def _get_git_revision(): try: revision = subprocess.check_output(REVISION_CMD.split()).strip() except subprocess.CalledProcessError: logger.info('Failed to execute git to get revision') return None return revision.decode('utf-8')
def commit(pads, *args, **kwargs): message = "Added results for run " + pads.api.active_run( ).info.run_id pads.managed_result_git.commit_changes(message=message) repo = pads.managed_result_git.repo remotes = repo.remotes if not remotes: logger.warning( "Your results don't have any remote repository set. Set a remote repository for" "to enable automatic pushing.") else: for remote in remotes: name, url = remote.name, list(remote.urls)[0] try: # check if remote repo is bare and if it is initialize it with a temporary local repo pads.managed_result_git.is_remote_empty(remote=name, remote_url=url, init=True) # stash current state repo.git.stash('push', '--include-untracked') # Force pull repo.git.pull(name, 'master', '--allow-unrelated-histories') # Push merged changes repo.git.push(name, 'master') logger.info("Pushed your results automatically to " + name + " @:" + url) # pop the stash repo.git.stash('pop') except Exception as e: logger.error( "pushing logs to remote failed due to this error '{}'" .format(str(e)))
def numpy_crawler(obj: Crawler, target_columns=None, **kwargs): logger.info("Detecting a dataset object of type 'numpy.ndarray'. Crawling any available metadata...") # , (obj.data[:, i].min(), obj.data[:, i].max()) if len(obj.data.shape) == 2: features = [(str(i), str(obj.data[:, i].dtype), False) for i in range(obj.data.shape[1])] else: # TODO for multidim datasets features = None metadata = {"type": str(obj.format), "shape": obj.data.shape, "features": features} metadata = {**metadata, **kwargs} targets = None try: if target_columns: targets = obj.data[:, target_columns] if isinstance(target_columns, Iterable): for c in target_columns: feature = metadata["features"][c] metadata["features"][c] = (feature[0], feature[1], True) else: feature = metadata["features"][target_columns] metadata["features"][target_columns] = (feature[0], feature[1], True) except Exception as e: logger.warning(str(e)) return obj.data, metadata, targets
def activate_tracking(self, reload_modules=False, reload_warnings=True, clear_imports=False, affected_modules=None): """ Function to duck punch all objects defined in the mapping files. This should at best be called before importing any libraries. :param affected_modules: Affected modules of the mapping files. :param clear_imports: Clear imports after punching. CAREFUL THIS IS EXPERIMENTAL! :param reload_warnings: Show warnings of affected modules which were already imported before the importlib was extended. :param reload_modules: Force a reload of affected modules. CAREFUL THIS IS EXPERIMENTAL! :return: """ if affected_modules is None: # Modules are affected if they are mapped by a library or are already punched affected_modules = self.wrap_manager.module_wrapper.punched_module_names | \ {l.name for l in self.mapping_registry.get_libraries()} global tracking_active if not tracking_active: logger.info("Activating tracking by extending importlib...") from pypads.app.pypads import set_current_pads set_current_pads(self) # Add our loader to the meta_path extend_import_module() import sys import importlib loaded_modules = [(name, module) for name, module in sys.modules.items()] for name, module in loaded_modules: if self.is_affected_module(name, affected_modules): if reload_warnings: logger.warning( name + " was imported before PyPads. To enable tracking import PyPads before or use " "reload_modules / clear_imports. Every already created instance is not tracked." ) if clear_imports: del sys.modules[name] if reload_modules: try: spec = importlib.util.find_spec(module.__name__) duck_punch_loader(spec) loader = spec.loader module = loader.load_module(module.__name__) loader.exec_module(module) importlib.reload(module) except Exception as e: logger.debug("Couldn't reload module " + str(e)) tracking_active = True else: # TODO check if a second tracker / tracker activation doesn't break the tracking logger.warning("Tracking was already activated.") return self
def entry(_cls, *args, _pypads_context=context, pypads_mapped_by=mappings, **kwargs): logger.debug("Call to tracked class method " + str(fn)) global error if self._pypads.api.active_run(): error = False with self._make_call(_cls, fn_reference) as call: accessor = call.call_id # add the function to the callback stack callback = types.MethodType(fn, _cls) # for every hook add if self._is_skip_recursion(accessor): logger.info("Skipping " + str(accessor.context.container.__name__) + "." + str( accessor.wrappee.__name__)) out = callback(*args, **kwargs) return out hooks = context.get_hooks(fn) for (h, config) in hooks: c = self._add_hook(h, config, callback, call, context.get_wrap_metas(fn)) if c: callback = types.MethodType(c, _cls) # start executing the stack out = callback(*args, **kwargs) else: if not error: error = True logger.error( "No run was active to log your hooks. You may want to start a run with PyPads().start_track()") callback = types.MethodType(fn, _cls) out = callback(*args, **kwargs) return out
def keras_crawler(obj: Crawler, **kwargs): logger.info("Detecting a keras dataset loaded object. Crawling any available metadata...") (X_train, y_train), (X_test, y_test) = obj.data import numpy as np targets = np.concatenate([y_train, y_test]) data = np.concatenate([np.concatenate([X_train, X_test]), targets.reshape(len(targets), 1)], axis=1) metadata = {"format": obj.format, "shape": data.shape} metadata = {**metadata, **kwargs} return data, metadata, targets
def torch_crawler(obj: Crawler, **kwargs): logger.info("Detecting a torchvision dataset loaded object. Crawling any available metadata...") data = obj.data.data.numpy() targets = obj.data.targets.numpy() train = obj.data.train source = obj.data.training_file if train else obj.data.test_file metadata = {"format": obj.format, "shape": data.shape, "classes": obj.data.classes, "Description": obj.data.__repr__(), "training_data": train, "source": source} # metadata = {**metadata, **kwargs} return data, metadata, targets
def sklearn_crawler(obj: Crawler, **kwargs): logger.info("Detecting an sklearn dataset loaded object. Crawling any available metadata...") import numpy as np if "return_X_y" in kwargs and kwargs.get("return_X_y"): X, y = obj.data data = np.concatenate([X, y.reshape(len(y), 1)], axis=1) features = [(str(i), str(X[:, i].dtype), False) for i in range(X.shape[1])] features.append(("class", str(y.dtype), True)) metadata = {"type": str(obj.format), "features": features, "shape": (X.shape[0], X.shape[1] + 1)} metadata = {**metadata, **kwargs} return data, metadata, y else: return bunch_crawler(obj, **kwargs)
def generator(): logger.info("Detected splitting, Tracking splits started...") pads.cache.add("tracking_mode", "single") if _logger_output.splits is None: splits = SplitTO(parent=_logger_output) else: splits = _logger_output.splits train, test, val = splitter_output(_return, fn=_pypads_env.callback) split_id = uuid.uuid4() pads.cache.run_add("current_split", split_id) splits.add_split(split_id, train, test, val) _logger_output.splits = splits return _return
def dataframe_crawler(obj: Crawler, target_columns, **kwargs): logger.info("Detecting a dataset object of type 'pandas.DataFrame'. Crawling any available metadata...") data = obj.data features = [] for i, col in enumerate(data.columns): flag = col in target_columns if target_columns is not None else False features.append((col, str(data[col].dtype), flag)) metadata = {"type": str(obj.format), "shape": data.shape, "features": features} metadata = {**metadata, **kwargs} targets = None if target_columns is not None: targets = data[target_columns].values else: logger.warning("Target values might be innaccurate or not tracked.") return data, metadata, targets
def generator(): pads.cache.add("tracking_mode", "multiple") logger.info("Detected splitting, Tracking splits started...") if _logger_output.splits is None: splits = SplitTO(parent=_logger_output) else: splits = _logger_output.splits for r in items: split_id = uuid.uuid4() pads.cache.run_add("current_split", split_id) train, test, val = splitter_output(r, fn=_pypads_env.callback) splits.add_split(split_id, train, test, val) _logger_output.splits = splits yield r
def __post__(self, ctx, *args, _logger_call, _pypads_pre_return, _pypads_result, _logger_output, _args, _kwargs, **kwargs): from pypads.app.pypads import get_current_pads pads = get_current_pads() if _logger_output.splits is None: splits = SplitTO(parent=_logger_output) else: splits = _logger_output.splits logger.info("Detected splitting, Tracking splits started...") train, test, val = splitter_output(_pypads_result, fn=ctx) split_id = uuid.uuid4() pads.cache.run_add("current_split", split_id) splits.add_split(split_id, train, test, val) # splits.store(_logger_output, "splits") _logger_output.splits = splits
def __call_wrapped__(self, ctx, *args, _pypads_env: InjectionLoggerEnv, _logger_call, _logger_output, _args, _kwargs): from pypads.app.pypads import get_current_pads pads = get_current_pads() if pads.cache.run_get("parameter_search", False): logger.info("Executing a parameter search under a nested run.") with pads.api.intermediate_run( experiment_id=pads.api.active_run().info.experiment_id, clear_cache=False, setups=False): _return, time = OriginalExecutor(fn=_pypads_env.callback)( *_args, **_kwargs) return _return, time else: return OriginalExecutor(fn=_pypads_env.callback)(*_args, **_kwargs)
def add_results(self, cv_results: dict): """ Parse the result dict of sklearn Grid search """ logger.info("Logging Grid Search resutls....") mean_scores = validate_type(cv_results.get('mean_test_score', [])) std_scores = validate_type(cv_results.get('std_test_score', [])) rankings = validate_type(cv_results.get('rank_test_score', [])) for i, params in enumerate(cv_results.get('params', [])): self.results.append( self.ParamSearchModel.SearchModel( index=validate_type(i), setting=validate_type(params), mean_score=mean_scores[i], std_score=std_scores[i], ranking=rankings[i]))
def _call(self, *args, _pypads_env: LoggerEnv, _logger_call, _logger_output, **kwargs): pads = _pypads_env.pypads logger.info("Tracking execution to run with id " + pads.api.active_run().info.run_id) dependencies = DependencyTO(parent=_logger_output) try: # Execute pip freeze try: # noinspection PyProtectedMember,PyPackageRequirements from pip._internal.operations import freeze except ImportError: # pip < 10.0 # noinspection PyUnresolvedReferences,PyPackageRequirements from pip.operations import freeze dependencies.add_dependency(list(freeze.freeze())) except Exception as e: _logger_output.set_failure_state(e) finally: _logger_output.dependencies = dependencies.store()
def _init_git_repo(self, path, source=True): """ Initializes a new git repo if none is found. :param path: :param source: :return: """ import git try: self.repo = git.Repo.init(path, bare=False) self._add_git_ignore() if source: self.commit_changes(message="Pypads initial commit") logger.info("Repository was successfully initialized") except (InvalidGitRepositoryError, GitCommandError, GitError) as e: logger.error( "No repository was present and git could not initialize a repository in this directory" " {0} because of exception: {1}".format(path, e))
def test_default_tracking(self): """ This example will track the experiment exection with the default configuration. :return: """ # --------------------------- setup of the tracking --------------------------- # Activate tracking of pypads from pypads.app.base import PyPads tracker = PyPads(autostart=False, log_level="WARNING") tracker.start_track(experiment_name="1. Experiment") tracker.actuators.set_random_seed(seed=1) import timeit t = timeit.Timer(sklearn_simple_decision_tree_experiment) from pypads import logger logger.info(t.timeit(1)) # --------------------------- asserts --------------------------- import mlflow run = mlflow.active_run() assert tracker.api.active_run().info.run_id == run.info.run_id tracker.api.end_run()
def test_default_tracking(self): """ This example will track the experiment exection with the default configuration. :return: """ # --------------------------- setup of the tracking --------------------------- # Activate tracking of pypads from pypads.app.base import PyPads tracker = PyPads(uri="http://mlflow.padre-lab.eu") tracker.activate_tracking() tracker.start_track() from pypads import logger logger.info("Test logger") import timeit t = timeit.Timer(sklearn_simple_decision_tree_experiment) from pypads import logger logger.info(t.timeit(1)) # --------------------------- asserts --------------------------- tracker.api.end_run()
import argparse # Initialize parser from pypads import logger parser = argparse.ArgumentParser() # Adding optional argument parser.add_argument( "-o", "--OntologyUri", default="https://www.padre-lab.eu/onto/", help="Set the base URI for concepts defined in an ontology.") # TODO add ontology password # Read arguments from command line args, _ = parser.parse_known_args() if args.OntologyUri: logger.info("Setting PyPads base ontology URI to: %s" % args.OntologyUri) ontology_uri = args.OntologyUri
def __pre__(self, ctx, *args, **kwargs): logger.info("Pypads tracked class " + str(ctx.__class__) + " initialized.")
def activate(pypads, *args, **kwargs): logger.info("Trying to configure onto plugin for pypads...") configure_plugin(pypads, *args, **kwargs) logger.info("Finished configuring onto plugin for pypads!")
def add_wrappings(self, module): """ Function that look for matched mappings and inject corresponding logging functionalities :param self: context :param module: module to be wrapped """ from pypads.app.pypads import current_pads reference = module.__name__ # History to check if a class inherits a wrapping intra-module mro_entry_history = {} if current_pads: # TODO we might want to make this configurable/improve performance. # This looks at every imported class and every mapping. # On execution of a module we search for relevant mappings # For every var on module try: members = inspect.getmembers( module, lambda x: hasattr(x, "__module__") and x.__module__ == module.__name__) except Exception as e: logger.debug( "getmembers of inspect failed on module '" + str(module.__name__) + "' with expection" + str(e) + ". Falling back to dir to get the members of the module.") members = [(name, getattr(module, name)) for name in dir(module)] for name, obj in members: if obj is not None: obj_ref = ".".join([reference, name]) package = Package(module, PackagePath(obj_ref)) # Skip modules if they are from another package for now if inspect.ismodule(obj): if not module.__name__.split(".")[0] == obj.__name__.split( ".")[0]: continue mappings = set() if inspect.isclass(obj) and hasattr(obj, "mro"): try: # Look at the MRO and add classes to be punched which inherit from our punched classes mro_ = obj.mro()[1:] for entry in mro_: if entry not in mro_entry_history.keys(): mro_entry_history[entry] = [obj] else: mro_entry_history[entry].append(obj) if hasattr(entry, "_pypads_mapping_" + entry.__name__): found_mappings = _add_inherited_mapping( obj, entry) mappings = mappings.union(found_mappings) except Exception as e: logger.debug("Skipping some superclasses of " + str(obj) + ". " + str(e)) mappings = mappings.union(_get_relevant_mappings(package)) if len(mappings) > 0: if not has_delayed_wrapping(): current_pads.wrap_manager.wrap( obj, Context(module, reference), { MatchedMapping(mapping, package.path) for mapping in mappings }) else: _first_in_queue = list( _import_loggers_queues.keys())[0] if not _first_in_queue in _wrapping_queues: _wrapping_queues[_first_in_queue] = [] _wrapping_queues[_first_in_queue].append( (obj, Context(module, reference), { MatchedMapping(mapping, package.path) for mapping in mappings })) if reference in _import_loggers_queues: # execute import logger of this reference while len(_import_loggers_queues[reference]) > 0: (fn, config) = _import_loggers_queues[reference].pop() fn(self) del _import_loggers_queues[reference] if reference in _wrapping_queues: for (obj, ctx, mm) in _wrapping_queues[reference]: current_pads.wrap_manager.wrap(obj, ctx, mm) del _wrapping_queues[reference] if reference in current_pads.wrap_manager.module_wrapper.punched_module_names: logger.info(f"PyPads wrapped functions of module {reference}.")
def __post__(self, ctx, *args, _logger_call, _pypads_pre_return, _pypads_result, _logger_output, _args, _kwargs, **kwargs): """ :param ctx: :param args: :param _pypads_result: :param kwargs: :return: """ from pypads.app.pypads import get_current_pads pads = get_current_pads() preds = _pypads_result if pads.cache.run_exists("predictions"): preds = pads.cache.run_pop("predictions") # check if there is info about decision scores probabilities = None if pads.cache.run_exists("probabilities"): probabilities = pads.cache.run_pop("probabilities") # check if there is info on truth values targets = None if pads.cache.run_exists("targets"): targets = pads.cache.run_get("targets") # check if there exists information about the current split current_split = None split_id = None mode = None splits = None if pads.cache.run_exists("current_split"): split_id = pads.cache.run_get("current_split") splitter = pads.cache.run_get(pads.cache.run_get("split_tracker")) splits = splitter.get("output").splits.splits mode = pads.cache.get("tracking_mode", "single") current_split = splits.get(str(split_id), None) # depending on available info log the predictions if current_split is None: logger.warning("No split information were found in the cache of the current run, " "individual decision tracking might be missing Truth values, try to decorate you splitter!") else: logger.info( "Logging single instance / individual decisions depending on the availability of split information, " "predictions, probabilites and target values.") if mode == "multiple" and _len(preds) == _len(targets): _logger_output.individual_decisions = [] for split_id, split in splits.items(): decisions = SingleInstanceTO(split_id=uuid.UUID(split_id), parent=_logger_output) if split.test_set is not None: try: for i, instance in enumerate(split.test_set): prediction = preds[i] probability_scores = [] if probabilities is not None: probability_scores = _tolist(probabilities[i]) truth = None if targets is not None: truth = targets[instance] decisions.add_decision(instance=instance, truth=truth, prediction=prediction, probabilities=probability_scores) _logger_output.individual_decisions.append(decisions.store()) except Exception as e: logger.warning( "Could not log single instance decisions due to this error '%s'" % str(e)) else: decisions = SingleInstanceTO(split_id=split_id, parent=_logger_output) if current_split.test_set is not None: try: for i, instance in enumerate(current_split.test_set): prediction = preds[i] probability_scores = [] if probabilities is not None: probability_scores = _tolist(probabilities[i]) truth = None if targets is not None: truth = targets[instance] decisions.add_decision(instance=instance, truth=truth, prediction=prediction, probabilities=probability_scores) _logger_output.individual_decisions = decisions.store() except Exception as e: logger.warning("Could not log single instance decisions due to this error '%s'" % str(e))
def series_crawler(obj: Crawler, **kwargs): logger.info("Detecting a dataset object of type 'pandas.Series'. Crawling any available metadata...") data = obj.data metadata = {"type": str(obj.format), "shape": data.shape} metadata = {**metadata, **kwargs} return data, metadata, None
def graph_crawler(obj: Crawler, **kwargs): logger.info("Detecting a dataset loaded object of type 'networkx.Graph. Crawling any available metadata...") graph = obj.data metadata = {"type": str(obj.format), "shape": (graph.number_of_edges(), graph.number_of_nodes())} metadata = {**metadata, **kwargs} return graph, metadata, None
def __post__(self, ctx, *args, _pypads_env: InjectionLoggerEnv, _logger_call: InjectionLoggerCallModel, _logger_output, _pypads_result, _args, _kwargs, _pypads_write_format=FileFormats.pickle, **kwargs): pads = _pypads_env.pypads # if the return object is None, take the object instance ctx dataset_object = _pypads_result if _pypads_result is not None else ctx mapping_data = _pypads_env.data dataset_data = data_str(mapping_data, "dataset", "@schema", default={}) # Get additional arguments if given by the user _dataset_kwargs = dict() if pads.cache.run_exists("dataset_kwargs"): _dataset_kwargs = pads.cache.run_get("dataset_kwargs") # Scrape the data object crawler = Crawler( dataset_object, ctx=_logger_call.original_call.call_id.context.container, callback=_logger_call.original_call.call_id.wrappee, kw=_kwargs) data, metadata, targets = crawler.crawl(**_dataset_kwargs) if targets is not None: pads.cache.run_add("targets", targets) # Look for metadata information given by the user when using the decorators if pads.cache.run_exists("dataset_metadata"): metadata = {**metadata, **pads.cache.run_get("dataset_metadata")} # getting the dataset object name if hasattr(dataset_object, "name"): ds_name = dataset_object.name elif pads.cache.run_exists("dataset_name") and pads.cache.run_get( "dataset_name") is not None: ds_name = pads.cache.run_get("dataset_name") else: ds_name = _logger_call.original_call.call_id.wrappee.__qualname__ # compile identifying hash try: data_hash = persistent_hash(str(dataset_object)) except Exception: logger.warning( "Could not compute the hash of the dataset object, falling back to dataset name hash..." ) data_hash = persistent_hash((str(ds_name), str(metadata))) # create referencing object dto = DatasetTO( parent=_logger_output, name=ds_name, shape=metadata.get("shape", None), metadata=metadata, repository_reference=data_hash, repository_type=_pypads_env.pypads.dataset_repository.name) # Add to repo if needed if not pads.dataset_repository.has_object(uid=data_hash): logger.info( "Detected Dataset was not found in the store. Adding an entry..." ) repo_obj = pads.dataset_repository.get_object(uid=data_hash) if isinstance(data, dict): binary_refs = [] for k, v in data.items(): binary_refs.append( repo_obj.log_mem_artifact( dto.name + "_" + k, v, write_format=_pypads_write_format, description="Dataset binary part: {}".format(k), additional_data=metadata, holder=dto)) else: binary_refs = repo_obj.log_mem_artifact( dto.name, dataset_object, write_format=_pypads_write_format, description="Dataset binary", additional_data=metadata, holder=dto) logger.info("Entry added in the dataset repository.") documentation = "Documentation missing" if ctx: documentation = ctx.__doc__ elif _logger_call.original_call.call_id.wrappee.__doc__: documentation = _logger_call.original_call.call_id.wrappee.__doc__ # create repository object dro = DatasetRepositoryObject( name=data_str(dataset_data, "rdfs:label", default=dto.name), uid=data_hash, description=data_str(dataset_data, "rdfs:description", default="Some unkonwn Dataset"), documentation=data_str(dataset_data, "padre:documentation", default=documentation), binary_references=binary_refs, location=_logger_call.original_call.call_id.context.reference, additional_data=dataset_data) repo_obj.log_json(dro) # Store object _logger_output.dataset = dto.store()
def activate(pypads, *args, **kwargs): from pypads_padre.app.plugin import configure_plugin logger.info("Trying to configure padre plugin for pypads...") configure_plugin(pypads, *args, **kwargs) logger.info("Finished configuring padre plugin for pypads!")