class Modules(Enum): if is_package_available('sklearn'): sklearn = "sklearn.datasets" if is_package_available('keras'): keras = "keras.datasets" if is_package_available('torchvision'): torch = "torchvision.datasets"
def check_determinism(): from pypads.app.pypads import get_current_pads pads = get_current_pads() if is_package_available('tensorflow'): import tensorflow tf_version = tensorflow.version.VERSION if tensorflow.match("(1\.(14|15)|2\.0)", tf_version): if "TF_USE_CUDNN_AUTOTUNE" in os.environ: logger.warning( "When using TF auto-tuning of cuDNN convolution algorithms your experiment might" " be non-deterministic.") pads.api.set_tag("non-determinism", "CUDNN_AUTOTUNE") if ("TF_CUDNN_DETERMINISTIC" not in os.environ or (not os.environ["TF_CUDNN_DETERMINISTIC"] and os.environ["TF_CUDNN_DETERMINISTIC"] is not 1)): if not is_package_available("tfdeterminism"): logger.warning( "Your experiment might include a gpu-specific sources of non-determinism." " See https://github.com/NVIDIA/tensorflow-determinism" ) pads.api.set_tag( "non-determinism", "TF auto-tuning of cuDNN convolution algorithms (see multi-algo note)" )
class Types(Enum): if is_package_available('sklearn') and tracking_active: from sklearn.utils import Bunch bunch = Bunch else: bunch = "sklearn.utils.Bunch" if is_package_available('numpy'): from numpy import ndarray Ndarray = ndarray else: ndarray = 'numpy.ndarray' if is_package_available('pandas'): from pandas import DataFrame, Series dataframe = DataFrame series = Series else: dataframe = 'pandas.DataFrame' series = 'pandas.Series' if is_package_available('networkx'): from networkx import Graph graph = Graph else: graph = 'networkx.Graph' dict = dict tuple = Tuple
def ner_tagging(corpus): if is_package_available("spacy"): import spacy nlp = spacy.load("en_core_web_sm") doc = nlp(corpus) nouns = set() for chunk in doc.noun_chunks: if "=" not in chunk.text and "." not in chunk.text: nouns.add(chunk.text) ents = set() for ent in doc.ents: if "=" not in ent.text and "." not in ent.text and "`" not in ent.text and "/" not in ent.text: ents.add(ent.text) return str(nouns), str(ents) elif is_package_available("nltk"): # TODO use nltk to find named entities https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da pass
def _call(self, *args, _pypads_env: LoggerEnv, _pypads_autologgers=None, _logger_call, _logger_output, **kwargs): if _pypads_autologgers is None: _pypads_autologgers = [ "keras", "tensorflow", "xgboost", "gluon", "spark", "lightgbm", "sklearn" ] if 'tensorflow' in _pypads_autologgers and 'tensorflow' in sys.modules and 'tensorflow' not in added_auto_logs \ and is_package_available('tensorflow'): added_auto_logs.add('tensorflow') from mlflow import tensorflow tensorflow.autolog() if 'keras' in _pypads_autologgers and 'keras' in sys.modules and 'keras' not in added_auto_logs \ and is_package_available('keras'): added_auto_logs.add('keras') from mlflow import keras keras.autolog() if 'xgboost' in _pypads_autologgers and 'xgboost' in sys.modules and 'xgboost' not in added_auto_logs \ and is_package_available('xgboost'): added_auto_logs.add('xgboost') from mlflow import xgboost xgboost.autolog() if 'gluon' in _pypads_autologgers and 'gluon' in sys.modules and 'gluon' not in added_auto_logs \ and is_package_available('gluon'): added_auto_logs.add('gluon') from mlflow import gluon gluon.autolog() if 'spark' in _pypads_autologgers and 'spark' in sys.modules and 'spark' not in added_auto_logs \ and is_package_available('pyspark'): added_auto_logs.add('spark') from mlflow import spark spark.autolog() if 'lightgbm' in _pypads_autologgers and 'lightgbm' in sys.modules and 'lightgbm' not in added_auto_logs \ and is_package_available('lightgbm'): added_auto_logs.add('lightgbm') from mlflow import lightgbm lightgbm.autolog() if 'sklearn' in _pypads_autologgers and 'sklearn' in sys.modules and 'sklearn' not in added_auto_logs \ and is_package_available('sklearn'): added_auto_logs.add('sklearn') from mlflow import sklearn sklearn.autolog()
def finalize_output(pads, logger_call, output, *args, **kwargs): pipeline: PipelineTO = pads.cache.run_get("pipeline") from networkx import MultiDiGraph network: MultiDiGraph = pipeline.nx_network base_folder = get_temp_folder() path = os.path.join(base_folder, "pipeline_graph.png") if not os.path.exists(base_folder): pathlib.Path(base_folder).mkdir(parents=True, exist_ok=True) if is_package_available("agraph") and is_package_available( "graphviz") and is_package_available("pygraphviz"): from networkx.drawing.nx_agraph import to_agraph agraph = to_agraph(network) agraph.layout('dot') agraph.draw(path) pipeline.store_artifact( path, "pipeline_graph.png", description= "A depiction of the underlying pipeline of the experiment.") output.pipeline = pipeline.store()
def is_installed(self): """ Check if a match is installed :return: """ if self.regex: return any({ self.allows(version) for version in find_package_regex_versions(self.name).values() if version is not None }) else: if is_package_available(self.name): version = find_package_version(self.name) if version is None: raise VersionNotFoundException( "Couldn't find version for lib {}".format(self.name)) return self.allows(version) return False
def set_random_seed(seed): import random global padre_seed padre_seed = seed # --- set random seed --- random.seed(seed) # --- set numpy seed --- numpy.random.seed(seed) # global seeds for numpy seem to not work with RandomState() # --- set pytorch seed --- if is_package_available("torch"): # noinspection PyPackageRequirements,PyUnresolvedReferences import torch torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
import os import time from functools import wraps from typing import List from pypads.utils.util import is_package_available if is_package_available("joblib"): import joblib original_delayed = joblib.delayed @wraps(original_delayed) def punched_delayed(fn): """Decorator used to capture the arguments of a function.""" @wraps(fn) def wrapped_function(*args, _pypads_cache=None, _pypads_config=None, _pypads_active_run_id=None, _pypads_tracking_uri=None, _pypads_affected_modules=None, _pypads_triggering_process=None, **kwargs): from pypads.parallel.util import _pickle_tuple, _cloudpickle_tuple from pypads import logger # only if pads data was passed if _pypads_active_run_id: # noinspection PyUnresolvedReferences from pypads.app import pypads
# --- TorchVision Dataset object --- def torch_crawler(obj: Crawler, **kwargs): logger.info("Detecting a torchvision dataset loaded object. Crawling any available metadata...") data = obj.data.data.numpy() targets = obj.data.targets.numpy() train = obj.data.train source = obj.data.training_file if train else obj.data.test_file metadata = {"format": obj.format, "shape": data.shape, "classes": obj.data.classes, "Description": obj.data.__repr__(), "training_data": train, "source": source} # metadata = {**metadata, **kwargs} return data, metadata, targets if is_package_available("torchvision"): Crawler.register_fn(Modules.torch.value, torch_crawler) # --- Keras datasets --- def keras_crawler(obj: Crawler, **kwargs): logger.info("Detecting a keras dataset loaded object. Crawling any available metadata...") (X_train, y_train), (X_test, y_test) = obj.data import numpy as np targets = np.concatenate([y_train, y_test]) data = np.concatenate([np.concatenate([X_train, X_test]), targets.reshape(len(targets), 1)], axis=1) metadata = {"format": obj.format, "shape": data.shape} metadata = {**metadata, **kwargs} return data, metadata, targets
def numpy_seed(seed): try: from pypads.app.pypads import get_current_pads pads = get_current_pads() pads.cache.run_add("numpy.random.seed", seed) log_random_seed("numpy.random.seed") return original_numpy(seed) except Exception as e: Warning("Tracker failed to log the set seed because %s" % str(e)) return original_numpy(seed) numpy.random.seed = numpy_seed # --- pytorch seed --- if is_package_available("torch"): # noinspection PyPackageRequirements,PyUnresolvedReferences import torch original_torch = torch.manual_seed def torch_seed(seed): try: from pypads.app.pypads import get_current_pads pads = get_current_pads() pads.cache.run_add("torch.seed", seed) log_random_seed("torch.seed") return original_torch(seed) except Exception as e: Warning("Tracker failed to log the set seed because %s" % str(e)) return original_torch(seed)