class EnvCreateArgs(RelatedConfigMixin): model = StrSequenceField(str, required=True) source = related.StringField(required=True) dataloader = StrSequenceField(str, default=[], required=False) env = related.StringField(default=None, required=False) gpu = related.BooleanField(default=False, required=False) tmpdir = related.StringField(default=None, required=False) vep = related.BooleanField(default=False, required=False)
class DataLoaderArgument(RelatedConfigMixin): # MAYBE - make this a general argument class doc = related.StringField("", required=False) example = AnyField(required=False) name = related.StringField(required=False) type = related.StringField(default='str', required=False) optional = related.BooleanField(default=False, required=False) tags = StrSequenceField(str, default=[], required=False) # TODO - restrict the tags def __attrs_post_init__(self): if self.doc == "": logger.warn("doc empty for one of the dataloader `args` fields")
class Info(RelatedConfigMixin): """Class holding information about the component. Parses the info section in component.yaml: info: authors: - name: Ziga Avsec doc: RBP binding prediction name: rbp_eclip version: 0.1 """ authors = related.SequenceField(Author, repr=True, required=False) doc = related.StringField("", required=False) # free-text description of the model name = related.StringField(required=False) # TODO - deprecate version = related.StringField(default="0.1", required=False) license = related.StringField(default="MIT", required=False) # license of the model/dataloader - defaults to MIT tags = StrSequenceField(str, default=[], required=False) def __attrs_post_init__(self): if self.authors and self.doc == "": logger.warn("doc empty for the `info:` field")
class ArraySchema(RelatedConfigMixin): """ Args: shape: Tuple of shape (same as in Keras for the input) doc: Description of the array special_type: str, special type name. Could also be an array of special entries? metadata_entries: str or list of metadata """ verbose = True shape = TupleIntField() doc = related.StringField("", required=False) # MAYBE - allow a list of strings? # - could be useful when a single array can have multiple 'attributes' name = related.StringField(required=False) special_type = related.ChildField(ArraySpecialType, required=False) associated_metadata = StrSequenceField(str, default=[], required=False) column_labels = StrSequenceField( str, default=[], required=False ) # either a list or a path to a file --> need to check whether it's a list # TODO shall we have # - associated_metadata in ArraySchema # OR # - associated_array in MetadataField? # assert that there are no Nones in the shape, assume that channels is the only 4 or it is the last # update the model schema shape on calling batch_iter method # overwrite the batch_iter method of the returned dataloader --> decorator needed def print_msg(self, msg): if self.verbose: print("ArraySchema mismatch") print(msg) def _validate_list_column_labels(self): dim_ok = len(self.shape) >= 1 if dim_ok and (self.shape[0] is not None): dim_ok &= len(self.column_labels) == self.shape[0] if not dim_ok: self.print_msg( "Column annotation does not match array dimension with shape %s and %d labels (%s ...)" % (str(self.shape), len( self.column_labels), str(self.column_labels)[:30])) def __attrs_post_init__(self): if len(self.column_labels) > 1: # check that length is ok with columns self._validate_list_column_labels() elif len(self.column_labels) == 1: label = self.column_labels.list[0] import os # check if path exists raise exception only test time, but only a warning in prediction time if os.path.exists(label): with open(label, "r") as ifh: object.__setattr__(self, "column_labels", [l.rstrip() for l in ifh]) self._validate_list_column_labels() else: object.__setattr__(self, "column_labels", None) def compatible_with_batch(self, batch, verbose=True): """Checks compatibility with a particular batch of data Args: batch: numpy array ignore_batch_axis: if True, the batch axis is not considered verbose: print the fail reason """ def print_msg(msg): if verbose: print("ArraySchema mismatch") print(msg) # type = np.ndarray if not isinstance(batch, np.ndarray): print_msg("Expecting a np.ndarray. Got type(batch) = {0}".format( type(batch))) return False if not batch.ndim >= 1: print_msg( "The array is a scalar (expecting at least the batch dimension)" ) return False return self.compatible_with_schema( ArraySchema(shape=batch.shape[1:], doc="")) def compatible_with_schema(self, schema, name_self="", name_schema="", verbose=True): """Checks the compatibility with another schema Args: schema: Other ArraySchema name_self: How to call self in the error messages name_schema: analogously to name_self for the schema ArraySchema verbose: bool, describe what went wrong through print() """ def print_msg(msg): if verbose: # print("ArraySchema mismatch") print(msg) if not isinstance(schema, ArraySchema): print_msg( "Expecting ArraySchema. Got type({0} schema) = {1}".format( name_schema, type(schema))) return False def print_msg_template(): print("ArraySchema mismatch") print("Array shapes don't match for the fields:") print("--") print(name_self) print("--") print(self.get_config_as_yaml()) print("--") print(name_schema) print("--") print(schema.get_config_as_yaml()) print("--") print( "Provided shape (without the batch axis): {0}, expected shape: {1} " .format(bshape, self.shape)) bshape = schema.shape if not len(bshape) == len(self.shape): print_msg_template() return False for i in range(len(bshape)): if bshape[i] is not None and self.shape[i] is not None: # shapes don't match if not bshape[i] == self.shape[i]: print_msg_template() return False return True
class Dependencies(RelatedConfigMixin): conda = StrSequenceField(str, default=[], required=False, repr=True) pip = StrSequenceField(str, default=[], required=False, repr=True) # not really required conda_channels = related.SequenceField(str, default=["defaults"], required=False, repr=True) def __attrs_post_init__(self): """ In case conda or pip are filenames pointing to existing files, read the files and populate the package names """ if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \ os.path.exists(self.conda[0]): # found a conda txt file object.__setattr__(self, "conda", read_txt(self.conda[0])) if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \ os.path.exists(self.pip[0]): # found a pip txt file object.__setattr__(self, "pip", read_txt(self.pip[0])) def install_pip(self, dry_run=False): print("pip dependencies to be installed:") print(self.pip) if dry_run: return else: kconda.install_pip(self.pip) def install_conda(self, dry_run=False): print("Conda dependencies to be installed:") print(self.conda) if dry_run: return else: channels, packages = self._get_channels_packages() kconda.install_conda(packages, channels) def install(self, dry_run=False): self.install_conda(dry_run) self.install_pip(dry_run) def merge(self, dependencies): """Merge one dependencies with another one Use case: merging the dependencies of model and dataloader Args: dependencies: Dependencies instance Returns: new Dependencies instance """ return Dependencies( conda=unique_list(list(self.conda) + list(dependencies.conda)), pip=kconda.normalize_pip(list(self.pip) + list(dependencies.pip)), conda_channels=unique_list( list(self.conda_channels) + list(dependencies.conda_channels))) def normalized(self): """Normalize the list of dependencies """ channels, packages = self._get_channels_packages() if isinstance(packages, related.types.TypedSequence): packages = packages.list if isinstance(channels, related.types.TypedSequence): channels = channels.list return Dependencies(conda=packages, pip=kconda.normalize_pip(list(self.pip)), conda_channels=channels) def _get_channels_packages(self): """Get conda channels and packages separated from each other (by '::') """ if len(self.conda) == 0: return self.conda_channels, self.conda channels, packages = list( zip(*map(kconda.parse_conda_package, self.conda))) channels = unique_list(list(channels) + list(self.conda_channels)) packages = unique_list(list(packages)) return channels, packages def to_env_dict(self, env_name): deps = self.normalized() channels, packages = deps._get_channels_packages() if isinstance(packages, related.types.TypedSequence): packages = packages.list if isinstance(channels, related.types.TypedSequence): channels = channels.list env_dict = OrderedDict( name=env_name, channels=channels, dependencies=packages + [OrderedDict(pip=kconda.normalize_pip(deps.pip))]) return env_dict def to_env_file(self, env_name, path): """Dump the dependencies to a file """ with open(path, 'w') as f: d = self.to_env_dict(env_name) # add python if not present add_py = True for dep in d['dependencies']: if isinstance(dep, str) and dep.startswith("python"): add_py = False if add_py: d['dependencies'] = ["python"] + d['dependencies'] # ----- # remove fields that are empty out = [] for k in d: if not (isinstance(d[k], list) and len(d[k]) == 0): out.append((k, d[k])) # ----- f.write( yaml_ordered_dump(OrderedDict(out), indent=2, default_flow_style=False)) def gpu(self): """Get the gpu-version of the dependencies """ def replace_gpu(dep): if dep.startswith("tensorflow") and "gpu" not in dep: new_dep = dep.replace("tensorflow", "tensorflow-gpu") logger.info( "use gpu: Replacing the dependency {0} with {1}".format( dep, new_dep)) return new_dep if dep.startswith("pytorch-cpu"): new_dep = dep.replace("pytorch-cpu", "pytorch") logger.info( "use gpu: Replacing the dependency {0} with {1}".format( dep, new_dep)) return new_dep return dep deps = self.normalized() return Dependencies(conda=[replace_gpu(dep) for dep in deps.conda], pip=[replace_gpu(dep) for dep in deps.pip], conda_channels=deps.conda_channels)
class Dependencies(RelatedConfigMixin): conda = StrSequenceField(str, default=[], required=False, repr=True) pip = StrSequenceField(str, default=[], required=False, repr=True) # not really required conda_channels = related.SequenceField(str, default=["defaults"], required=False, repr=True) def __attrs_post_init__(self): """ In case conda or pip are filenames pointing to existing files, read the files and populate the package names """ if len(self.conda) == 1 and self.conda[0].endswith(".txt") and \ os.path.exists(self.conda[0]): # found a conda txt file object.__setattr__(self, "conda", read_txt(self.conda[0])) if len(self.pip) == 1 and self.pip[0].endswith(".txt") and \ os.path.exists(self.pip[0]): # found a pip txt file object.__setattr__(self, "pip", read_txt(self.pip[0])) def all_installed(self, verbose=False): """Validate if all the dependencies are installed as requested Args: verbose: if True, display warnings if the dependencies are not installed Returns: (bool): True if all the required package versions are installed and False otherwise """ norm = self.normalized() for pkg in list(norm.conda) + list(norm.pip): if not kconda.is_installed(pkg): if verbose: pkg_name, req_version = kconda.version_split(pkg) found_version = kconda.get_package_version(pkg_name) if found_version is None: print("Package '{}' is not installed".format(pkg_name)) else: print("Installed package '{}={}' doesn't " "comply with '{}'".format( pkg_name, found_version, pkg)) return False return True def install_pip(self, dry_run=False): print("pip dependencies to be installed:") print(self.pip) if dry_run: return else: kconda.install_pip(self.pip) def install_conda(self, dry_run=False): print("Conda dependencies to be installed:") print(self.conda) if dry_run: return else: channels, packages = self._get_channels_packages() kconda.install_conda(packages, channels) def install(self, dry_run=False): self.install_conda(dry_run) self.install_pip(dry_run) def merge(self, dependencies): """Merge one dependencies with another one Use case: merging the dependencies of model and dataloader Args: dependencies: Dependencies instance Returns: new Dependencies instance """ return Dependencies( conda=unique_list(list(self.conda) + list(dependencies.conda)), pip=kconda.normalize_pip(list(self.pip) + list(dependencies.pip)), conda_channels=unique_list( list(self.conda_channels) + list(dependencies.conda_channels))) def normalized(self): """Normalize the list of dependencies """ channels, packages = self._get_channels_packages() if isinstance(packages, related.types.TypedSequence): packages = packages.list if isinstance(channels, related.types.TypedSequence): channels = channels.list return Dependencies(conda=packages, pip=kconda.normalize_pip(list(self.pip)), conda_channels=channels) def _get_channels_packages(self): """Get conda channels and packages separated from each other(by '::') """ if len(self.conda) == 0: return self.conda_channels, self.conda channels, packages = list( zip(*map(kconda.parse_conda_package, self.conda))) channels = unique_list(list(channels) + list(self.conda_channels)) packages = unique_list(list(packages)) # Handle channel order if "bioconda" in channels and "conda-forge" not in channels: # Insert 'conda-forge' right after bioconda if it is not included channels.insert(channels.index("bioconda") + 1, "conda-forge") if "pysam" in packages and "bioconda" in channels: if channels.index("defaults") < channels.index("bioconda"): logger.warn( "Swapping channel order - putting defaults last. " + "Using pysam bioconda instead of anaconda") channels.remove("defaults") channels.insert(len(channels), "defaults") return channels, packages def to_env_dict(self, env_name): deps = self.normalized() channels, packages = deps._get_channels_packages() if isinstance(packages, related.types.TypedSequence): packages = packages.list if isinstance(channels, related.types.TypedSequence): channels = channels.list env_dict = OrderedDict( name=env_name, channels=channels, dependencies=packages + [OrderedDict(pip=kconda.normalize_pip(deps.pip))]) return env_dict @classmethod def from_env_dict(self, dict): cfg = {} cfg["conda_channels"] = dict['channels'] cfg["conda"] = [ el for el in dict['dependencies'] if not isinstance(el, OrderedDict) ] pip = [ el for el in dict['dependencies'] if isinstance(el, OrderedDict) ] if len(pip) == 1: cfg["pip"] = pip[0]['pip'] elif len(pip) > 1: raise Exception("Malformatted conda environment yaml!") return self.from_config(cfg) def to_env_file(self, env_name, path): """Dump the dependencies to a file """ with open(path, 'w') as f: d = self.to_env_dict(env_name) # add python if not present add_py = True for dep in d['dependencies']: if isinstance(dep, str) and dep.startswith("python"): add_py = False if add_py: d['dependencies'] = ["python"] + d['dependencies'] # ----- # remove fields that are empty out = [] for k in d: if not (isinstance(d[k], list) and len(d[k]) == 0): out.append((k, d[k])) # ----- f.write( yaml_ordered_dump(OrderedDict(out), indent=2, default_flow_style=False)) def gpu(self): """Get the gpu - version of the dependencies """ def replace_gpu(dep): if dep.startswith("tensorflow") and "gpu" not in dep: new_dep = dep.replace("tensorflow", "tensorflow-gpu") logger.info( "use gpu: Replacing the dependency {0} with {1}".format( dep, new_dep)) return new_dep if dep.startswith("pytorch-cpu"): new_dep = dep.replace("pytorch-cpu", "pytorch") logger.info( "use gpu: Replacing the dependency {0} with {1}".format( dep, new_dep)) return new_dep return dep deps = self.normalized() return Dependencies(conda=[replace_gpu(dep) for dep in deps.conda], pip=[replace_gpu(dep) for dep in deps.pip], conda_channels=deps.conda_channels) def osx(self): """Get the os - x compatible dependencies """ from sys import platform if platform != 'darwin': logger.warn( "Calling osx dependency conversion on non-osx platform: {}". format(platform)) def replace_osx(dep): if dep.startswith("pytorch-cpu"): new_dep = dep.replace("pytorch-cpu", "pytorch") logger.info( "osx: Replacing the dependency {0} with {1}".format( dep, new_dep)) return new_dep return dep deps = self.normalized() return Dependencies(conda=[replace_osx(dep) for dep in deps.conda], pip=[replace_osx(dep) for dep in deps.pip], conda_channels=deps.conda_channels)