def add_label(self, label_name: str, values: list = None, auxiliary_labels: list = None, positive_class=None): vals = list(values) if values else None if label_name in self._labels and self._labels[ label_name] is not None and len(self._labels[label_name]) > 0: warnings.warn( "Label " + label_name + " has already been set. Overriding existing values...", Warning) if positive_class is not None: if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str): positive_class = str(positive_class) ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class') else: positive_class = self._get_default_positive_class(values) if positive_class: logging.info( f"LabelConfiguration: set default positive class '{positive_class}' for label {label_name}" ) self._labels[label_name] = Label(label_name, vals, auxiliary_labels, positive_class)
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path) -> MLApplicationInstruction: location = MLApplicationParser.__name__ ParameterValidator.assert_keys(instruction.keys(), [ 'type', 'dataset', 'number_of_processes', 'config_path', 'store_encoded_data' ], location, key) ParameterValidator.assert_in_valid_list( instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset") ParameterValidator.assert_type_and_value( instruction['number_of_processes'], int, location, f"{key}: number_of_processes", min_inclusive=1) ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path') ParameterValidator.assert_type_and_value( instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data') hp_setting, label = self._parse_hp_setting(instruction, path, key) instruction = MLApplicationInstruction( dataset=symbol_table.get(instruction['dataset']), name=key, number_of_processes=instruction['number_of_processes'], label_configuration=LabelConfiguration([label]), hp_setting=hp_setting, store_encoded_data=instruction['store_encoded_data']) return instruction
def __init__(self, k: int, skip_first_n_aa: int, skip_last_n_aa: int, abundance: str, normalize_all_features: bool, name: str = None): location = "AtchleyKmerEncoder" ParameterValidator.assert_type_and_value(k, int, location, "k", 1) ParameterValidator.assert_type_and_value(skip_first_n_aa, int, location, "skip_first_n_aa", 0) ParameterValidator.assert_type_and_value(skip_last_n_aa, int, location, "skip_last_n_aa", 0) ParameterValidator.assert_in_valid_list( abundance.upper(), [ab.name for ab in RelativeAbundanceType], location, "abundance") ParameterValidator.assert_type_and_value(normalize_all_features, bool, location, "normalize_all_features") self.k = k self.skip_first_n_aa = skip_first_n_aa self.skip_last_n_aa = skip_last_n_aa self.abundance = RelativeAbundanceType[abundance.upper()] self.normalize_all_features = normalize_all_features self.name = name self.scaler_path = None self.vectorizer_path = None
def _prepare_parameters(use_positional_info: bool, distance_to_seq_middle: int, flatten: bool, sequence_type: str, name: str = None): location = OneHotEncoder.__name__ ParameterValidator.assert_type_and_value(use_positional_info, bool, location, "use_positional_info") if use_positional_info: ParameterValidator.assert_type_and_value(distance_to_seq_middle, int, location, "distance_to_seq_middle", min_inclusive=1) else: distance_to_seq_middle = None ParameterValidator.assert_type_and_value(flatten, bool, location, "flatten") ParameterValidator.assert_type_and_value(sequence_type, str, location, 'sequence_type') ParameterValidator.assert_in_valid_list( sequence_type.upper(), [item.name for item in SequenceType], location, 'sequence_type') return { "use_positional_info": use_positional_info, "distance_to_seq_middle": distance_to_seq_middle, "flatten": flatten, "sequence_type": SequenceType[sequence_type.upper()], "name": name }
def parse_object(specs, valid_class_names: list, class_name_ending: str, class_path: str, location: str, key: str, builder: bool = False, return_params_dict: bool = False): class_name = ObjectParser.get_class_name(specs, valid_class_names, class_name_ending, location, key) ParameterValidator.assert_in_valid_list(class_name, valid_class_names, location, key) cls = ReflectionHandler.get_class_by_name( f"{class_name}{class_name_ending}", class_path) params = ObjectParser.get_all_params(specs, class_path, class_name, key) try: if "name" not in inspect.signature(cls.__init__).parameters.keys(): del params["name"] obj = cls.build_object( **params) if builder and hasattr(cls, "build_object") else cls( **params) except TypeError as err: raise AssertionError( f"{location}: invalid parameter {err.args[0]} when specifying parameters in {specs} " f"under key {key}. Valid parameter names are: " f"{[name for name in inspect.signature(cls.__init__).parameters.keys()]}" ) return (obj, {class_name: params}) if return_params_dict else obj
def _parse_dataset(key: str, dataset_specs: dict, symbol_table: SymbolTable, result_path: Path) -> SymbolTable: location = "ImportParser" ParameterValidator.assert_keys(list(dataset_specs.keys()), ImportParser.valid_keys, location, f"datasets:{key}", False) valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataImport, "Import", "IO/dataset_import/") ParameterValidator.assert_in_valid_list(dataset_specs["format"], valid_formats, location, "format") import_cls = ReflectionHandler.get_class_by_name("{}Import".format( dataset_specs["format"])) params = ImportParser._prepare_params(dataset_specs, result_path, key) if "is_repertoire" in params: ParameterValidator.assert_type_and_value(params["is_repertoire"], bool, location, "is_repertoire") if params["is_repertoire"] == True: if import_cls != IReceptorImport: assert "metadata_file" in params, f"{location}: Missing parameter: metadata_file under {key}/params/" ParameterValidator.assert_type_and_value( params["metadata_file"], Path, location, "metadata_file") if params["is_repertoire"] == False: assert "paired" in params, f"{location}: Missing parameter: paired under {key}/params/" ParameterValidator.assert_type_and_value( params["paired"], bool, location, "paired") if params["paired"] == True: assert "receptor_chains" in params, f"{location}: Missing parameter: receptor_chains under {key}/params/" ParameterValidator.assert_in_valid_list( params["receptor_chains"], ["_".join(cp.value) for cp in ChainPair], location, "receptor_chains") try: dataset = import_cls.import_dataset(params, key) dataset.name = key symbol_table.add(key, SymbolType.DATASET, dataset) except KeyError as key_error: raise KeyError( f"{key_error}\n\nAn error occurred during parsing of dataset {key}. " f"The keyword {key_error.args[0]} was missing. This either means this argument was " f"not defined under definitions/datasets/{key}/params, or this column was missing from " f"an input data file. ") except Exception as ex: raise Exception( f"{ex}\n\nAn error occurred while parsing the dataset {key}. See the log above for more details." ) return symbol_table
def parse(specs: dict, symbol_table: SymbolTable) -> dict: if "output" in specs: ParameterValidator.assert_keys(specs["output"], ["format"], "OutputParser", "output") ParameterValidator.assert_in_valid_list(specs["output"]["format"], ["HTML"], "OutputParser", "format") else: specs["output"] = {"format": "HTML"} symbol_table.add("output", SymbolType.OUTPUT, specs["output"]) return specs["output"]
def _prepare_parameters(distance_metric: str, attributes_to_match: list, sequence_batch_size: int, context: dict = None): valid_metrics = [metric.name for metric in DistanceMetricType] ParameterValidator.assert_in_valid_list(distance_metric, valid_metrics, "DistanceEncoder", "distance_metric") return { "distance_metric": DistanceMetricType[distance_metric.upper()], "attributes_to_match": attributes_to_match, "sequence_batch_size": sequence_batch_size, "context": context }
def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ['file_format', 'name'], DesignMatrixExporter.__name__, DesignMatrixExporter.__name__) ParameterValidator.assert_in_valid_list( kwargs['file_format'], ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'], DesignMatrixExporter.__name__, 'file_format') return DesignMatrixExporter(**kwargs)
def build_object(cls, **kwargs): location = cls.__name__ ParameterValidator.assert_keys(kwargs.keys(), ["filter_sequence_type", "batch_size", "count_agg"], location, "DuplicateSequenceFilter") ParameterValidator.assert_in_valid_list(kwargs["filter_sequence_type"].upper(), [item.name for item in SequenceType], location, "filter_sequence_type") ParameterValidator.assert_in_valid_list(kwargs["count_agg"].upper(), [item.name for item in CountAggregationFunction], location, "count_agg") ParameterValidator.assert_type_and_value(kwargs["batch_size"], int, location, "batch_size", 1) return DuplicateSequenceFilter(filter_sequence_type=SequenceType[kwargs["filter_sequence_type"].upper()], batch_size=kwargs["batch_size"], count_agg=CountAggregationFunction[kwargs["count_agg"].upper()])
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") default_params = DefaultParamsLoader.load("instructions/", instruction["type"]) instruction = {**default_params, **instruction} parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table
def add_label(self, label: str, values: list = None, auxiliary_labels: list = None, positive_class=None): vals = list(values) if values else None if label in self._labels and self._labels[label] is not None and len(self._labels[label]) > 0: warnings.warn("Label " + label + " has already been set. Overriding existing values...", Warning) if positive_class is not None: if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str): positive_class = str(positive_class) ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class') self._labels[label] = Label(label, vals, auxiliary_labels, positive_class)
def _prepare_parameters(normalization_type: str, reads: str, sequence_encoding: str, k: int = 0, k_left: int = 0, k_right: int = 0, min_gap: int = 0, max_gap: int = 0, metadata_fields_to_include: list = None, name: str = None, scale_to_unit_variance: bool = False, scale_to_zero_mean: bool = False, sequence_type: str = None): location = KmerFrequencyEncoder.__name__ ParameterValidator.assert_in_valid_list(normalization_type.upper(), [item.name for item in NormalizationType], location, "normalization_type") ParameterValidator.assert_in_valid_list(reads.upper(), [item.name for item in ReadsType], location, "reads") ParameterValidator.assert_in_valid_list(sequence_encoding.upper(), [item.name for item in SequenceEncodingType], location, "sequence_encoding") ParameterValidator.assert_type_and_value(scale_to_zero_mean, bool, location, "scale_to_zero_mean") ParameterValidator.assert_type_and_value(scale_to_unit_variance, bool, location, "scale_to_unit_variance") ParameterValidator.assert_type_and_value(sequence_type, str, location, 'sequence_type') ParameterValidator.assert_in_valid_list(sequence_type.upper(), [st.name for st in SequenceType], location, 'sequence_type') if "IMGT" in sequence_encoding.upper(): assert sequence_type.upper() == SequenceType.AMINO_ACID.name, f"{location}: for IMGT-based k-mer frequency encoding (here: " \ f"{sequence_encoding.upper()}), sequence type has to be 'amino_acid'." vars_to_check = {"k": k, "k_left": k_left, "k_right": k_right, "min_gap": min_gap, "max_gap": max_gap} for param in vars_to_check.keys(): ParameterValidator.assert_type_and_value(vars_to_check[param], int, location, param, min_inclusive=0) if "gap" in sequence_encoding.lower(): assert k_left != 0 and k_right != 0, f"KmerFrequencyEncoder: sequence encoding {sequence_encoding} was chosen, but k_left " \ f"({k_left}) or k_right ({k_right}) have to be set and larger than 0." return { "normalization_type": NormalizationType[normalization_type.upper()], "reads": ReadsType[reads.upper()], "sequence_encoding": SequenceEncodingType[sequence_encoding.upper()], "name": name, "scale_to_zero_mean": scale_to_zero_mean, "scale_to_unit_variance": scale_to_unit_variance, 'sequence_type': SequenceType[sequence_type.upper()], **vars_to_check }
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path = None) -> SubsamplingInstruction: valid_keys = [ "type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats" ] ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key) dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET) ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset') dataset = symbol_table.get(instruction['dataset']) ParameterValidator.assert_type_and_value( instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes') ParameterValidator.assert_all_type_and_value( instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count()) valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names( DataExporter, 'Exporter', "dataset_export/") ParameterValidator.assert_type_and_value( instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats") ParameterValidator.assert_all_in_valid_list( instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats") return SubsamplingInstruction( dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'], dataset_export_formats=[ ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/") for export_format in instruction['dataset_export_formats'] ], name=key)
def _prepare_parameters(vector_size: int, k: int, model_type: str, name: str = None): location = "Word2VecEncoder" ParameterValidator.assert_type_and_value(vector_size, int, location, "vector_size", min_inclusive=1) ParameterValidator.assert_type_and_value(k, int, location, "k", min_inclusive=1) ParameterValidator.assert_in_valid_list( model_type.upper(), [item.name for item in ModelType], location, "model_type") return { "vector_size": vector_size, "k": k, "model_type": ModelType[model_type.upper()], "name": name }
def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy: valid_strategies = [ cls[:-10] for cls in ReflectionHandler.discover_classes_by_partial_name( "Implanting", "simulation/signal_implanting_strategy/") ] ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key) defaults = DefaultParamsLoader.load( "signal_implanting_strategy/", f"{signal['implanting']}Implanting") signal = {**defaults, **signal} ParameterValidator.assert_keys_present( list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key) implanting_comp = None if 'implanting_computation' in signal: implanting_comp = signal['implanting_computation'].lower() ParameterValidator.assert_in_valid_list( implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__, 'implanting_computation') implanting_comp = ImplantingComputation[implanting_comp.upper()] implanting_strategy = ReflectionHandler.get_class_by_name( f"{signal['implanting']}Implanting")( GappedMotifImplanting(), signal["sequence_position_weights"], implanting_comp) return implanting_strategy