Esempio n. 1
0
    def add_label(self,
                  label_name: str,
                  values: list = None,
                  auxiliary_labels: list = None,
                  positive_class=None):

        vals = list(values) if values else None

        if label_name in self._labels and self._labels[
                label_name] is not None and len(self._labels[label_name]) > 0:
            warnings.warn(
                "Label " + label_name +
                " has already been set. Overriding existing values...",
                Warning)

        if positive_class is not None:
            if all(isinstance(val, str)
                   for val in values) and not isinstance(positive_class, str):
                positive_class = str(positive_class)
            ParameterValidator.assert_in_valid_list(positive_class, values,
                                                    Label.__name__,
                                                    'positive_class')
        else:
            positive_class = self._get_default_positive_class(values)
            if positive_class:
                logging.info(
                    f"LabelConfiguration: set default positive class '{positive_class}' for label {label_name}"
                )

        self._labels[label_name] = Label(label_name, vals, auxiliary_labels,
                                         positive_class)
Esempio n. 2
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable,
              path: Path) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), [
            'type', 'dataset', 'number_of_processes', 'config_path',
            'store_encoded_data'
        ], location, key)
        ParameterValidator.assert_in_valid_list(
            instruction['dataset'],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            f"{key}: dataset")
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'],
            int,
            location,
            f"{key}: number_of_processes",
            min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['config_path'],
                                                 str, location,
                                                 f'{key}: config_path')
        ParameterValidator.assert_type_and_value(
            instruction['store_encoded_data'], bool, location,
            f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(
            dataset=symbol_table.get(instruction['dataset']),
            name=key,
            number_of_processes=instruction['number_of_processes'],
            label_configuration=LabelConfiguration([label]),
            hp_setting=hp_setting,
            store_encoded_data=instruction['store_encoded_data'])

        return instruction
Esempio n. 3
0
 def __init__(self,
              k: int,
              skip_first_n_aa: int,
              skip_last_n_aa: int,
              abundance: str,
              normalize_all_features: bool,
              name: str = None):
     location = "AtchleyKmerEncoder"
     ParameterValidator.assert_type_and_value(k, int, location, "k", 1)
     ParameterValidator.assert_type_and_value(skip_first_n_aa, int,
                                              location, "skip_first_n_aa",
                                              0)
     ParameterValidator.assert_type_and_value(skip_last_n_aa, int, location,
                                              "skip_last_n_aa", 0)
     ParameterValidator.assert_in_valid_list(
         abundance.upper(), [ab.name for ab in RelativeAbundanceType],
         location, "abundance")
     ParameterValidator.assert_type_and_value(normalize_all_features, bool,
                                              location,
                                              "normalize_all_features")
     self.k = k
     self.skip_first_n_aa = skip_first_n_aa
     self.skip_last_n_aa = skip_last_n_aa
     self.abundance = RelativeAbundanceType[abundance.upper()]
     self.normalize_all_features = normalize_all_features
     self.name = name
     self.scaler_path = None
     self.vectorizer_path = None
Esempio n. 4
0
    def _prepare_parameters(use_positional_info: bool,
                            distance_to_seq_middle: int,
                            flatten: bool,
                            sequence_type: str,
                            name: str = None):

        location = OneHotEncoder.__name__

        ParameterValidator.assert_type_and_value(use_positional_info, bool,
                                                 location,
                                                 "use_positional_info")
        if use_positional_info:
            ParameterValidator.assert_type_and_value(distance_to_seq_middle,
                                                     int,
                                                     location,
                                                     "distance_to_seq_middle",
                                                     min_inclusive=1)
        else:
            distance_to_seq_middle = None

        ParameterValidator.assert_type_and_value(flatten, bool, location,
                                                 "flatten")
        ParameterValidator.assert_type_and_value(sequence_type, str, location,
                                                 'sequence_type')
        ParameterValidator.assert_in_valid_list(
            sequence_type.upper(), [item.name for item in SequenceType],
            location, 'sequence_type')

        return {
            "use_positional_info": use_positional_info,
            "distance_to_seq_middle": distance_to_seq_middle,
            "flatten": flatten,
            "sequence_type": SequenceType[sequence_type.upper()],
            "name": name
        }
Esempio n. 5
0
    def parse_object(specs,
                     valid_class_names: list,
                     class_name_ending: str,
                     class_path: str,
                     location: str,
                     key: str,
                     builder: bool = False,
                     return_params_dict: bool = False):
        class_name = ObjectParser.get_class_name(specs, valid_class_names,
                                                 class_name_ending, location,
                                                 key)
        ParameterValidator.assert_in_valid_list(class_name, valid_class_names,
                                                location, key)

        cls = ReflectionHandler.get_class_by_name(
            f"{class_name}{class_name_ending}", class_path)
        params = ObjectParser.get_all_params(specs, class_path, class_name,
                                             key)

        try:
            if "name" not in inspect.signature(cls.__init__).parameters.keys():
                del params["name"]
            obj = cls.build_object(
                **params) if builder and hasattr(cls, "build_object") else cls(
                    **params)
        except TypeError as err:
            raise AssertionError(
                f"{location}: invalid parameter {err.args[0]} when specifying parameters in {specs} "
                f"under key {key}. Valid parameter names are: "
                f"{[name for name in inspect.signature(cls.__init__).parameters.keys()]}"
            )

        return (obj, {class_name: params}) if return_params_dict else obj
Esempio n. 6
0
    def _parse_dataset(key: str, dataset_specs: dict,
                       symbol_table: SymbolTable,
                       result_path: Path) -> SymbolTable:
        location = "ImportParser"

        ParameterValidator.assert_keys(list(dataset_specs.keys()),
                                       ImportParser.valid_keys, location,
                                       f"datasets:{key}", False)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataImport, "Import", "IO/dataset_import/")
        ParameterValidator.assert_in_valid_list(dataset_specs["format"],
                                                valid_formats, location,
                                                "format")

        import_cls = ReflectionHandler.get_class_by_name("{}Import".format(
            dataset_specs["format"]))
        params = ImportParser._prepare_params(dataset_specs, result_path, key)

        if "is_repertoire" in params:
            ParameterValidator.assert_type_and_value(params["is_repertoire"],
                                                     bool, location,
                                                     "is_repertoire")

            if params["is_repertoire"] == True:
                if import_cls != IReceptorImport:
                    assert "metadata_file" in params, f"{location}: Missing parameter: metadata_file under {key}/params/"
                    ParameterValidator.assert_type_and_value(
                        params["metadata_file"], Path, location,
                        "metadata_file")

            if params["is_repertoire"] == False:
                assert "paired" in params, f"{location}: Missing parameter: paired under {key}/params/"
                ParameterValidator.assert_type_and_value(
                    params["paired"], bool, location, "paired")

                if params["paired"] == True:
                    assert "receptor_chains" in params, f"{location}: Missing parameter: receptor_chains under {key}/params/"
                    ParameterValidator.assert_in_valid_list(
                        params["receptor_chains"],
                        ["_".join(cp.value) for cp in ChainPair], location,
                        "receptor_chains")

        try:
            dataset = import_cls.import_dataset(params, key)
            dataset.name = key
            symbol_table.add(key, SymbolType.DATASET, dataset)
        except KeyError as key_error:
            raise KeyError(
                f"{key_error}\n\nAn error occurred during parsing of dataset {key}. "
                f"The keyword {key_error.args[0]} was missing. This either means this argument was "
                f"not defined under definitions/datasets/{key}/params, or this column was missing from "
                f"an input data file. ")
        except Exception as ex:
            raise Exception(
                f"{ex}\n\nAn error occurred while parsing the dataset {key}. See the log above for more details."
            )

        return symbol_table
Esempio n. 7
0
    def parse(specs: dict, symbol_table: SymbolTable) -> dict:
        if "output" in specs:
            ParameterValidator.assert_keys(specs["output"], ["format"], "OutputParser", "output")
            ParameterValidator.assert_in_valid_list(specs["output"]["format"], ["HTML"], "OutputParser", "format")
        else:
            specs["output"] = {"format": "HTML"}
        symbol_table.add("output", SymbolType.OUTPUT, specs["output"])

        return specs["output"]
Esempio n. 8
0
    def _prepare_parameters(distance_metric: str, attributes_to_match: list, sequence_batch_size: int, context: dict = None):
        valid_metrics = [metric.name for metric in DistanceMetricType]
        ParameterValidator.assert_in_valid_list(distance_metric, valid_metrics, "DistanceEncoder", "distance_metric")

        return {
            "distance_metric": DistanceMetricType[distance_metric.upper()],
            "attributes_to_match": attributes_to_match,
            "sequence_batch_size": sequence_batch_size,
            "context": context
        }
Esempio n. 9
0
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys_present(list(kwargs.keys()),
                                               ['file_format', 'name'],
                                               DesignMatrixExporter.__name__,
                                               DesignMatrixExporter.__name__)
        ParameterValidator.assert_in_valid_list(
            kwargs['file_format'],
            ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'],
            DesignMatrixExporter.__name__, 'file_format')

        return DesignMatrixExporter(**kwargs)
Esempio n. 10
0
 def build_object(cls, **kwargs):
     location = cls.__name__
     ParameterValidator.assert_keys(kwargs.keys(), ["filter_sequence_type", "batch_size", "count_agg"], location,
                                    "DuplicateSequenceFilter")
     ParameterValidator.assert_in_valid_list(kwargs["filter_sequence_type"].upper(), [item.name for item in SequenceType],
                                             location, "filter_sequence_type")
     ParameterValidator.assert_in_valid_list(kwargs["count_agg"].upper(), [item.name for item in CountAggregationFunction], location,
                                             "count_agg")
     ParameterValidator.assert_type_and_value(kwargs["batch_size"], int, location, "batch_size", 1)
     return DuplicateSequenceFilter(filter_sequence_type=SequenceType[kwargs["filter_sequence_type"].upper()],
                                    batch_size=kwargs["batch_size"], count_agg=CountAggregationFunction[kwargs["count_agg"].upper()])
Esempio n. 11
0
    def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple:

        ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key)
        valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")]
        ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type")

        default_params = DefaultParamsLoader.load("instructions/", instruction["type"])
        instruction = {**default_params, **instruction}
        parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")()
        instruction_object = parser.parse(key, instruction, symbol_table, path)

        symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object)
        return instruction, symbol_table
Esempio n. 12
0
    def add_label(self, label: str, values: list = None, auxiliary_labels: list = None, positive_class=None):

        vals = list(values) if values else None

        if label in self._labels and self._labels[label] is not None and len(self._labels[label]) > 0:
            warnings.warn("Label " + label + " has already been set. Overriding existing values...", Warning)

        if positive_class is not None:
            if all(isinstance(val, str) for val in values) and not isinstance(positive_class, str):
                positive_class = str(positive_class)
            ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class')

        self._labels[label] = Label(label, vals, auxiliary_labels, positive_class)
Esempio n. 13
0
    def _prepare_parameters(normalization_type: str, reads: str, sequence_encoding: str, k: int = 0, k_left: int = 0,
                            k_right: int = 0, min_gap: int = 0, max_gap: int = 0, metadata_fields_to_include: list = None, name: str = None,
                            scale_to_unit_variance: bool = False, scale_to_zero_mean: bool = False, sequence_type: str = None):

        location = KmerFrequencyEncoder.__name__

        ParameterValidator.assert_in_valid_list(normalization_type.upper(), [item.name for item in NormalizationType], location, "normalization_type")
        ParameterValidator.assert_in_valid_list(reads.upper(), [item.name for item in ReadsType], location, "reads")
        ParameterValidator.assert_in_valid_list(sequence_encoding.upper(), [item.name for item in SequenceEncodingType], location, "sequence_encoding")
        ParameterValidator.assert_type_and_value(scale_to_zero_mean, bool, location, "scale_to_zero_mean")
        ParameterValidator.assert_type_and_value(scale_to_unit_variance, bool, location, "scale_to_unit_variance")
        ParameterValidator.assert_type_and_value(sequence_type, str, location, 'sequence_type')
        ParameterValidator.assert_in_valid_list(sequence_type.upper(), [st.name for st in SequenceType], location, 'sequence_type')

        if "IMGT" in sequence_encoding.upper():
            assert sequence_type.upper() == SequenceType.AMINO_ACID.name, f"{location}: for IMGT-based k-mer frequency encoding (here: " \
                                                                     f"{sequence_encoding.upper()}), sequence type has to be 'amino_acid'."

        vars_to_check = {"k": k, "k_left": k_left, "k_right": k_right, "min_gap": min_gap, "max_gap": max_gap}
        for param in vars_to_check.keys():
            ParameterValidator.assert_type_and_value(vars_to_check[param], int, location, param, min_inclusive=0)

        if "gap" in sequence_encoding.lower():
            assert k_left != 0 and k_right != 0, f"KmerFrequencyEncoder: sequence encoding {sequence_encoding} was chosen, but k_left " \
                                                 f"({k_left}) or k_right ({k_right}) have to be set and larger than 0."

        return {
            "normalization_type": NormalizationType[normalization_type.upper()],
            "reads": ReadsType[reads.upper()],
            "sequence_encoding": SequenceEncodingType[sequence_encoding.upper()],
            "name": name,
            "scale_to_zero_mean": scale_to_zero_mean, "scale_to_unit_variance": scale_to_unit_variance,
            'sequence_type': SequenceType[sequence_type.upper()],
            **vars_to_check
        }
Esempio n. 14
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> SubsamplingInstruction:
        valid_keys = [
            "type", "dataset", "subsampled_dataset_sizes",
            "dataset_export_formats"
        ]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys,
                                       SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'],
                                                dataset_keys,
                                                SubsamplingParser.__name__,
                                                f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(
            instruction['subsampled_dataset_sizes'], list,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(
            instruction['subsampled_dataset_sizes'], int,
            SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes', 1,
            dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(
            instruction['dataset_export_formats'], list,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction['dataset_export_formats'], valid_export_formats,
            SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(
            dataset=dataset,
            subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
            dataset_export_formats=[
                ReflectionHandler.get_class_by_name(export_format + "Exporter",
                                                    "dataset_export/")
                for export_format in instruction['dataset_export_formats']
            ],
            name=key)
Esempio n. 15
0
 def _prepare_parameters(vector_size: int,
                         k: int,
                         model_type: str,
                         name: str = None):
     location = "Word2VecEncoder"
     ParameterValidator.assert_type_and_value(vector_size,
                                              int,
                                              location,
                                              "vector_size",
                                              min_inclusive=1)
     ParameterValidator.assert_type_and_value(k,
                                              int,
                                              location,
                                              "k",
                                              min_inclusive=1)
     ParameterValidator.assert_in_valid_list(
         model_type.upper(), [item.name for item in ModelType], location,
         "model_type")
     return {
         "vector_size": vector_size,
         "k": k,
         "model_type": ModelType[model_type.upper()],
         "name": name
     }
Esempio n. 16
0
    def _get_implanting_strategy(key: str,
                                 signal: dict) -> SignalImplantingStrategy:

        valid_strategies = [
            cls[:-10]
            for cls in ReflectionHandler.discover_classes_by_partial_name(
                "Implanting", "simulation/signal_implanting_strategy/")
        ]
        ParameterValidator.assert_in_valid_list(signal["implanting"],
                                                valid_strategies,
                                                "SignalParser", key)

        defaults = DefaultParamsLoader.load(
            "signal_implanting_strategy/", f"{signal['implanting']}Implanting")
        signal = {**defaults, **signal}

        ParameterValidator.assert_keys_present(
            list(signal.keys()),
            ["motifs", "implanting", "sequence_position_weights"],
            SignalParser.__name__, key)

        implanting_comp = None
        if 'implanting_computation' in signal:
            implanting_comp = signal['implanting_computation'].lower()
            ParameterValidator.assert_in_valid_list(
                implanting_comp,
                [el.name.lower() for el in ImplantingComputation],
                SignalParser.__name__, 'implanting_computation')
            implanting_comp = ImplantingComputation[implanting_comp.upper()]

        implanting_strategy = ReflectionHandler.get_class_by_name(
            f"{signal['implanting']}Implanting")(
                GappedMotifImplanting(), signal["sequence_position_weights"],
                implanting_comp)

        return implanting_strategy