Ejemplo n.º 1
0
    def _prepare_parameters(reference: dict,
                            max_edit_distances: dict,
                            name: str = None):
        location = "MatchedReceptorsEncoder"

        legal_chains = [
            chain
            for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor())
            for chain in receptor.get_chains()
        ]

        if type(max_edit_distances) is int:
            max_edit_distances = {
                chain: max_edit_distances
                for chain in legal_chains
            }
        elif type(max_edit_distances) is dict:
            ParameterValidator.assert_keys(max_edit_distances.keys(),
                                           legal_chains,
                                           location,
                                           "max_edit_distances",
                                           exclusive=False)
        else:
            ParameterValidator.assert_type_and_value(max_edit_distances, dict,
                                                     location,
                                                     'max_edit_distances')

        reference_receptors = MatchedReferenceUtil.prepare_reference(
            reference, location=location, paired=True)

        return {
            "reference_receptors": reference_receptors,
            "max_edit_distances": max_edit_distances,
            "name": name
        }
Ejemplo n.º 2
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: str = None) -> DatasetExportInstruction:
        location = "DatasetExportParser"
        ParameterValidator.assert_keys(list(instruction.keys()),
                                       DatasetExportParser.VALID_KEYS,
                                       location, key)
        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, "Exporter", 'dataset_export/')
        ParameterValidator.assert_all_in_valid_list(
            instruction["export_formats"], valid_formats, location,
            "export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction["datasets"],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            "datasets")

        return DatasetExportInstruction(
            datasets=[
                symbol_table.get(dataset_key)
                for dataset_key in instruction["datasets"]
            ],
            exporters=[
                ReflectionHandler.get_class_by_name(f"{key}Exporter",
                                                    "dataset_export/")
                for key in instruction["export_formats"]
            ],
            name=key)
Ejemplo n.º 3
0
    def _check_specs(self, workflow_specification):
        location = 'MultiDatasetBenchmarkTool'
        ParameterValidator.assert_keys(
            workflow_specification.keys(),
            ['definitions', 'instructions', 'output'], location,
            'YAML specification')

        self._check_dataset_specs(workflow_specification, location)
        self._check_instruction_specs(workflow_specification, location)
Ejemplo n.º 4
0
    def parse(specs: dict, symbol_table: SymbolTable) -> dict:
        if "output" in specs:
            ParameterValidator.assert_keys(specs["output"], ["format"],
                                           "OutputParser", "output")
            ParameterValidator.assert_in_valid_list(specs["output"]["format"],
                                                    ["HTML"], "OutputParser",
                                                    "format")
        else:
            specs["output"] = {"format": "HTML"}
        symbol_table.add("output", SymbolType.OUTPUT, specs["output"])

        return specs["output"]
Ejemplo n.º 5
0
    def _parse_settings(self, instruction: dict,
                        symbol_table: SymbolTable) -> list:
        try:
            settings = []
            for index, setting in enumerate(instruction["settings"]):
                if "preprocessing" in setting:
                    ParameterValidator.assert_type_and_value(
                        setting["preprocessing"], str,
                        TrainMLModelParser.__name__, f'settings: {index+1}. '
                        f'element: preprocessing')
                    if symbol_table.contains(setting["preprocessing"]):
                        preprocessing_sequence = symbol_table.get(
                            setting["preprocessing"])
                        preproc_name = setting["preprocessing"]
                    else:
                        raise KeyError(
                            f"{TrainMLModelParser.__name__}: preprocessing was set in the TrainMLModel instruction to value "
                            f"{setting['preprocessing']}, but no such preprocessing was defined in the specification under "
                            f"definitions: {PreprocessingParser.keyword}.")
                else:
                    setting["preprocessing"] = None
                    preprocessing_sequence = []
                    preproc_name = None

                ParameterValidator.assert_keys(
                    setting.keys(), ["preprocessing", "ml_method", "encoding"],
                    TrainMLModelParser.__name__,
                    f"settings, {index + 1}. entry")

                encoder = symbol_table.get(setting["encoding"]).build_object(symbol_table.get(instruction["dataset"]),
                                                                             **symbol_table.get_config(setting["encoding"])["encoder_params"])\
                    .set_context({"dataset": symbol_table.get(instruction['dataset'])})

                s = HPSetting(encoder=encoder,
                              encoder_name=setting["encoding"],
                              encoder_params=symbol_table.get_config(
                                  setting["encoding"])["encoder_params"],
                              ml_method=symbol_table.get(setting["ml_method"]),
                              ml_method_name=setting["ml_method"],
                              ml_params=symbol_table.get_config(
                                  setting["ml_method"]),
                              preproc_sequence=preprocessing_sequence,
                              preproc_sequence_name=preproc_name)
                settings.append(s)
            return settings
        except KeyError as key_error:
            raise KeyError(
                f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under settings in TrainMLModel instruction."
            )
Ejemplo n.º 6
0
    def parse_signals(signals: dict, symbol_table: SymbolTable):
        for key, signal_spec in signals.items():

            ParameterValidator.assert_keys_present(signal_spec.keys(), SignalParser.VALID_KEYS, "SignalParser", key)

            implanting_strategy = SignalParser._get_implanting_strategy(key, signal_spec)

            ParameterValidator.assert_keys(signal_spec["motifs"], symbol_table.get_keys_by_type(SymbolType.MOTIF), "SignalParser",
                                           f"motifs in signal {key}", False)

            signal_motifs = [symbol_table.get(motif_id) for motif_id in signal_spec["motifs"]]
            signal = Signal(key, signal_motifs, implanting_strategy)
            symbol_table.add(key, SymbolType.SIGNAL, signal)

        return symbol_table, signals
Ejemplo n.º 7
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'label', 'pool_size', 'config_path', 'store_encoded_data'], location, key)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset")
        ParameterValidator.assert_type_and_value(instruction['pool_size'], int, location, f"{key}: pool_size", min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['label'], str, location, f'{key}: label')
        ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path')
        ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key, pool_size=instruction['pool_size'],
                                               label_configuration=LabelConfiguration([label]), hp_setting=hp_setting,
                                               store_encoded_data=instruction['store_encoded_data'])

        return instruction
Ejemplo n.º 8
0
    def parse_motifs(motifs: dict, symbol_table: SymbolTable):

        valid_motif_keys = [
            "seed", "instantiation", "seed_chain1", "seed_chain2",
            "name_chain1", "name_chain2"
        ]
        for key in motifs.keys():

            ParameterValidator.assert_keys(motifs[key].keys(),
                                           valid_motif_keys,
                                           "MotifParser",
                                           key,
                                           exclusive=False)

            motif = MotifParser._parse_motif(key, motifs[key])
            symbol_table.add(key, SymbolType.MOTIF, motif)

        return symbol_table, motifs
Ejemplo n.º 9
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: str = None) -> ExploratoryAnalysisInstruction:
        exp_analysis_units = {}

        ParameterValidator.assert_keys(instruction, ["analyses", "type"],
                                       "ExploratoryAnalysisParser",
                                       "ExploratoryAnalysis")
        for analysis_key, analysis in instruction["analyses"].items():

            params = self._prepare_params(analysis, symbol_table)
            exp_analysis_units[analysis_key] = ExploratoryAnalysisUnit(
                **params)

        process = ExploratoryAnalysisInstruction(
            exploratory_analysis_units=exp_analysis_units, name=key)
        return process
Ejemplo n.º 10
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> SubsamplingInstruction:

        valid_keys = ["type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats"]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__,
                                                     f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
                                      dataset_export_formats=[ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/")
                                                              for export_format in instruction['dataset_export_formats']], name=key)
Ejemplo n.º 11
0
 def build_object(cls, **kwargs):
     location = cls.__name__
     ParameterValidator.assert_keys(
         kwargs.keys(), ["filter_sequence_type", "batch_size", "count_agg"],
         location, "DuplicateSequenceFilter")
     ParameterValidator.assert_in_valid_list(
         kwargs["filter_sequence_type"].upper(),
         [item.name for item in SequenceType], location,
         "filter_sequence_type")
     ParameterValidator.assert_in_valid_list(
         kwargs["count_agg"].upper(),
         [item.name
          for item in CountAggregationFunction], location, "count_agg")
     ParameterValidator.assert_type_and_value(kwargs["batch_size"], int,
                                              location, "batch_size", 1)
     return DuplicateSequenceFilter(
         filter_sequence_type=SequenceType[
             kwargs["filter_sequence_type"].upper()],
         batch_size=kwargs["batch_size"],
         count_agg=CountAggregationFunction[kwargs["count_agg"].upper()])
Ejemplo n.º 12
0
    def _prepare_params(self, analysis: dict,
                        symbol_table: SymbolTable) -> dict:

        valid_keys = [
            "dataset", "report", "preprocessing_sequence", "labels",
            "encoding", "number_of_processes"
        ]
        ParameterValidator.assert_keys(list(analysis.keys()), valid_keys,
                                       "ExploratoryAnalysisParser", "analysis",
                                       False)

        params = {
            "dataset": symbol_table.get(analysis["dataset"]),
            "report": copy.deepcopy(symbol_table.get(analysis["report"]))
        }

        optional_params = self._prepare_optional_params(analysis, symbol_table)
        params = {**params, **optional_params}

        return params
Ejemplo n.º 13
0
    def _parse_dataset(key: str, dataset_specs: dict, symbol_table: SymbolTable, result_path: str) -> SymbolTable:
        location = "ImportParser"

        ParameterValidator.assert_keys(list(dataset_specs.keys()), ImportParser.valid_keys, location, f"datasets:{key}", False)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataImport, "Import", "IO/dataset_import/")
        ParameterValidator.assert_in_valid_list(dataset_specs["format"], valid_formats, location, "format")

        import_cls = ReflectionHandler.get_class_by_name("{}Import".format(dataset_specs["format"]))
        params = ImportParser._prepare_params(dataset_specs, result_path, key)


        if "is_repertoire" in params:
            ParameterValidator.assert_type_and_value(params["is_repertoire"], bool, location, "is_repertoire")

            if params["is_repertoire"] == True:
                if import_cls != IReceptorImport:
                    assert "metadata_file" in params, f"{location}: Missing parameter: metadata_file under {key}/params/"
                    ParameterValidator.assert_type_and_value(params["metadata_file"], str, location, "metadata_file")

            if params["is_repertoire"] == False:
                assert "paired" in params, f"{location}: Missing parameter: paired under {key}/params/"
                ParameterValidator.assert_type_and_value(params["paired"], bool, location, "paired")

                if params["paired"] == True:
                    assert "receptor_chains" in params, f"{location}: Missing parameter: receptor_chains under {key}/params/"
                    ParameterValidator.assert_in_valid_list(params["receptor_chains"], ["_".join(cp.value) for cp in ChainPair], location, "receptor_chains")

        try:
            dataset = import_cls.import_dataset(params, key)
            dataset.name = key
            symbol_table.add(key, SymbolType.DATASET, dataset)
        except KeyError as key_error:
            raise KeyError(f"{key_error}\n\nAn error occurred during parsing of dataset {key}. "
                           f"The keyword {key_error.args[0]} was missing. This either means this argument was "
                           f"not defined under definitions/datasets/{key}/params, or this column was missing from "
                           f"an input data file. ")
        except Exception as ex:
            raise Exception(f"{ex}\n\nAn error occurred while parsing the dataset {key}. See the log above for more details.")

        return symbol_table
Ejemplo n.º 14
0
    def build_object(cls, **kwargs):

        ParameterValidator.assert_keys(
            kwargs.keys(),
            ['reference_path', 'comparison_attributes', 'name', 'label'],
            ReferenceSequenceOverlap.__name__,
            f"reports: {kwargs['name'] if 'name' in kwargs else ''}")

        assert os.path.isfile(kwargs['reference_path']), f"{ReferenceSequenceOverlap.__name__}: 'reference_path' for report {kwargs['name']} is not " \
                                                         f"a valid file path."

        reference_sequences_df = pd.read_csv(kwargs['reference_path'])
        attributes = reference_sequences_df.columns.tolist()

        ParameterValidator.assert_keys_present(
            expected_values=kwargs['comparison_attributes'],
            values=attributes,
            location=ReferenceSequenceOverlap.__name__,
            parameter_name='columns in file under reference_path')

        return ReferenceSequenceOverlap(**kwargs)
Ejemplo n.º 15
0
    def prepare_reference(reference_params: dict, location: str, paired: bool):
        ParameterValidator.assert_keys(list(reference_params.keys()),
                                       ["format", "params"], location,
                                       "reference")

        seq_import_params = reference_params[
            "params"] if "params" in reference_params else {}

        assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \
                                                  f"Specify the correct path under reference."

        if "paired" in seq_import_params:
            assert seq_import_params[
                "paired"] == paired, f"{location}: paired must be {paired} for SequenceImport"
        else:
            seq_import_params["paired"] = paired

        format_str = reference_params["format"]

        if format_str == "IRIS":  # todo refactor this when refactoring IRISSequenceImport
            receptors = IRISSequenceImport.import_items(**seq_import_params)
        else:
            import_class = ReflectionHandler.get_class_by_name(
                "{}Import".format(format_str))
            params = DefaultParamsLoader.load(
                EnvironmentSettings.default_params_path + "datasets/",
                DefaultParamsLoader.convert_to_snake_case(format_str))
            for key, value in seq_import_params.items():
                params[key] = value
            params["paired"] = paired

            processed_params = DatasetImportParams.build_object(**params)

            receptors = ImportHelper.import_items(
                import_class, reference_params["params"]["path"],
                processed_params)

        return receptors
Ejemplo n.º 16
0
    def _parse_simulation(key: str, simulation: dict,
                          symbol_table: SymbolTable) -> SymbolTable:

        location = "SimulationParser"
        valid_implanting_keys = [
            "dataset_implanting_rate", "repertoire_implanting_rate", "signals",
            "is_noise"
        ]
        implantings = []

        for impl_key, implanting in simulation.items():

            ParameterValidator.assert_keys(implanting.keys(),
                                           valid_implanting_keys,
                                           location,
                                           impl_key,
                                           exclusive=False)
            ParameterValidator.assert_keys(
                implanting["signals"],
                symbol_table.get_keys_by_type(SymbolType.SIGNAL), location,
                impl_key, False)

            implanting_params = copy.deepcopy(implanting)
            implanting_params["signals"] = [
                symbol_table.get(signal) for signal in implanting["signals"]
            ]
            implanting_params["name"] = impl_key

            implantings.append(Implanting(**implanting_params))

        assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \
            "The total dataset implanting rate can not exceed 1."

        symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings))

        return symbol_table
Ejemplo n.º 17
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: str = None) -> TrainMLModelInstruction:

        valid_keys = [
            "assessment", "selection", "dataset", "strategy", "labels",
            "metrics", "settings", "number_of_processes", "type", "reports",
            "optimization_metric", 'refit_optimal_model', 'store_encoded_data'
        ]
        ParameterValidator.assert_type_and_value(instruction['settings'], list,
                                                 TrainMLModelParser.__name__,
                                                 'settings')
        ParameterValidator.assert_keys(list(instruction.keys()), valid_keys,
                                       TrainMLModelParser.__name__,
                                       "TrainMLModel")
        ParameterValidator.assert_type_and_value(
            instruction['refit_optimal_model'], bool,
            TrainMLModelParser.__name__, 'refit_optimal_model')
        ParameterValidator.assert_type_and_value(instruction['metrics'], list,
                                                 TrainMLModelParser.__name__,
                                                 'metrics')
        ParameterValidator.assert_type_and_value(
            instruction['optimization_metric'], str,
            TrainMLModelParser.__name__, 'optimization_metric')
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'], int,
            TrainMLModelParser.__name__, 'number_of_processes')
        ParameterValidator.assert_type_and_value(instruction['strategy'], str,
                                                 TrainMLModelParser.__name__,
                                                 'strategy')
        ParameterValidator.assert_type_and_value(
            instruction['store_encoded_data'], bool,
            TrainMLModelParser.__name__, 'store_encoded_data')

        settings = self._parse_settings(instruction, symbol_table)
        dataset = symbol_table.get(instruction["dataset"])
        assessment = self._parse_split_config(key, instruction, "assessment",
                                              symbol_table, len(settings))
        selection = self._parse_split_config(key, instruction, "selection",
                                             symbol_table, len(settings))
        assessment, selection = self._update_split_configs(
            assessment, selection, dataset)
        label_config = self._create_label_config(instruction, dataset, key)
        strategy = ReflectionHandler.get_class_by_name(
            instruction["strategy"], "hyperparameter_optimization/")
        metrics = {Metric[metric.upper()] for metric in instruction["metrics"]}
        optimization_metric = Metric[
            instruction["optimization_metric"].upper()]
        metric_search_criterion = Metric.get_search_criterion(
            optimization_metric)
        path = self._prepare_path(instruction)
        context = self._prepare_context(instruction, symbol_table)
        reports = self._prepare_reports(instruction["reports"], symbol_table)

        hp_instruction = TrainMLModelInstruction(
            dataset=dataset,
            hp_strategy=strategy(settings, metric_search_criterion),
            hp_settings=settings,
            assessment=assessment,
            selection=selection,
            metrics=metrics,
            optimization_metric=optimization_metric,
            refit_optimal_model=instruction['refit_optimal_model'],
            label_configuration=label_config,
            path=path,
            context=context,
            store_encoded_data=instruction['store_encoded_data'],
            number_of_processes=instruction["number_of_processes"],
            reports=reports,
            name=key)

        return hp_instruction