Ejemplo n.º 1
0
    def _prepare_specs(self):
        with open(self.yaml_path, "r") as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(),
                                               ["definitions", "instructions"],
                                               GalaxyTrainMLModel.__name__,
                                               "YAML specification")
        ParameterValidator.assert_all_in_valid_list(
            specs.keys(), ["definitions", "instructions", "output"],
            GalaxyTrainMLModel.__name__, "YAML specification")

        ParameterValidator.assert_type_and_value(specs["instructions"], dict,
                                                 GalaxyTrainMLModel.__name__,
                                                 "instructions")

        assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \
                                                             f"`instructions`, got the following instead: {list(specs['instructions'].keys())}."

        self.instruction_name = list(specs["instructions"].keys())[0]

        ParameterValidator.assert_type_and_value(
            specs['instructions'][self.instruction_name], dict,
            GalaxyTrainMLModel.__name__, self.instruction_name)
        ParameterValidator.assert_keys_present(
            specs['instructions'][self.instruction_name].keys(), ['type'],
            GalaxyTrainMLModel.__name__, self.instruction_name)

        assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \
            f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \
            f"for this tool."

        Util.check_paths(specs, GalaxyTrainMLModel.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Ejemplo n.º 2
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: str = None) -> DatasetExportInstruction:
        location = "DatasetExportParser"
        ParameterValidator.assert_keys(list(instruction.keys()),
                                       DatasetExportParser.VALID_KEYS,
                                       location, key)
        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, "Exporter", 'dataset_export/')
        ParameterValidator.assert_all_in_valid_list(
            instruction["export_formats"], valid_formats, location,
            "export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction["datasets"],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            "datasets")

        return DatasetExportInstruction(
            datasets=[
                symbol_table.get(dataset_key)
                for dataset_key in instruction["datasets"]
            ],
            exporters=[
                ReflectionHandler.get_class_by_name(f"{key}Exporter",
                                                    "dataset_export/")
                for key in instruction["export_formats"]
            ],
            name=key)
Ejemplo n.º 3
0
    def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple:

        valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MLMethod, "", "ml_methods/")

        if type(ml_specification) is str:
            ml_specification = {ml_specification: {}}

        ml_specification = {**DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification}
        ml_specification_keys = list(ml_specification.keys())

        ParameterValidator.assert_all_in_valid_list(list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] +
                                                    valid_class_values, "MLParser", ml_method_id)

        non_default_keys = [key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"]]

        assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \
                                                f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \
                                                f"{str([key for key in non_default_keys])[1:-1]}."

        ml_method_class_name = non_default_keys[0]
        ml_method_class = ReflectionHandler.get_class_by_name(ml_method_class_name, "ml_methods/")

        ml_specification[ml_method_class_name] = {**DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False),
                                                  **ml_specification[ml_method_class_name]}

        method, params = MLParser.create_method_instance(ml_specification, ml_method_class, ml_method_id)
        ml_specification[ml_method_class_name] = params
        method.name = ml_method_id

        return method, ml_specification
Ejemplo n.º 4
0
    def _prepare_report_config(self, instruction_key, instruction, split_key,
                               symbol_table):
        if "reports" in instruction[split_key]:
            location = f"{instruction_key}/{split_key}/reports"
            report_types = list(signature(ReportConfig).parameters.keys())
            ParameterValidator.assert_all_in_valid_list(
                instruction[split_key]["reports"].keys(), report_types,
                location, "reports")

            for report_type in instruction[split_key]["reports"]:
                ParameterValidator.assert_type_and_value(
                    instruction[split_key]["reports"][report_type], list,
                    f"{location}/{report_type}", report_type)

            report_config_input = {
                report_type: {
                    report_id: symbol_table.get(report_id)
                    for report_id in instruction[split_key]["reports"]
                    [report_type]
                }
                for report_type in instruction[split_key]["reports"]
            }
        else:
            report_config_input = {}

        return report_config_input
Ejemplo n.º 5
0
    def build_object(cls, **kwargs):
        location = "Coefficients"
        coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]]

        name = kwargs["name"] if "name" in kwargs else None

        ParameterValidator.assert_all_in_valid_list(coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location,
                                                    "coefs_to_plot")

        if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot:
            cutoff = kwargs["cutoff"]
            ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff")
            ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15)
        else:
            cutoff = []

        if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot:
            n_largest = kwargs["n_largest"]
            ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest")
            ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1)
        else:
            n_largest = []

        coefs = CoefficientPlottingSettingList()
        for keyword in coefs_to_plot:
            coefs.append(CoefficientPlottingSetting[keyword.upper()])

        return Coefficients(coefs, cutoff, n_largest, name)
Ejemplo n.º 6
0
    def import_dataset(params, name: str) -> SequenceDataset:
        """
        Returns randomly generated receptor dataset according to the parameters;

        YAML specification:

            result_path: path/where/to/store/results/
            sequence_count: 100 # number of random sequences to generate
            chain_1_length_probabilities:
                14: 0.8 # 80% of all generated sequences for all sequences will have length 14
                15: 0.2 # 20% of all generated sequences across all sequences will have length 15
            labels:
                epitope1: # label name
                    True: 0.5 # 50% of the sequences will have class True
                    False: 0.5 # 50% of the sequences will have class False
                epitope2: # next label with classes that will be assigned to sequences independently of the previous label or other parameters
                    1: 0.3 # 30% of the generated sequences will have class 1
                    0: 0.7 # 70% of the generated sequences will have class 0

        """
        valid_keys = [
            "sequence_count", "length_probabilities", "labels", "result_path"
        ]
        ParameterValidator.assert_all_in_valid_list(
            list(params.keys()), valid_keys, "RandomSequenceDatasetImport",
            "params")

        return RandomDatasetGenerator.generate_sequence_dataset(
            sequence_count=params["sequence_count"],
            length_probabilities=params["length_probabilities"],
            labels=params["labels"],
            path=params["result_path"])
Ejemplo n.º 7
0
    def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset:
        valid_keys = ["result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels"]
        ParameterValidator.assert_all_in_valid_list(list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params")

        return RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=params["repertoire_count"],
                                                                  sequence_count_probabilities=params["sequence_count_probabilities"],
                                                                  sequence_length_probabilities=params["sequence_length_probabilities"],
                                                                  labels=params["labels"],
                                                                  path=params["result_path"])
Ejemplo n.º 8
0
    def update_specs(self):
        with open(self.yaml_path, 'r') as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationTool.__name__, "YAML specification")
        ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationTool.__name__, "YAML specification")

        self._check_dataset(specs)
        self._check_instruction(specs)

        Util.check_paths(specs, DatasetGenerationTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Ejemplo n.º 9
0
    def parse_encoder(key: str, specs: dict):
        class_path = "encodings"
        valid_encoders = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DatasetEncoder, "Encoder", class_path)
        encoder = ObjectParser.get_class(specs, valid_encoders, "Encoder",
                                         class_path, "EncodingParser", key)
        params = ObjectParser.get_all_params(specs, class_path,
                                             encoder.__name__[:-7], key)

        required_params = [
            p for p in list(
                inspect.signature(encoder.__init__).parameters.keys())
            if p != "self"
        ]
        ParameterValidator.assert_all_in_valid_list(
            params.keys(), required_params, "EncoderParser",
            f"{key}/{encoder.__name__.replace('Encoder', '')}")

        return encoder, params
Ejemplo n.º 10
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str = None) -> SubsamplingInstruction:

        valid_keys = ["type", "dataset", "subsampled_dataset_sizes", "dataset_export_formats"]
        ParameterValidator.assert_keys(instruction.keys(), valid_keys, SubsamplingParser.__name__, key)

        dataset_keys = symbol_table.get_keys_by_type(SymbolType.DATASET)
        ParameterValidator.assert_in_valid_list(instruction['dataset'], dataset_keys, SubsamplingParser.__name__, f'{key}/dataset')

        dataset = symbol_table.get(instruction['dataset'])
        ParameterValidator.assert_type_and_value(instruction['subsampled_dataset_sizes'], list, SubsamplingParser.__name__, f'{key}/subsampled_dataset_sizes')
        ParameterValidator.assert_all_type_and_value(instruction['subsampled_dataset_sizes'], int, SubsamplingParser.__name__,
                                                     f'{key}/subsampled_dataset_sizes', 1, dataset.get_example_count())

        valid_export_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, 'Exporter', "dataset_export/")
        ParameterValidator.assert_type_and_value(instruction['dataset_export_formats'], list, SubsamplingParser.__name__, f"{key}/dataset_export_formats")
        ParameterValidator.assert_all_in_valid_list(instruction['dataset_export_formats'], valid_export_formats, SubsamplingParser.__name__, f"{key}/dataset_export_formats")

        return SubsamplingInstruction(dataset=dataset, subsampled_dataset_sizes=instruction['subsampled_dataset_sizes'],
                                      dataset_export_formats=[ReflectionHandler.get_class_by_name(export_format + "Exporter", "dataset_export/")
                                                              for export_format in instruction['dataset_export_formats']], name=key)
Ejemplo n.º 11
0
    def _extract_reports(self):
        with open(self.specification_path, "r") as file:
            workflow_specification = yaml.safe_load(file)

        report_keys = list(workflow_specification['instructions'].values()
                           )[0]['benchmark_reports']

        ParameterValidator.assert_all_in_valid_list(
            report_keys,
            list(workflow_specification['definitions']['reports'].keys()),
            MultiDatasetBenchmarkTool.__name__, "benchmark_reports")

        reports = {
            key: value
            for key, value in workflow_specification['definitions']
            ['reports'].items() if key in report_keys
        }
        symbol_table, _ = ReportParser.parse_reports(reports, SymbolTable())
        self.reports = [
            entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT)
        ]