Esempio n. 1
0
    def prepare_specs(self):
        with self.yaml_path.open("r") as file:
            specs = yaml.safe_load(file)

        self.instruction_name = Util.check_instruction_type(
            specs, DataSimulationTool.__name__, self.expected_instruction)
        self.export_format = Util.check_export_format(
            specs, DataSimulationTool.__name__, self.instruction_name)

        ParameterValidator.assert_keys_present(specs["definitions"],
                                               ["datasets"],
                                               DataSimulationTool.__name__,
                                               "definitions/datasets")
        ParameterValidator.assert_type_and_value(
            specs['definitions']['datasets'], dict,
            DataSimulationTool.__name__, "definitions/datasets")

        dataset_names = list(specs['definitions']['datasets'].keys())
        assert len(dataset_names) == 1, f"{DataSimulationTool.__name__}: one dataset has to be defined under definitions/datasets, got " \
                                        f"{dataset_names} instead."

        self.dataset_name = dataset_names[0]

        Util.check_paths(specs, DataSimulationTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Esempio n. 2
0
    def build(cls, **kwargs):
        ParameterValidator.assert_keys_present(
            list(kwargs.keys()),
            ['metadata_file', 'name', 'repertoire_ids', 'metadata_fields'],
            RepertoireDataset.__name__, "repertoire dataset")
        repertoires = []
        metadata_df = pd.read_csv(kwargs['metadata_file'],
                                  comment=Constants.COMMENT_SIGN)
        for index, row in metadata_df.iterrows():
            filename = Path(kwargs['metadata_file']).parent / row['filename']
            if not filename.is_file() and 'repertoires' in str(filename):
                filename = filename.parent.parent / Path(row['filename']).name
            repertoire = Repertoire(data_filename=filename,
                                    metadata_filename=filename.parent /
                                    f'{filename.stem}_metadata.yaml',
                                    identifier=row['identifier'])
            repertoires.append(repertoire)

        if "repertoire_ids" in kwargs.keys(
        ) and "repertoires" not in kwargs.keys(
        ) and kwargs['repertoire_ids'] is not None:
            assert all(rep.identifier == kwargs['repertoire_ids'][i] for i, rep in enumerate(repertoires)), \
                f"{RepertoireDataset.__name__}: repertoire ids from the iml_dataset file and metadata file don't match for the dataset " \
                f"{kwargs['name']} with identifier {kwargs['identifier']}."

        return RepertoireDataset(**{**kwargs, **{"repertoires": repertoires}})
Esempio n. 3
0
    def _prepare_specs(self):
        with self.yaml_path.open("r") as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], GalaxyTrainMLModel.__name__, "YAML specification")
        ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], GalaxyTrainMLModel.__name__,
                                                    "YAML specification")

        ParameterValidator.assert_type_and_value(specs["instructions"], dict, GalaxyTrainMLModel.__name__, "instructions")

        assert len(list(specs["instructions"].keys())) == 1, f"{GalaxyTrainMLModel.__name__}: one instruction has to be specified under " \
                                                             f"`instructions`, got the following instead: {list(specs['instructions'].keys())}."

        self.instruction_name = list(specs["instructions"].keys())[0]

        ParameterValidator.assert_type_and_value(specs['instructions'][self.instruction_name], dict, GalaxyTrainMLModel.__name__,
                                                 self.instruction_name)
        ParameterValidator.assert_keys_present(specs['instructions'][self.instruction_name].keys(), ['type'], GalaxyTrainMLModel.__name__,
                                               self.instruction_name)

        assert specs['instructions'][self.instruction_name]['type'] == TrainMLModelInstruction.__name__[:-11], \
            f"{GalaxyTrainMLModel.__name__}: instruction `type` under {self.instruction_name} has to be {TrainMLModelInstruction.__name__[:-11]} " \
            f"for this tool."

        assert len(
            specs['instructions'][self.instruction_name]['labels']) == 1, f"{GalaxyTrainMLModel.__name__}: one label has to be specified under " \
                                                                          f"`labels`, got the following instead: {specs['instructions'][self.instruction_name]['labels']}."
        Util.check_paths(specs, GalaxyTrainMLModel.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Esempio n. 4
0
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys_present(list(kwargs.keys()),
                                               ['file_format', 'name'],
                                               DesignMatrixExporter.__name__,
                                               DesignMatrixExporter.__name__)
        ParameterValidator.assert_in_valid_list(
            kwargs['file_format'],
            ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'],
            DesignMatrixExporter.__name__, 'file_format')

        return DesignMatrixExporter(**kwargs)
Esempio n. 5
0
    def _check_dataset(self, specs):
        ParameterValidator.assert_keys_present(specs["definitions"].keys(),
                                               ['datasets'],
                                               DatasetGenerationTool.__name__,
                                               'definitions')
        assert len(specs['definitions']['datasets'].keys()) == 1, \
            f"{DatasetGenerationTool.__name__}: only one dataset can be defined with this Galaxy tool, got these " \
            f"instead: {list(specs['definitions']['datasets'].keys())}."

        assert len(specs['instructions'].keys()) == 1, \
            f"{DatasetGenerationTool.__name__}: only one instruction of type DatasetExport can be defined with this Galaxy tool, got these " \
            f"instructions instead: {list(specs['instructions'].keys())}."
Esempio n. 6
0
    def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple:

        ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key)
        valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")]
        ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type")

        default_params = DefaultParamsLoader.load("instructions/", instruction["type"])
        instruction = {**default_params, **instruction}
        parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")()
        instruction_object = parser.parse(key, instruction, symbol_table, path)

        symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object)
        return instruction, symbol_table
Esempio n. 7
0
    def check_instruction_type(specs: dict, tool_name,
                               expected_instruction) -> str:
        ParameterValidator.assert_keys_present(list(specs.keys()),
                                               ['definitions', 'instructions'],
                                               tool_name, "YAML specification")
        assert len(list(specs['instructions'].keys())) == 1, f"{tool_name}: multiple instructions were given " \
                                                             f"({str(list(specs['instructions'].keys()))[1:-1]}), but only one instruction of type " \
                                                             f"{expected_instruction} should be specified."
        instruction_name = list(specs['instructions'].keys())[0]
        instruction_type = specs['instructions'][instruction_name]['type']
        assert instruction_type == expected_instruction, \
            f"{tool_name}: instruction type has to be '{expected_instruction}', got {instruction_type} instead."

        return instruction_name
Esempio n. 8
0
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys_present(list(kwargs.keys()),
                                               ["implanted_motifs_per_label"],
                                               "MotifSeedRecovery",
                                               "MotifSeedRecovery report")

        implanted_motifs_per_label = kwargs["implanted_motifs_per_label"]

        ParameterValidator.assert_type_and_value(
            implanted_motifs_per_label, dict, "MotifSeedRecovery",
            f"implanted_motifs_per_label")

        for label_name in implanted_motifs_per_label.keys():
            ParameterValidator.assert_type_and_value(
                implanted_motifs_per_label[label_name], dict,
                "MotifSeedRecovery",
                f"implanted_motifs_per_label/{label_name}")

            ParameterValidator.assert_keys_present(
                implanted_motifs_per_label[label_name].keys(),
                ["hamming_distance", "seeds", "gap_sizes"],
                "MotifSeedRecovery",
                f"implanted_motifs_per_label/{label_name}")
            ParameterValidator.assert_type_and_value(
                implanted_motifs_per_label[label_name]["hamming_distance"],
                bool, "MotifSeedRecovery",
                f"implanted_motifs_per_label/{label_name}/hamming_distance")
            ParameterValidator.assert_type_and_value(
                implanted_motifs_per_label[label_name]["gap_sizes"], list,
                "MotifSeedRecovery",
                f"implanted_motifs_per_label/{label_name}/gap_sizes")
            ParameterValidator.assert_type_and_value(
                implanted_motifs_per_label[label_name]["seeds"], list,
                "MotifSeedRecovery",
                f"implanted_motifs_per_label/{label_name}/seeds")
            for gap_size in implanted_motifs_per_label[label_name][
                    "gap_sizes"]:
                ParameterValidator.assert_type_and_value(
                    gap_size,
                    int,
                    "MotifSeedRecovery",
                    f"implanted_motifs_per_label/{label_name}/gap_sizes",
                    min_inclusive=0)
            for seed in implanted_motifs_per_label[label_name]["seeds"]:
                ParameterValidator.assert_type_and_value(
                    seed, str, "MotifSeedRecovery",
                    f"implanted_motifs_per_label/{label_name}/seeds")

        return MotifSeedRecovery(implanted_motifs_per_label)
Esempio n. 9
0
    def check_export_format(specs: dict, tool_name: str,
                            instruction_name: str):
        ParameterValidator.assert_keys_present(
            list(specs['instructions'][instruction_name].keys()),
            ["export_formats"], tool_name,
            f"{instruction_name}/export_formats")
        ParameterValidator.assert_type_and_value(
            specs['instructions'][instruction_name]["export_formats"], list,
            tool_name, f"{instruction_name}/export_formats")

        assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \
            f"{tool_name}: only one format can be specified under export_formats parameter under " \
            f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead."

        return specs['instructions'][instruction_name]["export_formats"][0]
Esempio n. 10
0
    def _check_instruction(self, specs):
        instruction_name = Util.check_instruction_type(
            specs, DatasetGenerationTool.__name__,
            DatasetExportInstruction.__name__[:-11])

        for key in ['datasets', 'export_formats']:
            ParameterValidator.assert_keys_present(
                list(specs['instructions'][instruction_name].keys()), [key],
                DatasetGenerationTool.__name__, instruction_name)
            ParameterValidator.assert_type_and_value(
                specs["instructions"][instruction_name][key], list,
                DatasetGenerationTool.__name__, f"{instruction_name}/{key}")

            assert len(specs['instructions'][instruction_name][key]) == 1, \
                f"{DatasetGenerationTool.__name__}: this tool accepts only one item under {key}, got {specs['instructions'][instruction_name][key]} " \
                f"instead."
Esempio n. 11
0
    def _check_specs(self):
        with open(self.yaml_path, "r") as file:
            specs = yaml.load(file)

        instruction_name = Util.check_instruction_type(
            specs, GalaxyMLApplicationTool.__name__,
            MLApplicationInstruction.__name__[:-11])

        ParameterValidator.assert_keys_present(
            list(specs['instructions'][instruction_name].keys()),
            ["dataset", "config_path"], GalaxyMLApplicationTool.__name__,
            instruction_name)

        assert os.path.isfile(specs['instructions'][instruction_name]['config_path']), \
            f"{GalaxyMLApplicationTool.__name__}: file specified under 'config_path' parameter " \
            f"({specs['instructions'][instruction_name]['config_path']}) is not available. Please check if it was correctly uploaded or if the file" \
            f" name is correct."
Esempio n. 12
0
    def _check_dataset_specs(self, workflow_specification, location):
        ParameterValidator.assert_type_and_value(
            workflow_specification['definitions'], dict, location,
            'definitions')
        ParameterValidator.assert_keys_present(
            workflow_specification['definitions'].keys(), ['datasets'],
            location, 'definitions')
        ParameterValidator.assert_type_and_value(
            workflow_specification['definitions']['datasets'], dict, location,
            'datasets')

        dataset_names = list(
            workflow_specification['definitions']['datasets'].keys())

        assert len(dataset_names) > 1, \
            f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \
            f"If only one dataset is needed, consider using the training instruction directly."
Esempio n. 13
0
    def _update_specs(self):
        with self.yaml_path.open('r') as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(),
                                               ["definitions", "instructions"],
                                               DatasetGenerationTool.__name__,
                                               "YAML specification")
        ParameterValidator.assert_all_in_valid_list(
            specs.keys(), ["definitions", "instructions", "output"],
            DatasetGenerationTool.__name__, "YAML specification")

        self._check_dataset(specs)
        self._check_instruction(specs)

        Util.check_paths(specs, DatasetGenerationTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
Esempio n. 14
0
    def import_repertoire_dataset(import_class, params: DatasetImportParams,
                                  dataset_name: str) -> RepertoireDataset:
        """
        Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file

        Arguments:
            import_class: class to use for import
            params: instance of DatasetImportParams class which includes information on path, columns, result path etc.
            dataset_name: user-defined name of the dataset

        Returns:
            RepertoireDataset object that was created
        """
        metadata = pd.read_csv(params.metadata_file, ",")

        ParameterValidator.assert_keys_present(
            metadata.columns.tolist(), ["filename"], ImportHelper.__name__,
            f'{dataset_name}: params: metadata_file')

        PathBuilder.build(params.result_path / "repertoires/")

        arguments = [(import_class, row, params)
                     for index, row in metadata.iterrows()]
        with Pool(params.number_of_processes) as pool:
            repertoires = pool.starmap(ImportHelper.load_repertoire_as_object,
                                       arguments)

        new_metadata_file = ImportHelper.make_new_metadata_file(
            repertoires, metadata, params.result_path, dataset_name)

        potential_labels = list(set(metadata.columns.tolist()) - {"filename"})
        dataset = RepertoireDataset(labels={
            key: list(set(metadata[key].values.tolist()))
            for key in potential_labels
        },
                                    repertoires=repertoires,
                                    metadata_file=new_metadata_file,
                                    name=dataset_name)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Esempio n. 15
0
    def prepare_specs(self):
        with self.yaml_path.open("r") as file:
            specs = yaml.safe_load(file)

        self.instruction_name = Util.check_instruction_type(
            specs, DataSimulationTool.__name__, self.expected_instruction)
        self.export_format = Util.check_export_format(
            specs, DataSimulationTool.__name__, self.instruction_name)

        ParameterValidator.assert_keys_present(specs["definitions"],
                                               ["datasets"],
                                               DataSimulationTool.__name__,
                                               "definitions/datasets")
        ParameterValidator.assert_type_and_value(
            specs['definitions']['datasets'], dict,
            DataSimulationTool.__name__, "definitions/datasets")

        self.dataset_name = "dataset"
        Util.update_dataset_key(specs, DataSimulationTool.__name__,
                                self.dataset_name)

        Util.check_paths(specs, DataSimulationTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)
    def build_object(cls, **kwargs):

        ParameterValidator.assert_keys(
            kwargs.keys(),
            ['reference_path', 'comparison_attributes', 'name', 'label'],
            ReferenceSequenceOverlap.__name__,
            f"reports: {kwargs['name'] if 'name' in kwargs else ''}")

        kwargs['reference_path'] = Path(kwargs['reference_path'])

        assert kwargs['reference_path'].is_file(), f"{ReferenceSequenceOverlap.__name__}: 'reference_path' for report {kwargs['name']} is not " \
                                                         f"a valid file path."

        reference_sequences_df = pd.read_csv(kwargs['reference_path'])
        attributes = reference_sequences_df.columns.tolist()

        ParameterValidator.assert_keys_present(
            expected_values=kwargs['comparison_attributes'],
            values=attributes,
            location=ReferenceSequenceOverlap.__name__,
            parameter_name='columns in file under reference_path')

        return ReferenceSequenceOverlap(**kwargs)
Esempio n. 17
0
    def parse_signals(signals: dict, symbol_table: SymbolTable):
        for key, signal_spec in signals.items():

            ParameterValidator.assert_keys_present(signal_spec.keys(),
                                                   SignalParser.VALID_KEYS,
                                                   "SignalParser", key)

            implanting_strategy = SignalParser._get_implanting_strategy(
                key, signal_spec)

            ParameterValidator.assert_keys(
                signal_spec["motifs"],
                symbol_table.get_keys_by_type(SymbolType.MOTIF),
                "SignalParser", f"motifs in signal {key}", False)

            signal_motifs = [
                symbol_table.get(motif_id)
                for motif_id in signal_spec["motifs"]
            ]
            signal = Signal(key, signal_motifs, implanting_strategy)
            symbol_table.add(key, SymbolType.SIGNAL, signal)

        return symbol_table, signals
Esempio n. 18
0
    def parse(self,
              key: str,
              instruction: dict,
              symbol_table: SymbolTable,
              path: Path = None) -> DatasetExportInstruction:
        location = "DatasetExportParser"
        ParameterValidator.assert_keys(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS +
            DatasetExportParser.OPTIONAL_KEYS, location, key, False)
        ParameterValidator.assert_keys_present(
            list(instruction.keys()), DatasetExportParser.REQUIRED_KEYS,
            location, key)

        valid_formats = ReflectionHandler.all_nonabstract_subclass_basic_names(
            DataExporter, "Exporter", 'dataset_export/')
        ParameterValidator.assert_all_in_valid_list(
            instruction["export_formats"], valid_formats, location,
            "export_formats")
        ParameterValidator.assert_all_in_valid_list(
            instruction["datasets"],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            "datasets")

        return DatasetExportInstruction(
            datasets=[
                symbol_table.get(dataset_key)
                for dataset_key in instruction["datasets"]
            ],
            exporters=[
                ReflectionHandler.get_class_by_name(f"{key}Exporter",
                                                    "dataset_export/")
                for key in instruction["export_formats"]
            ],
            preprocessing_sequence=symbol_table.get(
                instruction["preprocessing_sequence"])
            if "preprocessing_sequence" in instruction else None,
            name=key)
Esempio n. 19
0
    def _check_instruction_specs(self, workflow_specification, location):
        ParameterValidator.assert_type_and_value(
            workflow_specification['instructions'], dict, location,
            'instructions')

        instruction_names = list(workflow_specification['instructions'].keys())
        assert len(instruction_names) == 1, f"MultiDatasetBenchmarkTool: there can be only one instruction specified for this tool. " \
                                            f"Currently the following instructions are specified: {instruction_names}."

        ParameterValidator.assert_keys_present(
            workflow_specification['instructions'][
                instruction_names[0]].keys(), ['type', 'datasets'], location,
            instruction_names[0])

        instruction_type = workflow_specification['instructions'][
            instruction_names[0]]['type']
        assert instruction_type == 'TrainMLModel', \
            f"MultiDatasetBenchmarkTool: this tool works only with instruction of type 'TrainMLModel', got {instruction_type} instead."

        datasets_in_instruction = workflow_specification['instructions'][
            instruction_names[0]]['datasets']
        assert len(datasets_in_instruction) > 1, \
            f'{location}: this tool takes a multiple dataset names as input, but only {len(datasets_in_instruction)} were provided: ' \
            f'{datasets_in_instruction}.'
Esempio n. 20
0
    def _get_implanting_strategy(key: str,
                                 signal: dict) -> SignalImplantingStrategy:

        valid_strategies = [
            cls[:-10]
            for cls in ReflectionHandler.discover_classes_by_partial_name(
                "Implanting", "simulation/signal_implanting_strategy/")
        ]
        ParameterValidator.assert_in_valid_list(signal["implanting"],
                                                valid_strategies,
                                                "SignalParser", key)

        defaults = DefaultParamsLoader.load(
            "signal_implanting_strategy/", f"{signal['implanting']}Implanting")
        signal = {**defaults, **signal}

        ParameterValidator.assert_keys_present(
            list(signal.keys()),
            ["motifs", "implanting", "sequence_position_weights"],
            SignalParser.__name__, key)

        implanting_comp = None
        if 'implanting_computation' in signal:
            implanting_comp = signal['implanting_computation'].lower()
            ParameterValidator.assert_in_valid_list(
                implanting_comp,
                [el.name.lower() for el in ImplantingComputation],
                SignalParser.__name__, 'implanting_computation')
            implanting_comp = ImplantingComputation[implanting_comp.upper()]

        implanting_strategy = ReflectionHandler.get_class_by_name(
            f"{signal['implanting']}Implanting")(
                GappedMotifImplanting(), signal["sequence_position_weights"],
                implanting_comp)

        return implanting_strategy