Example #1
0
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlsettingsperformance/"
        PathBuilder.build(path)

        report = MLSettingsPerformance(**{"single_axis_labels": False, "x_label_position": None, "y_label_position": None})

        report.result_path = path
        report.state = self._create_state_object(path / "input_data/")

        result = report.generate_report()

        self.assertTrue(os.path.isfile(path / "performance.csv"))
        self.assertTrue(os.path.isfile(path / "performance.html"))

        self.assertIsInstance(result, ReportResult)
        self.assertEqual(result.output_figures[0].path, path / "performance.html")
        self.assertEqual(result.output_tables[0].path, path / "performance.csv")

        written_data = pd.read_csv(path / "performance.csv")
        self.assertEqual(list(written_data.columns), ["fold", "label", "encoding", "ml_method", "performance"])

        shutil.rmtree(path)
Example #2
0
    def test_subsampling(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "subsampling_workflow/")
        repertoire_specs = self.build_specs(path)

        specs_filename = path / "specs.yaml"
        with open(specs_filename, "w") as file:
            yaml.dump(repertoire_specs, file)

        app = ImmuneMLApp(specs_filename, path / "result/")
        app.run()

        shutil.rmtree(path)
    def test_run(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "performance_overview/")
        specs_file = self._prepare_specs(path)

        tool = MultiDatasetBenchmarkTool(specs_file, path / "result/")
        tool.run()

        self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_data_d1.csv"))
        self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_data_d2.csv"))
        self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/precision_recall_curve.html"))
        self.assertTrue(os.path.isfile(path / "result/benchmarking_reports/performance_overview/roc_curve.html"))

        shutil.rmtree(path)
Example #4
0
    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(self.result_path / self.name)
        self._extract_label()

        hp_items = [state.optimal_hp_items[self.label] for state in self.instruction_states]
        overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items)

        labels = [state.dataset.name for state in self.instruction_states]
        figure_path = self._make_figure(overlap_matrix, labels)
        data_path = self._export_matrix(overlap_matrix, labels)

        return ReportResult(output_figures=[ReportOutput(figure_path, 'sequence overlap across datasets')],
                            output_tables=[ReportOutput(data_path, 'sequence overlap across datasets (csv)')])
Example #5
0
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        Preprocessor.check_dataset_type(dataset, [RepertoireDataset],
                                        "ChainRepertoireFilter")
        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == Chain.get_chain(
                    params["keep_chain"])
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(
            processed_dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "ChainRepertoireFilter")

        return processed_dataset
Example #6
0
    def _implant_signals_in_repertoires(
            simulation_state: SimulationState = None) -> Dataset:

        repertoires_path = PathBuilder.build(simulation_state.result_path /
                                             "repertoires")
        processed_repertoires = SignalImplanter._implant_signals(
            simulation_state, SignalImplanter._process_repertoire,
            repertoires_path)
        processed_dataset = RepertoireDataset(repertoires=processed_repertoires, labels={**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}),
                                                                                         **{signal.id: [True, False] for signal in simulation_state.signals}},
                                              name=simulation_state.dataset.name,
                                              metadata_file=Path(SignalImplanter._create_metadata_file(processed_repertoires, simulation_state)))
        return processed_dataset
    def test_sequence_dataset(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "integration_dataset_gen_html_sequence/")
        dataset_path = path / "sequence_dataset/"

        specs = {
            "definitions": {
                "datasets": {
                    "sequencedataset": {
                        "format": "RandomSequenceDataset",
                        "params": {
                            "sequence_count": 10,
                            "length_probabilities": {
                                10: 1
                            },
                            "labels": {
                                "epitope_a": {
                                    True: 0.5,
                                    False: 0.5
                                },
                                "epitope_b": {
                                    True: 0.5,
                                    False: 0.5
                                }
                            },
                            "result_path": str(dataset_path)
                        }
                    }
                }
            },
            "instructions": {
                "instr1": {
                    "type": "DatasetExport",
                    "export_formats": ["Pickle", "AIRR"],
                    "datasets": ["sequencedataset"]
                }
            },
            "output": {
                "format": "HTML"
            }
        }

        specs_path = path / "specs.yaml"
        with open(specs_path, "w") as file:
            yaml.dump(specs, file)

        app = ImmuneMLApp(specs_path, path / "result/")
        app.run()

        shutil.rmtree(path)
Example #8
0
    def _generate(self) -> ReportResult:
        
        X = self.train_dataset.encoded_data
        predicted_y = self.method.predict(X, self.label)[self.label.name]
        predicted_proba_y = self.method.predict_proba(X, self.label)[self.label.name]
        true_y = self.train_dataset.encoded_data.labels[self.label.name]
        classes = self.method.get_classes()

        PathBuilder.build(self.result_path)

        scores = {}
        output = {
            'tables': [],
            'figures': []
        }

        for metric in self.metrics_set:
            _score = TrainingPerformance._compute_score(
                Metric[metric],
                predicted_y,
                predicted_proba_y,
                true_y,
                classes,
            )
            if metric == 'CONFUSION_MATRIX':
                self._generate_heatmap(classes, classes, _score, metric, output)
            else:
                scores[metric] = _score

        scores_df = pd.DataFrame.from_dict(scores, orient='index')
        scores_df.columns = [self.label.name]

        self._generate_barplot(scores_df, output)

        return ReportResult(self.name,
                            info="Plots the evaluation metrics for the performance given machine learning model and training dataset.",
                            output_tables=output['tables'],
                            output_figures=output['figures'])
Example #9
0
    def test_sequence_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/ioairr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, False)

        column_mapping = self.get_column_mapping()
        params = {
            "is_repertoire": False,
            "result_path": path,
            "path": path,
            "import_out_of_frame": False,
            "import_with_stop_codon": False,
            "import_productive": True,
            "region_type": "IMGT_CDR3",
            "import_empty_nt_sequences": True,
            "import_empty_aa_sequences": False,
            "column_mapping": column_mapping,
            "import_illegal_characters": False,
            "separator": "\t",
            "sequence_file_size": 1
        }

        dataset = AIRRImport.import_dataset(params, "airr_sequence_dataset")

        self.assertEqual(5, dataset.get_example_count())
        self.assertEqual(5, len(dataset.get_filenames()))

        for idx, sequence in enumerate(dataset.get_data()):
            self.assertEqual(sequence.amino_acid_sequence, "ASGVAGTFDY")

        v_genes = sorted(
            ["IGHV4-59", "IGHV4-34", "IGHV4-31", "IGHV4-31", "IGHV4-31"])
        self.assertListEqual(
            sorted(
                [sequence.metadata.v_gene for sequence in dataset.get_data()]),
            v_genes)

        shutil.rmtree(path)
Example #10
0
    def test_minimal_dataset(self):
        # test to make sure import works with minimally specified input
        path = EnvironmentSettings.root_path / "test/tmp/ioairr/"
        PathBuilder.build(path)
        file1_content = """sequence_id	junction_aa
IVKNQEJ01BVGQ6	CASGVAGTFDYW
IVKNQEJ01AQVWS	CASGVAGTFDYW
IVKNQEJ01AOYFZ	CASGVAGNFLLX
IVKNQEJ01EI5S4	CASGVAGTFDYW"""

        with open(path / "rep1.tsv", "w") as file:
            file.writelines(file1_content)

        with open(path / "metadata.csv", "w") as file:
            file.writelines("""filename,subject_id
rep1.tsv,1""")

        column_mapping = self.get_column_mapping()

        params = {
            "is_repertoire": True,
            "result_path": path,
            "path": path,
            "metadata_file": path / "metadata.csv",
            "import_out_of_frame": False,
            "import_with_stop_codon": False,
            "import_productive": True,
            "region_type": "IMGT_CDR3",
            "import_empty_nt_sequences": True,
            "import_empty_aa_sequences": False,
            "column_mapping": column_mapping,
            "import_illegal_characters": False,
            "separator": "\t"
        }

        AIRRImport.import_dataset(params, "airr_minimal_repertoire_dataset")

        shutil.rmtree(path)
Example #11
0
    def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset:
        """
        Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file

        Arguments:
            import_class: class to use for import
            params: instance of DatasetImportParams class which includes information on path, columns, result path etc.
            dataset_name: user-defined name of the dataset

        Returns:
            RepertoireDataset object that was created

        """

        try:
            metadata = pd.read_csv(params.metadata_file, ",")
        except Exception as e:
            raise Exception(f"{e}\nAn error occurred while reading in the metadata file {params.metadata_file}. Please see the error log above for "
                            f"more details on this error and the documentation for the expected format of the metadata.")

        ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__,
                                               f'{dataset_name}: params: metadata_file')

        PathBuilder.build(params.result_path / "repertoires/")

        arguments = [(import_class, row, params) for index, row in metadata.iterrows()]
        with Pool(params.number_of_processes) as pool:
            repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments)

        new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name)

        potential_labels = list(set(metadata.columns.tolist()) - {"filename"})
        dataset = RepertoireDataset(labels={key: list(set(metadata[key].values.tolist())) for key in potential_labels},
                                    repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name)

        PickleExporter.export(dataset, params.result_path)

        return dataset
    def test_repertoiredataset(self):

        path = EnvironmentSettings.tmp_test_path / "repertoiredataset_yaml"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, write_metadata=True)

        old_wd = os.getcwd()

        os.chdir(path)

        yamlbuilder_main([
            "-r", "VDJdb", "-o",
            str(path), "-f", "repertoire.yaml", "-m", "metadata.csv", "-i",
            "True"
        ])

        with open(path / "repertoire.yaml", "r") as file:
            loaded_receptor = yaml.load(file, Loader=yaml.FullLoader)

            self.assertDictEqual(
                loaded_receptor["definitions"]["datasets"], {
                    "dataset": {
                        "format": "VDJdb",
                        "params": {
                            "path": "./",
                            "metadata_file": "metadata.csv",
                            "is_repertoire": True,
                            "region_type": RegionType.IMGT_CDR3.name,
                            "result_path": "./"
                        }
                    }
                })

        ImmuneMLParser.parse_yaml_file(path / "repertoire.yaml")

        os.chdir(old_wd)

        shutil.rmtree(path)
Example #13
0
    def test_sequence_export(self):
        path = EnvironmentSettings.tmp_test_path / "airr_exporter_receptor/"
        PathBuilder.build(path)

        dataset = self.create_dummy_sequencedataset(path)

        path_exported = path / "exported_sequences"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(path_exported / "batch1.tsv", sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]), ["1a", "1b"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]),
                             ["AAATTT", "ATATAT"])
        self.assertListEqual(list(resulting_data["v_call"]),
                             ["TRAV1", "TRBV1"])
        self.assertListEqual(list(resulting_data["j_call"]),
                             ["TRAJ1", "TRBJ1"])
        self.assertListEqual(list(resulting_data["d_call"]),
                             ["TRAD1", "TRBD1"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRA", "TRB"])
        self.assertListEqual(list(resulting_data["custom1"]), ["cust1", nan])
        self.assertListEqual(list(resulting_data["custom2"]), [nan, "cust1"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', 'T'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F'])

        resulting_data = pd.read_csv(path_exported / "batch2.tsv", sep="\t")
        self.assertListEqual(list(resulting_data["sequence_id"]), ["2b"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["ATATAT"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB"])
        self.assertListEqual(list(resulting_data["custom2"]), ["cust1"])
        self.assertListEqual(list(resulting_data["productive"]), ['T'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F'])

        shutil.rmtree(path)
Example #14
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path / "distance_encoder/"
        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = DistanceEncoder.build_object(
            dataset, **{
                "distance_metric": DistanceMetricType.JACCARD.name,
                "attributes_to_match": ["sequence_aas"],
                "sequence_batch_size": 20
            })

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4,
                          filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(0, encoded.encoded_data.examples.iloc[0, 0])
        self.assertEqual(0, encoded.encoded_data.examples.iloc[1, 1])
        self.assertEqual(0, encoded.encoded_data.examples.iloc[0, 4])

        self.assertTrue(
            np.array_equal([1, 0, 1, 0, 1, 0, 1, 0],
                           encoded.encoded_data.labels["l1"]))
        self.assertTrue(
            np.array_equal([2, 3, 2, 3, 2, 3, 3, 3],
                           encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
Example #15
0
    def test_load(self):
        x = np.array([[1, 0, 0], [0, 1, 1], [1, 1, 1], [0, 1, 1]])
        y = {"default": np.array([1, 0, 2, 0])}

        knn = KNN()
        knn.fit(EncodedData(examples=sparse.csr_matrix(x), labels=y), Label("default"))

        path = EnvironmentSettings.root_path / "test/tmp/loadtestsklearn/"
        PathBuilder.build(path)

        with open(path / "knn.pickle", "wb") as file:
            pickle.dump(knn.model, file)

        config = MLMethodConfiguration()
        config.labels_with_values = {"default": [0, 1, 2]}
        config.store(path / "config.json")

        knn2 = KNN()
        knn2.load(path)

        self.assertTrue(isinstance(knn2.model, KNeighborsClassifier))

        shutil.rmtree(path)
Example #16
0
    def store(self, path: Path, feature_names=None, details_path=None):
        content = self._convert_object_to_dict()
        PathBuilder.build(path)
        file_path = path / FilenameHandler.get_filename(
            self.__class__.__name__, "pickle")

        with file_path.open("wb") as file:
            pickle.dump(content, file)

        if details_path is None:
            params_path = path / FilenameHandler.get_filename(
                self.__class__.__name__, "yaml")
        else:
            params_path = details_path

        with params_path.open("w") as file:
            desc = {
                self.label_name: {
                    **content, "feature_names": feature_names,
                    "classes": list(self.class_mapping.values())
                }
            }
            yaml.dump(desc, file)
Example #17
0
    def _output_specs(file_path=None,
                      result_path=None,
                      definitions: dict = None,
                      instructions: dict = None,
                      output: dict = None) -> Path:
        filepath = ImmuneMLParser._get_full_specs_filepath(
            file_path, result_path)

        result = {
            "definitions": definitions,
            "instructions": instructions,
            "output": output
        }
        result = ImmuneMLParser._paths_to_strings_recursive(result)

        PathBuilder.build(filepath.parent)
        with filepath.open("w") as file:
            yaml.dump(result, file)

        print(
            f"{datetime.datetime.now()}: Full specification is available at {filepath}.\n",
            flush=True)
        return filepath
Example #18
0
    def test_load_sequence_dataset(self):
        path = EnvironmentSettings.root_path / "test/tmp/mixcr/"
        PathBuilder.build(path)
        self.create_dummy_dataset(path, add_metadata=False)

        params = DefaultParamsLoader.load(
            EnvironmentSettings.default_params_path / "datasets/", "mixcr")
        params["is_repertoire"] = False
        params["paired"] = False
        params["result_path"] = path
        params["path"] = path

        dataset = MiXCRImport.import_dataset(params,
                                             "mixcr_repertoire_dataset")

        seqs = [sequence for sequence in dataset.get_data()]

        self.assertTrue(seqs[0].amino_acid_sequence
                        in ["AVLETSGSRLT", "ALVTDSWGKLQ"])  # OSX/windows
        self.assertTrue(seqs[0].metadata.v_gene in ["TRAV21",
                                                    "TRAV6"])  # OSX/windows

        shutil.rmtree(path)
Example #19
0
    def test_dataset_generation(self):

        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "cv_split_variant/")
        repertoire_specs = self.build_specs(path)

        specs_filename = path / "specs.yaml"
        with open(specs_filename, "w") as file:
            yaml.dump(repertoire_specs, file)

        app = ImmuneMLApp(specs_filename, path / "result/")
        app.run()

        shutil.rmtree(path)
    def _vectorize_examples(self, examples, params: EncoderParams,
                            keys: set) -> Tuple[np.ndarray, list]:

        if self.vectorizer_path is None:
            self.vectorizer_path = params.result_path / "vectorizer_keys.yaml"

        if params.learn_model is True:
            kmer_keys = sorted(list(keys))
            PathBuilder.build(params.result_path)
            with self.vectorizer_path.open("w") as file:
                yaml.dump(kmer_keys, file)
        else:
            with self.vectorizer_path.open("r") as file:
                kmer_keys = yaml.safe_load(file)

        vectorized_examples = [
            np.array([
                np.array(example[key]) if key in example else
                np.zeros(self.k * Util.ATCHLEY_FACTOR_COUNT + 1)
                for key in kmer_keys
            ]) for example in examples
        ]
        return np.array(vectorized_examples, dtype=np.float32), kmer_keys
Example #21
0
    def store(self, path: Path, feature_names=None, details_path: Path = None):
        PathBuilder.build(path)
        file_path = path / f"{self._get_model_filename()}.pickle"
        with file_path.open("wb") as file:
            dill.dump(self.model, file)

        if details_path is None:
            params_path = path / f"{self._get_model_filename()}.yaml"
        else:
            params_path = details_path

        with params_path.open("w") as file:
            desc = {
                **(self.get_params()),
                "feature_names": feature_names,
                "classes": self.model.classes_.tolist(),
                "class_mapping": self.class_mapping,
            }

            if self.label is not None:
                desc["label"] = vars(self.label)

            yaml.dump(desc, file)
Example #22
0
    def encode(self, dataset, params: EncoderParams) -> RepertoireDataset:
        result_path = params.result_path / "encoding"
        PathBuilder.build(result_path)

        self.export_repertoire_tsv_files(result_path)

        labels = params.label_config.get_labels_by_name()
        metadata_filepath = self.export_metadata_file(dataset, labels,
                                                      result_path)

        encoded_dataset = dataset.clone()
        encoded_dataset.encoded_data = EncodedData(
            examples=None,
            labels=dataset.get_metadata(labels)
            if params.encode_labels else None,
            example_ids=dataset.repertoire_ids,
            encoding=DeepRCEncoder.__name__,
            info={
                "metadata_filepath": metadata_filepath,
                "max_sequence_length": self.max_sequence_length
            })

        return encoded_dataset
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1")], path=path,
                                                       metadata={"subject_id": "patient1"}),
                Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="2")], path=path,
                                                       metadata={"subject_id": "patient1"}),
                Repertoire.build_from_sequence_objects([ReceptorSequence("AAC", identifier="3")], path=path,
                                                       metadata={"subject_id": "patient3"})]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector().process_dataset(dataset, path / "result")

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
Example #24
0
    def test_encode(self):
        path = EnvironmentSettings.tmp_test_path / "deeprc_encoder/"
        PathBuilder.build(path)
        PathBuilder.build(path / "encoded_data/")

        main_dataset, sub_dataset = self.create_datasets(path)

        enc = DeepRCEncoder.build_object(sub_dataset, **{})

        enc.set_context({"dataset": main_dataset})

        encoded = enc.encode(
            sub_dataset,
            EncoderParams(result_path=path / "encoded_data/",
                          label_config=LabelConfiguration(
                              [Label("l1", [0, 1]),
                               Label("l2", [2, 3])]),
                          pool_size=4))

        self.assertListEqual(encoded.encoded_data.example_ids,
                             sub_dataset.get_repertoire_ids())
        self.assertTrue(
            os.path.isfile(encoded.encoded_data.info["metadata_filepath"]))

        metadata_content = pd.read_csv(
            encoded.encoded_data.info["metadata_filepath"], sep="\t")
        self.assertListEqual(list(metadata_content["ID"]),
                             sub_dataset.get_repertoire_ids())

        for repertoire in main_dataset.repertoires:
            rep_path = path / f"encoded_data/encoding/{repertoire.identifier}.tsv"
            self.assertTrue(os.path.isfile(rep_path))
            repertoire_tsv = pd.read_csv(rep_path, sep="\t")
            self.assertListEqual(list(repertoire_tsv["amino_acid"]),
                                 list(repertoire.get_sequence_aas()))

        shutil.rmtree(path)
Example #25
0
 def __init__(self,
              train_dataset: Dataset,
              test_dataset: Dataset,
              label: Label,
              metrics: set,
              optimization_metric: Metric,
              path: Path,
              ml_reports: List[MLReport] = None,
              encoding_reports: list = None,
              data_reports: list = None,
              number_of_processes: int = 2,
              label_config: LabelConfiguration = None,
              report_context: dict = None,
              hp_setting: HPSetting = None):
     self.train_dataset = train_dataset
     self.test_dataset = test_dataset
     self.label = label
     self.label_config = label_config
     self.method = copy.deepcopy(hp_setting.ml_method)
     self.path = PathBuilder.build(path) if path is not None else None
     self.ml_details_path = path / "ml_details.yaml" if path is not None else None
     self.ml_score_path = path / "ml_score.csv" if path is not None else None
     self.train_predictions_path = path / "train_predictions.csv" if path is not None else None
     self.test_predictions_path = path / "test_predictions.csv" if path is not None else None
     self.report_path = PathBuilder.build(
         path / "reports") if path is not None else None
     self.number_of_processes = number_of_processes
     assert all([isinstance(metric, Metric) for metric in metrics]), \
         "MLProcess: metrics are not set to be an instance of Metric."
     self.metrics = metrics
     self.metrics.add(Metric.BALANCED_ACCURACY)
     self.optimization_metric = optimization_metric
     self.ml_reports = ml_reports if ml_reports is not None else []
     self.encoding_reports = encoding_reports if encoding_reports is not None else []
     self.data_reports = data_reports if data_reports is not None else []
     self.report_context = report_context
     self.hp_setting = copy.deepcopy(hp_setting)
Example #26
0
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "kernel_sequence_logo/")
        dataset = RandomDatasetGenerator.generate_receptor_dataset(
            receptor_count=500,
            chain_1_length_probabilities={4: 1},
            chain_2_length_probabilities={4: 1},
            labels={"CMV": {
                True: 0.5,
                False: 0.5
            }},
            path=path / "dataset")
        enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration([Label("CMV", [True, False])])))
        cnn = ReceptorCNN(kernel_count=2,
                          kernel_size=[3],
                          positional_channels=3,
                          sequence_type="amino_acid",
                          device="cpu",
                          number_of_threads=4,
                          random_seed=1,
                          learning_rate=0.01,
                          iteration_count=10,
                          l1_weight_decay=0.1,
                          evaluate_at=5,
                          batch_size=100,
                          training_percentage=0.8,
                          l2_weight_decay=0.0)
        cnn.fit(enc_dataset.encoded_data, "CMV")

        report = KernelSequenceLogo(method=cnn, result_path=path / "logos/")
        report.generate_report()

        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv"))
        self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.csv"))
        self.assertTrue(
            os.path.isfile(path / "logos/fully_connected_layer_weights.html"))

        shutil.rmtree(path)
Example #27
0
    def build_like(cls,
                   repertoire,
                   indices_to_keep: list,
                   result_path: Path,
                   filename_base: str = None):
        if indices_to_keep is not None and len(indices_to_keep) > 0:
            PathBuilder.build(result_path)

            data = repertoire.load_data()
            data = data[indices_to_keep]
            identifier = uuid4().hex
            filename_base = filename_base if filename_base is not None else identifier

            data_filename = result_path / f"{filename_base}.npy"
            np.save(str(data_filename), data)

            metadata_filename = result_path / f"{filename_base}_metadata.yaml"
            shutil.copyfile(repertoire.metadata_filename, metadata_filename)

            new_repertoire = Repertoire(data_filename, metadata_filename,
                                        identifier)
            return new_repertoire
        else:
            return None
    def store(self, path: Path, feature_names=None, details_path: Path = None):
        PathBuilder.build(path)
        torch.save(
            copy.deepcopy(self.logistic_regression).state_dict(),
            str(path / "log_reg.pt"))
        custom_vars = copy.deepcopy(vars(self))

        coefficients_df = pd.DataFrame(
            custom_vars["logistic_regression"].linear.weight.detach().numpy(),
            columns=feature_names)
        coefficients_df["bias"] = custom_vars[
            "logistic_regression"].linear.bias.detach().numpy()
        coefficients_df.to_csv(path / "coefficients.csv", index=False)

        del custom_vars["result_path"]
        del custom_vars["logistic_regression"]
        del custom_vars["label"]

        if self.label:
            custom_vars["label"] = vars(self.label)

        params_path = path / "custom_params.yaml"
        with params_path.open('w') as file:
            yaml.dump(custom_vars, file)
Example #29
0
    def export(hp_item: HPItem, path: Path) -> Path:
        PathBuilder.build(path)
        preproc_filename = MLExporter._store_preprocessing_sequence(
            hp_item.hp_setting.preproc_sequence, path).name
        encoder_filename = MLExporter._store_encoder(
            hp_item.hp_setting.encoder, path).name

        hp_item.method.store(path, hp_item.method.get_feature_names())
        labels_with_values = {
            hp_item.method.get_label_name(): hp_item.method.get_classes()
        }

        method_config = MLMethodConfiguration(
            labels_with_values=labels_with_values,
            software_used=hp_item.method.get_package_info(),
            encoding_name=hp_item.hp_setting.encoder_name,
            encoding_parameters=hp_item.hp_setting.encoder_params,
            encoding_file=encoder_filename,
            encoding_class=type(hp_item.hp_setting.encoder).__name__,
            ml_method=type(hp_item.method).__name__,
            ml_method_name=hp_item.method.name,
            train_dataset_id=hp_item.train_dataset.identifier,
            train_dataset_name=hp_item.train_dataset.name,
            preprocessing_sequence_name=hp_item.hp_setting.
            preproc_sequence_name,
            preprocessing_file=os.path.basename(preproc_filename),
            preprocessing_parameters={
                type(seq).__name__:
                {str(key): str(val)
                 for key, val in vars(seq).items()}
                for seq in hp_item.hp_setting.preproc_sequence
            })

        method_config.store(path / 'ml_config.yaml')

        return path
Example #30
0
    def test_simulation(self):
        path = EnvironmentSettings.tmp_test_path / "integration_simulation/"
        self.prepare_dataset(path)
        specs_path = self.prepare_specs(path)

        PathBuilder.build(path / "result/")

        app = ImmuneMLApp(specification_path=specs_path,
                          result_path=path / "result/")
        app.run()

        self.assertTrue(os.path.isfile(path / "result/inst1/metadata.csv"))

        metadata_df = pd.read_csv(path / "result/inst1/metadata.csv",
                                  comment=Constants.COMMENT_SIGN)
        self.assertTrue("signal1" in metadata_df.columns)
        self.assertEqual(17, sum(metadata_df["signal1"]))

        self.assertTrue(os.path.isfile(path / "result/index.html"))
        self.assertTrue(
            os.path.isfile(
                path / "result/inst1/exported_dataset/pickle/d1.iml_dataset"))

        shutil.rmtree(path)