def test_generate(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset, path, name='report', file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))

        self.assertTrue(os.path.isfile(path / "labels.csv"))
        self.assertTrue(os.path.isfile(path / "encoding_details.yaml"))

        matrix = pd.read_csv(path / "design_matrix.csv", sep=",").values
        self.assertTrue(np.array_equal(matrix, np.arange(12).reshape(3, 4)))

        labels = pd.read_csv(path / "labels.csv", sep=",").values
        self.assertTrue(np.array_equal(labels, np.array([[1, 0], [0, 0], [1, 1]])))

        with open(path / "encoding_details.yaml", "r") as file:
            loaded = yaml.safe_load(file)

        self.assertTrue("feature_names" in loaded)
        self.assertTrue("encoding" in loaded)
        self.assertTrue("example_ids" in loaded)

        self.assertTrue(np.array_equal(loaded["example_ids"], np.array([0, 1, 2])))
        self.assertTrue(np.array_equal(loaded["feature_names"], np.array(["f1", "f2", "f3", "f4"])))
        self.assertEqual("test_encoding", loaded["encoding"])

        shutil.rmtree(path)
Beispiel #2
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder

    Arguments:
        path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format
        result_path (str): where to store the results
        metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
            otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column
    Returns:
         encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    path_to_dataset_directory = Path(path_to_dataset_directory)
    result_path = Path(result_path)

    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)
    else:
        metadata_path = Path(metadata_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_alleles",
            "allJHitsWithScore": "j_alleles"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.labels.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_type": "amino_acid",
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])]))))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=result_path / "csv_exported", file_format='csv')
    dataset_exporter.generate_report()

    return encoded_dataset
Beispiel #3
0
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path / "explanalysisprocintegration/"
        PathBuilder.build(path)
        os.environ["cache_type"] = "test"

        dataset = self.create_dataset(path)

        label_config = LabelConfiguration()
        label_config.add_label("l1", [0, 1])
        label_config.add_label("l2", [2, 3])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV
        """

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs = {
            "params": {
                "path": path / "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        units = {
            "named_analysis_4":
            ExploratoryAnalysisUnit(
                dataset=dataset,
                report=DesignMatrixExporter(name='report', file_format='csv'),
                label_config=label_config,
                encoder=MatchedSequencesRepertoireEncoder.build_object(
                    dataset, **{
                        "max_edit_distance": 1,
                        "reference": refs
                    }))
        }

        process = ExploratoryAnalysisInstruction(units, name="exp")
        process.run(path / "results/")

        self.assertTrue(
            os.path.isfile(
                path /
                "results/exp/analysis_named_analysis_4/report/design_matrix.csv"
            ))

        shutil.rmtree(path)
    def test_exporter(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset=dataset, result_path=path,
                                      name="design_matrix", file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))
        report.file_format = 'csv.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv.zip"))

        report.file_format = 'npy'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy"))
        report.file_format = 'npy.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy.zip"))

        report.file_format = 'hdf5'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5"))
        report.file_format = 'hdf5.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5.zip"))
        shutil.rmtree(path)

        with self.assertRaises(AssertionError):
            DesignMatrixExporter.build_object(**{'file_format': "random"})