def test_generate(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset, path, name='report', file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))

        self.assertTrue(os.path.isfile(path / "labels.csv"))
        self.assertTrue(os.path.isfile(path / "encoding_details.yaml"))

        matrix = pd.read_csv(path / "design_matrix.csv", sep=",").values
        self.assertTrue(np.array_equal(matrix, np.arange(12).reshape(3, 4)))

        labels = pd.read_csv(path / "labels.csv", sep=",").values
        self.assertTrue(np.array_equal(labels, np.array([[1, 0], [0, 0], [1, 1]])))

        with open(path / "encoding_details.yaml", "r") as file:
            loaded = yaml.safe_load(file)

        self.assertTrue("feature_names" in loaded)
        self.assertTrue("encoding" in loaded)
        self.assertTrue("example_ids" in loaded)

        self.assertTrue(np.array_equal(loaded["example_ids"], np.array([0, 1, 2])))
        self.assertTrue(np.array_equal(loaded["feature_names"], np.array(["f1", "f2", "f3", "f4"])))
        self.assertEqual("test_encoding", loaded["encoding"])

        shutil.rmtree(path)
    def test_generate(self):
        path = EnvironmentSettings.root_path / "test/tmp/featuredistribution/"
        PathBuilder.build(path)

        dataset = self._create_dummy_encoded_data(path)

        report = FeatureComparison.build_object(**{"dataset": dataset,
                                                     "result_path": path,
                                                     "comparison_label": "patient"})

        self.assertTrue(report.check_prerequisites())

        result = report.generate_report()

        self.assertIsInstance(result, ReportResult)

        self.assertEqual(result.output_figures[0].path, path / "feature_comparison.html")
        self.assertEqual(result.output_tables[0].path, path / "feature_values.csv")

        content = pd.read_csv(path / "feature_values.csv")
        self.assertListEqual(list(content.columns),
                             ["patient", "example_id", "sequence", "feature", "value"])

        # report should succeed to build but check_prerequisites should be false when data is not encoded
        report = FeatureDistribution.build_object(**{"dataset": RepertoireDataset(),
                                                     "result_path": path})

        self.assertFalse(report.check_prerequisites())

        shutil.rmtree(path)
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = path / "receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def test_exporter(self):
        dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)),
                                                             labels={"l1": [1, 0, 1], "l2": [0, 0, 1]},
                                                             example_ids=[0, 1, 2],
                                                             feature_names=["f1", "f2", "f3", "f4"],
                                                             encoding="test_encoding"))

        path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/"

        report = DesignMatrixExporter(dataset=dataset, result_path=path,
                                      name="design_matrix", file_format='csv')
        report.generate_report()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv"))
        report.file_format = 'csv.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.csv.zip"))

        report.file_format = 'npy'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy"))
        report.file_format = 'npy.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.npy.zip"))

        report.file_format = 'hdf5'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5"))
        report.file_format = 'hdf5.zip'
        report._export_matrix()
        self.assertTrue(os.path.isfile(path / "design_matrix.hdf5.zip"))
        shutil.rmtree(path)

        with self.assertRaises(AssertionError):
            DesignMatrixExporter.build_object(**{'file_format': "random"})
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=["1", "1", "1", "2", '2'],
            path=path)

        if dataset_type == "receptor":

            dataset = ReceptorDataset.build_from_objects(
                test_repertoire.receptors, 100, path, name="receptor_dataset")
            dataset.identifier = 'receptor_dataset'

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def test_get_normalized_sequence_lengths(self):
        path = EnvironmentSettings.root_path / "test/tmp/datareports/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="1"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"),
            ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="4")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="5"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="8")
        ],
                                                      path=path,
                                                      metadata={})

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        sld = SequenceLengthDistribution(dataset, 1, path)

        result = sld.generate_report()
        self.assertTrue(os.path.isfile(result.output_figures[0].path))

        shutil.rmtree(path)
Beispiel #7
0
    def make_random_dataset(self, path):
        alphabet = EnvironmentSettings.get_sequence_alphabet()
        sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)]

        repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        PickleExporter.export(dataset, path)
Beispiel #8
0
    def test_repertoire_export(self):
        path = EnvironmentSettings.tmp_test_path / "airr_exporter_repertoire/"
        PathBuilder.build(path)

        repertoire, metadata_path = self.create_dummy_repertoire(path)
        dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path)

        path_exported = path / "exported"
        AIRRExporter.export(dataset, path_exported)

        resulting_data = pd.read_csv(path_exported / f"repertoires/{repertoire.identifier}.tsv", sep="\t")

        self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"])
        self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"])
        self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"])
        self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"])
        self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"])
        self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"])
        self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"])
        self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15])
        self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"])
        self.assertListEqual(list(resulting_data["productive"]), ['T', 'F'])
        self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F'])

        shutil.rmtree(path)
Beispiel #9
0
    def create_dataset(self):
        path = Path(
            os.path.relpath(EnvironmentSettings.root_path /
                            "test/tmp/immunemlapp/initial_dataset"))
        PathBuilder.build(path)

        repertoire_count = 30
        repertoires, metadata = RepertoireBuilder.build(
            [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)],
            path, {
                "CD": [
                    'yes' if i % 2 == 0 else 'no'
                    for i in range(repertoire_count)
                ],
                "CMV": [
                    True if i % 2 == 1 else False
                    for i in range(repertoire_count)
                ]
            }, [[{
                "chain": "A" if i % 2 == 0 else "B",
                "count": random.randint(2, 5)
            } for i in range(4)] for j in range(repertoire_count)])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    labels={
                                        "CD": [True, False],
                                        "CMV": [True, False]
                                    },
                                    name="d1")
        PickleExporter.export(dataset, path)

        return path / "d1.iml_dataset"
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path /
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
    def _create_dummy_encoded_data(self, path):
        n_subjects = 50
        n_features = 30

        kmers = [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(n_features)]

        encoded_data = {
            'examples': sparse.csr_matrix(
                np.random.normal(50, 10, n_subjects * n_features).reshape((n_subjects, n_features))),
            'example_ids': [''.join(random.choices(string.ascii_uppercase, k=4)) for i in range(n_subjects)],
            'labels': {
            },
            'feature_names': kmers,
            'feature_annotations': pd.DataFrame({
                "sequence": kmers
            }),
            'encoding': "random"
        }

        metadata_filepath = path / "metadata.csv"

        metadata = pd.DataFrame({"patient": np.array([i % 2 == 0 for i in range(n_subjects)])})
        metadata.to_csv(metadata_filepath, index=False)

        dataset = RepertoireDataset(encoded_data=EncodedData(**encoded_data), metadata_file=metadata_filepath)

        return dataset
Beispiel #12
0
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAA", identifier="1")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="2")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="3")],
                path=path,
                metadata={"subject_id": "patient3"})
        ]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector.process(
            dataset, {"result_path": path / "result"})

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_dataset = RepertoireDataset(
            repertoires=dataset.repertoires,
            labels=dataset.labels,
            metadata_file=dataset.metadata_file)

        feature_annotations = self._get_feature_info()
        encoded_repertoires, labels, example_ids = self._encode_repertoires(
            dataset, params)

        encoded_dataset.add_encoded_data(
            EncodedData(
                # examples contains a np.ndarray with counts
                examples=encoded_repertoires,
                # example_ids contains a list of repertoire identifiers
                example_ids=example_ids,
                # feature_names contains a list of reference receptor identifiers
                feature_names=[
                    "{receptor_id}.{chain}".format(
                        receptor_id=row["receptor_id"], chain=row["chain"])
                    for index, row in feature_annotations.iterrows()
                ],
                # feature_annotations contains a PD dataframe with sequence and VDJ gene usage per reference receptor
                feature_annotations=feature_annotations,
                labels=labels,
                encoding=MatchedReceptorsEncoder.__name__))

        return encoded_dataset
Beispiel #14
0
    def _create_report(self, path):
        report = ConfounderAnalysis.build_object(
            metadata_labels=["age", "HLA"], name='test')

        report.ml_details_path = path / "ml_details.yaml"
        report.label = Label("disease")
        report.result_path = path
        encoder = KmerFrequencyEncoder.build_object(
            RepertoireDataset(), **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3,
                'sequence_type': SequenceType.AMINO_ACID.name
            })
        report.train_dataset = self._encode_dataset(
            encoder, self._make_dataset(path / "train", size=100), path)
        report.test_dataset = self._encode_dataset(encoder,
                                                   self._make_dataset(path /
                                                                      "test",
                                                                      size=40),
                                                   path,
                                                   learn_model=False)
        report.method = self._create_dummy_lr_model(
            path, report.train_dataset.encoded_data, Label("disease"))

        return report
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, \
            "SequenceAbundanceEncoder: this encoding works only for single label."

        examples = self._calculate_sequence_abundance(dataset,
                                                      self.comparison_data,
                                                      labels[0], params)

        encoded_data = EncodedData(
            examples,
            dataset.get_metadata([labels[0]])
            if params.encode_labels else None,
            dataset.get_repertoire_ids(), [
                SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE,
                SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE
            ],
            encoding=SequenceAbundanceEncoder.__name__,
            info={'relevant_sequence_path': self.relevant_sequence_csv_path})

        encoded_dataset = RepertoireDataset(labels=dataset.labels,
                                            encoded_data=encoded_data,
                                            repertoires=dataset.repertoires)

        return encoded_dataset
Beispiel #16
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]]
            [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]]
        else:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]]
            [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
Beispiel #17
0
    def create_dummy_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]})

        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)
        dataset.name = "my_dataset"
        PickleExporter.export(dataset, path)

        return f"{dataset.name}.iml_dataset"
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_data = self._encode_data(dataset, params)

        encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires,
                                            encoded_data=encoded_data,
                                            labels=dataset.labels,
                                            metadata_file=dataset.metadata_file)

        return encoded_dataset
Beispiel #19
0
    def test_run(self):

        path = EnvironmentSettings.root_path / "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {
                 "default": [
                     1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
                     2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                 ]
             })
        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [
            HPSetting(
                Word2VecEncoder.build_object(
                    dataset, **{
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }), {
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }, LogisticRegression(), {
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1
                    }, [])
        ]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                              ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                             ReportConfig())

        instruction = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            split_config_assessment, split_config_selection,
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")
    def test_match(self):
        path = EnvironmentSettings.root_path / "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
Beispiel #22
0
 def create_dataset(self, path: str) -> RepertoireDataset:
     repertoires, metadata = RepertoireBuilder.build(
         [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"],
          ["D"], ["E", "F"]], path, {
              "l1": [1, 0, 1, 0, 1, 0, 1, 0],
              "l2": [2, 3, 2, 3, 2, 3, 3, 3]
          })
     dataset = RepertoireDataset(repertoires=repertoires,
                                 metadata_file=metadata)
     return dataset
Beispiel #23
0
    def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams):
        labels = params.label_config.get_labels_by_name()

        assert len(labels) == 1, f"SequenceCountEncoder: this encoding works only for single label, got {labels} instead."

        encoded_data = self._encode_sequence_count(dataset, self.comparison_data, labels[0], params)

        encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires)

        return encoded_dataset
Beispiel #24
0
    def _implant_signals_in_repertoires(
            simulation_state: SimulationState = None) -> Dataset:

        PathBuilder.build(simulation_state.result_path / "repertoires")
        processed_repertoires = SignalImplanter._implant_signals(
            simulation_state, SignalImplanter._process_repertoire)
        processed_dataset = RepertoireDataset(repertoires=processed_repertoires, labels={**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}),
                                                                                         **{signal.id: [True, False] for signal in simulation_state.signals}},
                                              name=simulation_state.dataset.name,
                                              metadata_file=SignalImplanter._create_metadata_file(processed_repertoires, simulation_state))
        return processed_dataset
    def _build_test_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build(
            [["GGG", "III", "LLL", "MMM"],
             ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"],
             ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]],
            labels={"l1": [True, True, False, False]},
            path=path)

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata,
                                    identifier="1")

        return dataset
    def test_get_metadata_fields(self):

        path = EnvironmentSettings.tmp_test_path / "repertoire_dataset/"
        PathBuilder.build(path)

        repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"])
        dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata)

        self.assertTrue("l1" in dataset.get_metadata_fields())
        self.assertTrue("hla" in dataset.get_metadata_fields())
        self.assertTrue("subject_id" in dataset.get_metadata_fields())

        shutil.rmtree(path)
Beispiel #27
0
    def create_datasets(self, path: Path):
        repertoires, metadata = RepertoireBuilder.build(
            [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, {
                "l1": [1, 0, 1, 0],
                "l2": [2, 3, 2, 3]
            })

        main_dataset = RepertoireDataset(repertoires=repertoires,
                                         metadata_file=metadata)
        sub_dataset = main_dataset.make_subset([0, 1],
                                               path=path,
                                               dataset_type="subset")
        return main_dataset, sub_dataset
    def create_dummy_data(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "yes", "no"]
        }

        metadata = {
            "v_gene": "TRBV1",
            "j_gene": "TRBJ1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 5
            }, {
                **metadata, "count": 5
            }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
100	TRB	AAAA	TRBV1	TRBJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
200	TRB	SSSS	TRBV1	TRBJ1	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0"""

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_sequences = {
            "params": {
                "path": path / "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        return dataset, label_config, reference_sequences, labels
Beispiel #29
0
    def create_dataset(self, path):
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA"], ["AAAC"], ["ACA"], ["CAAA"], ["AAAC"], ["AAA"]], path, {
                "l1": [1, 1, 1, 0, 0, 0],
                "l2": [2, 3, 2, 3, 2, 3]
            })

        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={
                                        "l1": [0, 1],
                                        "l2": [2, 3]
                                    },
                                    metadata_file=metadata)
        return dataset
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/clones_per_repertoire_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["ACF", "ACF", "ACF"],
                                                                       ["ACF", "ACF"],
                                                                       ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        dataset1 = ClonesPerRepertoireFilter(**{"lower_limit": 3, "result_path": path}).process_dataset(dataset, path)
        self.assertEqual(2, dataset1.get_example_count())

        dataset2 = ClonesPerRepertoireFilter(**{"upper_limit": 2, "result_path": path}).process_dataset(dataset, path)
        self.assertEqual(1, dataset2.get_example_count())

        self.assertRaises(AssertionError, ClonesPerRepertoireFilter(**{"lower_limit": 10, "result_path": path}).process_dataset, dataset, path)

        shutil.rmtree(path)