Exemple #1
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path + "integration_sequence_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
Exemple #2
0
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
    def _create_label_config(self, instruction: dict, dataset: Dataset,
                             instruction_key: str) -> LabelConfiguration:
        labels = instruction["labels"]

        self._check_label_format(labels, instruction_key)

        label_config = LabelConfiguration()
        for label in labels:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.params is not None and label_name in dataset.params:
                label_values = dataset.params[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label "
                    f"{label}, using empty list instead.  This could cause problems with some encodings. "
                    f"If that might be the case, check if the dataset {dataset.name} has been properly loaded."
                )

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"),
                    ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")
                ]
            ]
            [
                receptors2.append(seq)
                for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]
            ]
        else:
            [
                receptors1.append(seq) for seq in [
                    ReceptorSequence("AAAA", identifier="1"),
                    ReceptorSequence("ATA", identifier="2"),
                    ReceptorSequence("ATA", identifier='3')
                ]
            ]
            [
                receptors2.append(seq) for seq in [
                    ReceptorSequence("ATA", identifier="1"),
                    ReceptorSequence("TAA", identifier="2")
                ]
            ]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={
                                                          "l1": 1,
                                                          "l2": 2,
                                                          "subject_id": "1"
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={
                                                          "l1": 0,
                                                          "l2": 3,
                                                          "subject_id": "2"
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
Exemple #5
0
    def construct_test_flatten_dataset(self, path):
        sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})),
                     ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2}))]

        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        return SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")
Exemple #6
0
    def create_dummy_data(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "yes", "no"]
        }

        metadata = {
            "v_gene": "TRBV1",
            "j_gene": "TRBJ1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 10
            }], [{
                **metadata, "count": 5
            }, {
                **metadata, "count": 5
            }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
100	TRB	AAAA	TRBV1	TRBJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""}	{"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"}	0
200	TRB	SSSS	TRBV1	TRBJ1	HomoSapiens	HLA-A*03:01	B2M	MHCI	KLGGALQAK	IE1	CMV	https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/#	{"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""}	{"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""}	{"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"}	0"""

        with open(path + "refs.tsv", "w") as file:
            file.writelines(file_content)

        reference_sequences = {
            "params": {
                "path": path + "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        return dataset, label_config, reference_sequences, labels
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path + "explanalysisprocintegration/"
        PathBuilder.build(path)
        os.environ["cache_type"] = "test"

        dataset = self.create_dataset(path)

        label_config = LabelConfiguration()
        label_config.add_label("l1", [0, 1])
        label_config.add_label("l2", [2, 3])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV
        """

        with open(path + "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs = {
            "params": {
                "path": path + "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        units = {
            "named_analysis_4":
            ExploratoryAnalysisUnit(
                dataset=dataset,
                report=DesignMatrixExporter(),
                label_config=label_config,
                encoder=MatchedSequencesRepertoireEncoder.build_object(
                    dataset, **{
                        "max_edit_distance": 1,
                        "reference": refs
                    }))
        }

        process = ExploratoryAnalysisInstruction(units, name="exp")
        process.run(path + "results/")

        self.assertTrue(
            os.path.isfile(
                path +
                "results/exp/analysis_named_analysis_4/report/design_matrix.csv"
            ))

        shutil.rmtree(path)
Exemple #8
0
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
    def test_encode(self):

        test_path = EnvironmentSettings.root_path + "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
    def test_run(self):
        path = EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]),
            labels={"l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]}
        )

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label_name='l1')

        res = MLMethodAssessment.run(MLMethodAssessmentParams(
            dataset=dataset,
            method=method1,
            metrics={Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO},
            optimization_metric=Metric.LOG_LOSS,
            predictions_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv",
            label="l1",
            ml_score_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv",
            split_index=1,
            path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/"
        ))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(os.path.isfile(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/")
    def test_run(self):

        path = EnvironmentSettings.root_path + "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
                                                       ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path,
                                                      {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
                                                                   1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]})
        dataset = RepertoireDataset(repertoires=repertoires,
                                    params={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}),
                                 {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3},
                                 LogisticRegression(),
                                 {"model_selection_cv": False, "model_selection_n_folds": -1}, [])]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig())

        instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings,
                                              split_config_assessment,
                                              split_config_selection,
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY,
                                              label_config, path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
Exemple #12
0
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)
        filename = "{}receptors.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset, lc
Exemple #13
0
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/kmerfreqenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1"),
                                                       ReceptorSequence("ATA", identifier="2"),
                                                       ReceptorSequence("ATA", identifier='3')],
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects([ReceptorSequence("ATA", identifier="1"),
                                                       ReceptorSequence("TAA", identifier="2"),
                                                       ReceptorSequence("AAC", identifier="3")],
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.IDENTITY.name,
                "k": 3
            })

        d1 = encoder.encode(dataset, EncoderParams(
            result_path=path + "1/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        encoder = KmerFrequencyEncoder.build_object(dataset, **{
                "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        d2 = encoder.encode(dataset, EncoderParams(
            result_path=path + "2/",
            label_config=lc,
            pool_size=2,
            learn_model=True,
            model={},
            filename="dataset.csv"
        ))

        encoder3 = KmerFrequencyEncoder.build_object(dataset, **{
            "normalization_type": NormalizationType.BINARY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "k": 3
        })

        d3 = encoder3.encode(dataset, EncoderParams(
            result_path=path + "3/",
            label_config=lc,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        shutil.rmtree(path)

        self.assertTrue(isinstance(d1, RepertoireDataset))
        self.assertTrue(isinstance(d2, RepertoireDataset))
        self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2))
        self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2))
        self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
    def test_encode(self):
        path = EnvironmentSettings.root_path + "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path + "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)
    def create_dummy_data(self, path):

        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "no", "no"]
        }

        metadata_alpha = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.LIGHT.value
        }
        metadata_beta = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.HEAVY.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[[
                "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"
            ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata_alpha, "count": 10,
                "v_gene": "IGLV35"
            }, {
                **metadata_alpha, "count": 10
            }, {
                **metadata_beta, "count": 10,
                "v_gene": "IGHV29-1"
            }], [{
                **metadata_beta, "count": 10,
                "v_gene": "IGHV7-3"
            }],
                          [{
                              **metadata_alpha, "count": 5,
                              "v_gene": "IGLV26-2"
                          }, {
                              **metadata_alpha, "count": 2
                          }, {
                              **metadata_beta, "count": 1
                          }, {
                              **metadata_beta, "count": 2
                          }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """id	IGLV	IGHV	IGL_regex	IGH_regex
1	IGLV35	IGHV29-1	AGQ.GSSNTGKLI	S[APGFTVML]GQGETQY
2		IGHV7-3		ASS.R.*
3	IGLV26-1		I..NDYKLS	
4	IGLV26-2		I..NDYKLS	
"""

        filepath = path + "reference_motifs.tsv"
        with open(filepath, "w") as file:
            file.writelines(file_content)

        return dataset, label_config, filepath, labels
Exemple #16
0
    def test(self):

        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="3",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="4",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="5",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="6",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                identifier="7",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                identifier="8",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                identifier="9",
                metadata=SequenceMetadata(custom_params={"l1": 1}))
        ]

        path = EnvironmentSettings.tmp_test_path + "kmrefreqseqfacencoder/"
        PathBuilder.build(path)
        filename = "{}sequences.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")

        encoder = KmerFreqSequenceEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path + "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids for
                identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[3].A))

        shutil.rmtree(path)
Exemple #17
0
    def create_encoded_matchedregex(self, path):
        # Setting up dummy data
        labels = {
            "subject_id": ["subject_1", "subject_2", "subject_3"],
            "label": ["yes", "no", "no"]
        }

        metadata_alpha = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.ALPHA.value
        }
        metadata_beta = {
            "v_gene": "V1",
            "j_gene": "J1",
            "chain": Chain.BETA.value
        }

        repertoires, metadata = RepertoireBuilder.build(
            sequences=[[
                "XXAGQXGSSNTGKLIXX", "XXAGQXGSSNTGKLIYY", "XXSAGQGETQYXX"
            ], ["ASSXRXX"], ["XXIXXNDYKLSXX", "CCCC", "SSSS", "TTTT"]],
            path=path,
            labels=labels,
            seq_metadata=[[{
                **metadata_alpha, "count": 10,
                "v_gene": "TRAV35"
            }, {
                **metadata_alpha, "count": 10
            }, {
                **metadata_beta, "count": 10,
                "v_gene": "TRBV29-1"
            }], [{
                **metadata_beta, "count": 10,
                "v_gene": "TRBV7-3"
            }],
                          [{
                              **metadata_alpha, "count": 5,
                              "v_gene": "TRAV26-2"
                          }, {
                              **metadata_alpha, "count": 2
                          }, {
                              **metadata_beta, "count": 1
                          }, {
                              **metadata_beta, "count": 2
                          }]],
            subject_ids=labels["subject_id"])

        dataset = RepertoireDataset(repertoires=repertoires,
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("subject_id", labels["subject_id"])
        label_config.add_label("label", labels["label"])

        file_content = """id	TRAV	TRBV	TRA_regex	TRB_regex
1	TRAV35	TRBV29-1	AGQ.GSSNTGKLI	S[APGFTVML]GQGETQY
2		TRBV7-3		ASS.R.*
3	TRAV26-1		I..NDYKLS	
4	TRAV26-2		I..NDYKLS	
        """

        filepath = path + "reference_motifs.tsv"
        with open(filepath, "w") as file:
            file.writelines(file_content)

        encoder = MatchedRegexEncoder.build_object(
            dataset, **{
                "motif_filepath": filepath,
                "match_v_genes": False,
                "sum_counts": True
            })

        encoded = encoder.encode(
            dataset,
            EncoderParams(result_path=path,
                          label_config=label_config,
                          filename="dataset.csv"))

        return encoded