Esempio n. 1
0
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAA", identifier="1")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="2")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="3")],
                path=path,
                metadata={"subject_id": "patient3"})
        ]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector.process(
            dataset, {"result_path": path / "result"})

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
Esempio n. 2
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]]
            [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]]
        else:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]]
            [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
    def test_get_normalized_sequence_lengths(self):
        path = EnvironmentSettings.root_path / "test/tmp/datareports/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="1"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"),
            ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="4")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAA", identifier="5"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"),
            ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"),
            ReceptorSequence(amino_acid_sequence="AAA", identifier="8")
        ],
                                                      path=path,
                                                      metadata={})

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        sld = SequenceLengthDistribution(dataset, 1, path)

        result = sld.generate_report()
        self.assertTrue(os.path.isfile(result.output_figures[0].path))

        shutil.rmtree(path)
    def test_create_model(self):
        test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA")
        sequence2 = ReceptorSequence("CASSCCC")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        model_creator = KmerPairModelCreator()
        model = model_creator.create_model(dataset=dataset,
                                           k=2,
                                           vector_size=16,
                                           batch_size=1,
                                           model_path=test_path /
                                           "model.model")

        self.assertTrue(isinstance(model, Word2Vec))
        self.assertTrue("CA" in model.wv.vocab)
        self.assertEqual(400, len(model.wv.vocab))

        shutil.rmtree(test_path)
Esempio n. 5
0
    def create_dummy_repertoire(self, path):
        sequence_objects = [ReceptorSequence(amino_acid_sequence="AAA",
                                             nucleotide_sequence="GCTGCTGCT",
                                             identifier="receptor_1",
                                             metadata=SequenceMetadata(v_gene="TRBV1",
                                                                       j_gene="TRBJ1",
                                                                       chain=Chain.BETA,
                                                                       count=5,
                                                                       region_type="IMGT_CDR3",
                                                                       frame_type="IN",
                                                                       custom_params={"d_call": "TRBD1",
                                                                                      "custom_test": "cust1"})),
                            ReceptorSequence(amino_acid_sequence="GGG",
                                             nucleotide_sequence="GGTGGTGGT",
                                             identifier="receptor_2",
                                             metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01",
                                                                       j_gene="TRAJ2",
                                                                       chain=Chain.ALPHA,
                                                                       count=15,
                                                                       frame_type=None,
                                                                       region_type="IMGT_CDR3",
                                                                       custom_params={"d_call": "TRAD2",
                                                                                      "custom_test": "cust2"}))]

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"})
        df = pd.DataFrame({"filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"],
                           "repertoire_identifier": [repertoire.identifier]})
        df.to_csv(path / "metadata.csv", index=False)

        return repertoire, path / "metadata.csv"
Esempio n. 6
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
    def test_process(self):

        path = EnvironmentSettings.root_path / "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path / "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path / "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path / "results"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path / "results"
                          })

        shutil.rmtree(path)
Esempio n. 8
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path / "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
Esempio n. 9
0
    def test_match_repertoire(self):

        path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence(amino_acid_sequence="AAAAAA",
                             identifier="1",
                             metadata=SequenceMetadata(chain="A", count=3)),
            ReceptorSequence(amino_acid_sequence="CCCCCC",
                             identifier="2",
                             metadata=SequenceMetadata(chain="A", count=2)),
            ReceptorSequence(amino_acid_sequence="AAAACC",
                             identifier="3",
                             metadata=SequenceMetadata(chain="A", count=1)),
            ReceptorSequence(amino_acid_sequence="TADQVF",
                             identifier="4",
                             metadata=SequenceMetadata(chain="A", count=4))
        ],
                                                            metadata={
                                                                "CD": True
                                                            },
                                                            path=path)

        sequences = [
            ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")),
            ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A"))
        ]

        matcher = SequenceMatcher()
        result = matcher.match_repertoire(repertoire, 0, sequences, 2,
                                          SequenceMatchingSummaryType.COUNT)

        self.assertTrue("sequences" in result)
        self.assertTrue("repertoire" in result)
        self.assertTrue("repertoire_index" in result)

        self.assertEqual(4, len(result["sequences"]))
        self.assertEqual(1, len(result["sequences"][0]["matching_sequences"]))
        self.assertEqual(0, len(result["sequences"][1]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][2]["matching_sequences"]))
        self.assertEqual(1, len(result["sequences"][3]["matching_sequences"]))

        self.assertEqual(
            3,
            len([
                r for r in result["sequences"]
                if len(r["matching_sequences"]) > 0
            ]))
        self.assertTrue(result["metadata"]["CD"])

        result = matcher.match_repertoire(
            repertoire, 0, sequences, 2,
            SequenceMatchingSummaryType.CLONAL_PERCENTAGE)
        self.assertEqual(0.8, result["clonal_percentage"])

        shutil.rmtree(path)
Esempio n. 10
0
    def test_match(self):
        path = EnvironmentSettings.root_path / "test/tmp/seqmatch/"
        PathBuilder.build(path)

        repertoire = Repertoire.build_from_sequence_objects(
            sequence_objects=[
                ReceptorSequence(amino_acid_sequence="AAAAAA",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="3"),
                ReceptorSequence(amino_acid_sequence="CCCCCC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="4"),
                ReceptorSequence(amino_acid_sequence="AAAACC",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J2"),
                                 identifier="5"),
                ReceptorSequence(amino_acid_sequence="TADQVF",
                                 metadata=SequenceMetadata(chain="A",
                                                           v_gene="V1",
                                                           j_gene="J3"),
                                 identifier="6")
            ],
            metadata={"CD": True},
            path=path)

        dataset = RepertoireDataset(repertoires=[repertoire])
        sequences = [
            ReceptorSequence("AAAACA",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J2"),
                             identifier="1"),
            ReceptorSequence("TADQV",
                             metadata=SequenceMetadata(chain="A",
                                                       v_gene="V1",
                                                       j_gene="J3"),
                             identifier="2")
        ]

        matcher = SequenceMatcher()
        result = matcher.match(dataset, sequences, 2,
                               SequenceMatchingSummaryType.PERCENTAGE)

        self.assertTrue("repertoires" in result)
        self.assertEqual(
            1,
            len(result["repertoires"][0]["sequences"][3]
                ["matching_sequences"]))
        self.assertTrue(result["repertoires"][0]["metadata"]["CD"])
        self.assertEqual(1, len(result["repertoires"]))

        shutil.rmtree(path)
    def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path: Path) -> Repertoire:
        if repertoire_metadata is not None:
            metadata = copy.deepcopy(repertoire_metadata)
        else:
            metadata = {}

        # when adding implant to a repertoire, only signal id is stored:
        # more detailed information is available in each receptor_sequence
        # (specific motif and motif instance)
        metadata[signal.id] = True
        repertoire = Repertoire.build_from_sequence_objects(sequences, path, metadata)

        return repertoire
Esempio n. 12
0
    def test_create_sentences_from_repertoire(self):

        path = EnvironmentSettings.tmp_test_path / "kmer/"
        PathBuilder.build(path)

        rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"),
                                                      ReceptorSequence(amino_acid_sequence="ACCT"),
                                                      ReceptorSequence(amino_acid_sequence="AACT")], path, {})

        sentences = KmerHelper.create_sentences_from_repertoire(rep, 3, sequence_type=SequenceType.AMINO_ACID)

        self.assertEqual(3, len(sentences))
        self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0])

        shutil.rmtree(path)
Esempio n. 13
0
    def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None):

        if subject_ids is not None:
            assert len(subject_ids) == len(sequences)

        if seq_metadata is not None:
            assert len(sequences) == len(seq_metadata)
            for index, sequence_list in enumerate(sequences):
                assert len(sequence_list) == len(seq_metadata[index])

        PathBuilder.build(path)
        rep_path = PathBuilder.build(path / "repertoires")

        repertoires = []
        if subject_ids is None:
            subject_ids = []

        for rep_index, sequence_list in enumerate(sequences):
            rep_sequences = ReceptorSequenceList()
            if len(subject_ids) < len(sequences):
                subject_ids.append("rep_" + str(rep_index))
            for seq_index, sequence in enumerate(sequence_list):
                if seq_metadata is None:
                    m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3")
                else:
                    m = SequenceMetadata(**seq_metadata[rep_index][seq_index])

                s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index))
                rep_sequences.append(s)

            if labels is not None:
                metadata = {key: labels[key][rep_index] for key in labels.keys()}
            else:
                metadata = {}

            metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}}

            repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}")
            repertoires.append(repertoire)

        df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires],
                              "subject_id": subject_ids,
                              "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]},
                           **(labels if labels is not None else {})})
        df.to_csv(path / "metadata.csv", index=False)

        return repertoires, path / "metadata.csv"
Esempio n. 14
0
    def _process_repertoire(index, repertoire, current_implanting,
                            simulation_state) -> Repertoire:
        if current_implanting is not None:

            return SignalImplanter._implant_in_repertoire(
                index, repertoire, current_implanting, simulation_state)

        else:
            new_repertoire = Repertoire.build_from_sequence_objects(
                repertoire.sequences,
                simulation_state.result_path / "repertoires",
                repertoire.metadata)

            for signal in simulation_state.signals:
                new_repertoire.metadata[f"signal_{signal.id}"] = False

            return new_repertoire
Esempio n. 15
0
    def implant_in_repertoire(self, repertoire: Repertoire,
                              repertoire_implanting_rate: float, signal,
                              path: Path):

        assert all("/" not in motif.seed for motif in signal.motifs), \
            f'FullSequenceImplanting: motifs cannot include gaps. Check motifs {[motif.identifier for motif in signal.motifs]}.'

        sequences = repertoire.sequences
        new_sequence_count = math.ceil(
            len(sequences) * repertoire_implanting_rate)
        assert new_sequence_count > 0, \
            f"FullSequenceImplanting: there are too few sequences ({len(sequences)}) in the repertoire with identifier {repertoire.identifier} " \
            f"to have the given repertoire implanting rate ({repertoire_implanting_rate}). Please consider increasing the repertoire implanting rate."
        new_sequences = self._create_new_sequences(sequences,
                                                   new_sequence_count, signal)
        metadata = copy.deepcopy(repertoire.metadata)
        metadata[f"signal_{signal.id}"] = True

        return Repertoire.build_from_sequence_objects(new_sequences, path,
                                                      metadata)
Esempio n. 16
0
    def _process_repertoire(index,
                            repertoire,
                            current_implanting,
                            simulation_state,
                            output_path: Path = None) -> Repertoire:
        if current_implanting is not None:

            new_repertoire = SignalImplanter._implant_in_repertoire(
                index, repertoire, current_implanting, simulation_state)

        else:
            new_metadata = {
                **repertoire.metadata,
                **{
                    f"{signal.id}": False
                    for signal in simulation_state.signals
                }
            }
            new_repertoire = Repertoire.build_from_sequence_objects(
                repertoire.sequences,
                simulation_state.result_path / "repertoires",
                metadata=new_metadata)

        return new_repertoire
    def test_encode(self):
        path = EnvironmentSettings.root_path / "test/tmp/kmerfreqenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence("AAA", identifier="1"),
            ReceptorSequence("ATA", identifier="2"),
            ReceptorSequence("ATA", identifier='3')
        ],
                                                      metadata={
                                                          "l1": 1,
                                                          "l2": 2,
                                                          "subject_id": "1"
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence("ATA", identifier="1"),
            ReceptorSequence("TAA", identifier="2"),
            ReceptorSequence("AAC", identifier="3")
        ],
                                                      metadata={
                                                          "l1": 0,
                                                          "l2": 3,
                                                          "subject_id": "2"
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = KmerFrequencyEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.IDENTITY.name,
                "k": 3
            })

        d1 = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "1/",
                          label_config=lc,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        encoder = KmerFrequencyEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        encoder3 = KmerFrequencyEncoder.build_object(
            dataset, **{
                "normalization_type": NormalizationType.BINARY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "k": 3
            })

        d3 = encoder3.encode(
            dataset,
            EncoderParams(result_path=path / "3/",
                          label_config=lc,
                          learn_model=True,
                          model={},
                          filename="dataset.pkl"))

        shutil.rmtree(path)

        self.assertTrue(isinstance(d1, RepertoireDataset))
        self.assertTrue(isinstance(d2, RepertoireDataset))
        self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2))
        self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2))
        self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
Esempio n. 18
0
 def _store_repertoire(self, repertoire, sequences):
     new_repertoire = Repertoire.build_from_sequence_objects(
         sequence_objects=sequences,
         path=self.result_path,
         metadata=repertoire.metadata)
     return new_repertoire
Esempio n. 19
0
    def test_run(self):

        r = []

        path = EnvironmentSettings.tmp_test_path / "signalImplanter/"

        if not os.path.isdir(path):
            os.makedirs(path)

        sequences = [
            ReceptorSequence("ACDEFG", identifier="1"),
            ReceptorSequence("ACDEFG", identifier="2"),
            ReceptorSequence("ACDEFG", identifier="3"),
            ReceptorSequence("ACDEFG", identifier="4")
        ]

        for i in range(10):
            rep = Repertoire.build_from_sequence_objects(
                sequence_objects=sequences, path=path, metadata={})
            r.append(rep)

        dataset = RepertoireDataset(repertoires=r)

        m1 = Motif(identifier="m1",
                   instantiation=GappedKmerInstantiation(),
                   seed="CAS")
        m2 = Motif(identifier="m2",
                   instantiation=GappedKmerInstantiation(),
                   seed="CCC")
        s1 = Signal(identifier="s1",
                    motifs=[m1],
                    implanting_strategy=HealthySequenceImplanting(
                        GappedMotifImplanting(),
                        implanting_computation=ImplantingComputation.ROUND))
        s2 = Signal(identifier="s2",
                    motifs=[m1, m2],
                    implanting_strategy=HealthySequenceImplanting(
                        GappedMotifImplanting(),
                        implanting_computation=ImplantingComputation.ROUND))

        simulation = Simulation([
            Implanting(dataset_implanting_rate=0.2,
                       repertoire_implanting_rate=0.5,
                       signals=[s1, s2],
                       name="i1"),
            Implanting(dataset_implanting_rate=0.2,
                       repertoire_implanting_rate=0.5,
                       signals=[s2],
                       name="i2")
        ])

        input_params = SimulationState(dataset=dataset,
                                       result_path=path,
                                       simulation=simulation,
                                       signals=[s1, s2],
                                       formats=["ImmuneML"])

        new_dataset = SignalImplanter.run(input_params)
        reps_with_s2 = sum([
            rep.metadata[s2.id] is True
            for rep in new_dataset.get_data(batch_size=10)
        ])
        reps_with_s1 = sum([
            rep.metadata[s1.id] is True
            for rep in new_dataset.get_data(batch_size=10)
        ])
        self.assertEqual(10, len(new_dataset.get_example_ids()))
        self.assertTrue(
            all([
                s1.id in rep.metadata.keys()
                for rep in new_dataset.get_data(batch_size=10)
            ]))
        self.assertTrue(
            all([
                s2.id in rep.metadata.keys()
                for rep in new_dataset.get_data(batch_size=10)
            ]))
        self.assertTrue(reps_with_s2 == 4)
        self.assertTrue(reps_with_s1 == 2)

        self.assertEqual(10, len(new_dataset.get_example_ids()))

        metadata_filenames = [
            filename.name for filename in new_dataset.get_filenames()
        ]
        self.assertTrue(
            all([
                repertoire.data_filename.name in metadata_filenames
                for repertoire in new_dataset.repertoires
            ]))

        shutil.rmtree(path)
 def store_repertoire(path, repertoire, sequences):
     new_repertoire = Repertoire.build_from_sequence_objects(sequences, path, repertoire.metadata)
     return new_repertoire
Esempio n. 21
0
    def test_find_label_associated_sequence_p_values(self):
        path = EnvironmentSettings.tmp_test_path / "comparison_data_find_label_assocseqpvalues/"
        PathBuilder.build(path)

        repertoires = [
            Repertoire.build_from_sequence_objects([ReceptorSequence()], path,
                                                   {
                                                       "l1": val,
                                                       "subject_id": subject_id
                                                   })
            for val, subject_id in zip([True, True, False, False],
                                       ["rep_0", "rep_1", "rep_2", "rep_3"])
        ]

        col_name_index = {
            repertoires[index].identifier: index
            for index in range(len(repertoires))
        }

        comparison_data = ComparisonData(
            repertoire_ids=[
                repertoire.identifier for repertoire in repertoires
            ],
            comparison_attributes=["sequence_aas"],
            sequence_batch_size=4,
            path=path)
        comparison_data.batches = [
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[1., 0., 0., 0.], [1., 1., 0., 0.]]),
                    'items': [('GGG', ), ('III', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 0
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[1., 1., 0., 1.], [1., 1., 1., 1.]]),
                    'items': [('LLL', ), ('MMM', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 1
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 1., 0., 0.], [0., 1., 0., 1.]]),
                    'items': [('DDD', ), ('EEE', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 2
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 1., 1., 1.], [0., 0., 1., 1.]]),
                    'items': [('FFF', ), ('CCC', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 3
                }),
            ComparisonDataBatch(
                **{
                    'matrix': np.array([[0., 0., 0., 1.]]),
                    'items': [('AAA', )],
                    'repertoire_index_mapping': col_name_index,
                    'path': path,
                    'identifier': 4
                })
        ]

        p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(
            comparison_data, repertoires,
            Label('l1', [True, False], positive_class=True))

        print(p_values)

        self.assertTrue(
            np.allclose([
                SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667,
                0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE,
                0.8333333333333331, 1., 1., 2
            ],
                        p_values,
                        equal_nan=True))

        shutil.rmtree(path)
Esempio n. 22
0
    def test_encode(self):
        path = EnvironmentSettings.root_path / "test/tmp/evennessenc/"

        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=100))
            for i in range(1000)
        ] + [
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=1))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_1",
                                                          "l2": 2
                                                      },
                                                      path=path)

        rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[
            ReceptorSequence("AAA", metadata=SequenceMetadata(count=10))
            for i in range(1000)
        ],
                                                      metadata={
                                                          "l1": "test_2",
                                                          "l2": 3
                                                      },
                                                      path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", ["test_1", "test_2"])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 51
            })

        d1 = encoder.encode(
            dataset, EncoderParams(
                result_path=path / "1/",
                label_config=lc,
            ))

        encoder = EvennessProfileEncoder.build_object(
            dataset, **{
                "min_alpha": 0,
                "max_alpha": 10,
                "dimension": 11
            })

        d2 = encoder.encode(
            dataset,
            EncoderParams(result_path=path, label_config=lc, pool_size=2))

        self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1)
        self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1)

        shutil.rmtree(path)