Beispiel #1
0
    def _construct_test_dataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAAA",
                             identifier="1",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATA",
                             identifier="2",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 2,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATT",
                             identifier="3",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 2
                             }))
        ]

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [1, 2])

        dataset = SequenceDataset.build_from_objects(sequences=sequences,
                                                     file_size=10,
                                                     path=path)

        return dataset, lc
Beispiel #2
0
 def create_dummy_sequencedataset(self, path):
     sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                   metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN",
                                                             custom_params={"d_call": "TRAD1",
                                                                            "custom1": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"}))]
     sequences_path = path / "sequences"
     PathBuilder.build(sequences_path)
     return SequenceDataset.build_from_objects(sequences, 2, sequences_path)
Beispiel #3
0
    def construct_test_flatten_dataset(self, path):
        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAATTT",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ATATAT",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2}))
        ]

        PathBuilder.build(path)

        return SequenceDataset.build_from_objects(sequences=sequences,
                                                  file_size=10,
                                                  path=path)
    def test(self):

        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="3",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="4",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="5",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="6",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="7",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="8",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="9",
                metadata=SequenceMetadata(custom_params={"l1": 1}))
        ]

        path = EnvironmentSettings.tmp_test_path / "kmrefreqseqfacencoder/"
        PathBuilder.build(path)
        dataset = SequenceDataset.build_from_objects(
            sequences, 100, PathBuilder.build(path / 'data'), 'd2')

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqSequenceEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.NUCLEOTIDE.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids for
                identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[3].A))

        shutil.rmtree(path)