Ejemplo n.º 1
0
    def _encode_data(self, dataset: SequenceDataset, params: EncoderParams):
        sequence_objs = [obj for obj in dataset.get_data(params.pool_size)]

        sequences = [obj.get_sequence() for obj in sequence_objs]
        example_ids = dataset.get_example_ids()
        max_seq_len = max([len(seq) for seq in sequences])
        labels = self._get_labels(sequence_objs,
                                  params) if params.encode_labels else None

        examples = self._encode_sequence_list(
            sequences,
            pad_n_sequences=len(sequence_objs),
            pad_sequence_len=max_seq_len)

        feature_names = self._get_feature_names(max_seq_len)

        if self.flatten:
            examples = examples.reshape(
                (len(sequence_objs),
                 max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for item in sublist
            ]

        encoded_data = EncodedData(examples=examples,
                                   labels=labels,
                                   example_ids=example_ids,
                                   feature_names=feature_names,
                                   encoding=OneHotEncoder.__name__)

        return encoded_data
Ejemplo n.º 2
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(
                ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path / "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            filepath = path / f"batch{i}.pkl"
            with filepath.open("wb") as file:
                sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
                pickle.dump(sequences_to_pickle, file)

        d = SequenceDataset(
            filenames=[path / f"batch{i}.pkl" for i in range(10)],
            file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
Ejemplo n.º 3
0
    def test_make_subset(self):
        sequences = []
        for i in range(100):
            sequences.append(ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i)))

        path = EnvironmentSettings.tmp_test_path / "element_generator_subset/"
        PathBuilder.build(path)

        for i in range(10):
            filepath = path / f"batch{i}.npy"
            sequences_to_pickle = sequences[i * 10:(i + 1) * 10]
            sequence_matrix = np.core.records.fromrecords([seq.get_record() for seq in sequences_to_pickle], names=ReceptorSequence.get_record_names())
            np.save(str(filepath), sequence_matrix, allow_pickle=False)

        d = SequenceDataset(filenames=[path / f"batch{i}.npy" for i in range(10)], file_size=10)

        indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92]

        d2 = d.make_subset(indices, path, SequenceDataset.TRAIN)

        for batch in d2.get_batch(1000):
            for sequence in batch:
                self.assertTrue(int(sequence.identifier) in indices)

        self.assertEqual(15, d2.get_example_count())

        shutil.rmtree(path)
Ejemplo n.º 4
0
    def _encode_new_dataset(self, dataset: SequenceDataset,
                            params: EncoderParams):
        encoded_data = self._encode_data(dataset, params)

        encoded_dataset = SequenceDataset(filenames=dataset.get_filenames(),
                                          encoded_data=encoded_data,
                                          labels=dataset.labels,
                                          file_size=dataset.file_size)

        return encoded_dataset
    def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = path / "sequences.pkl"
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
Ejemplo n.º 6
0
    def _construct_test_dataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAAA",
                             identifier="1",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATA",
                             identifier="2",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 2,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATT",
                             identifier="3",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 2
                             }))
        ]

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [1, 2])

        dataset = SequenceDataset.build(sequences=sequences,
                                        file_size=10,
                                        path=path)

        return dataset, lc
Ejemplo n.º 7
0
    def import_sequence_dataset(import_class, params, dataset_name: str):
        PathBuilder.build(params.result_path)

        filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name)

        file_index = 0
        dataset_filenames = []
        dataset_params = {}
        items = None

        for index, filename in enumerate(filenames):
            new_items = ImportHelper.import_items(import_class, filename, params)
            items = np.append(items, new_items) if items is not None else new_items
            dataset_params = ImportHelper.extract_sequence_dataset_params(items, params)

            while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(params.result_path / "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size)
                items = items[params.sequence_file_size:]
                file_index += 1

        init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "labels": dataset_params}

        dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Ejemplo n.º 8
0
    def _encode_data(self, dataset: SequenceDataset, params: EncoderParams):
        sequence_objs = [obj for obj in dataset.get_data(params.pool_size)]

        sequences = [
            obj.get_sequence(self.sequence_type) for obj in sequence_objs
        ]

        if any(seq is None for seq in sequences):
            raise ValueError(
                f"{OneHotEncoder.__name__}: sequence dataset {dataset.name} (id: {dataset.identifier}) contains empty sequences for the specified "
                f"sequence type {self.sequence_type.name.lower()}. Please check that the dataset is imported correctly."
            )

        example_ids = dataset.get_example_ids()
        max_seq_len = max([len(seq) for seq in sequences])
        labels = self._get_labels(sequence_objs,
                                  params) if params.encode_labels else None

        examples = self._encode_sequence_list(
            sequences,
            pad_n_sequences=len(sequence_objs),
            pad_sequence_len=max_seq_len)

        feature_names = self._get_feature_names(max_seq_len)

        if self.flatten:
            examples = examples.reshape(
                (len(sequence_objs),
                 max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for item in sublist
            ]

        encoded_data = EncodedData(examples=examples,
                                   labels=labels,
                                   example_ids=example_ids,
                                   feature_names=feature_names,
                                   encoding=OneHotEncoder.__name__)

        return encoded_data
Ejemplo n.º 9
0
 def create_dummy_sequencedataset(self, path):
     sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                   metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN",
                                                             custom_params={"d_call": "TRAD1",
                                                                            "custom1": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"})),
                  ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2b",
                                   metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN",
                                                             custom_params={"d_call": "TRBD1",
                                                                            "custom2": "cust1"}))]
     sequences_path = path / "sequences"
     PathBuilder.build(sequences_path)
     return SequenceDataset.build_from_objects(sequences, 2, sequences_path)
Ejemplo n.º 10
0
    def construct_test_flatten_dataset(self, path):
        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAATTT",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ATATAT",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2}))
        ]

        PathBuilder.build(path)

        return SequenceDataset.build(sequences=sequences,
                                     file_size=10,
                                     path=path)
Ejemplo n.º 11
0
    def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset:

        iris_params = IRISImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name)
        file_index = 0
        dataset_filenames = []

        for index, filename in enumerate(filenames):
            items = IRISSequenceImport.import_items(filename, paired=iris_params.paired,
                                                    all_dual_chains=iris_params.import_dual_chains,
                                                    all_genes=iris_params.import_all_gene_combinations)

            while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(iris_params.result_path / "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size)
                items = items[iris_params.sequence_file_size:]
                file_index += 1

        return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \
            else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
Ejemplo n.º 12
0
    def generate_sequence_dataset(sequence_count: int,
                                  length_probabilities: dict, labels: dict,
                                  path: Path):
        """
        Creates sequence_count receptor sequences (single chain) where the length of sequences in each chain is sampled independently for each sequence from
        length_probabilities distribution. The labels are also randomly assigned to sequences from the distribution given in
        labels. In this case, labels are multi-class, so each sequences will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:

        sequence_count: 100 # generate 100 TRB ReceptorSequences
        length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_sequence_dataset_generation_params(
            sequence_count, length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        chain = "TRB"

        sequences = [
            ReceptorSequence(
                "".join(
                    random.choices(alphabet,
                                   k=random.choices(
                                       list(length_probabilities.keys()),
                                       length_probabilities.values())[0])),
                metadata=SequenceMetadata(
                    count=1,
                    v_subgroup=chain + "V1",
                    v_gene=chain + "V1-1",
                    v_allele=chain + "V1-1*01",
                    j_subgroup=chain + "J1",
                    j_gene=chain + "J1-1",
                    j_allele=chain + "J1-1*01",
                    chain=chain,
                    custom_params={
                        **{
                            label: random.choices(list(label_dict.keys()),
                                                  label_dict.values(),
                                                  k=1)[0]
                            for label, label_dict in labels.items()
                        },
                        **{
                            "subject": f"subj_{i + 1}"
                        }
                    })) for i in range(sequence_count)
        ]

        filename = path / "batch01.npy"

        sequence_matrix = np.core.records.fromrecords(
            [seq.get_record() for seq in sequences],
            names=ReceptorSequence.get_record_names())
        np.save(str(filename), sequence_matrix, allow_pickle=False)

        return SequenceDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=sequence_count)
    def test(self):

        sequences = [
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="1",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="2",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="3",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="4",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="5",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="6",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="AAACCC",
                nucleotide_sequence="AAACCC",
                identifier="7",
                metadata=SequenceMetadata(custom_params={"l1": 1})),
            ReceptorSequence(
                amino_acid_sequence="ACACAC",
                nucleotide_sequence="ACACAC",
                identifier="8",
                metadata=SequenceMetadata(custom_params={"l1": 2})),
            ReceptorSequence(
                amino_acid_sequence="CCCAAA",
                nucleotide_sequence="CCCAAA",
                identifier="9",
                metadata=SequenceMetadata(custom_params={"l1": 1}))
        ]

        path = EnvironmentSettings.tmp_test_path / "kmrefreqseqfacencoder/"
        PathBuilder.build(path)
        dataset = SequenceDataset.build_from_objects(
            sequences, 100, PathBuilder.build(path / 'data'), 'd2')

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqSequenceEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.NUCLEOTIDE.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv"))

        self.assertEqual(9, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids for
                identifier in ['1', '2', '3', '4', '5', '6', '7', '8', '9']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[3].A))

        shutil.rmtree(path)