def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence(self.sequence_type)
            for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        if any(seq is None for seq in first_chain_seqs) or any(
                seq is None for seq in second_chain_seqs):
            raise ValueError(
                f"{OneHotEncoder.__name__}: receptor dataset {dataset.name} (id: {dataset.identifier}) contains empty sequences for the "
                f"specified sequence type {self.sequence_type.name.lower()}. Please check that the dataset is imported correctly."
            )

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence() for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
    def _encode_examples(self, dataset: ReceptorDataset,
                         params: EncoderParams):
        encoded_receptors_counts, encoded_receptors = [], []
        receptor_ids = []
        label_config = params.label_config
        labels = {label: []
                  for label in label_config.get_labels_by_name()
                  } if params.encode_labels else None
        chains = []

        sequence_encoder = self._prepare_sequence_encoder()
        feature_names = sequence_encoder.get_feature_names(params)
        for receptor in dataset.get_data(params.pool_size):
            counts = {chain: Counter() for chain in receptor.get_chains()}
            chains = receptor.get_chains()
            for chain in receptor.get_chains():
                counts[chain] = self._encode_sequence(
                    receptor.get_chain(chain), params, sequence_encoder,
                    counts[chain])
            encoded_receptors_counts.append(counts)
            receptor_ids.append(receptor.identifier)

            if params.encode_labels:
                for label_name in label_config.get_labels_by_name():
                    label = receptor.metadata[label_name]
                    labels[label_name].append(label)

        for encoded_receptor_count in encoded_receptors_counts:
            counts = [
                self._add_chain_to_name(encoded_receptor_count[chain], chain)
                for chain in chains
            ]
            encoded_receptors.append(counts[0] + counts[1])

        return encoded_receptors, receptor_ids, labels, feature_names
Esempio n. 4
0
    def import_sequence_dataset(import_class, params, dataset_name: str):
        PathBuilder.build(params.result_path)

        filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name)

        file_index = 0
        dataset_filenames = []
        dataset_params = {}
        items = None

        for index, filename in enumerate(filenames):
            new_items = ImportHelper.import_items(import_class, filename, params)
            items = np.append(items, new_items) if items is not None else new_items
            dataset_params = ImportHelper.extract_sequence_dataset_params(items, params)

            while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(params.result_path / "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size)
                items = items[params.sequence_file_size:]
                file_index += 1

        init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "labels": dataset_params}

        dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs)

        PickleExporter.export(dataset, params.result_path)

        return dataset
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=["1", "1", "1", "2", '2'],
            path=path)

        if dataset_type == "receptor":

            dataset = ReceptorDataset.build_from_objects(
                test_repertoire.receptors, 100, path, name="receptor_dataset")
            dataset.identifier = 'receptor_dataset'

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def create_dataset(self, path, dataset_size: int = 50):

        receptors = []

        seq1 = ReceptorSequence(amino_acid_sequence="ACACAC")
        seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE")

        for i in range(dataset_size):
            if i % 2 == 0:
                receptors.append(
                    TCABReceptor(alpha=seq1,
                                 beta=seq1,
                                 metadata={"l1": 1},
                                 identifier=str(i)))
            else:
                receptors.append(
                    TCABReceptor(alpha=seq2,
                                 beta=seq2,
                                 metadata={"l1": 2},
                                 identifier=str(i)))

        PathBuilder.build(path)
        filename = path / "receptors.pkl"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
Esempio n. 7
0
    def create_dummy_receptordataset(self, path):
        receptors = [TCABReceptor(identifier="1",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom1": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom1": "cust1"}))),
                     TCABReceptor(identifier="2",
                                  alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a",
                                                         metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1",
                                                                                   chain=Chain.ALPHA,
                                                                                   frame_type="IN",
                                                                                   custom_params={"d_call": "TRAD1",
                                                                                                  "custom2": "cust1"})),
                                  beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b",
                                                        metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1",
                                                                                  chain=Chain.BETA,
                                                                                  frame_type="IN",
                                                                                  custom_params={"d_call": "TRBD1",
                                                                                                 "custom2": "cust1"})))]

        receptors_path = path / "receptors"
        PathBuilder.build(receptors_path)
        return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = path / "receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_data = self._encode_data(dataset, params)

        encoded_dataset = ReceptorDataset(filenames=dataset.get_filenames(),
                                          encoded_data=encoded_data,
                                          labels=dataset.labels)

        return encoded_dataset
    def test(self):

        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="3"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="4")
        ]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path / 'data')
        dataset = ReceptorDataset.build_from_objects(receptors,
                                                     path=path,
                                                     file_size=10)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqReceptorEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv",
                          encode_labels=False))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids
                for identifier in ['1', '2', '3', '4']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(
            all(feature_name in encoded_dataset.encoded_data.feature_names
                for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
Esempio n. 11
0
 def _build_labels(self, dataset: ReceptorDataset,
                   params: EncoderParams) -> dict:
     labels = {
         label: []
         for label in params.label_config.get_labels_by_name()
     }
     for receptor in dataset.get_data():
         for label in labels.keys():
             labels[label].append(receptor.metadata[label])
     return labels
Esempio n. 12
0
    def _implant_signals_in_receptors(
            simulation_state: SimulationState) -> Dataset:
        processed_receptors = SignalImplanter._implant_signals(
            simulation_state, SignalImplanter._process_receptor)
        processed_dataset = ReceptorDataset.build(
            receptors=processed_receptors,
            file_size=simulation_state.dataset.file_size,
            name=simulation_state.dataset.name,
            path=simulation_state.result_path)

        processed_dataset.labels = {**(simulation_state.dataset.labels if simulation_state.dataset.labels is not None else {}),
                                    **{signal: [True, False] for signal in simulation_state.signals}}

        return processed_dataset
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset.build(receptors, 2, path)
        return dataset, lc
    def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[
                                     chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None,
                                                  nucleotide_sequence=row[
                                                      chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None,
                                                  metadata=SequenceMetadata(
                                                      v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"],
                                                      v_subgroup=row[f'{chain_name}_v_subgroup'],
                                                      j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"],
                                                      j_subgroup=row[f'{chain_name}_j_subgroup'],
                                                      chain=chain_name, count=row["count"], region_type=generic_params.region_type.value))
                             for i, chain_name in enumerate(chain_names)}

                elements.append(ReceptorBuilder.build_object(sequences, row["identifier"],
                                                             {key: row[key] for key in row.keys()
                                                              if all(item not in key for item in
                                                                     ["v_gene", 'j_gene', "count", "identifier"] + chain_names)}))

        return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)
Esempio n. 15
0
    def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset:

        iris_params = IRISImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name)
        file_index = 0
        dataset_filenames = []

        for index, filename in enumerate(filenames):
            items = IRISSequenceImport.import_items(filename, paired=iris_params.paired,
                                                    all_dual_chains=iris_params.import_dual_chains,
                                                    all_genes=iris_params.import_all_gene_combinations)

            while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(iris_params.result_path / "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size)
                items = items[iris_params.sequence_file_size:]
                file_index += 1

        return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \
            else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
Esempio n. 16
0
    def test_split_dataset(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "leave_one_out_splitter/")
        receptors = []
        for i in range(10):
            receptors.append(
                TCABReceptor(ReceptorSequence(), ReceptorSequence(),
                             {"subject": i % 3}))

        filename = path / "batch1.pickle"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        dataset = ReceptorDataset(filenames=[filename])

        params = DataSplitterParams(
            dataset,
            SplitType.LEAVE_ONE_OUT_STRATIFICATION,
            3,
            paths=[path / f"result_{i}/" for i in range(1, 4)],
            split_config=SplitConfig(SplitType.LEAVE_ONE_OUT_STRATIFICATION,
                                     split_count=3,
                                     leave_one_out_config=LeaveOneOutConfig(
                                         "subject", 1)))
        train_datasets, test_datasets = LeaveOneOutSplitter.split_dataset(
            params)

        self.assertEqual(3, len(train_datasets))
        self.assertEqual(3, len(test_datasets))

        for i in range(3):
            self.assertTrue(
                all(receptor.metadata["subject"] == i
                    for receptor in test_datasets[i].get_data()))
            self.assertTrue(
                all(receptor.metadata["subject"] != i
                    for receptor in train_datasets[i].get_data()))

        shutil.rmtree(path)
    def construct_test_flatten_dataset(self, path):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a"),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b"),
                         metadata={"l1": 1},
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2")
        ]

        return ReceptorDataset.build(receptors, 10, path)
Esempio n. 18
0
    def generate_receptor_dataset(receptor_count: int,
                                  chain_1_length_probabilities: dict,
                                  chain_2_length_probabilities: dict,
                                  labels: dict, path: Path):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(
            receptor_count, chain_1_length_probabilities,
            chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence(
            "".join(
                random.choices(alphabet,
                               k=random.choices(list(proba.keys()),
                                                proba.values())[0])),
            metadata=SequenceMetadata(count=1,
                                      v_subgroup=chain + "V1",
                                      v_gene=chain + "V1-1",
                                      v_allele=chain + "V1-1*01",
                                      j_subgroup=chain + "J1",
                                      j_gene=chain + "J1-1",
                                      j_allele=chain + "J1-1*01",
                                      chain=chain,
                                      cell_id=id))

        receptors = [
            TCABReceptor(alpha=get_random_sequence(
                chain_1_length_probabilities, "TRA", i),
                         beta=get_random_sequence(chain_2_length_probabilities,
                                                  "TRB", i),
                         metadata={
                             **{
                                 label: random.choices(list(label_dict.keys()),
                                                       label_dict.values(),
                                                       k=1)[0]
                                 for label, label_dict in labels.items()
                             },
                             **{
                                 "subject": f"subj_{i + 1}"
                             }
                         }) for i in range(receptor_count)
        ]

        filename = path / "batch01.npy"

        receptor_matrix = np.core.records.fromrecords(
            [receptor.get_record() for receptor in receptors],
            names=TCABReceptor.get_record_names())
        np.save(str(filename), receptor_matrix, allow_pickle=False)

        return ReceptorDataset(labels={
            label: list(label_dict.keys())
            for label, label_dict in labels.items()
        },
                               filenames=[filename],
                               file_size=receptor_count,
                               element_class_name=type(receptors[0]).__name__
                               if len(receptors) > 0 else None)
Esempio n. 19
0
    def prepare_tcr_dist_dataframe(dataset: ReceptorDataset,
                                   label_names: list) -> pd.DataFrame:
        if len(label_names) > 1:
            raise NotImplementedError(
                f"TCRdist: multiple labels specified ({str(label_names)[1:-1]}), but only single label binary class "
                f"is currently supported in immuneML.")
        label_name = label_names[0]

        subject, epitope, count, v_a_gene, j_a_gene, cdr3_a_aa, v_b_gene, j_b_gene, cdr3_b_aa, clone_id, cdr3_b_nucseq, cdr3_a_nucseq = \
            [], [], [], [], [], [], [], [], [], [], [], []

        for receptor in dataset.get_data():
            subject.append(receptor.metadata["subject"] if "subject" in
                           receptor.metadata else "sub" + receptor.identifier)
            epitope.append(receptor.metadata[label_name])
            count.append(
                receptor.get_chain("alpha").metadata.
                count if receptor.get_chain("alpha").metadata.count ==
                receptor.get_chain("beta").metadata.count and receptor.
                get_chain("beta").metadata.count is not None else 1)
            v_a_gene.append(
                TCRdistHelper.add_default_allele_to_v_gene(
                    receptor.get_chain('alpha').metadata.v_allele))
            j_a_gene.append(receptor.get_chain('alpha').metadata.j_allele)
            cdr3_a_aa.append(receptor.get_chain('alpha').amino_acid_sequence)
            cdr3_a_nucseq.append(
                receptor.get_chain("alpha").nucleotide_sequence)
            v_b_gene.append(
                TCRdistHelper.add_default_allele_to_v_gene(
                    receptor.get_chain('beta').metadata.v_allele))
            j_b_gene.append(receptor.get_chain('beta').metadata.j_allele)
            cdr3_b_aa.append(receptor.get_chain('beta').amino_acid_sequence)
            cdr3_b_nucseq.append(
                receptor.get_chain("beta").nucleotide_sequence)
            clone_id.append(receptor.identifier)

        if all(item is not None
               for item in cdr3_a_nucseq) and all(item is not None
                                                  for item in cdr3_b_nucseq):
            return pd.DataFrame({
                "subject": subject,
                "epitope": epitope,
                "count": count,
                "v_a_gene": v_a_gene,
                "j_a_gene": j_a_gene,
                "cdr3_a_aa": cdr3_a_aa,
                "v_b_gene": v_b_gene,
                "j_b_gene": j_b_gene,
                "cdr3_b_aa": cdr3_b_aa,
                "clone_id": clone_id,
                "cdr3_b_nucseq": cdr3_b_nucseq,
                "cdr3_a_nucseq": cdr3_a_nucseq
            })
        else:
            return pd.DataFrame({
                "subject": subject,
                "epitope": epitope,
                "count": count,
                "v_a_gene": v_a_gene,
                "j_a_gene": j_a_gene,
                "cdr3_a_aa": cdr3_a_aa,
                "v_b_gene": v_b_gene,
                "j_b_gene": j_b_gene,
                "cdr3_b_aa": cdr3_b_aa,
                "clone_id": clone_id
            })