Beispiel #1
0
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence() for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
Beispiel #2
0
    def import_sequence_dataset(import_class, params, dataset_name: str):
        PathBuilder.build(params.result_path)

        filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name)

        file_index = 0
        dataset_filenames = []
        dataset_params = {}
        items = None

        for index, filename in enumerate(filenames):
            new_items = ImportHelper.import_items(import_class, filename, params)
            items = np.append(items, new_items) if items is not None else new_items
            dataset_params = ImportHelper.extract_sequence_dataset_params(items, params)

            while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(params.result_path + "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size)
                items = items[params.sequence_file_size:]
                file_index += 1

        init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "params": dataset_params}

        dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs)

        PickleExporter.export(dataset, params.result_path)

        return dataset
Beispiel #3
0
    def prepare_tcr_dist_dataframe(dataset: ReceptorDataset, labels: list) -> pd.DataFrame:
        if len(labels) > 1:
            raise NotImplementedError(f"TCRdist: multiple labels specified ({str(labels)[1:-1]}), but only single label binary class "
                                      f"is currently supported in immuneML.")
        label = labels[0]

        subject, epitope, count, v_a_gene, j_a_gene, cdr3_a_aa, v_b_gene, j_b_gene, cdr3_b_aa, clone_id, cdr3_b_nucseq, cdr3_a_nucseq = \
            [], [], [], [], [], [], [], [], [], [], [], []

        for receptor in dataset.get_data():
            subject.append(receptor.metadata["subject"] if "subject" in receptor.metadata else "sub" + receptor.identifier)
            epitope.append(receptor.metadata[label])
            count.append(receptor.get_chain("alpha").metadata.count
                         if receptor.get_chain("alpha").metadata.count == receptor.get_chain("beta").metadata.count
                            and receptor.get_chain("beta").metadata.count is not None else 1)
            v_a_gene.append(TCRdistHelper.add_default_allele_to_v_gene(receptor.get_chain('alpha').metadata.v_allele))
            j_a_gene.append(receptor.get_chain('alpha').metadata.j_allele)
            cdr3_a_aa.append(receptor.get_chain('alpha').amino_acid_sequence)
            cdr3_a_nucseq.append(receptor.get_chain("alpha").nucleotide_sequence)
            v_b_gene.append(TCRdistHelper.add_default_allele_to_v_gene(receptor.get_chain('beta').metadata.v_allele))
            j_b_gene.append(receptor.get_chain('beta').metadata.j_allele)
            cdr3_b_aa.append(receptor.get_chain('beta').amino_acid_sequence)
            cdr3_b_nucseq.append(receptor.get_chain("beta").nucleotide_sequence)
            clone_id.append(receptor.identifier)

        if all(item is not None for item in cdr3_a_nucseq) and all(item is not None for item in cdr3_b_nucseq):
            return pd.DataFrame({"subject": subject, "epitope": epitope, "count": count, "v_a_gene": v_a_gene, "j_a_gene": j_a_gene,
                                 "cdr3_a_aa": cdr3_a_aa, "v_b_gene": v_b_gene, "j_b_gene": j_b_gene, "cdr3_b_aa": cdr3_b_aa, "clone_id": clone_id,
                                 "cdr3_b_nucseq": cdr3_b_nucseq, "cdr3_a_nucseq": cdr3_a_nucseq})
        else:
            return pd.DataFrame({"subject": subject, "epitope": epitope, "count": count, "v_a_gene": v_a_gene, "j_a_gene": j_a_gene,
                                 "cdr3_a_aa": cdr3_a_aa, "v_b_gene": v_b_gene, "j_b_gene": j_b_gene, "cdr3_b_aa": cdr3_b_aa, "clone_id": clone_id})
    def _create_dummy_data(self, path, dataset_type):
        PathBuilder.build(path)
        dataset = None

        test_repertoire = Repertoire.build(
            sequence_aas=[
                "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH"
            ],
            v_genes=["V1-1" for i in range(5)],
            j_genes=["J1-1" for i in range(5)],
            chains=[
                Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA
            ],
            custom_lists={
                "custom_1": [f"CUST-{i}" for i in range(5)],
                "custom_2":
                [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)]
            },
            cell_ids=[1, 1, 1, 2, 2],
            path=path)

        if dataset_type == "receptor":
            receptordataset_filename = f"{path}/receptors.pkl"
            with open(receptordataset_filename, "wb") as file:
                pickle.dump(test_repertoire.receptors, file)

            dataset = ReceptorDataset(filenames=[receptordataset_filename],
                                      identifier="receptor_dataset")

        elif dataset_type == "repertoire":
            test_repertoire.identifier = "repertoire_dataset"
            dataset = RepertoireDataset(repertoires=[test_repertoire])

        return dataset
Beispiel #5
0
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_data = self._encode_data(dataset, params)

        encoded_dataset = ReceptorDataset(filenames=dataset.get_filenames(),
                                          encoded_data=encoded_data,
                                          params=dataset.params)

        return encoded_dataset
Beispiel #6
0
    def _implant_signals_in_receptors(simulation_state: SimulationState) -> Dataset:
        processed_receptors = SignalImplanter._implant_signals(simulation_state, SignalImplanter._process_receptor)
        processed_dataset = ReceptorDataset.build(receptors=processed_receptors, file_size=simulation_state.dataset.file_size,
                                                  name=simulation_state.dataset.name, path=simulation_state.result_path)

        processed_dataset.params = {**(simulation_state.dataset.params if simulation_state.dataset.params is not None else {}),
                                    **{signal: [True, False] for signal in simulation_state.signals}}

        return processed_dataset
    def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict, chain_2_length_probabilities: dict, labels: dict,
                                  path: str):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0
        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities,
                                                                         chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence("".join(random.choices(alphabet, k=random.choices(list(proba.keys()),
                                                                                                                      proba.values())[0])),
                                                                    metadata=SequenceMetadata(count=1,
                                                                                              v_subgroup=chain+"V1",
                                                                                              v_gene=chain+"V1-1",
                                                                                              v_allele=chain+"V1-1*01",
                                                                                              j_subgroup=chain + "J1",
                                                                                              j_gene=chain + "J1-1",
                                                                                              j_allele=chain + "J1-1*01",
                                                                                              chain=chain,
                                                                                              cell_id=id))

        receptors = [TCABReceptor(alpha=get_random_sequence(chain_1_length_probabilities, "TRA", i),
                                  beta=get_random_sequence(chain_2_length_probabilities, "TRB", i),
                                  metadata={**{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0]
                                               for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}})
                     for i in range(receptor_count)]

        filename = f"{path if path[-1] == '/' else path + '/'}batch01.pickle"

        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        return ReceptorDataset(params={label: list(label_dict.keys()) for label, label_dict in labels.items()},
                               filenames=[filename], file_size=receptor_count)
Beispiel #8
0
 def _build_labels(self, dataset: ReceptorDataset,
                   params: EncoderParams) -> dict:
     labels = {
         label: []
         for label in params.label_config.get_labels_by_name()
     }
     for receptor in dataset.get_data():
         for label in labels.keys():
             labels[label].append(receptor.metadata[label])
     return labels
Beispiel #9
0
    def create_dummy_receptordataset(self, path):
        receptors = [
            TCABReceptor(identifier="1",
                         alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom1": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom1": "cust1"
                                                   }))),
            TCABReceptor(identifier="2",
                         alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a",
                                                metadata=SequenceMetadata(
                                                    v_gene="TRAV1",
                                                    j_gene="TRAJ1",
                                                    chain=Chain.ALPHA,
                                                    frame_type="IN",
                                                    custom_params={
                                                        "d_call": "TRAD1",
                                                        "custom2": "cust1"
                                                    })),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b",
                                               metadata=SequenceMetadata(
                                                   v_gene="TRBV1",
                                                   j_gene="TRBJ1",
                                                   chain=Chain.BETA,
                                                   frame_type="IN",
                                                   custom_params={
                                                       "d_call": "TRBD1",
                                                       "custom2": "cust1"
                                                   })))
        ]

        return ReceptorDataset.build(receptors, 2, "{}receptors".format(path))
Beispiel #10
0
    def load_sequence_dataset(params: dict, dataset_name: str) -> Dataset:

        iris_params = IRISImportParams.build_object(**params)

        filenames = ImportHelper.get_sequence_filenames(iris_params.path, dataset_name)
        file_index = 0
        dataset_filenames = []

        for index, filename in enumerate(filenames):
            items = IRISSequenceImport.import_items(filename, paired=iris_params.paired,
                                                    all_dual_chains=iris_params.import_dual_chains,
                                                    all_genes=iris_params.import_all_gene_combinations)

            while len(items) > iris_params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0):
                dataset_filenames.append(iris_params.result_path + "batch_{}.pickle".format(file_index))
                ImportHelper.store_sequence_items(dataset_filenames, items, iris_params.sequence_file_size)
                items = items[iris_params.sequence_file_size:]
                file_index += 1

        return ReceptorDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name) if iris_params.paired \
            else SequenceDataset(filenames=dataset_filenames, file_size=iris_params.sequence_file_size, name=dataset_name)
Beispiel #11
0
    def construct_test_flatten_dataset(self, path):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAATTT",
                                                identifier="1a"),
                         beta=ReceptorSequence(amino_acid_sequence="ATATAT",
                                               identifier="1b"),
                         metadata={"l1": 1},
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                                identifier="2a"),
                         beta=ReceptorSequence(amino_acid_sequence="AAAAAA",
                                               identifier="2b"),
                         metadata={"l1": 2},
                         identifier="2")
        ]

        return ReceptorDataset.build(receptors, 10,
                                     "{}receptors.pkl".format(path))
Beispiel #12
0
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)
        filename = "{}receptors.pkl".format(path)
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(params={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset, lc
    def _import_from_files(
            filenames: List[str],
            generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file,
                             sep=generic_params.separator,
                             usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df[
                    "alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df[
                    "beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df[
                    "alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df[
                    "beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [
                Chain.get_chain(ch).name.lower()
                for ch in generic_params.receptor_chains.value
            ]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(
                    df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {
                    chain_vals[i]: ReceptorSequence(
                        amino_acid_sequence=row[chain_name +
                                                "_amino_acid_sequence"] if
                        chain_name + "_amino_acid_sequence" in row else None,
                        nucleotide_sequence=row[chain_name +
                                                "_nucleotide_sequence"] if
                        chain_name + "_nucleotide_sequence" in row else None,
                        metadata=SequenceMetadata(
                            v_gene=row[f"{chain_name}_v_gene"],
                            v_allele=row[f"{chain_name}_v_allele"],
                            v_subgroup=row[f'{chain_name}_v_subgroup'],
                            j_gene=row[f"{chain_name}_j_gene"],
                            j_allele=row[f"{chain_name}_j_allele"],
                            j_subgroup=row[f'{chain_name}_j_subgroup'],
                            chain=chain_name,
                            count=row["count"],
                            region_type=generic_params.region_type.value))
                    for i, chain_name in enumerate(chain_names)
                }

                elements.append(
                    ReceptorBuilder.build_object(
                        sequences, row["identifier"], {
                            key: row[key]
                            for key in row.keys() if all(
                                item not in key for item in
                                ["v_gene", 'j_gene', "count", "identifier"] +
                                chain_names)
                        }))

        return ReceptorDataset.build(elements,
                                     generic_params.sequence_file_size,
                                     generic_params.result_path)