def _encode_examples(self, dataset: ReceptorDataset,
                         params: EncoderParams):
        encoded_receptors_counts, encoded_receptors = [], []
        receptor_ids = []
        label_config = params.label_config
        labels = {label: []
                  for label in label_config.get_labels_by_name()
                  } if params.encode_labels else None
        chains = []

        sequence_encoder = self._prepare_sequence_encoder()
        feature_names = sequence_encoder.get_feature_names(params)
        for receptor in dataset.get_data(params.pool_size):
            counts = {chain: Counter() for chain in receptor.get_chains()}
            chains = receptor.get_chains()
            for chain in receptor.get_chains():
                counts[chain] = self._encode_sequence(
                    receptor.get_chain(chain), params, sequence_encoder,
                    counts[chain])
            encoded_receptors_counts.append(counts)
            receptor_ids.append(receptor.identifier)

            if params.encode_labels:
                for label_name in label_config.get_labels_by_name():
                    label = receptor.metadata[label_name]
                    labels[label_name].append(label)

        for encoded_receptor_count in encoded_receptors_counts:
            counts = [
                self._add_chain_to_name(encoded_receptor_count[chain], chain)
                for chain in chains
            ]
            encoded_receptors.append(counts[0] + counts[1])

        return encoded_receptors, receptor_ids, labels, feature_names
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence(self.sequence_type)
            for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        if any(seq is None for seq in first_chain_seqs) or any(
                seq is None for seq in second_chain_seqs):
            raise ValueError(
                f"{OneHotEncoder.__name__}: receptor dataset {dataset.name} (id: {dataset.identifier}) contains empty sequences for the "
                f"specified sequence type {self.sequence_type.name.lower()}. Please check that the dataset is imported correctly."
            )

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
Beispiel #3
0
 def _build_labels(self, dataset: ReceptorDataset,
                   params: EncoderParams) -> dict:
     labels = {
         label: []
         for label in params.label_config.get_labels_by_name()
     }
     for receptor in dataset.get_data():
         for label in labels.keys():
             labels[label].append(receptor.metadata[label])
     return labels
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence() for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
Beispiel #5
0
    def prepare_tcr_dist_dataframe(dataset: ReceptorDataset,
                                   label_names: list) -> pd.DataFrame:
        if len(label_names) > 1:
            raise NotImplementedError(
                f"TCRdist: multiple labels specified ({str(label_names)[1:-1]}), but only single label binary class "
                f"is currently supported in immuneML.")
        label_name = label_names[0]

        subject, epitope, count, v_a_gene, j_a_gene, cdr3_a_aa, v_b_gene, j_b_gene, cdr3_b_aa, clone_id, cdr3_b_nucseq, cdr3_a_nucseq = \
            [], [], [], [], [], [], [], [], [], [], [], []

        for receptor in dataset.get_data():
            subject.append(receptor.metadata["subject"] if "subject" in
                           receptor.metadata else "sub" + receptor.identifier)
            epitope.append(receptor.metadata[label_name])
            count.append(
                receptor.get_chain("alpha").metadata.
                count if receptor.get_chain("alpha").metadata.count ==
                receptor.get_chain("beta").metadata.count and receptor.
                get_chain("beta").metadata.count is not None else 1)
            v_a_gene.append(
                TCRdistHelper.add_default_allele_to_v_gene(
                    receptor.get_chain('alpha').metadata.v_allele))
            j_a_gene.append(receptor.get_chain('alpha').metadata.j_allele)
            cdr3_a_aa.append(receptor.get_chain('alpha').amino_acid_sequence)
            cdr3_a_nucseq.append(
                receptor.get_chain("alpha").nucleotide_sequence)
            v_b_gene.append(
                TCRdistHelper.add_default_allele_to_v_gene(
                    receptor.get_chain('beta').metadata.v_allele))
            j_b_gene.append(receptor.get_chain('beta').metadata.j_allele)
            cdr3_b_aa.append(receptor.get_chain('beta').amino_acid_sequence)
            cdr3_b_nucseq.append(
                receptor.get_chain("beta").nucleotide_sequence)
            clone_id.append(receptor.identifier)

        if all(item is not None
               for item in cdr3_a_nucseq) and all(item is not None
                                                  for item in cdr3_b_nucseq):
            return pd.DataFrame({
                "subject": subject,
                "epitope": epitope,
                "count": count,
                "v_a_gene": v_a_gene,
                "j_a_gene": j_a_gene,
                "cdr3_a_aa": cdr3_a_aa,
                "v_b_gene": v_b_gene,
                "j_b_gene": j_b_gene,
                "cdr3_b_aa": cdr3_b_aa,
                "clone_id": clone_id,
                "cdr3_b_nucseq": cdr3_b_nucseq,
                "cdr3_a_nucseq": cdr3_a_nucseq
            })
        else:
            return pd.DataFrame({
                "subject": subject,
                "epitope": epitope,
                "count": count,
                "v_a_gene": v_a_gene,
                "j_a_gene": j_a_gene,
                "cdr3_a_aa": cdr3_a_aa,
                "v_b_gene": v_b_gene,
                "j_b_gene": j_b_gene,
                "cdr3_b_aa": cdr3_b_aa,
                "clone_id": clone_id
            })