Exemple #1
0
    def get_repertoire_contents(repertoire, compairr_params):
        attributes = [EnvironmentSettings.get_sequence_type().value, "counts"]
        attributes += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"]
        repertoire_contents = repertoire.get_attributes(attributes)
        repertoire_contents = pd.DataFrame({**repertoire_contents, "identifier": repertoire.identifier})

        check_na_rows = [EnvironmentSettings.get_sequence_type().value]
        check_na_rows += [] if compairr_params.ignore_counts else ["counts"]
        check_na_rows += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"]

        n_rows_before = len(repertoire_contents)

        repertoire_contents.dropna(inplace=True, subset=check_na_rows)

        if n_rows_before > len(repertoire_contents):
            warnings.warn(
                f"CompAIRRHelper: removed {n_rows_before - len(repertoire_contents)} entries from repertoire {repertoire.identifier} due to missing values.")

        if compairr_params.ignore_counts:
            repertoire_contents["counts"] = 1

        repertoire_contents.rename(columns={EnvironmentSettings.get_sequence_type().value: "junction_aa",
                                           "v_genes": "v_call", "j_genes": "j_call",
                                           "counts": "duplicate_count", "identifier": "repertoire_id"},
                                   inplace=True)

        return repertoire_contents
Exemple #2
0
    def drop_illegal_character_sequences(
            dataframe: pd.DataFrame,
            import_illegal_characters: bool) -> pd.DataFrame:
        if not import_illegal_characters:
            sequence_type = EnvironmentSettings.get_sequence_type()
            sequence_name = sequence_type.name.lower().replace("_", " ")

            legal_alphabet = EnvironmentSettings.get_sequence_alphabet(
                sequence_type)
            if sequence_type == SequenceType.AMINO_ACID:
                legal_alphabet.append(Constants.STOP_CODON)

            is_illegal_seq = [
                ImportHelper.is_illegal_sequence(sequence, legal_alphabet)
                for sequence in dataframe[sequence_type.value]
            ]
            n_illegal = sum(is_illegal_seq)

            if n_illegal > 0:
                dataframe.drop(dataframe.loc[is_illegal_seq].index,
                               inplace=True)
                warnings.warn(
                    f"{ImportHelper.__name__}: {n_illegal} sequences were removed from the dataset because their {sequence_name} sequence contained illegal characters. "
                )
        return dataframe
    def _build_new_sequence(self, sequence: ReceptorSequence, position, signal: dict) -> ReceptorSequence:

        gap_length = signal["motif_instance"].gap
        if "/" in signal["motif_instance"].instance:
            motif_left, motif_right = signal["motif_instance"].instance.split("/")
        else:
            motif_left = signal["motif_instance"].instance
            motif_right = ""

        gap_start = position+len(motif_left)
        gap_end = gap_start+gap_length
        part1 = sequence.get_sequence()[:position]
        part2 = sequence.get_sequence()[gap_start:gap_end]
        part3 = sequence.get_sequence()[gap_end+len(motif_right):]

        new_sequence_string = part1 + motif_left + part2 + motif_right + part3

        annotation = SequenceAnnotation()
        implant = ImplantAnnotation(signal_id=signal["signal_id"],
                                    motif_id=signal["motif_id"],
                                    motif_instance=signal["motif_instance"],
                                    position=position)
        annotation.add_implant(implant)

        new_sequence = ReceptorSequence()
        new_sequence.set_annotation(annotation)
        new_sequence.set_metadata(copy.deepcopy(sequence.metadata))
        new_sequence.set_sequence(new_sequence_string, EnvironmentSettings.get_sequence_type())

        return new_sequence
    def get_relevant_sequence_attributes(self):
        attributes = [EnvironmentSettings.get_sequence_type().value]

        if not self.compairr_params.ignore_genes:
            attributes += ["v_genes", "j_genes"]

        return attributes
    def __init__(self,
                 use_positional_info: bool,
                 distance_to_seq_middle: int,
                 flatten: bool,
                 name: str = None):
        self.use_positional_info = use_positional_info
        self.distance_to_seq_middle = distance_to_seq_middle
        self.flatten = flatten

        if distance_to_seq_middle:
            self.pos_increasing = [
                1 / self.distance_to_seq_middle * i
                for i in range(self.distance_to_seq_middle)
            ]
            self.pos_decreasing = self.pos_increasing[::-1]
        else:
            self.pos_decreasing = None

        self.name = name

        if EnvironmentSettings.get_sequence_type(
        ) == SequenceType.NUCLEOTIDE:  # todo check this / explain in docs
            self.distance_to_seq_middle = self.distance_to_seq_middle * 3

        self.onehot_dimensions = self.ALPHABET + [
            "start", "mid", "end"
        ] if self.use_positional_info else self.ALPHABET  # todo test this
Exemple #6
0
    def _encode_repertoire(self, repertoire, params: EncoderParams):
        sequences = repertoire.get_attribute(EnvironmentSettings.get_sequence_type().value)

        onehot_encoded = self._encode_sequence_list(sequences, pad_n_sequences=self.max_rep_len, pad_sequence_len=self.max_seq_len)
        example_id = repertoire.identifier
        labels = self._get_repertoire_labels(repertoire, params) if params.encode_labels else None

        return onehot_encoded, example_id, labels
    def get_sequence(self, sequence_type: SequenceType = None):
        """Returns receptor_sequence (nucleotide/amino acid) that corresponds to provided sequence type or preset receptor_sequence type from
        EnvironmentSettings class if no type is provided"""

        sequence_type_ = EnvironmentSettings.get_sequence_type() if sequence_type is None else sequence_type
        if sequence_type_ == SequenceType.AMINO_ACID:
            return self.amino_acid_sequence
        else:
            return self.nucleotide_sequence
 def get_sequence(self):
     """
     :return: receptor_sequence (nucleotide/amino acid) that corresponds to preset
     receptor_sequence type from EnvironmentSettings class
     """
     if EnvironmentSettings.get_sequence_type() == SequenceType.AMINO_ACID:
         return self.amino_acid_sequence
     else:
         return self.nucleotide_sequence
    def write_sequence_set_file(self, sequence_set, filename, offset=0):
        sequence_col = "junction_aa" if EnvironmentSettings.get_sequence_type(
        ) == SequenceType.AMINO_ACID else "junction"
        vj_header = "" if self.compairr_params.ignore_genes else "\tv_call\tj_call"

        with open(filename, "w") as file:
            file.write(
                f"{sequence_col}{vj_header}\tduplicate_count\trepertoire_id\n")

            for id, sequence_info in enumerate(sequence_set, offset):
                file.write("\t".join(sequence_info) + f"\t1\t{id}\n")
Exemple #10
0
    def _set_max_dims(self, dataset):
        max_rep_len = 0
        max_seq_len = 0

        for repertoire in dataset.repertoires:
            sequences = repertoire.get_attribute(EnvironmentSettings.get_sequence_type().value)
            max_rep_len = max(len(sequences), max_rep_len)
            max_seq_len = max(max([len(seq) for seq in sequences]), max_seq_len)

        self.max_rep_len = max_rep_len
        self.max_seq_len = max_seq_len