Beispiel #1
0
    def _build_new_sequence(self, sequence: ReceptorSequence, position, signal: dict) -> ReceptorSequence:

        gap_length = signal["motif_instance"].gap
        if "/" in signal["motif_instance"].instance:
            motif_left, motif_right = signal["motif_instance"].instance.split("/")
        else:
            motif_left = signal["motif_instance"].instance
            motif_right = ""

        gap_start = position+len(motif_left)
        gap_end = gap_start+gap_length
        part1 = sequence.get_sequence()[:position]
        part2 = sequence.get_sequence()[gap_start:gap_end]
        part3 = sequence.get_sequence()[gap_end+len(motif_right):]

        new_sequence_string = part1 + motif_left + part2 + motif_right + part3

        annotation = SequenceAnnotation()
        implant = ImplantAnnotation(signal_id=signal["signal_id"],
                                    motif_id=signal["motif_id"],
                                    motif_instance=signal["motif_instance"],
                                    position=position)
        annotation.add_implant(implant)

        new_sequence = ReceptorSequence()
        new_sequence.set_annotation(annotation)
        new_sequence.set_metadata(copy.deepcopy(sequence.metadata))
        new_sequence.set_sequence(new_sequence_string, EnvironmentSettings.get_sequence_type())

        return new_sequence
Beispiel #2
0
 def get_sequence(self):
     """
     :return: receptor_sequence (nucleotide/amino acid) that corresponds to preset
     receptor_sequence type from EnvironmentSettings class
     """
     if EnvironmentSettings.get_sequence_type() == SequenceType.AMINO_ACID:
         return self.amino_acid_sequence
     else:
         return self.nucleotide_sequence
    def _set_max_dims(self, dataset):
        max_rep_len = 0
        max_seq_len = 0

        for repertoire in dataset.repertoires:
            sequences = repertoire.get_attribute(
                EnvironmentSettings.get_sequence_type().value)
            max_rep_len = max(len(sequences), max_rep_len)
            max_seq_len = max(max([len(seq) for seq in sequences]),
                              max_seq_len)

        self.max_rep_len = max_rep_len
        self.max_seq_len = max_seq_len
    def _encode_repertoire(self, repertoire, params: EncoderParams):
        sequences = repertoire.get_attribute(
            EnvironmentSettings.get_sequence_type().value)

        onehot_encoded = self._encode_sequence_list(
            sequences,
            pad_n_sequences=self.max_rep_len,
            pad_sequence_len=self.max_seq_len)
        example_id = repertoire.identifier
        labels = self._get_repertoire_labels(
            repertoire, params) if params.encode_labels else None

        return onehot_encoded, example_id, labels
Beispiel #5
0
    def drop_illegal_character_sequences(dataframe: pd.DataFrame, import_illegal_characters: bool) -> pd.DataFrame:
        if not import_illegal_characters:
            sequence_type = EnvironmentSettings.get_sequence_type()
            sequence_name = sequence_type.name.lower().replace("_", " ")

            legal_alphabet = EnvironmentSettings.get_sequence_alphabet(sequence_type)
            if sequence_type == SequenceType.AMINO_ACID:
                legal_alphabet.append(Constants.STOP_CODON)

            is_illegal_seq = [ImportHelper.is_illegal_sequence(sequence, legal_alphabet) for
                              sequence in dataframe[sequence_type.value]]
            n_illegal = sum(is_illegal_seq)

            if n_illegal > 0:
                dataframe.drop(dataframe.loc[is_illegal_seq].index, inplace=True)
                warnings.warn(
                    f"{ImportHelper.__name__}: {n_illegal} sequences were removed from the dataset because their {sequence_name} sequence contained illegal characters. ")
        return dataframe