Esempio n. 1
0
    def test_create_kmers_from_string(self):
        kmers = KmerHelper.create_kmers_from_string("ABCDEFG", 3)
        self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers)
        self.assertEqual(5, len(kmers))

        kmers = KmerHelper.create_kmers_from_string("AB", 3)
        self.assertTrue(len(kmers) == 0)
Esempio n. 2
0
    def test_create_kmers_from_sequence(self):
        kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="ABCDEFG"), 3, sequence_type=SequenceType.AMINO_ACID)
        self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers)
        self.assertEqual(5, len(kmers))

        kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="AB"), 3, sequence_type=SequenceType.AMINO_ACID)
        self.assertTrue(len(kmers) == 0)
    def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path):
        model = Word2Vec(size=vector_size, min_count=1, window=5)  # creates an empty model
        all_kmers = KmerHelper.create_all_kmers(k=k, alphabet=EnvironmentSettings.get_sequence_alphabet())
        all_kmers = [[kmer] for kmer in all_kmers]
        model.build_vocab(all_kmers)

        for repertoire in dataset.get_data(batch_size=batch_size):
            sentences = KmerHelper.create_sentences_from_repertoire(repertoire=repertoire, k=k)
            model.train(sentences=sentences, total_words=len(all_kmers), epochs=15)

        model.save(str(model_path))

        return model
Esempio n. 4
0
    def test_create_kmers_within_HD(self):

        kmers = KmerHelper.create_kmers_within_HD("ACT", list("ACTEF"), 1)

        self.assertEqual(15, len(kmers))
        for i in range(15):
            self.assertTrue(set("ACT").intersection(set(kmers[i][1])))
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates all overlapping gapped k-mers from a sequence as features for use in KmerFrequencyEncoder.
        this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left
        side of the gap and a k-mer of length k_right on the right side of the gap.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap",
                        "min_gap")
        :return: SequenceEncodingResult
        """
        k_left = params.model.get('k_left')
        k_right = params.model.get('k_right', k_left)
        max_gap = params.model.get('max_gap')
        min_gap = params.model.get('min_gap', 0)
        sequence_type = params.model.get('sequence_type', None)
        length = len(sequence.get_sequence(sequence_type))

        if length < k_left + k_right + max_gap:
            warnings.warn(
                'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence'
            )
            return None

        gapped_kmers = KmerHelper.create_gapped_kmers_from_sequence(
            sequence,
            k_left=k_left,
            max_gap=max_gap,
            min_gap=min_gap,
            k_right=k_right,
            sequence_type=sequence_type)

        return gapped_kmers
Esempio n. 6
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder.
        this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left
        side of the gap and a k-mer of length k_right on the right side of the gap.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap",
                        "min_gap")
        :return: SequenceEncodingResult
        """
        k_left = params.model.get('k_left')
        k_right = params.model.get('k_right', k_left)
        max_gap = params.model.get('max_gap')
        min_gap = params.model.get('min_gap', 0)
        length = len(sequence.get_sequence())

        if length < k_left + k_right + max_gap:
            warnings.warn(
                'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence'
            )
            return None

        gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(
            sequence,
            k_left=k_left,
            max_gap=max_gap,
            min_gap=min_gap,
            k_right=k_right)

        gapped_kmers = [
            Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer])
            for kmer in gapped_kmers
        ]

        return gapped_kmers
Esempio n. 7
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        Encodes a receptor sequence into a sequence of k-mers

        Args:
            sequence: ReceptorSequence object
            params: EncoderParams object with information on k-mer length

        Returns:

        """
        k = params.model["k"]
        sequence_type = params.model.get('sequence_type', None)
        length = len(sequence.get_sequence(sequence_type))

        if length < k:
            logging.warning(
                f'KmerSequenceEncoder: Sequence length {length} is less than {k}. Ignoring sequence...'
            )
            return None

        kmers = KmerHelper.create_kmers_from_sequence(
            sequence=sequence, k=k, sequence_type=sequence_type)

        return kmers
Esempio n. 8
0
 def test_create_all_kmers(self):
     alphabet = list("ABCD")
     k = 2
     kmers = KmerHelper.create_all_kmers(k=k, alphabet=alphabet)
     self.assertEqual(len(kmers), 16)
     self.assertTrue("BD" in kmers)
     self.assertTrue("DA" in kmers)
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates overlapping continuous k-mers from a sequence as features for use in KmerFrequencyEncoder
        object of type EncoderParams, same object as passed into KmerFrequencyEncoder
        :param sequence: ReceptorSequence
        :param params: EncoderParams (where params["model"]["k"] is used)
        :return: SequenceEncodingResult consisting of features and feature information names
        """
        k = params.model["k"]
        sequence_type = params.model.get('sequence_type', None)
        length = len(sequence.get_sequence(sequence_type))

        if length < k:
            logging.warning(
                'Sequence length is less than k. Ignoring sequence')
            return None

        kmers = KmerHelper.create_IMGT_kmers_from_sequence(
            sequence=sequence, k=k, sequence_type=sequence_type)

        kmers = [
            Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer])
            for kmer in kmers
        ]

        return kmers
Esempio n. 10
0
 def test_create_IMGT_kmers_from_sequence(self):
     kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3, sequence_type=SequenceType.AMINO_ACID)
     self.assertTrue(("CAS", 105) in kmers)
     self.assertTrue(("ASS", 106) in kmers)
     self.assertTrue(("SSR", 107) in kmers)
     self.assertTrue(("SRY", 108) in kmers)
     self.assertTrue(("RYU", 114) in kmers)
     self.assertTrue(("YUF", 115) in kmers)
Esempio n. 11
0
 def test_create_IMGT_gapped_kmers_from_sequence(self):
     kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(
         ReceptorSequence("CASSRYUF"), 2, 1, 1, 1)
     self.assertTrue(
         all([
             k in kmers
             for k in [('CA.S', 105), ('AS.R',
                                       106), ('SS.Y',
                                              107), ('SR.U',
                                                     108), ('RY.F', 114)]
         ]))
Esempio n. 12
0
    def create_model(self, dataset: RepertoireDataset, k: int,
                     vector_size: int, batch_size: int, model_path: Path):

        model = Word2Vec(size=vector_size, min_count=1,
                         window=5)  # creates an empty model
        all_kmers = KmerHelper.create_all_kmers(
            k=k, alphabet=EnvironmentSettings.get_sequence_alphabet())
        all_kmers = [[kmer] for kmer in all_kmers]
        model.build_vocab(all_kmers)

        for kmer in all_kmers:
            sentences = KmerHelper.create_kmers_within_HD(
                kmer=kmer[0],
                alphabet=EnvironmentSettings.get_sequence_alphabet(),
                distance=1)
            model.train(sentences=sentences,
                        total_words=len(all_kmers),
                        epochs=model.epochs)

        model.save(str(model_path))

        return model
Esempio n. 13
0
    def _encode_repertoire(self, repertoire, vectors):
        repertoire_vector = np.zeros(vectors.vector_size)
        for (index2, sequence) in enumerate(repertoire.sequences):
            kmers = KmerHelper.create_kmers_from_sequence(sequence=sequence,
                                                          k=self.k)
            sequence_vector = np.zeros(vectors.vector_size)
            for kmer in kmers:
                try:
                    word_vector = vectors.get_vector(kmer)
                    sequence_vector = np.add(sequence_vector, word_vector)
                except KeyError:
                    pass

            repertoire_vector = np.add(repertoire_vector, sequence_vector)
        return repertoire_vector
Esempio n. 14
0
    def test_create_sentences_from_repertoire(self):

        path = EnvironmentSettings.tmp_test_path / "kmer/"
        PathBuilder.build(path)

        rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"),
                                                      ReceptorSequence(amino_acid_sequence="ACCT"),
                                                      ReceptorSequence(amino_acid_sequence="AACT")], path, {})

        sentences = KmerHelper.create_sentences_from_repertoire(rep, 3, sequence_type=SequenceType.AMINO_ACID)

        self.assertEqual(3, len(sentences))
        self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0])

        shutil.rmtree(path)
Esempio n. 15
0
    def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
        """
        creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in
        KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder.
        :param sequence: ReceptorSequence
        :param params: EncoderParams (where params["model"]["k"] is used)
        :return: SequenceEncodingResult
        """
        k = params.model["k"]
        length = len(sequence.get_sequence())

        if length < k:
            logging.warning('KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...')
            return None

        kmers = KmerHelper.create_kmers_from_sequence(sequence, k)

        return kmers
Esempio n. 16
0
    def compute_tcrb_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict:
        """
        Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count for the given receptor
        sequence, T is the total count across all receptor sequences. The relative abundance per receptor sequence is then computed and only the
        maximum sequence abudance was used for the k-mer so that the k-mer's relative abundance is equal to the abundance of the most frequent
        receptor sequence in which the receptor appears:

        .. math::

            T^{TCR \\beta} = \\sum_{TCR\\beta} C^{TCR\\beta}

            RA^{TCR\\beta} = \\frac{C^{TCR\\beta}}{T^{TCR\\beta}}

            RA = \\max_{\\underset{with \\, kmer}{TCR\\beta}} {RA^{TCR \\beta}}

        For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell
        receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online
        January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_

        Arguments:

            sequences: an array of (amino acid) sequences (corresponding to a repertoire)
            counts: an array of counts for each of the sequences
            k: the length of the k-mer (in the publication referenced above, k is 4)

        Returns:

            a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences

        """
        relative_abundance = {}
        total_count = np.sum(counts)
        relative_abundance_per_sequence = counts / total_count
        for index, sequence in enumerate(sequences):
            kmers = KmerHelper.create_kmers_from_string(sequence, k)
            for kmer in kmers:
                if kmer not in relative_abundance or relative_abundance[kmer] < relative_abundance_per_sequence[index]:
                    relative_abundance[kmer] = relative_abundance_per_sequence[index]

        return relative_abundance
Esempio n. 17
0
    def compute_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict:
        """
        Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count, T is the total count and
        RA is relative abundance (the output of the function for each k-mer separately):

        .. math::

            C^{kmer}=\\sum_{\\underset{with kmer}{TCR \\beta}} C^{TCR \\beta}

            T^{kmer} = \\sum_{kmer} C^{kmer}

            RA = \\frac{C^{kmer}}{T^{kmer}}

        For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell
        receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online
        January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_

        Arguments:

            sequences: an array of (amino acid) sequences (corresponding to a repertoire)
            counts: an array of counts for each of the sequences
            k: the length of the k-mer (in the publication referenced above, k is 4)

        Returns:

            a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences

        """

        c_kmers = Counter()
        for index, sequence in enumerate(sequences):
            kmers = KmerHelper.create_kmers_from_string(sequence, k)
            c_kmers += {kmer: counts[index] for kmer in kmers}

        t_kmers = sum(c_kmers.values())

        return {kmer: c_kmers[kmer] / t_kmers for kmer in c_kmers.keys()}
Esempio n. 18
0
 def test_create_gapped_kmers_from_string(self):
     kmers = KmerHelper.create_gapped_kmers_from_string(
         "CASSRYUF", 2, 1, 1, 1)
     self.assertTrue(
         all([k in kmers
              for k in ['CA.S', 'AS.R', 'SS.Y', 'SR.U', 'RY.F']]))