def test_create_kmers_from_string(self): kmers = KmerHelper.create_kmers_from_string("ABCDEFG", 3) self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers) self.assertEqual(5, len(kmers)) kmers = KmerHelper.create_kmers_from_string("AB", 3) self.assertTrue(len(kmers) == 0)
def test_create_kmers_from_sequence(self): kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="ABCDEFG"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue("ABC" in kmers and "BCD" in kmers and "CDE" in kmers and "DEF" in kmers and "EFG" in kmers) self.assertEqual(5, len(kmers)) kmers = KmerHelper.create_kmers_from_sequence(ReceptorSequence(amino_acid_sequence="AB"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue(len(kmers) == 0)
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers(k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for repertoire in dataset.get_data(batch_size=batch_size): sentences = KmerHelper.create_sentences_from_repertoire(repertoire=repertoire, k=k) model.train(sentences=sentences, total_words=len(all_kmers), epochs=15) model.save(str(model_path)) return model
def test_create_kmers_within_HD(self): kmers = KmerHelper.create_kmers_within_HD("ACT", list("ACTEF"), 1) self.assertEqual(15, len(kmers)) for i in range(15): self.assertTrue(set("ACT").intersection(set(kmers[i][1])))
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates all overlapping gapped k-mers from a sequence as features for use in KmerFrequencyEncoder. this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left side of the gap and a k-mer of length k_right on the right side of the gap. :param sequence: ReceptorSequence :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap", "min_gap") :return: SequenceEncodingResult """ k_left = params.model.get('k_left') k_right = params.model.get('k_right', k_left) max_gap = params.model.get('max_gap') min_gap = params.model.get('min_gap', 0) sequence_type = params.model.get('sequence_type', None) length = len(sequence.get_sequence(sequence_type)) if length < k_left + k_right + max_gap: warnings.warn( 'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence' ) return None gapped_kmers = KmerHelper.create_gapped_kmers_from_sequence( sequence, k_left=k_left, max_gap=max_gap, min_gap=min_gap, k_right=k_right, sequence_type=sequence_type) return gapped_kmers
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder. this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left side of the gap and a k-mer of length k_right on the right side of the gap. :param sequence: ReceptorSequence :param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap", "min_gap") :return: SequenceEncodingResult """ k_left = params.model.get('k_left') k_right = params.model.get('k_right', k_left) max_gap = params.model.get('max_gap') min_gap = params.model.get('min_gap', 0) length = len(sequence.get_sequence()) if length < k_left + k_right + max_gap: warnings.warn( 'Sequence length is less than k_left + k_right + max_gap. Ignoring sequence' ) return None gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence( sequence, k_left=k_left, max_gap=max_gap, min_gap=min_gap, k_right=k_right) gapped_kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in gapped_kmers ] return gapped_kmers
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ Encodes a receptor sequence into a sequence of k-mers Args: sequence: ReceptorSequence object params: EncoderParams object with information on k-mer length Returns: """ k = params.model["k"] sequence_type = params.model.get('sequence_type', None) length = len(sequence.get_sequence(sequence_type)) if length < k: logging.warning( f'KmerSequenceEncoder: Sequence length {length} is less than {k}. Ignoring sequence...' ) return None kmers = KmerHelper.create_kmers_from_sequence( sequence=sequence, k=k, sequence_type=sequence_type) return kmers
def test_create_all_kmers(self): alphabet = list("ABCD") k = 2 kmers = KmerHelper.create_all_kmers(k=k, alphabet=alphabet) self.assertEqual(len(kmers), 16) self.assertTrue("BD" in kmers) self.assertTrue("DA" in kmers)
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult consisting of features and feature information names """ k = params.model["k"] sequence_type = params.model.get('sequence_type', None) length = len(sequence.get_sequence(sequence_type)) if length < k: logging.warning( 'Sequence length is less than k. Ignoring sequence') return None kmers = KmerHelper.create_IMGT_kmers_from_sequence( sequence=sequence, k=k, sequence_type=sequence_type) kmers = [ Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in kmers ] return kmers
def test_create_IMGT_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_kmers_from_sequence(ReceptorSequence("CASSRYUF"), 3, sequence_type=SequenceType.AMINO_ACID) self.assertTrue(("CAS", 105) in kmers) self.assertTrue(("ASS", 106) in kmers) self.assertTrue(("SSR", 107) in kmers) self.assertTrue(("SRY", 108) in kmers) self.assertTrue(("RYU", 114) in kmers) self.assertTrue(("YUF", 115) in kmers)
def test_create_IMGT_gapped_kmers_from_sequence(self): kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence( ReceptorSequence("CASSRYUF"), 2, 1, 1, 1) self.assertTrue( all([ k in kmers for k in [('CA.S', 105), ('AS.R', 106), ('SS.Y', 107), ('SR.U', 108), ('RY.F', 114)] ]))
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers( k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for kmer in all_kmers: sentences = KmerHelper.create_kmers_within_HD( kmer=kmer[0], alphabet=EnvironmentSettings.get_sequence_alphabet(), distance=1) model.train(sentences=sentences, total_words=len(all_kmers), epochs=model.epochs) model.save(str(model_path)) return model
def _encode_repertoire(self, repertoire, vectors): repertoire_vector = np.zeros(vectors.vector_size) for (index2, sequence) in enumerate(repertoire.sequences): kmers = KmerHelper.create_kmers_from_sequence(sequence=sequence, k=self.k) sequence_vector = np.zeros(vectors.vector_size) for kmer in kmers: try: word_vector = vectors.get_vector(kmer) sequence_vector = np.add(sequence_vector, word_vector) except KeyError: pass repertoire_vector = np.add(repertoire_vector, sequence_vector) return repertoire_vector
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path / "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT")], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3, sequence_type=SequenceType.AMINO_ACID) self.assertEqual(3, len(sentences)) self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams): """ creates overlapping continuous k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder object of type EncoderParams, same object as passed into KmerFrequencyEncoder. :param sequence: ReceptorSequence :param params: EncoderParams (where params["model"]["k"] is used) :return: SequenceEncodingResult """ k = params.model["k"] length = len(sequence.get_sequence()) if length < k: logging.warning('KmerSequenceEncoder: Sequence length is less than k. Ignoring sequence...') return None kmers = KmerHelper.create_kmers_from_sequence(sequence, k) return kmers
def compute_tcrb_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict: """ Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count for the given receptor sequence, T is the total count across all receptor sequences. The relative abundance per receptor sequence is then computed and only the maximum sequence abudance was used for the k-mer so that the k-mer's relative abundance is equal to the abundance of the most frequent receptor sequence in which the receptor appears: .. math:: T^{TCR \\beta} = \\sum_{TCR\\beta} C^{TCR\\beta} RA^{TCR\\beta} = \\frac{C^{TCR\\beta}}{T^{TCR\\beta}} RA = \\max_{\\underset{with \\, kmer}{TCR\\beta}} {RA^{TCR \\beta}} For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_ Arguments: sequences: an array of (amino acid) sequences (corresponding to a repertoire) counts: an array of counts for each of the sequences k: the length of the k-mer (in the publication referenced above, k is 4) Returns: a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences """ relative_abundance = {} total_count = np.sum(counts) relative_abundance_per_sequence = counts / total_count for index, sequence in enumerate(sequences): kmers = KmerHelper.create_kmers_from_string(sequence, k) for kmer in kmers: if kmer not in relative_abundance or relative_abundance[kmer] < relative_abundance_per_sequence[index]: relative_abundance[kmer] = relative_abundance_per_sequence[index] return relative_abundance
def compute_relative_abundance(sequences: np.ndarray, counts: np.ndarray, k: int) -> dict: """ Computes the relative abundance of k-mers in the repertoire per following equations where C is the template count, T is the total count and RA is relative abundance (the output of the function for each k-mer separately): .. math:: C^{kmer}=\\sum_{\\underset{with kmer}{TCR \\beta}} C^{TCR \\beta} T^{kmer} = \\sum_{kmer} C^{kmer} RA = \\frac{C^{kmer}}{T^{kmer}} For more details, please see the original publication: Ostmeyer J, Christley S, Toby IT, Cowell LG. Biophysicochemical motifs in T cell receptor sequences distinguish repertoires from tumor-infiltrating lymphocytes and adjacent healthy tissue. Cancer Res. Published online January 1, 2019:canres.2292.2018. `doi:10.1158/0008-5472.CAN-18-2292 <https://cancerres.aacrjournals.org/content/canres/79/7/1671.full.pdf>`_ Arguments: sequences: an array of (amino acid) sequences (corresponding to a repertoire) counts: an array of counts for each of the sequences k: the length of the k-mer (in the publication referenced above, k is 4) Returns: a dictionary where keys are k-mers and values are their relative abundances in the given list of sequences """ c_kmers = Counter() for index, sequence in enumerate(sequences): kmers = KmerHelper.create_kmers_from_string(sequence, k) c_kmers += {kmer: counts[index] for kmer in kmers} t_kmers = sum(c_kmers.values()) return {kmer: c_kmers[kmer] / t_kmers for kmer in c_kmers.keys()}
def test_create_gapped_kmers_from_string(self): kmers = KmerHelper.create_gapped_kmers_from_string( "CASSRYUF", 2, 1, 1, 1) self.assertTrue( all([k in kmers for k in ['CA.S', 'AS.R', 'SS.Y', 'SR.U', 'RY.F']]))