def test_find_subsequence_indices(self): with captured_output() as (_, _): # no substrings indices = find_substring_indices("AAAAA", 'G', offset=0, overlap=True) self.assertEqual([x for x in indices], []) # yes substrings indices = find_substring_indices("AAAAA", 'A', offset=0, overlap=True) self.assertEqual([x for x in indices], [0, 1, 2, 3, 4]) indices = find_substring_indices("AAAAA", 'AA', offset=0, overlap=True) self.assertEqual([x for x in indices], [0, 1, 2, 3]) # test overlap indices = find_substring_indices("AAAAA", 'AA', offset=0, overlap=False) self.assertEqual([x for x in indices], [0, 2]) # test offset indices = find_substring_indices("ATGCATGC", 'ATGCATGC', offset=1, overlap=True) self.assertEqual([x for x in indices], [1]) indices = find_substring_indices("ATGCATGC", 'ATGCATGC', offset=1, overlap=True) self.assertEqual([x for x in indices], [1]) # compare with gatc modtifs indices2 = find_substring_indices("gatcgatc", 'gatc', offset=1, overlap=True) self.assertEqual([x for x in indices2], [1, 5]) # if empty string passed indices2 = find_substring_indices("", 'gatc', offset=1, overlap=True) self.assertRaises(AssertionError, indices2.__next__) indices2 = find_substring_indices("gatcgatc", '', offset=1, overlap=True) self.assertRaises(AssertionError, indices2.__next__)
def find_gatc_motifs(sequence): """Generate index of 'A' within the 'GATC' motifs in a nucleotide sequence :param sequence: since GATC motif is in DNA, expecting a DNA nucleotide sequence :return: generator yielding index of 'A' within the 'GATC' """ return find_substring_indices(sequence.upper(), "GATC", offset=1)
def find_motifs_sequence_positions(sequence, motifs, overlap=False): """Find locations of edited nucleotide nucleotide sequence using find and replace motifs note: we convert sequence to uppercase :param sequence: nucleotide sequence :param motifs: list of motif's which need to be replaced: eg [[find, replace]], [["CCAGG", "CEAGG"]] :param overlap: boolean option to look for motif overlaps """ already_repaced_indexes = set() # gather motifs for motif_pair in motifs: assert len(motif_pair) == 2 and type( motif_pair ) is list, "Motifs must be structured as list of lists, even for one motif find and replace" # find edit character and offset offset, old_char, substitution_char = find_modification_index_and_character( motif_pair[0], motif_pair[1]) for index in find_substring_indices(sequence.upper(), motif_pair[0].upper(), offset=offset, overlap=overlap): # make sure that there is no overlapping assignments of characters assert index not in already_repaced_indexes, "Motifs has two different edits to a single nucleotide " \ "location. Check motifs {}".format(motifs) already_repaced_indexes.add(index) yield index, old_char, substitution_char
def write_file(test_seq, kmer, outfile): with open(outfile, "w") as fh: indices = [x+1 for x in find_substring_indices(test_seq, kmer, overlap=False)] fh.write(kmer+"\n") for x in indices: fh.write(str(x)+" ") fh.write(str(len(kmer))) fh.write("M"+"\n")
def test_count_kmers(self): self.assertRaises(AssertionError, count_kmers, '', 1) self.assertRaises(AssertionError, count_kmers, 'asdf', 0) for x in range(10): rand_len = np.random.randint(1, 1000) random_string = get_random_string(rand_len) rand_k = np.random.randint(1, 10) kmer_counts = count_kmers(random_string, rand_k) for kmer in kmer_counts.keys(): self.assertEqual(kmer_counts[kmer], len([x for x in find_substring_indices(random_string, kmer)]), "random_string: {} \n kmer: {}".format(random_string, kmer))