Ejemplo n.º 1
0
 def test_find_subsequence_indices(self):
     with captured_output() as (_, _):
         # no substrings
         indices = find_substring_indices("AAAAA",
                                          'G',
                                          offset=0,
                                          overlap=True)
         self.assertEqual([x for x in indices], [])
         # yes substrings
         indices = find_substring_indices("AAAAA",
                                          'A',
                                          offset=0,
                                          overlap=True)
         self.assertEqual([x for x in indices], [0, 1, 2, 3, 4])
         indices = find_substring_indices("AAAAA",
                                          'AA',
                                          offset=0,
                                          overlap=True)
         self.assertEqual([x for x in indices], [0, 1, 2, 3])
         # test overlap
         indices = find_substring_indices("AAAAA",
                                          'AA',
                                          offset=0,
                                          overlap=False)
         self.assertEqual([x for x in indices], [0, 2])
         # test offset
         indices = find_substring_indices("ATGCATGC",
                                          'ATGCATGC',
                                          offset=1,
                                          overlap=True)
         self.assertEqual([x for x in indices], [1])
         indices = find_substring_indices("ATGCATGC",
                                          'ATGCATGC',
                                          offset=1,
                                          overlap=True)
         self.assertEqual([x for x in indices], [1])
         # compare with gatc modtifs
         indices2 = find_substring_indices("gatcgatc",
                                           'gatc',
                                           offset=1,
                                           overlap=True)
         self.assertEqual([x for x in indices2], [1, 5])
         # if empty string passed
         indices2 = find_substring_indices("",
                                           'gatc',
                                           offset=1,
                                           overlap=True)
         self.assertRaises(AssertionError, indices2.__next__)
         indices2 = find_substring_indices("gatcgatc",
                                           '',
                                           offset=1,
                                           overlap=True)
         self.assertRaises(AssertionError, indices2.__next__)
Ejemplo n.º 2
0
def find_gatc_motifs(sequence):
    """Generate index of 'A' within the 'GATC' motifs in a nucleotide sequence

    :param sequence: since GATC motif is in DNA, expecting a DNA nucleotide sequence
    :return: generator yielding index of 'A' within the 'GATC'
    """
    return find_substring_indices(sequence.upper(), "GATC", offset=1)
Ejemplo n.º 3
0
def find_motifs_sequence_positions(sequence, motifs, overlap=False):
    """Find locations of edited nucleotide nucleotide sequence using find and replace motifs

    note: we convert sequence to uppercase

    :param sequence: nucleotide sequence
    :param motifs: list of motif's which need to be replaced: eg [[find, replace]], [["CCAGG", "CEAGG"]]
    :param overlap: boolean option to look for motif overlaps
    """
    already_repaced_indexes = set()
    # gather motifs
    for motif_pair in motifs:
        assert len(motif_pair) == 2 and type(
            motif_pair
        ) is list, "Motifs must be structured as list of lists, even for one motif find and replace"
        # find edit character and offset
        offset, old_char, substitution_char = find_modification_index_and_character(
            motif_pair[0], motif_pair[1])
        for index in find_substring_indices(sequence.upper(),
                                            motif_pair[0].upper(),
                                            offset=offset,
                                            overlap=overlap):
            # make sure that there is no overlapping assignments of characters
            assert index not in already_repaced_indexes, "Motifs has two different edits to a single nucleotide " \
                                                         "location. Check motifs {}".format(motifs)
            already_repaced_indexes.add(index)

            yield index, old_char, substitution_char
Ejemplo n.º 4
0
def write_file(test_seq, kmer, outfile):
    with open(outfile, "w") as fh:
        indices = [x+1 for x in find_substring_indices(test_seq, kmer, overlap=False)]
        fh.write(kmer+"\n")
        for x in indices:
            fh.write(str(x)+" ")
            fh.write(str(len(kmer)))
            fh.write("M"+"\n")
Ejemplo n.º 5
0
 def test_count_kmers(self):
     self.assertRaises(AssertionError, count_kmers, '', 1)
     self.assertRaises(AssertionError, count_kmers, 'asdf', 0)
     for x in range(10):
         rand_len = np.random.randint(1, 1000)
         random_string = get_random_string(rand_len)
         rand_k = np.random.randint(1, 10)
         kmer_counts = count_kmers(random_string, rand_k)
         for kmer in kmer_counts.keys():
             self.assertEqual(kmer_counts[kmer], len([x for x in find_substring_indices(random_string, kmer)]),
                              "random_string: {} \n kmer: {}".format(random_string, kmer))