def test_dump(self): s = 'ACGTACGTAGCAT' bwt = BWT(s) t = TemporaryFile() bwt.dump(t) t.seek(0) bwt2 = BWT('', t) t.close() assert_array_equal(bwt.seq, bwt2.seq) assert_array_equal(bwt.sa, bwt2.sa) self.assertEqual(bwt.alphabet, bwt2.alphabet)
def test_search(self): s1 = 'ACGTACGTAGCAT' alphabet = list(set(s1)) bwt1 = BWT(s1) self.assertEqual(list(bwt1.search('ACGT')), [0, 4]) del s1, bwt1 s2 = 'TTCGGCAGTCACCGCGGTTTTCTCGAACGCTTAGGGATAATCGGATCATTCTCACGTAACAGGGTTACGGAGAGTGGAGTGGT'\ 'TGCATGTTAGCCCGCTCTATGCTGGCACCGTGCGGCCACAAACTTATACGTTCTCAGCAGGTATTGTCTCCGGCAATGTTCTC'\ 'TACTGAGGGATGCAATGACATTACGCCACTTATCATTTTAGAAATGCGAGCTTCGAGGGCTGGTGCCGACAGGGCCCTAGCTT'\ 'CGCGCTGCTACACTCTCTAGTCACTAGACACATCTCCATGGGGGAGATAATCTTCGTTTTCCGAGCATGAAACGTACCGGTTA'\ 'CACCCATCATAACGGTGAGAGTTAGTGTGGTTTTTCGGACCAACGGACTGCTGGGTTGCGTGGTTCATTGGTCCTTGACGACG'\ 'AGCAATAGCATGACGCTTTAATAAATCGTTCTACGTTGGGAGGTAACGCGGAATCCCAAGGCCCTCGACATCGTCCTCTCCAT'\ 'ACGAGACACGAAACATTTATATCACTTCGGGCCATTATGCATATCAGTTGGCTGGTTCCTCTTGACGTTAAAATAGGTGGGAA'\ 'GTTATTCGATACCAACGTCTGCAGGTACCGAATAGTGCACGGCGACACACCACAGGGATCTATTAATAAACCTGGTGATGTGA'\ 'TTGGTCCGACTTCGACTAACAACGATTCGACCAGTCTTAATTCTGATCTCGTGACCCGTGTCCTTATTCCATCTAATGAAATT'\ 'CCTGTGGCGATCGGGCACTTCCCGGCGTAAAGTAACCTCCGGACGGCCTAGATTCTATTCTGAAGCTCGCTCGTTTGGAACTT'\ 'GGGGCGCTGAGTCATTACGGGCGCGGTTCTACCATGCACAGCAATATTACACCACTCCTCCCACAGAATCTTCCGACGTGAAG'\ 'GAATGCCCGCAGACAGACATGCTGGTGAAACTGCACCACGACCTTTCGCAACCAGCGCCGGGCGAAAGTCAGTTCAGTCTGCC'\ 'GGAC' bwt2 = BWT(s2) self.assertEqual(sorted(bwt2.search('ACGT')), [53, 130, 320, 447, 562, 595, 905]) del s2, bwt2 s3 = ''.join(choice(alphabet) for _ in xrange(1000)) bwt3 = BWT(s3) n3 = ''.join(choice(alphabet) for _ in xrange(100)) self.assertEqual(sum(1 for _ in re.finditer('(?=' + n3 + ')', s3)), len(list(bwt3.search(n3))))
if __name__ == '__main__': #Parse some arguments parser = argparse.ArgumentParser() parser.add_argument('command', help="Command [index, search]") parser.add_argument('files', metavar='file', nargs='+', help='files to index/search (FASTA)') parser.add_argument('--fastq', help='fastq file containing reads to map') parser.add_argument('--fasta', help='fasta file containing reads to map') parser.add_argument('--ed', type=int, help='maximum edit distance to search for') args = parser.parse_args() print args #Index goes through all the input files and creats a FM-index and dumps it to a JSON file if args.command == 'index': for fasta in args.files: bwt = BWT(SeqIO.read(fasta, 'fasta').seq.tostring()) with open(fasta + '.index', 'w') as out: bwt.dump(out) del bwt #For the meat: elif args.command == 'search': #Pull all of our genomes into memory indices = {} #Try to grab existing indices, otherwise create them for fasta in args.files: try: with open(fasta + '.index') as f: indices[fasta] = BWT('', f) except IOError: indices[fasta] = BWT(SeqIO.read(fasta, 'fasta').seq.tostring())