def split_data(fastaFN, partitions): buckets = [[] for _ in xrange(partitions)] ring = cycle(xrange(partitions)) with open(fastaFN, 'rU') as inF: for seq in util.parseFASTA(inF): buckets[ring.next()].append(seq) return buckets
def test_parseFASTA(self): """ Testing parseFASTA function. :return: Returns OK is test goals were achieved, otherwise raises error. """ FASTARecord = namedtuple("FASTA_Record", "id descr data") parseFASTA_result = ut.parseFASTA("phylotoast/test/test_FASTA.fna") manually_parsed = [ FASTARecord( id="PIDF154_1", descr= "HU82XDC01DBOHO orig_bc=ACAGGTCG new_bc=ACAGGTCG bc_diffs=0", data= "AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAGATTAAGTAGCTTGCTATTTAATCTTAGTGGCGCACGGGTGAGTAATATATAGCTAATCTGCCCTACACTAGAGGACAACAGTTGGAAACGACTGCTAATACTCTATACTCCTTCTTTACATAAGTTAAGTCGGGAAAGTTTTTCGGTGTAGGATGAGGCTATATCGTATCAGCTAGTGGTAGGTAACGGCCTACCAAGGCTATGACGCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTAGGGGAATATTGCTCAAATGGGGGGAAAACCCTGAAAGCAGCAACGCCGCGTGGAGGATGACACTTTTCGGA" ), FASTARecord( id="PIDTA158_2", descr= "HU82XDC01A3N0T orig_bc=ACCGCAGG new_bc=ACCGCAGG bc_diffs=0", data= "GATGAACGCTAGCGATAGGCTTAACACATGCAAGTCGAGGGCATCACGAATTAGCAATAGTTTGGTGGCGACCGGCGCACGGGTGCGTAACACGTATACAACCTACCTTCAATTGGGGAATAACCTGGAGAAATTTGGACTAATACCCCATAGTAAACGGGAGAGGCATTCTTTTTTGTTTAAAGATTTATTGATTGGAGATGGGTATGCGTAGGATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCAACGATCCTTAGGGGTT" ), FASTARecord( id="PIDF160_3", descr= "HU82XDC01DTNIU orig_bc=ACCGTAGA new_bc=ACCGTAGA bc_diffs=0", data= "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGTTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTTTAGGGCATCCTAAGATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGACAATGGAACCAAAAGTCTGATCCAGCAATTCTGTGTGCACGATG" ), FASTARecord( id="PIDTA.TB168_4", descr= "HU82XDC01ETBU0 orig_bc=GCGCAACG new_bc=GCGCAACG bc_diffs=0", data= "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATATGGCATCGTATAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGATCGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGGAATATTGGGACAATGGGACCGAGAGTCTGATCCAGCAACTCTGTGTGCACGAT" ), FASTARecord( id="PIDTA.TB140_5", descr= "HU82XDC01AVWB9 orig_bc=ACTGGAGA new_bc=ACTGGAGA bc_diffs=0", data= "GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAATTTGGGTTTTAACTTAGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTAGAAATGAATGCTAATACCTGATATTATGATTTTAAGGCATCTTAGAATTATGAAAGCTATAAGCACTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACAGCTCACCAAGGC" ) ] for rec1, rec2 in zip(parseFASTA_result, manually_parsed): self.assertEqual(rec1, rec2, msg="FASTA records not parsed as expected.")
def main(): args = handle_program_options() with open(args.biom_fp, 'rU') as bf: biom_otus = {row['id'] for row in json.load(bf)['rows']} repset = util.parseFASTA(args.repset_fp) seq_ids = set() with open(args.repset_out_fp, 'w') as out_f: fasta_str = ">{} {}\n{}\n" for seq in repset: if seq.id not in seq_ids and seq.id in biom_otus: seq_ids.add(seq.id) out_f.write(fasta_str.format(seq.id, seq.descr, seq.data))
def test_parseFASTA(self): """ Testing parseFASTA function. :return: Returns OK is test goals were achieved, otherwise raises error. """ FASTARecord = namedtuple("FASTA_Record", "id descr data") parseFASTA_result = ut.parseFASTA("phylotoast/test/test_FASTA.fna") manually_parsed = [FASTARecord(id="PIDF154_1", descr="HU82XDC01DBOHO orig_bc=ACAGGTCG new_bc=ACAGGTCG bc_diffs=0", data="AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAGATTAAGTAGCTTGCTATTTAATCTTAGTGGCGCACGGGTGAGTAATATATAGCTAATCTGCCCTACACTAGAGGACAACAGTTGGAAACGACTGCTAATACTCTATACTCCTTCTTTACATAAGTTAAGTCGGGAAAGTTTTTCGGTGTAGGATGAGGCTATATCGTATCAGCTAGTGGTAGGTAACGGCCTACCAAGGCTATGACGCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTAGGGGAATATTGCTCAAATGGGGGGAAAACCCTGAAAGCAGCAACGCCGCGTGGAGGATGACACTTTTCGGA"), FASTARecord(id="PIDTA158_2", descr="HU82XDC01A3N0T orig_bc=ACCGCAGG new_bc=ACCGCAGG bc_diffs=0", data="GATGAACGCTAGCGATAGGCTTAACACATGCAAGTCGAGGGCATCACGAATTAGCAATAGTTTGGTGGCGACCGGCGCACGGGTGCGTAACACGTATACAACCTACCTTCAATTGGGGAATAACCTGGAGAAATTTGGACTAATACCCCATAGTAAACGGGAGAGGCATTCTTTTTTGTTTAAAGATTTATTGATTGGAGATGGGTATGCGTAGGATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCAACGATCCTTAGGGGTT"), FASTARecord(id="PIDF160_3", descr="HU82XDC01DTNIU orig_bc=ACCGTAGA new_bc=ACCGTAGA bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGTTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTTTAGGGCATCCTAAGATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGACAATGGAACCAAAAGTCTGATCCAGCAATTCTGTGTGCACGATG"), FASTARecord(id="PIDTA.TB168_4", descr="HU82XDC01ETBU0 orig_bc=GCGCAACG new_bc=GCGCAACG bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAACTTCGGTTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATATGGCATCGTATAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGATCGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGGAATATTGGGACAATGGGACCGAGAGTCTGATCCAGCAACTCTGTGTGCACGAT"), FASTARecord(id="PIDTA.TB140_5", descr="HU82XDC01AVWB9 orig_bc=ACTGGAGA new_bc=ACTGGAGA bc_diffs=0", data="GATGAACGCTGACAGAATGCTTAACACATGCAAGTCAACTTGAATTTGGGTTTTAACTTAGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTAGAAATGAATGCTAATACCTGATATTATGATTTTAAGGCATCTTAGAATTATGAAAGCTATAAGCACTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACAGCTCACCAAGGC")] for rec1, rec2 in zip(parseFASTA_result, manually_parsed): self.assertEqual( rec1, rec2, msg="FASTA records not parsed as expected." )