def test_parse_gg_summary_flat(self): """Parse the gg summary files from flat_files.py""" exp = [GreengenesRecord({'prokMSA_id':'1', 'ncbi_acc_w_ver':'xyzf'}), GreengenesRecord({'prokMSA_id':'25', 'ncbi_acc_w_ver':'abcd', 'country':'australia'}), GreengenesRecord({'prokMSA_id':'50', 'ncbi_acc_w_ver':'223xx'})] obs = list(parse_gg_summary_flat(StringIO(gg_summary))) self.assertEqual(obs,exp)
def test_write_gg_record(self): """Writes a gg record""" exp = sorted([ 'BEGIN', 'prokmsa_id=123', 'gg_id=', 'hugenholtz_tax_string=', 'ncbi_acc_w_ver=xyz', 'ncbi_gi=333', 'n_pos_aligned=', 'n_pos_unaligned=', 'db_name=', 'gold_id=', 'decision=', 'prokmsaname=', 'isolation_source=', 'clone=foo', 'organism=', 'strain=', 'specific_host=', 'authors=', 'title=', 'pubmed=123', 'journal=', 'study_id=', 'submit_date=', 'country=', 'ncbi_tax_string=', 'silva_tax_string=', 'rdp_tax_string=', 'greengenes_tax_string=', 'non_acgt_percent=0.5', 'perc_ident_to_invariant_core=', 'small_gap_intrusions=', 'bellerophon=', 'bel3_div_ratio=', 'chim_slyr_a=', 'chim_slyr_b=', 'chim_slyr_a_tax=', 'chim_slyr_b_tax=', 'aligned_seq=', 'unaligned_seq=', 'END', '' ]) ggrec = GreengenesRecord({ 'prokmsa_id': 123, 'ncbi_acc_w_ver': 'xyz', 'ncbi_gi': '333', 'pubmed': 123, 'clone': 'foo', 'non_acgt_percent': '0.5' }) f = StringIO() write_gg_record(f, ggrec) f.seek(0) obs = sorted(f.read().splitlines()) self.assertEqual(obs, exp)
def get_genbank_summary(r): """Get the gb summary data""" rec = GreengenesRecord() for f, m in parse_funs: rec[f] = m(r) return rec
def parse_gg_summary_flat(open_file): """Parse a flat greengenes summary file from flat_files""" header_line = open_file.readline() if not header_line.startswith('#'): raise ValueError, "Missing the header!" header = header_line[1:].strip().split('\t') print "WARNING: NOT SETTING TYPES CURRENTLY" for line in open_file: record = GreengenesRecord() for key, value in zip(header, line.strip().split('\t')): record[key] = value #record.setTypes() yield record
def test_get_genbank_summary(self): """Get the summary!!""" exp = GreengenesRecord({'ncbi_acc_w_ver':'AGIY01000001.1', 'ncbi_gi':'354825968', 'gold_id':'Gi05850', 'decision':'named_isolate', 'isolation_source':'anaerobic digested sludge', 'organism':'Methanolinea tarda NOBI-1', 'strain':'NOBI-1', 'prokmsaname':'Methanolinea tarda NOBI-1', 'specific_host':'Methanolinea tarda NOBI-1 ctg73, whole genome shotgun sequence.', 'authors':'Lucas,S., Han,J., Lapidus,A., Cheng,J.-F., Goodwin,L., Pitluck,S., Peters,L., Land,M.L., Hauser,L., Imachi,H., Sekiguchi,Y., Kamagata,Y., Cadillo-Quiroz,H., Zinder,S., Liu,W.T., Tamaki,H. and Woyke,T.J.', 'title':'The draft genome of Methanolinea tarda NOBI-1', 'submit_date':'31-OCT-2011', 'country':'Japan: Nagaoka', #'NCBI_tax_id':'882090', 'ncbi_tax_string':'Archaea; Euryarchaeota; Methanomicrobia; Methanomicrobiales; Methanoregulaceae; Methanolinea'}) obs = get_genbank_summary(self.gb1) self.assertEqual(obs,exp)
def setUp(self): self.ggrecord = GreengenesRecord({'prokmsa_id':123})