def test_embl_to_gb(self): # EMBL records have more features than genbank, (ex more than one date, # embl class, DOI cross references) so I can't convert an embl to gb # and then to embl keeping all those data. But I can start from # genbank record # do embl file -> embl object -> gb file -> gb object -> # embl file. Ensure that first and last files are identical embl = DNA.read(self.single_rna_simple_fp, format="embl") # "write" genbank record in a embl file with io.StringIO() as fh: DNA.write(embl, format="genbank", file=fh) # read genbank file fh.seek(0) genbank = DNA.read(fh, format="genbank") # "write" genbank record in a embl file with io.StringIO() as fh: DNA.write(genbank, format="embl", file=fh) # read file object obs = fh.getvalue() # test objects with open(self.single_rna_simple_fp) as fh: exp = fh.read() self.assertEqual(exp, obs)
def reformat_egid(genbank_fp, output_dir): """ Reformat input genome to the formats accepted by EGID. Parameters ---------- genbank_fp: string file path to genome in GenBank format output_dir: string output directory path Notes ----- Input to EGID are five obsolete NCBI standard files: gbk, fna, faa, ffn and ptt. """ (gb, genes) = _merge_genbank_seqs(genbank_fp) DNA.write(gb, join(output_dir, 'id.fna'), format='fasta') DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank') nucl_seq = str(gb) output_f = {} for x in ('faa', 'ffn', 'ptt'): output_f[x] = open(join(output_dir, 'id.' + x), 'w') output_f['ptt'].write('locus001\n' + str(len(genes)) + ' proteins\n') # a ptt file contains the following columns: fields = ('Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym', 'Code', 'COG', 'Product') output_f['ptt'].write('\t'.join(fields) + '\n') gid = 1 # assign an incremental integer to the current gene for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]): output_f['faa'].write('>' + gene + '\n' + l[0] + '\n') output_f['ptt'].write(str(l[1]) + '..' + str(l[2]) + '\t' + l[3] + '\t' + str(len(l[0])) + '\t' + str(gid) + '\t-\tgene' + str(gid) + '\t-\t-\t-\n') if l[3] == '+': # positive strand output_f['ffn'].write('>locus001:' + str(l[1]) + '-' + str(l[2]) + '\n' + nucl_seq[l[1]-1:l[2]] + '\n') else: # negative strand (reverse complement) rc_seq = str(DNA(nucl_seq[l[1]-1:l[2]]).reverse_complement()) output_f['ffn'].write('>locus001:c' + str(l[2]) + '-' + str(l[1]) + '\n' + rc_seq + '\n') gid += 1 for x in output_f: output_f[x].close()
def reformat_genemark(genbank_fp, output_dir): """ Reformat input genome to the formats accepted by GeneMark. Parameters ---------- genbank_fp: string file path to genome in GenBank format output_dir: string output directory path Notes ----- GeneMark's acceptable input file format is FASTA (genome sequence). """ gb = _merge_genbank_seqs(genbank_fp)[0] DNA.write(gb, join(output_dir, 'id.fna'), format='fasta') DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
def test_gb_to_embl(self): genbank = DNA.read(self.genbank_fp, format="genbank") with io.StringIO() as fh: DNA.write(genbank, format="embl", file=fh) # EMBL can't deal with genbank version (ie M14399.1 GI:145229) # read embl data and write to gb fh.seek(0) embl = DNA.read(fh, format="embl") with io.StringIO() as fh: DNA.write(embl, format="genbank", file=fh) # read gb data obs = fh.getvalue() with open(self.genbank_fp) as fh: exp = fh.read() self.assertEqual(exp, obs)