def prep_genbank_files(templateFile, fasta_files, annotDir, master_source_table=None, comment=None, sequencing_tech=None, coverage_table=None, biosample_map=None): ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, as well as numerous other metadata files for the submission. Creates a directory full of files (.sqn in particular) that can be sent to GenBank. ''' # get coverage map coverage = {} if coverage_table: for row in util.file.read_tabfile_dict(coverage_table): if row.get('sample') and row.get('aln2self_cov_median'): coverage[row['sample']] = row['aln2self_cov_median'] # get biosample id map biosample = {} if biosample_map: for row in util.file.read_tabfile_dict(biosample_map): if row.get('sample') and row.get('BioSample'): biosample[row['sample']] = row['BioSample'] # make output directory util.file.mkdir_p(annotDir) for fn in fasta_files: if not fn.endswith('.fasta'): raise Exception("fasta files must end in .fasta") sample_base = os.path.basename(fn)[:-6] # for each segment/chromosome in the fasta file, # create a separate new *.fsa file with open(fn, "r") as inf: asm_fasta = Bio.SeqIO.parse(inf, 'fasta') for idx, seq_obj in enumerate(asm_fasta): sample = sample_base + "-" + str(idx+1) # write the segment to a temp .fasta file # in the same dir so fasta2fsa functions as expected out_file_name = os.path.join(os.path.dirname(fn),sample+".fasta") with open(out_file_name, "w") as out_chr_fasta: Bio.SeqIO.write(seq_obj, out_chr_fasta, "fasta") # make .fsa files fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample)) # remove the .fasta file os.unlink(out_file_name) # make .src files if master_source_table: shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src')) # make .cmt files make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'), name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech) # run tbl2asn (relies on filesnames matching by prefix) tbl2asn = tools.tbl2asn.Tbl2AsnTool() tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)
def prep_genbank_files(templateFile, fasta_files, annotDir, master_source_table=None, comment=None, sequencing_tech=None, coverage_table=None, biosample_map=None): ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, as well as numerous other metadata files for the submission. Creates a directory full of files (.sqn in particular) that can be sent to GenBank. ''' # get coverage map coverage = {} if coverage_table: for row in util.file.read_tabfile_dict(coverage_table): if row.get('sample') and row.get('aln2self_cov_median'): coverage[row['sample']] = row['aln2self_cov_median'] # get biosample id map biosample = {} if biosample_map: for row in util.file.read_tabfile_dict(biosample_map): if row.get('sample') and row.get('BioSample'): biosample[row['sample']] = row['BioSample'] # make output directory util.file.mkdir_p(annotDir) for fn in fasta_files: if not fn.endswith('.fasta'): raise Exception("fasta files must end in .fasta") sample = os.path.basename(fn)[:-6] # make .fsa files fasta2fsa(fn, annotDir, biosample=biosample.get(sample)) # make .src files if master_source_table: shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src')) # make .cmt files make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'), name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech) # run tbl2asn tbl2asn = tools.tbl2asn.Tbl2AsnTool() tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)