def parse_annotation_line(line, genes, molecules): cols = line.split("\t") if len(cols) != 10: print("WARNING: Ignoring the following line because I expected 10 columns:\n{0}".format(line)) return False cols[9] = cols[9].rstrip() transcript_id = cols[0] CDS_id = cols[1] gene_id = get_gene_id_from_transcript( transcript_id ) if cols[5] is None: gene_product_name = cols[3] else: gene_product_name = cols[5] if transcript_id not in molecules: raise Exception("ERROR: found molecule {0} in referenced in annotation tab file but not in genomic_fasta file".format(transcript_id)) if gene_id in genes: gene = genes[gene_id] else: gene = things.Gene(id=gene_id) genes[gene_id] = gene mRNA = things.mRNA(id=transcript_id) gene.add_mRNA( mRNA ) annotation = annotation.FunctionalAnnotation(product_name=gene_product_name) ec_num_pattern = re.compile('\d+.') if cols[9] is not None: ec_nums = cols[9].split(',') for ec_num in ec_nums: m = ec_num_pattern.search(ec_num) if m: ec = annotation.ECAnnotation(number=ec_num) annotation.add_ec_number( ec ) go_pattern = re.compile('(\d+)') if cols[8] is not None: go_terms = cols[8].split(',') for go_term in go_terms: m = go_pattern.search(go_term) if m: go = annotation.GOAnnotation(go_id=go_term) annotation.add_go_annotation( go ) CDS = things.CDS(id=CDS_id, annotation=annotation) mRNA.add_CDS( CDS )
def parse_gff3(gff3_file): assemblies = dict() genes = dict() for line in open(gff3_file): cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] rfmin = int(cols[3]) - 1 rfmax = int(cols[4]) rstrand = None feat_id = column_9_value(cols[8], 'ID') #print("Processing feature: ({0})".format(feat_id)) if cols[6] == '-': strand = -1 elif cols[6] == '+': strand = 1 else: strand = 0 if cols[2] == 'gene': gene = things.Gene(id=feat_id) gene.locate_on(assembly=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) genes[feat_id] = gene elif cols[2] == 'mRNA': mRNA = things.mRNA(id=feat_id) mRNA.locate_on(assembly=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_id = column_9_value(cols[8], 'Parent') genes[parent_id].add_mRNA( mRNA ) return (assemblies, genes)
def main(): parser = argparse.ArgumentParser( description='Convert GFF output from Prodigal into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Prodigal') parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): pass else: ## gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "CDS": # gene gene = things.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) # mRNA mRNA = things.mRNA(id=feat_id + '.t1', parent=gene) mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_mRNA[feat_id + '.t1'] = 0 # CDS / exons parent_id = gff.column_9_value(cols[8], 'ID') + '.t1' ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception( "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file" .format(parent_id)) CDS = things.CDS(id=parent_id + '.cds', parent=mRNAs[parent_id]) CDS.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7])) mRNA.add_CDS(CDS) # exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) mRNA.add_exon(exon) ## gene.print_as(fh=fout, source='Prodigal_v2.6.3', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Converts CEGMA GFF output to spec-legal GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() fout = open(args.output_file, 'w') fout.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_mRNA = None current_gene_fmin = None current_gene_fmax = None current_gene_strand = None next_id_nums = {'gene':1, 'mRNA':1, 'CDS':1, 'exon':1} exon_column_types = ['First', 'Internal', 'Terminal', 'Single'] for line in open(args.input_file, 'r'): if line.startswith('#'): continue cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) if feat_type == 'Single' or feat_type == 'First': # If there's an existing gene already, print it out if current_gene is not None: current_gene.locate_on( target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand ) current_mRNA.locate_on( target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand ) #current_gene.print_as(format='text') current_gene.print_as(fh=fout, source='cegma', format='gff3') # initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] feat_id = "cegma.gene.{0}".format(next_id_nums['gene']) next_id_nums['gene'] += 1 gene = things.Gene(id=feat_id) current_gene = gene current_gene_strand = cols[6] current_gene_fmin = feat_fmin current_gene_fmax = feat_fmax mRNA_id = "cegma.mRNA.{0}".format(next_id_nums['mRNA']) next_id_nums['mRNA'] += 1 mRNA = things.mRNA(id=mRNA_id, parent=gene) gene.add_mRNA(mRNA) current_mRNA = mRNA # CEGMA versions < 2.5 had two rows for each exon. We don't need to process both of them, so # we skip the Exon one because its phase information is incorrect. if feat_type in exon_column_types: CDS_id = "cegma.CDS.{0}".format(next_id_nums['CDS']) next_id_nums['CDS'] += 1 CDS = things.CDS(id=CDS_id, parent=current_mRNA) CDS.locate_on( target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6], phase=cols[7] ) current_mRNA.add_CDS(CDS) exon_id = "cegma.exon.{0}".format(next_id_nums['exon']) next_id_nums['exon'] += 1 exon = things.Exon(id=exon_id, parent=current_mRNA) exon.locate_on( target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6] ) mRNA.add_exon(exon) if feat_fmin < current_gene_fmin: current_gene_fmin = feat_fmin if feat_fmax > current_gene_fmax: current_gene_fmax = feat_fmax # don't forget the last gene if current_gene is not None: current_gene.locate_on( target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand ) current_mRNA.locate_on( target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand ) current_gene.print_as(fh=fout, source='cegma', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false' ) parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception("ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = things.Gene(id=locus_tag, locus_tag=locus_tag) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene current_RNA = None elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) mRNA = things.mRNA(id=feat_id, parent=current_gene, locus_tag=locus_tag) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) tRNA = things.tRNA(id=feat_id, parent=current_gene) tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) rRNA = things.rRNA(id=feat_id, parent=current_gene) rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to # manually make one if current_RNA is None: feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) mRNA = things.mRNA(id=feat_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None if 'gene' in feat.qualifiers: gene_symbol = feat.qualifiers['gene'][0] else: gene_symbol = None annot = annotation.FunctionalAnnotation(product_name=product, gene_symbol=gene_symbol) if 'db_xref' in feat.qualifiers: for dbxref in feat.qualifiers['db_xref']: annot.add_dbxref(dbxref) polypeptide_id = "{0}.polypeptide.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA, annotation=annot) mRNA.add_polypeptide(polypeptide) exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand ) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 else: print("WARNING: The following feature was skipped:\n{0}".format(feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format(utils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
def main(): parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write( "".join(current_gene_comment_lines) ) gene.print_as(fh=fout, source='AUGUSTUS', format='gff3') gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] if feat_type not in ['gene', 'transcript', 'CDS']: continue ## The output format is GTF by default and (mostly) GFF if the --gff option is used. # If GTF is detected, let's start by transforming the 9th column into GFF so the # libraries can use it # g1 -> ID=g1 # g1.t1 -> ID=g1.t1;Parent=g1 # transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1 m_gene = re.match('(g\d+)', cols[8]) m_transcript = re.match('((g\d+).t\d+)', cols[8]) m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8]) # the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries if not cols[8].startswith('ID') and not cols[8].startswith('Parent'): if feat_type == 'gene': if m_gene: cols[8] = "ID={0}".format(m_gene.group(1)) else: raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'transcript': if m_transcript: cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2)) else: raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8])) elif feat_type == 'CDS': if m_CDS: cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1)) else: raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8])) feat_id = gff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = things.Gene(id=feat_id) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) elif feat_type == "transcript": mRNA = things.mRNA(id=feat_id, parent=gene) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = gff.column_9_value(cols[8], 'Parent') ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id)) CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id]) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = things.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon)
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)' ) args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception("ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'") ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as( fh=ofh, source='Cufflinks', format='gff3' ) match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin) match.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match = match elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin) mp.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
def get_gff3_features(gff3_file, assemblies=None): ''' Parses the passed GFF3 file and returns two dicts, loaded with biocode.biothings objects: 1. The first dict are the Assembly objects, keyed on assembly ID. Each Assembly has all of the children populated, so you can fully recover gene, RNA, exon and CDS features iterating on the assembly. 2. The second dist is a flat structure of all the descendent feature objects of the Assemblies keyed by the feature IDs. See the documentation for each feature type in biocode.biothings for more info ''' if assemblies is None: assemblies = dict() features = dict() # these are related to parsing any embedded FASTA in_fasta_section = False is_assembly_fasta = False current_fasta_id = None lnum = 0 FASTA_RE = re.compile(r'^\#\#FASTA\s*$') for line in open(gff3_file): lnum = lnum + 1 if in_fasta_section == True: m = re.search('>(\S+)\s*(.*)', line) if m: current_fasta_id = m.group(1) if current_fasta_id in assemblies: is_assembly_fasta = True else: is_assembly_fasta = False else: if current_fasta_id is None: if (len(str(line.rstrip())) > 0): raise Exception("FASTA parse error - sequence appears without preceding fasta id at line " + str(lnum)) if is_assembly_fasta == True: # must be a sequence line for an assembly # python 2.6+ makes string concatenation amortized O(n) # http://stackoverflow.com/a/4435752/1368079 assemblies[current_fasta_id].residues += str(line.rstrip()) assemblies[current_fasta_id].length = len( assemblies[current_fasta_id].residues ) continue elif FASTA_RE.match(line): # all data to the end of the file must be FASTA in_fasta_section = True continue # ignore all other comments if line.startswith('#'): continue cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] # initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id, residues='') current_assembly = assemblies[mol_id] rfmin = int(cols[3]) - 1 rfmax = int(cols[4]) rstrand = None atts = column_9_dict(cols[8]) feat_id = atts.get('ID') parent_id = atts.get('Parent') parent_feat = None # sanity check if rfmin > rfmax: raise Exception("ERROR: Coordinates in GFF for feature id {0} appear to be reversed and violate GFF3 specification: {1} > {2}".format(feat_id, cols[3], cols[4])) if 'locus_tag' in atts: locus_tag = atts['locus_tag'] else: locus_tag = None # shared features are not yet supported if isinstance(parent_id, list): raise Exception("This line contains a shared feature with multiple parents. This isn't yet supported:\n{0}".format(line)) if parent_id is not None: if parent_id in features: parent_feat = features[parent_id] else: raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) ) if cols[6] == '-': rstrand = -1 elif cols[6] == '+': rstrand = 1 else: rstrand = 0 phase = cols[7] if cols[2] == 'gene': gene = things.Gene(id=feat_id, locus_tag=locus_tag) gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) features[feat_id] = gene current_assembly.add_gene(gene) elif cols[2] == 'mRNA': mRNA = things.mRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag) mRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_mRNA(mRNA) features[feat_id] = mRNA elif cols[2] == 'rRNA': rRNA = things.rRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag) rRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_rRNA(rRNA) rRNA.annotation = parse_annotation_from_column_9(cols[8]) features[feat_id] = rRNA elif cols[2] == 'tRNA': tRNA = things.tRNA(id=feat_id, parent=parent_feat, locus_tag=locus_tag) tRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_tRNA(tRNA) tRNA.annotation = parse_annotation_from_column_9(cols[8]) features[feat_id] = tRNA elif cols[2] == 'exon': exon = things.Exon(id=feat_id, parent=parent_feat) exon.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_exon(exon) features[feat_id] = exon elif cols[2] == 'CDS': if phase == '.': phase = 0 else: phase = int(phase) CDS = things.CDS(id=feat_id, parent=parent_feat, phase=phase) CDS.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand, phase=phase) parent_feat.add_CDS(CDS) features[feat_id] = CDS elif cols[2] == 'polypeptide': polypeptide = things.Polypeptide(id=feat_id, parent=parent_feat) polypeptide.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_polypeptide(polypeptide) polypeptide.annotation = parse_annotation_from_column_9(cols[8]) features[feat_id] = polypeptide elif cols[2] == 'five_prime_UTR': utr = things.FivePrimeUTR(id=feat_id, parent=parent_feat) utr.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_five_prime_UTR(utr) features[feat_id] = utr elif cols[2] == 'three_prime_UTR': utr = things.ThreePrimeUTR(id=feat_id, parent=parent_feat) utr.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_three_prime_UTR(utr) features[feat_id] = utr else: sys.stderr.write( "Skipping feature {0} with type {1}\n".format(feat_id, cols[2]) ) continue features[feat_id].length = rfmax - rfmin return (assemblies, features)
def main(): parser = argparse.ArgumentParser( description='Metagenemark GFF -> GFF3 conversion script') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Metagenemark' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='Prefix to use in ID generation') parser.add_argument('-pf', '--protein_fasta', type=str, required=False, help='Optional protein FASTA to be written') args = parser.parse_args() assemblies = dict() current_assembly = None # key like 2 = SRS014890.polypeptide.2 polypeptide_lookup = dict() writing_protein = False gene = None mRNAs = dict() current_sequence = None current_gene_comment_lines = list() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") if args.protein_fasta is not None: protein_out = open(args.protein_fasta, mode='wt', encoding='utf-8') for line in open(args.input): if line.startswith("#"): if line.startswith("##FASTA"): current_gene_comment_lines.append("#{0}".format(line)) elif line.startswith("##end-Protein"): writing_protein = False current_gene_comment_lines.append(line) # since we're already doing our own header, don't duplicate the old one elif line.startswith("##gff-version"): continue else: if line.startswith("##Protein "): m = re.match("##Protein (\d+)", line) if m: writing_protein = True protein_out.write(">{0}\n".format(polypeptide_lookup[m.group(1)])) else: raise Exception("ERROR: Expected line to match: ##Protein N") elif writing_protein == True: protein_out.write(line[2:]) current_gene_comment_lines.append(line) else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] mol_id_m = re.match('^(\S+) ', mol_id) if mol_id_m: print("MATCH!") mol_id = mol_id_m.group(1) feat_type = cols[2] ## we expect only gene types here if feat_type not in ['gene', 'CDS']: raise Exception("ERROR: expected only 'gene' or 'CDS' feature types as input (depending on metagenemark version).") m_gene = re.match('gene_id[ =](\d+)', cols[8]) if m_gene: gene_num = m_gene.group(1) else: raise Exception("ERROR: expected 9th column to have gene ids like: gene_id 5") ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] gene = things.Gene(id="{0}.gene.{1}".format(args.prefix, gene_num)) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA = things.mRNA(id="{0}.mRNA.{1}".format(args.prefix, gene_num), parent=gene.id) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) CDS = things.CDS(id="{0}.CDS.{1}".format(args.prefix, gene_num), parent=mRNA.id) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) exon = things.Exon(id="{0}.exon.{1}".format(args.prefix, gene_num), parent=mRNA.id) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon) polypeptide_id = "{0}.polypeptide.{1}".format(args.prefix, gene_num) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA.id) polypeptide.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_polypeptide(polypeptide) polypeptide_lookup[gene_num] = polypeptide_id gene.print_as(fh=fout, source='GeneMark.hmm', format='gff3') fout.write( "".join(current_gene_comment_lines) ) current_gene_comment_lines = list()
def main(): parser = argparse.ArgumentParser( description='Convert PASA GFF file to canonical gene models') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by PASA' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-s', '--source', type=str, required=False, default='PASA', help='Value to use for the 2nd (source) column' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNA = None gene_fmin = None gene_fmax = None gene_strand = None ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = gff.column_9_value(cols[8], 'ID') # we expect all columns to be cDNA_match if feat_type != 'cDNA_match': raise Exception("ERROR: expected all columns to be of type 'cDNA_match' but found a {0}".format(feat_type)) ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if gene is None or feat_id != gene.id: if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3') # now start a new one gene = things.Gene(id=feat_id) mRNA = things.mRNA(id="{0}.mRNA".format(feat_id), parent=gene) exon_count_by_mRNA[mRNA.id] = 0 gene_fmin = int(cols[3]) - 1 gene_fmax = int(cols[4]) gene_strand = cols[6] current_assembly = assemblies[mol_id] # each row is a new CDS/exon for the current mRNA CDS = things.CDS(id="{0}.CDS".format(feat_id), parent=mRNA.id) # FIX THIS PHASE CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase='.' ) mRNA.add_CDS(CDS) exon_count_by_mRNA[mRNA.id] += 1 exon_id = "{0}.exon{1}".format(mRNA.id, exon_count_by_mRNA[mRNA.id]) exon = things.Exon(id=exon_id, parent=mRNA.id) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon) if int(cols[3]) - 1 < gene_fmin: gene_fmin = int(cols[3]) - 1 if int(cols[4]) > gene_fmax: gene_fmax = int(cols[4]) # don't orphan the last one if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3')
def parse_annotation_line(line, genes, molecules): cols = line.split("\t") if len(cols) != 10: print( "WARNING: Ignoring the following line because I expected 10 columns:\n{0}" .format(line)) return False cols[9] = cols[9].rstrip() transcript_id = cols[0] CDS_id = cols[1] gene_id = get_gene_id_from_transcript(transcript_id) if cols[5] is None: gene_product_name = cols[3] else: gene_product_name = cols[5] if transcript_id not in molecules: raise Exception( "ERROR: found molecule {0} in referenced in annotation tab file but not in genomic_fasta file" .format(transcript_id)) if gene_id in genes: gene = genes[gene_id] else: gene = things.Gene(id=gene_id) genes[gene_id] = gene mRNA = things.mRNA(id=transcript_id) gene.add_mRNA(mRNA) annotation = annotation.FunctionalAnnotation( product_name=gene_product_name) ec_num_pattern = re.compile('\d+.') if cols[9] is not None: ec_nums = cols[9].split(',') for ec_num in ec_nums: m = ec_num_pattern.search(ec_num) if m: ec = annotation.ECAnnotation(number=ec_num) annotation.add_ec_number(ec) go_pattern = re.compile('(\d+)') if cols[8] is not None: go_terms = cols[8].split(',') for go_term in go_terms: m = go_pattern.search(go_term) if m: go = annotation.GOAnnotation(go_id=go_term) annotation.add_go_annotation(go) CDS = things.CDS(id=CDS_id, annotation=annotation) mRNA.add_CDS(CDS)
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)') args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception( "ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'" ) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as(fh=ofh, source='Cufflinks', format='gff3') match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin) match.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match = match elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (( (fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin) mp.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as(fh=ofh, source='Cufflinks', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Converts CEGMA GFF output to spec-legal GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() fout = open(args.output_file, 'w') fout.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_mRNA = None current_gene_fmin = None current_gene_fmax = None current_gene_strand = None next_id_nums = {'gene': 1, 'mRNA': 1, 'CDS': 1, 'exon': 1} exon_column_types = ['First', 'Internal', 'Terminal', 'Single'] for line in open(args.input_file, 'r'): if line.startswith('#'): continue cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) if feat_type == 'Single' or feat_type == 'First': # If there's an existing gene already, print it out if current_gene is not None: current_gene.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_mRNA.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) #current_gene.print_as(format='text') current_gene.print_as(fh=fout, source='cegma', format='gff3') # initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] feat_id = "cegma.gene.{0}".format(next_id_nums['gene']) next_id_nums['gene'] += 1 gene = things.Gene(id=feat_id) current_gene = gene current_gene_strand = cols[6] current_gene_fmin = feat_fmin current_gene_fmax = feat_fmax mRNA_id = "cegma.mRNA.{0}".format(next_id_nums['mRNA']) next_id_nums['mRNA'] += 1 mRNA = things.mRNA(id=mRNA_id, parent=gene) gene.add_mRNA(mRNA) current_mRNA = mRNA # CEGMA versions < 2.5 had two rows for each exon. We don't need to process both of them, so # we skip the Exon one because its phase information is incorrect. if feat_type in exon_column_types: CDS_id = "cegma.CDS.{0}".format(next_id_nums['CDS']) next_id_nums['CDS'] += 1 CDS = things.CDS(id=CDS_id, parent=current_mRNA) CDS.locate_on(target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6], phase=cols[7]) current_mRNA.add_CDS(CDS) exon_id = "cegma.exon.{0}".format(next_id_nums['exon']) next_id_nums['exon'] += 1 exon = things.Exon(id=exon_id, parent=current_mRNA) exon.locate_on(target=current_assembly, fmin=feat_fmin, fmax=feat_fmax, strand=cols[6]) mRNA.add_exon(exon) if feat_fmin < current_gene_fmin: current_gene_fmin = feat_fmin if feat_fmax > current_gene_fmax: current_gene_fmax = feat_fmax # don't forget the last gene if current_gene is not None: current_gene.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_mRNA.locate_on(target=current_assembly, fmin=current_gene_fmin, fmax=current_gene_fmax, strand=current_gene_strand) current_gene.print_as(fh=fout, source='cegma', format='gff3')
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for StringTie output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '') cov = gff.column_9_value(col9, 'cov').replace('"', '') if ftype == 'transcript': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='StringTie', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = things.Gene(id=gene_id) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = things.mRNA(id=transcript_id, parent=current_gene) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif ftype == 'exon': exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '') exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='StringTie', format='gff3')
def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created') parser.add_argument( '--with_fasta', dest='fasta', action='store_true', help= 'Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false') parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = things.Assembly(id=mol_id) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception( "ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = things.Gene(id=locus_tag, locus_tag=locus_tag) gene.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) current_gene = gene current_RNA = None elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene, locus_tag=locus_tag) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: anticodon = feat.qualifiers['product'][0] else: anticodon = None tRNA = things.tRNA(id=feat_id, parent=current_gene, anticodon=anticodon) tRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format(locus_tag, rna_count_by_gene[locus_tag]) if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None annot = annotation.FunctionalAnnotation(product_name=product) rRNA = things.rRNA(id=feat_id, parent=current_gene, annotation=annot) rRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to # manually make one if current_RNA is None: feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) mRNA = things.mRNA(id=feat_id, parent=current_gene) mRNA.locate_on(target=current_assembly, fmin=fmin, fmax=fmax, strand=strand) gene.add_mRNA(mRNA) current_RNA = mRNA if 'product' in feat.qualifiers: product = feat.qualifiers['product'][0] else: product = None if 'gene' in feat.qualifiers: gene_symbol = feat.qualifiers['gene'][0] else: gene_symbol = None annot = annotation.FunctionalAnnotation( product_name=product, gene_symbol=gene_symbol) if 'db_xref' in feat.qualifiers: for dbxref in feat.qualifiers['db_xref']: annot.add_dbxref(dbxref) polypeptide_id = "{0}.polypeptide.{1}".format( locus_tag, rna_count_by_gene[locus_tag]) polypeptide = things.Polypeptide(id=polypeptide_id, parent=mRNA, annotation=annot) mRNA.add_polypeptide(polypeptide) exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = things.CDS(id=cds_id, parent=current_RNA) CDS.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (( (subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id]) exon = things.Exon(id=exon_id, parent=current_RNA) exon.locate_on(target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 else: print( "WARNING: The following feature was skipped:\n{0}".format( feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format( utils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format( features_skipped_count))
if cols[2] == 'gene': if last_gene is not None: print("\n") pprint ("DEBUG: last gene was ({0}): {1}".format(last_gene.id, vars(last_gene))) gene = things.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) print("DEBUG: locating gene {0} on {1} at coordinates fmin:{2}-fmax:{3} strand:{4}".format(feat_id, mol_id, rfmin, rfmax, rstrand) ) features[feat_id] = gene current_assembly.add_gene(gene) last_gene = gene elif cols[2] == 'mRNA': mRNA = things.mRNA(id=feat_id, parent=parent_feat) mRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) print("DEBUG: attaching mRNA:{0} to parent gene:{1}".format(feat_id, parent_feat.id) ) parent_feat.add_mRNA(mRNA) features[feat_id] = mRNA elif cols[2] == 'rRNA': rRNA = things.rRNA(id=feat_id, parent=parent_feat) rRNA.locate_on(target=current_assembly, fmin=rfmin, fmax=rfmax, strand=rstrand) parent_feat.add_rRNA(rRNA) features[feat_id] = rRNA ======= ## then mark any that align to it (except self) for sbj_gene in things: if qry_gene.id == sbj_gene.id: continue