def main(): parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters') parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-hll', '--homopolymer_length_limit', type=int, required=True, help='Stretches of non-N residues longer than this will be replaced with Ns' ) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open( args.output, 'wt' ) sys.stderr.write("INFO: Parsing input FASTA\n") sys.stderr.flush() seqs = biocodeutils.fasta_dict_from_file( args.input ) sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(args.homopolymer_length_limit)) sys.stderr.flush() for seq_id in seqs: seq = seqs[seq_id] current_seq = seq['s'] current_homopolymer_base = None current_homopolymer_length = 0 current_homopolymer_start_idx = 0 base_index = 0 for base in list(seq['s']): if base == current_homopolymer_base: current_homopolymer_length += 1 else: if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write("WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n".format( seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format(seq['s'][0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, seq['s'][base_index:]) current_homopolymer_base = base current_homopolymer_length = 1 current_homopolymer_start_idx = base_index base_index += 1 ## check after the last row for any runs which terminate the sequence if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write("WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n".format( seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format(current_seq[0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, current_seq[base_index:]) seqs[seq_id]['s'] = current_seq out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) out_fh.write(biocodeutils.wrapped_fasta(seqs[seq_id]['s'])) out_fh.write("\n")
def write_fasta(self, fh=None, path=None): ''' Writes the current set in FASTA format. You can either pass the fh or path arguments. If an open file handle already exists, fh is appropriate. Instead, if you have just a path you want to be written to pass the 'path' argument instead. The header format in the FASTA entries depends on the type of elements in the set. ''' if path is not None: fh = open(path, 'wt') if self.__class__ == PolypeptideSet: molecules = self.polypeptides elif self.__class__ == AssemblySet: molecules = self.assemblies else: raise Exception("ERROR: writing FASTA not supported in MoleculeSets of this type: {0}".format(self.__class__)) for molecule in molecules: if self.__class__ == PolypeptideSet: header = molecule.annotation_string() elif self.__class__ == AssemblySet: header = molecule.id fh.write(">{0}\n".format(header)) fh.write("{0}\n".format(biocodeutils.wrapped_fasta(molecule.residues))) fh.close()
def write_fasta(self, fh=None, path=None): ''' Writes the current set in FASTA format. You can either pass the fh or path arguments. If an open file handle already exists, fh is appropriate. Instead, if you have just a path you want to be written to pass the 'path' argument instead. The header format in the FASTA entries depends on the type of elements in the set. ''' if path is not None: fh = open(path, 'wt') if self.__class__ == PolypeptideSet: molecules = self.polypeptides elif self.__class__ == AssemblySet: molecules = self.assemblies else: raise Exception( "ERROR: writing FASTA not supported in MoleculeSets of this type: {0}" .format(self.__class__)) for molecule in molecules: header = molecule.annotation_string() fh.write(">{0}\n".format(header)) fh.write("{0}\n".format( biocodeutils.wrapped_fasta(molecule.residues))) fh.close()
def main(): parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = biocodeutils.fasta_dict_from_file( args.input ) components = dict() for seq_id in seqs: m = re.search("(comp\d+)_", seq_id) if m: component_id = m.group(1) if component_id not in components or len(seqs[seq_id]['s']) > len(components[component_id]['s']): components[component_id] = seqs[seq_id] else: raise Exception("ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}".format(seq_id)) for c_id in components: seq_wrapped = biocodeutils.wrapped_fasta(components[c_id]['s'], every=60) fout.write(">{0} {1}\n{2}\n".format(seq_id, components[c_id]['h'], seq_wrapped))
def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional. Writes an output (translated) FASTA file for all those features which had internal stops') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) ) fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id) ) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): parser = argparse.ArgumentParser( description= 'Filters trinity output for longest subcomponents based on naming convention' ) ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = biocodeutils.fasta_dict_from_file(args.input) components = dict() for seq_id in seqs: m = re.search("(comp\d+)_", seq_id) if m: component_id = m.group(1) if component_id not in components or len(seqs[seq_id]['s']) > len( components[component_id]['s']): components[component_id] = seqs[seq_id] components[component_id]['longest_id'] = seq_id else: raise Exception( "ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}" .format(seq_id)) for c_id in components: seq_wrapped = biocodeutils.wrapped_fasta(components[c_id]['s'], every=60) fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'], components[c_id]['h'], seq_wrapped))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') files = args.fasta_files # pull off a file and index it seqs = biocodeutils.fasta_dict_from_file( files.pop() ) # python strings are immutable, so we need to transform these into lists for seq_id in seqs: seqs[seq_id]['s'] = list(seqs[seq_id]['s']) for fasta_file in args.fasta_files: new_seqs = biocodeutils.fasta_dict_from_file( fasta_file ) for seq_id in new_seqs: # make sure it exists in the source file if seq_id not in seqs: raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) ) # they should also be the same length if len(seqs[seq_id]) != len(new_seqs[seq_id]): raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file)) i = 0 for base in new_seqs[seq_id]['s']: if base != seqs[seq_id]['s'][i]: if base == 'N': seqs[seq_id]['s'][i] = 'N' elif seqs[seq_id]['s'][i] != 'N': print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) ) i += 1 # now done, print out the results for seq_id in seqs: ofh.write( ">{0} {1}\n{2}\n".format( seq_id, seqs[seq_id]['h'], biocodeutils.wrapped_fasta(''.join(seqs[seq_id]['s'])) ) )
def main(): parser = argparse.ArgumentParser( description='Merge masked FASTA files') ## output file to be written parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') files = args.fasta_files # pull off a file and index it seqs = biocodeutils.fasta_dict_from_file( files.pop() ) # python strings are immutable, so we need to transform these into lists for seq_id in seqs: seqs[seq_id]['s'] = list(seqs[seq_id]['s']) for fasta_file in args.fasta_files: new_seqs = biocodeutils.fasta_dict_from_file( fasta_file ) for seq_id in new_seqs: # make sure it exists in the source file if seq_id not in seqs: raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) ) # they should also be the same length if len(seqs[seq_id]) != len(new_seqs[seq_id]): raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file)) i = 0 for base in new_seqs[seq_id]['s']: if base != seqs[seq_id]['s'][i]: if base == 'N': seqs[seq_id]['s'][i] = 'N' elif seqs[seq_id]['s'][i] != 'N': print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) ) i += 1 # now done, print out the results for seq_id in seqs: ofh.write( ">{0} {1}\n{2}\n".format( seq_id, seqs[seq_id]['h'], biocodeutils.wrapped_fasta(''.join(seqs[seq_id]['s'])) ) )
def main(): parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' ) parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = biocodeutils.fasta_dict_from_file( args.input ) for seq_id in seqs: seq_wrapped = biocodeutils.wrapped_fasta(seqs[seq_id]['s'], every=args.width) fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def write_fasta_results(f, polypeptides): """ Produces headers like: >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS Example: """ for polypeptide_id in polypeptides: polypeptide = polypeptides[polypeptide_id] go_string = "" ec_string = "" for go_annot in polypeptide.annotation.go_annotations: go_string += "GO:{0},".format(go_annot.go_id) go_string = go_string.rstrip(',') for ec_annot in polypeptide.annotation.ec_numbers: ec_string += "{0},".format(ec_annot.number) ec_string = ec_string.rstrip(',') header = "{0} {1}".format(polypeptide_id, polypeptide.annotation.product_name) if polypeptide.annotation.gene_symbol is not None: header = "{0} gene::{1}".format(header, polypeptide.annotation.gene_symbol) if ec_string != "": header = "{0} ec::{1}".format(header, ec_string) if go_string != "": header = "{0} go::{1}".format(header, go_string) f.write(">{0}\n".format(header)) f.write("{0}\n".format(biocodeutils.wrapped_fasta( polypeptide.residues)))
def main(): parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' ) parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT' ) parser.add_argument('-uc', '--upper_case', action='store_true', required=False, help='Forces all bases to be upper-case' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = biocodeutils.fasta_dict_from_file( args.input ) for seq_id in seqs: if args.upper_case == True: seqs[seq_id]['s'] = seqs[seq_id]['s'].upper() seq_wrapped = biocodeutils.wrapped_fasta(seqs[seq_id]['s'], every=args.width) fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-s', '--source', type=str, required=False, default='.', help='Optional. Sets the value for column 2 in all rows. Default = .' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) fout = open(args.output_file, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for CDS in mRNA.CDSs(): check_and_update_phase(CDS) gene.print_as(fh=fout, source=args.source, format='gff3') fasta_header_written = False for assembly_id in assemblies: if assemblies[assembly_id].length > 0: if fasta_header_written is False: fout.write("##FASTA\n") fasta_header_written = True fout.write(">{0}\n".format(assemblies[assembly_id].id) ) fout.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
def write_fasta_results( f, polypeptides ): """ Produces headers like: >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS Example: """ for polypeptide_id in polypeptides: polypeptide = polypeptides[polypeptide_id] go_string = "" ec_string = "" for go_annot in polypeptide.annotation.go_annotations: go_string += "GO:{0},".format(go_annot.go_id) go_string = go_string.rstrip(',') for ec_annot in polypeptide.annotation.ec_numbers: ec_string += "{0},".format(ec_annot.number) ec_string = ec_string.rstrip(',') header = "{0} {1}".format(polypeptide_id, polypeptide.annotation.product_name) if polypeptide.annotation.gene_symbol is not None: header = "{0} gene::{1}".format(header, polypeptide.annotation.gene_symbol) if ec_string != "": header = "{0} ec::{1}".format(header, ec_string) if go_string != "": header = "{0} go::{1}".format(header, go_string) f.write( ">{0}\n".format( header ) ) f.write( "{0}\n".format( biocodeutils.wrapped_fasta(polypeptide.residues) ) )
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-s', '--source', type=str, required=False, default='.', help='Optional. Sets the value for column 2 in all rows. Default = .' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) fout = open(args.output_file, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") # deal with the FASTA file if the user passed one if args.genome_fasta is not None: process_assembly_fasta(assemblies, args.genome_fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): for CDS in mRNA.CDSs(): check_and_update_phase(CDS) gene.print_as(fh=fout, source=args.source, format='gff3') fasta_header_written = False for assembly_id in assemblies: if assemblies[assembly_id].length > 0: if fasta_header_written is False: fout.write("##FASTA\n") fasta_header_written = True fout.write(">{0}\n".format(assemblies[assembly_id].id)) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(assemblies[assembly_id].residues)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument( '-f', '--fasta', type=str, required=False, help= 'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues() if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write( "WARN: Non-canonical start codon ({0}) in mRNA {1}\n" .format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write( "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n" .format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format( biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false' ) parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly( id=mol_id ) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception("ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = biothings.Gene( id=locus_tag ) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) mRNA = biothings.mRNA( id=feat_id, parent=current_gene ) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) tRNA = biothings.tRNA( id=feat_id, parent=current_gene ) tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) rRNA = biothings.rRNA( id=feat_id, parent=current_gene ) rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = biothings.CDS( id=cds_id, parent=current_RNA ) CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = biothings.Exon( id=exon_id, parent=current_RNA ) exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand ) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 product = feat.qualifiers['product'][0] else: print("WARNING: The following feature was skipped:\n{0}".format(feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence report non-terminal internal stops.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-p', '--print_n_with_stops', type=int, required=False, default=0, help= 'Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument( '-o', '--output_fasta', type=str, required=False, help= 'Optional. Writes an output (translated) FASTA file for all those features which had internal stops' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count( '*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format( mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand)) fasta_out_fh.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id)) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues(for_translation=True) if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' ) parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = biocodeutils.fasta_dict_from_file( args.fasta_in ) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception("ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = biocodeutils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format(seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format(correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main(): parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters') parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument( '-hll', '--homopolymer_length_limit', type=int, required=True, help= 'Stretches of non-N residues longer than this will be replaced with Ns' ) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open(args.output, 'wt') sys.stderr.write("INFO: Parsing input FASTA\n") sys.stderr.flush() seqs = biocodeutils.fasta_dict_from_file(args.input) sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format( args.homopolymer_length_limit)) sys.stderr.flush() for seq_id in seqs: seq = seqs[seq_id] current_seq = seq['s'] current_homopolymer_base = None current_homopolymer_length = 0 current_homopolymer_start_idx = 0 base_index = 0 for base in list(seq['s']): if base == current_homopolymer_base: current_homopolymer_length += 1 else: if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write( "WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n" .format(seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format( seq['s'][0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, seq['s'][base_index:]) current_homopolymer_base = base current_homopolymer_length = 1 current_homopolymer_start_idx = base_index base_index += 1 ## check after the last row for any runs which terminate the sequence if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write( "WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n" .format(seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format( current_seq[0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, current_seq[base_index:]) seqs[seq_id]['s'] = current_seq out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) out_fh.write(biocodeutils.wrapped_fasta(seqs[seq_id]['s'])) out_fh.write("\n")
def main(): parser = argparse.ArgumentParser( description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file( args.fasta ) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax)) print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) ) if debug_mode: print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp)) print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format(gene_label, biocodeutils.wrapped_fasta(gene_seq)))
def main(): parser = argparse.ArgumentParser( description='Convert GenBank flat files to GFF3 format') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GBK file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('--with_fasta', dest='fasta', action='store_true', help='Include the FASTA section with genomic sequence at end of file. (default)' ) parser.add_argument('--no_fasta', dest='fasta', action='store_false' ) parser.set_defaults(fasta=True) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) seqs_pending_writes = False features_skipped_count = 0 # each gb_record is a SeqRecord object for gb_record in SeqIO.parse(open(args.input_file, "r"), "genbank"): mol_id = gb_record.name if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly( id=mol_id ) if len(str(gb_record.seq)) > 0: seqs_pending_writes = True assemblies[mol_id].residues = str(gb_record.seq) assemblies[mol_id].length = len(str(gb_record.seq)) current_assembly = assemblies[mol_id] # each feat is a SeqFeature object for feat in gb_record.features: #print(feat) fmin = int(feat.location.start) fmax = int(feat.location.end) if feat.location.strand == 1: strand = '+' elif feat.location.strand == -1: strand = '-' else: raise Exception("ERROR: unstranded feature encountered: {0}".format(feat)) #print("{0} located at {1}-{2} strand:{3}".format( locus_tag, fmin, fmax, strand ) ) if feat.type == 'source': continue if feat.type == 'gene': # print the previous gene (if there is one) if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') locus_tag = feat.qualifiers['locus_tag'][0] gene = biothings.Gene( id=locus_tag ) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene current_RNA = None elif feat.type == 'mRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) mRNA = biothings.mRNA( id=feat_id, parent=current_gene ) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'tRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.tRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) tRNA = biothings.tRNA( id=feat_id, parent=current_gene ) tRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_tRNA(tRNA) current_RNA = tRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'rRNA': locus_tag = feat.qualifiers['locus_tag'][0] rna_count_by_gene[locus_tag] += 1 feat_id = "{0}.rRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) rRNA = biothings.rRNA( id=feat_id, parent=current_gene ) rRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_rRNA(rRNA) current_RNA = rRNA if feat_id in exon_count_by_RNA: raise Exception( "ERROR: two different RNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_RNA[feat_id] = 0 elif feat.type == 'CDS': locus_tag = feat.qualifiers['locus_tag'][0] # If processing a prokaryotic GBK, we'll encounter CDS before mRNA, so we have to # manually make one if current_RNA is None: feat_id = "{0}.mRNA.{1}".format( locus_tag, rna_count_by_gene[locus_tag] ) mRNA = biothings.mRNA( id=feat_id, parent=current_gene ) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[current_RNA.id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) current_CDS_phase = 0 for loc in feat.location.parts: subfmin = int(loc.start) subfmax = int(loc.end) CDS = biothings.CDS( id=cds_id, parent=current_RNA ) CDS.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) # 0 + 6 = 0 TTGCAT # 0 + 7 = 2 TTGCATG # 1 + 6 = 1 TTGCAT # 2 + 7 = 1 TTGCATG # general: 3 - ((length - previous phase) % 3) current_CDS_phase = 3 - (((subfmax - subfmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = biothings.Exon( id=exon_id, parent=current_RNA ) exon.locate_on( target=current_assembly, fmin=subfmin, fmax=subfmax, strand=strand ) current_RNA.add_exon(exon) exon_count_by_RNA[current_RNA.id] += 1 else: print("WARNING: The following feature was skipped:\n{0}".format(feat)) features_skipped_count += 1 # don't forget to do the last gene, if there were any if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') if args.fasta is True: if seqs_pending_writes is True: ofh.write("##FASTA\n") for assembly_id in assemblies: ofh.write(">{0}\n".format(assembly_id)) ofh.write("{0}\n".format(biocodeutils.wrapped_fasta(assemblies[assembly_id].residues))) if features_skipped_count > 0: print("Warning: {0} unsupported feature types were skipped".format(features_skipped_count))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference') parser.add_argument( '-fi', '--fasta_in', type=str, required=False, help= 'Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument( '-fo', '--fasta_out', type=str, required=False, help= 'If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = biocodeutils.fasta_dict_from_file(args.fasta_in) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception( "ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}} transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format( last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}} for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = biocodeutils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format( seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format( correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format( incorrect_orientation_count))
def main(): parser = argparse.ArgumentParser(description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') parser.add_argument( '-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = biocodeutils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print( "INFO: Processing gene with length {0} at {1}-{2}".format( len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print( "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)" .format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns(on=assembly) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[ intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format( intron_loc.fmin, intron_loc.fmax)) print( "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}" .format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print( "INFO:\tgenerating lower case seq of length: {0}\n" .format(len(lower_mid))) if debug_mode: print("INFO: seq length before CDS processing is: {0}". format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print( "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}" .format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format( fmin_chomp, fmax_chomp)) print( "\tpulling range: gene_seq[{0} : {1}]".format( fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp:len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format( gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format( gene_label, biocodeutils.wrapped_fasta(gene_seq)))