def main(): parser = argparse.ArgumentParser( description= 'Reverse or reverse-complement selected sequences within a multi-FASTA' ) ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-i', '--id_file', type=str, required=True, help='Path to file with IDs to process') parser.add_argument( '-a', '--action', type=str, required=True, choices=['reverse', 'revcomp'], help='What should be done to the sequences in the ID file') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == 'reverse': seq['s'] = seq['s'][::-1] elif args.action == 'revcomp': seq['s'] = utils.reverse_complement(seq['s']) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq['s']), 60): fout.write(seq['s'][i:i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' ) parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' ) parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' ) parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' ) parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' ) parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) start_col = args.start_coord_col - 1 stop_col = args.stop_coord_col - 1 mol_col = args.mol_col - 1 for line in open(args.coords_file): line = line.rstrip() cols = line.split('\t') if len(cols) < 3: continue (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col])) mol_id = cols[mol_col] if mol_id not in seqs: raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id)) seq = seqs[mol_id]['s'][fmin:fmax] seq_id = None if args.name_col is None: seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand ) else: seq_id = cols[int(args.name_col) - 1] if strand == -1: seq = utils.reverse_complement(seq) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq), 60): fout.write(seq[i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description="Reverse or reverse-complement selected sequences within a multi-FASTA" ) ## output file to be written parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file") parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process") parser.add_argument( "-a", "--action", type=str, required=True, choices=["reverse", "revcomp"], help="What should be done to the sequences in the ID file", ) parser.add_argument( "-o", "--output_file", type=str, required=False, default=None, help="Optional Path to an output file to be created", ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, "wt") seqs = utils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == "reverse": seq["s"] = seq["s"][::-1] elif args.action == "revcomp": seq["s"] = utils.reverse_complement(seq["s"]) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq["s"]), 60): fout.write(seq["s"][i : i + 60] + "\n")
def get_residues(self): if len(self.locations) == 0: raise Exception("ERROR: gene.get_residues() requested but gene {0} isn't located on anything.".format(self.id)) elif len(self.locations) > 1: raise Exception("ERROR: gene {0} is located on multiple molecules. Can't automatically extract the residues.".format(self.id)) loc = self.location() mol = loc.on # make sure this thing has its residues populated if len(mol.residues) <= 0: raise Exception("ERROR: gene.get_residues() requested but its molecule {0} has no stored residues".format(mol.id)) self.residues = mol.residues[loc.fmin:loc.fmax] self.length = len(self.residues) if loc.strand == -1: self.residues = utils.reverse_complement(self.residues) return self.residues
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop') parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional. Limits how far an extension will happen looking for an in-frame stop codon') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print("\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print("\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement(next_codon) print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3)) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' ) parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = utils.fasta_dict_from_file(args.fasta_in) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception("ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = utils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format(seq_id, utils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format(correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) parser.add_argument( '-el', '--extension_limit', type=int, required=False, default=100, help= 'Optional. Limits how far an extension will happen looking for an in-frame stop codon' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print( "\tcoding sequence ends with {0}, last three a.a.: {1}" .format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print( "\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print( "\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t" .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or ( mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement( next_codon) print(".{0}({1}-{2})".format( next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format( next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop( on=assemblies[assembly_id], to=(CDS_pos + 3)) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop( on=assemblies[assembly_id], to=CDS_pos) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format( mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates') parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers') parser.add_argument( '-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions') parser.add_argument( '-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions') parser.add_argument( '-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) start_col = args.start_coord_col - 1 stop_col = args.stop_coord_col - 1 mol_col = args.mol_col - 1 for line in open(args.coords_file): line = line.rstrip() cols = line.split('\t') if len(cols) < 3: continue (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col])) mol_id = cols[mol_col] if mol_id not in seqs: raise Exception( "ERROR: molecule ID ({0}) not found in FASTA file".format( mol_id)) seq = seqs[mol_id]['s'][fmin:fmax] seq_id = None if args.name_col is None: seq_id = "{0}___{1}.{2}.{3}".format(mol_id, fmin, fmax, strand) else: seq_id = cols[int(args.name_col) - 1] if strand == -1: seq = utils.reverse_complement(seq) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq), 60): fout.write(seq[i:i + 60] + "\n")