def get_residues(self): if len(self.locations) == 0: raise Exception( "ERROR: gene.get_residues() requested but gene {0} isn't located on anything." .format(self.id)) elif len(self.locations) > 1: raise Exception( "ERROR: gene {0} is located on multiple molecules. Can't automatically extract the residues." .format(self.id)) loc = self.location() mol = loc.on # make sure this thing has its residues populated if len(mol.residues) <= 0: raise Exception( "ERROR: gene.get_residues() requested but its molecule {0} has no stored residues" .format(mol.id)) self.residues = mol.residues[loc.fmin:loc.fmax] self.length = len(self.residues) if loc.strand == -1: self.residues = biocodeutils.reverse_complement(self.residues) return self.residues
def get_residues(self): if len(self.locations) == 0: raise Exception( "ERROR: CDS.get_residues() requested but CDS {0} isn't located on anything.".format(self.id) ) elif len(self.locations) > 1: raise Exception( "ERROR: CDS {0} is located on multiple molecules. Can't automatically extract the residues.".format( self.id ) ) loc = self.location() mol = loc.on # make sure this thing has its residues populated if len(mol.residues) <= 0: raise Exception( "ERROR: CDS.get_residues() requested but its molecule {0} has no stored residues".format(mol.id) ) self.residues = mol.residues[loc.fmin : loc.fmax] self.length = len(self.residues) if loc.strand == -1: self.residues = biocodeutils.reverse_complement(self.residues) return self.residues
def main(): parser = argparse.ArgumentParser( description= 'Reverse or reverse-complement selected sequences within a multi-FASTA' ) ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-i', '--id_file', type=str, required=True, help='Path to file with IDs to process') parser.add_argument( '-a', '--action', type=str, required=True, choices=['reverse', 'revcomp'], help='What should be done to the sequences in the ID file') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = biocodeutils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == 'reverse': seq['s'] = seq['s'][::-1] elif args.action == 'revcomp': seq['s'] = biocodeutils.reverse_complement(seq['s']) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq['s']), 60): fout.write(seq['s'][i:i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description="Reverse or reverse-complement selected sequences within a multi-FASTA" ) ## output file to be written parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file") parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process") parser.add_argument( "-a", "--action", type=str, required=True, choices=["reverse", "revcomp"], help="What should be done to the sequences in the ID file", ) parser.add_argument( "-o", "--output_file", type=str, required=False, default=None, help="Optional Path to an output file to be created", ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, "wt") seqs = biocodeutils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == "reverse": seq["s"] = seq["s"][::-1] elif args.action == "revcomp": seq["s"] = biocodeutils.reverse_complement(seq["s"]) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq["s"]), 60): fout.write(seq["s"][i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' ) parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' ) parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' ) parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' ) parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' ) parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = biocodeutils.fasta_dict_from_file( args.fasta_file ) start_col = args.start_coord_col - 1 stop_col = args.stop_coord_col - 1 mol_col = args.mol_col - 1 for line in open(args.coords_file): line = line.rstrip() cols = line.split('\t') if len(cols) < 3: continue (fmin, fmax, strand) = biocodeutils.humancoords_to_0interbase( int(cols[start_col]), int(cols[stop_col]) ) mol_id = cols[mol_col] if mol_id not in seqs: raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id)) seq = seqs[mol_id]['s'][fmin:fmax] seq_id = None if args.name_col is None: seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand ) else: seq_id = cols[int(args.name_col) - 1] if strand == -1: seq = biocodeutils.reverse_complement(seq) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq), 60): fout.write(seq[i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' ) parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = biocodeutils.fasta_dict_from_file( args.fasta_in ) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception("ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = biocodeutils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format(seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format(correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference') parser.add_argument( '-fi', '--fasta_in', type=str, required=False, help= 'Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument( '-fo', '--fasta_out', type=str, required=False, help= 'If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = biocodeutils.fasta_dict_from_file(args.fasta_in) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception( "ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}} transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format( last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}} for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = biocodeutils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format( seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format( correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format( incorrect_orientation_count))