def main(): parser = argparse.ArgumentParser( description= "Find mapping distance of paired end reads. Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -" ) parser.add_argument('input_fasta', help="FASTAFILE or - for stdin") args = parser.parse_args() inf = sys.stdin if args.input_fasta != '-': inf = open(args.input_fasta) fh = FastaHandleReader(inf) data = [] sys.stderr.write("Reads Mean Stddev\n") while True: entry = fh.read_entry() if not entry: break dist = len(entry['seq']) data.append(dist) if len(data) < 2: continue if len(data) % 1000 == 0: sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write("\n")
def main(): parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input',help="PSLFILE or - for STIDN") parser.add_argument('reference',help="FASTAFILE reference genome") parser.add_argument('query',help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n") n = p.value('qName') if not last_fasta: sys.stderr.write("ERROR: Ran out of query sequences too soon. Are they sorted properly\n") sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) print p.get_line() p.pretty_print(50) fhr.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('input', help="Use - for STDIN") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--fasta', action='store_true') group.add_argument('--fastq', action='store_true') group.add_argument('--gpd', action='store_true') parser.add_argument('--output_table', help='save coversion to file') parser.add_argument('-o', '--output') args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) if args.output: args.output = open(args.output, 'w') if args.output_table: args.output_table = open(args.output_table, 'w') else: args.output = sys.stdout if args.gpd: z = 0 for line in args.input: f = line.rstrip().split("\t") z += 1 name = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( z) + '/ccs' if args.output_table: args.output_table.write(f[0] + "\t" + name + "\n") f[0] = name f[1] = name args.output.write("\t".join(f) + "\n") args.output.close() if args.output_table: args.output_table.close() return if args.fasta: args.input = FastaHandleReader(args.input) elif args.fastq: args.input = FastqHandleReader(args.input) z = 0 while True: e = args.input.read_entry() if not e: break z += 1 name = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( z) + '/ccs' if args.fastq: args.output.write('@' + name + "\n" + e['seq'] + "\n" + '+' + e['qual'] + "\n") elif args.fasta: args.output.write('>' + name + "\n" + e['seq'] + "\n") if args.output_table: args.output_table.write(e['name'] + "\t" + name + "\n") args.output.close() if args.output_table: args.output_table.close()
def main(): parser = argparse.ArgumentParser(description="Convert a genome to its mappability",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('reference_genome',help="Use - for STDIN") parser.add_argument('-k','--fragment_length',type=int,default=36,help="length of fragment to check mappability") parser.add_argument('-x','--genome_index',required=True) parser.add_argument('--threads',type=int,default=cpu_count(),help="Thread count") parser.add_argument('-o','--output',help="set for output file otherwise will be STDOUT") parser.add_argument('--type',choices=['mean','median','geometric_mean'],default='mean',help="How to combine window results") args = parser.parse_args() if args.output: args.output = open(args.output,'w') else: args.output = sys.stdout udir = os.path.dirname(os.path.realpath(__file__)) cmd4 = 'bed_tools.py - --merge --break_merge_on_feature' p4 = Popen(cmd4.split(),stdin=PIPE,stdout=args.output) cmd3 = udir+'/counts_to_mappability.py - --fragment_length '+str(args.fragment_length) cmd3 += ' --'+args.type p3 = Popen(cmd3.split(),stdin=PIPE,stdout=p4.stdin) cmd2 = 'hisat_to_mapping_count.py -' p2 = Popen(cmd2.split(),stdin=PIPE,stdout=p3.stdin) cmd1 = 'hisat -x '+args.genome_index+' -U - -f --reorder -p '+str(args.threads) p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) inf = open(args.reference_genome) fhr = FastaHandleReader(inf) while True: e = fhr.read_entry() if not e: break for i in range(0,len(e['seq'])-args.fragment_length): p1.stdin.write('>'+e['name']+':'+str(i+1)+'-'+str(i+args.fragment_length)+"\n") p1.stdin.write(e['seq'][i:i+args.fragment_length].upper()+"\n") p1.communicate() p2.communicate() p3.communicate() p4.communicate() args.output.close()
def main(): parser = argparse.ArgumentParser( description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input', help="PSLFILE or - for STIDN") parser.add_argument('reference', help="FASTAFILE reference genome") parser.add_argument('query', help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size', type=int, default=68, help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading " + str(len(g.keys())) + " reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write( "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n" + line.rstrip() + "\n") n = p.value('qName') if not last_fasta: sys.stderr.write( "ERROR: Ran out of query sequences too soon. Are they sorted properly\n" ) sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) print p.get_line() p.pretty_print(50) fhr.close()
def main(): parser = argparse.ArgumentParser(description="Find primers in a sequence") parser.add_argument('input', help="FASTA_FILE genome or - for STDIN") parser.add_argument('--AT_end_limit', type=int, default=4, help='Maxmimum number of A/T to look for at the end') parser.add_argument( '--overlap_join', type=int, default=8, help='Join together matches with this much exact overlap') parser.add_argument('--end_criteria', type=int, default=5000, help='Stop when you have seen a k-mer this many times') parser.add_argument('--total_candidates', type=int, default=100, help='Look at this number of candidates') parser.add_argument('--kmersize', type=int, default=18, help='Look at this number of candidates') args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) #tx = read_fasta_into_hash(args.transcriptome_fasta) totals = {} total_length = 0 lenlow = args.kmersize lenhigh = args.kmersize counts = {} z = 0 reader = FastaHandleReader(args.input) while True: e = reader.read_entry() if not e: break z += 1 sys.stderr.write(str(z) + "\r") seq = e['seq'] explode(counts, seq, lenlow, lenhigh) longest = 0 if z % 20 == 0: for part in counts.keys(): if counts[part] <= 3: del counts[part] else: edgemax = edgeAT(part) if edgemax > args.AT_end_limit: del counts[part] biggest = sorted(counts, key=counts.get, reverse=True) if len(biggest) > 1: if counts[biggest[1]] > args.end_criteria: break sys.stderr.write("\n") numuse = args.total_candidates rankedsets = {} z = 0 #for seq in counts.keys(): # edgemax = edgeAT(seq) # if edgemax > args.AT_end_limit: del counts[seq] for myset in [[x, counts[x]] for x in sorted(counts, key=counts.get, reverse=True) ][0:numuse]: rankedsets[z] = myset z += 1 lastsets = -1 numsets = len(rankedsets.keys()) while numsets != lastsets: sys.stderr.write(str(numsets) + " \r") lastsets = numsets reduceranks(rankedsets) # remove highly similar sets numsets = len(rankedsets.keys()) sys.stderr.write("\n") # now we have our best candidates #for i in sorted(rankedsets.keys()): # print rankedsets[i][0]+"\t"+str(rankedsets[i][1]) lastsets = 0 numsets = len(rankedsets.keys()) while numsets != lastsets: lastsets = numsets combine_overlapping(rankedsets, args) numsets = len(rankedsets.keys()) for i in sorted(rankedsets.keys()): print rankedsets[i][0] + "\t" + str(rankedsets[i][1])
def main(): parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input',help="PSLFILE or - for STIDN") parser.add_argument('reference',help="FASTAFILE reference genome") parser.add_argument('query',help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n") n = p.value('qName') if not last_fasta: sys.stderr.write("ERROR: Ran out of query sequences too soon. Are they sorted properly\n") sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) p.correct_stats() print p.get_line() continue f = last_fasta nCount = 0 matches = 0 misMatches = 0 prev_qE = 0 prev_tE = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 for i in range(p.value('blockCount')): blen = p.value('blockSizes')[i] qS = p.value('qStarts')[i] #query start qE = qS + blen #query end tS = p.value('tStarts')[i] #target start tE = tS + blen #target end #Work on gaps if prev_qE > 0 or prev_tE > 0: #if its not our first time through tgap = tS-prev_tE if tgap < args.minimum_intron_size and tgap > 0: tNumInsert += 1 tBaseInsert += tgap qgap = qS-prev_qE if qgap > 0: qNumInsert += 1 qBaseInsert += qgap query = f['seq'] if p.value('strand') == '-': query = rc(f['seq']) qseq = query[qS:qE].upper() rseq = g[p.value('tName')][tS:tE].upper() #print qseq+"\n"+rseq+"\n" for j in range(0,blen): if qseq[j] == 'N': nCount += 1 elif qseq[j] == rseq[j]: matches += 1 else: misMatches += 1 prev_qE = qE prev_tE = tE p.entry['matches'] = matches p.entry['misMatches'] = misMatches p.entry['nCount'] = nCount p.entry['qNumInsert'] = qNumInsert p.entry['qBaseInsert'] = qBaseInsert p.entry['tNumInsert'] = tNumInsert p.entry['tBaseInsert'] = tBaseInsert p.entry['qSize'] = len(query) p.entry['tSize'] = len(g[p.value('tName')]) print p.get_line() #p.pretty_print(100) fhr.close()
def main(): parser = argparse.ArgumentParser(description="Find primers in a sequence") parser.add_argument('input',help="FASTA_FILE genome or - for STDIN") parser.add_argument('--AT_end_limit',type=int,default=4,help='Maxmimum number of A/T to look for at the end') parser.add_argument('--overlap_join',type=int,default=8,help='Join together matches with this much exact overlap') parser.add_argument('--end_criteria',type=int,default=5000,help='Stop when you have seen a k-mer this many times') parser.add_argument('--total_candidates',type=int,default=100,help='Look at this number of candidates') parser.add_argument('--kmersize',type=int,default=18,help='Look at this number of candidates') args = parser.parse_args() if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) #tx = read_fasta_into_hash(args.transcriptome_fasta) totals = {} total_length = 0 lenlow = args.kmersize lenhigh = args.kmersize counts = {} z = 0 reader = FastaHandleReader(args.input) while True: e = reader.read_entry() if not e: break z+=1 sys.stderr.write(str(z)+"\r") seq = e['seq'] explode(counts,seq,lenlow,lenhigh) longest = 0 if z %20 == 0: for part in counts.keys(): if counts[part] <= 3: del counts[part] else: edgemax = edgeAT(part) if edgemax > args.AT_end_limit: del counts[part] biggest = sorted(counts, key=counts.get,reverse=True) if len(biggest) > 1: if counts[biggest[1]] > args.end_criteria: break sys.stderr.write("\n") numuse = args.total_candidates rankedsets= {} z = 0 #for seq in counts.keys(): # edgemax = edgeAT(seq) # if edgemax > args.AT_end_limit: del counts[seq] for myset in [[x,counts[x]] for x in sorted(counts, key=counts.get,reverse=True)][0:numuse]: rankedsets[z] = myset z +=1 lastsets = -1 numsets = len(rankedsets.keys()) while numsets != lastsets: sys.stderr.write(str(numsets)+" \r") lastsets = numsets reduceranks(rankedsets) # remove highly similar sets numsets = len(rankedsets.keys()) sys.stderr.write("\n") # now we have our best candidates #for i in sorted(rankedsets.keys()): # print rankedsets[i][0]+"\t"+str(rankedsets[i][1]) lastsets = 0 numsets = len(rankedsets.keys()) while numsets != lastsets: lastsets = numsets combine_overlapping(rankedsets,args) numsets = len(rankedsets.keys()) for i in sorted(rankedsets.keys()): print rankedsets[i][0]+"\t"+str(rankedsets[i][1])