def main(): parser = argparse.ArgumentParser( description= "Find mapping distance of paired end reads. Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -" ) parser.add_argument( 'input_sam', help="SAMFILE ordered alignment a transcriptome or - for stdin") args = parser.parse_args() inf = sys.stdin if args.input_sam != '-': inf = open(args.input_sam) msr = SamBasics.MultiEntrySamReader(inf) spcf = SamBasics.SAMtoPSLconversionFactory() data = [] sys.stderr.write("Pairs Mean Stddev\n") while True: entries = msr.read_entries() if not entries: break if len(entries) != 2: continue [e1, e2] = entries if e1.check_flag(4) or e2.check_flag(4): continue if not e1.check_flag(2) and e2.check_flag(2): continue if not ((e1.check_flag(64) and e2.check_flag(128)) or (e1.check_flag(128) and e2.check_flag(64))): continue p1 = spcf.convert_line(e1.get_line()) p2 = spcf.convert_line(e2.get_line()) if not p1 or not p2: continue p1 = PSLBasics.PSL(p1) p2 = PSLBasics.PSL(p2) dist = max( p2.value('tEnd') - p1.value('tStart'), p1.value('tEnd') - p2.value('tStart')) data.append(dist) if len(data) < 2: continue if len(data) % 1000 == 0: sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write("\n")
def main(): parser = argparse.ArgumentParser( description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input', help="PSLFILE or - for STIDN") parser.add_argument('reference', help="FASTAFILE reference genome") parser.add_argument('query', help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size', type=int, default=68, help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading " + str(len(g.keys())) + " reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write( "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n" + line.rstrip() + "\n") n = p.value('qName') if not last_fasta: sys.stderr.write( "ERROR: Ran out of query sequences too soon. Are they sorted properly\n" ) sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) print p.get_line() p.pretty_print(50) fhr.close()
def main(): parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input',help="PSLFILE or - for STIDN") parser.add_argument('reference',help="FASTAFILE reference genome") parser.add_argument('query',help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n") n = p.value('qName') if not last_fasta: sys.stderr.write("ERROR: Ran out of query sequences too soon. Are they sorted properly\n") sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) p.correct_stats() print p.get_line() continue f = last_fasta nCount = 0 matches = 0 misMatches = 0 prev_qE = 0 prev_tE = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 for i in range(p.value('blockCount')): blen = p.value('blockSizes')[i] qS = p.value('qStarts')[i] #query start qE = qS + blen #query end tS = p.value('tStarts')[i] #target start tE = tS + blen #target end #Work on gaps if prev_qE > 0 or prev_tE > 0: #if its not our first time through tgap = tS-prev_tE if tgap < args.minimum_intron_size and tgap > 0: tNumInsert += 1 tBaseInsert += tgap qgap = qS-prev_qE if qgap > 0: qNumInsert += 1 qBaseInsert += qgap query = f['seq'] if p.value('strand') == '-': query = rc(f['seq']) qseq = query[qS:qE].upper() rseq = g[p.value('tName')][tS:tE].upper() #print qseq+"\n"+rseq+"\n" for j in range(0,blen): if qseq[j] == 'N': nCount += 1 elif qseq[j] == rseq[j]: matches += 1 else: misMatches += 1 prev_qE = qE prev_tE = tE p.entry['matches'] = matches p.entry['misMatches'] = misMatches p.entry['nCount'] = nCount p.entry['qNumInsert'] = qNumInsert p.entry['qBaseInsert'] = qBaseInsert p.entry['tNumInsert'] = tNumInsert p.entry['tBaseInsert'] = tBaseInsert p.entry['qSize'] = len(query) p.entry['tSize'] = len(g[p.value('tName')]) print p.get_line() #p.pretty_print(100) fhr.close()
def main(): parser = argparse.ArgumentParser( description="Analyze ORDERED psl alignments of long reads.") parser.add_argument( 'psl_file', help="Alignment file. Must be ordered by query name. use - for stdin") parser.add_argument('-o', '--output', help="Write to output file, default is STDIN") parser.add_argument('--noheader', action='store_true') parser.add_argument( '--minimum_coverage', type=int, help="Only consider alignments with at least this many bp aligned") parser.add_argument('--threads', type=int, default=multiprocessing.cpu_count(), help="INT default cpu_count") parser.add_argument( '--tempbuffer', help= "DIRECTORY store the results in a temporary file until they are ready to output. suggest using /tmp if you don't know what to use" ) args = parser.parse_args() seen_names = set() last_name = '' buffer = PSLBasics.MultiplePSLAlignments() inf = sys.stdin if args.psl_file != '-': inf = open(args.psl_file) global of tname = None if args.tempbuffer: if not args.output: sys.stderr.write( "ERROR if you want to buffer outputs in a temp file you need to specify a final output file.\n" ) sys.exit() rnum = random.randint(1, 1000000000) tname = args.tempbuffer.rstrip('/') + '/weirathe.' + str( rnum) + '.meta' of = open(tname, 'w') if args.output and not args.tempbuffer: of = open(args.output, 'w') global lock if args.threads > 1: pool = multiprocessing.Pool(args.threads) for line in inf: e = PSLBasics.line_to_entry(line.rstrip()) if e['qName'] != last_name: # we have a new name if e['qName'] in seen_names: sys.stderr.write( "ERROR psl entries are not ordered by query name.\n") sys.exit() seen_names.add(e['qName']) if buffer.get_alignment_count() > 0: #process_buffer(buffer) if args.threads > 1: pool.apply_async(process_buffer, [buffer], callback=print_result) else: res = process_buffer(buffer) print_result(res) buffer = PSLBasics.MultiplePSLAlignments() if args.minimum_coverage > 1: buffer.set_minimum_coverage(args.minimum_coverage) last_name = e['qName'] buffer.add_entry(PSLBasics.PSL(line.rstrip())) inf.close() if buffer.get_alignment_count() > 0: if args.threads > 1: pool.apply_async( process_buffer, [buffer], callback=print_result) # if we still have something left to do else: res = process_buffer(buffer) print_result(res) if args.threads > 1: pool.close() pool.join() of.close() if args.tempbuffer: of = open(args.output, 'w') with open(tname) as inf: for line in inf: of.write(line) of.close() os.remove(tname)