def main(): parser = argparse.ArgumentParser(description="Make sam file compatible with tools counting on a splicemap format sam file.") parser.add_argument('in_sam',help="FILENAME of sam file, or '-' for STDIN") args = parser.parse_args() inf = sys.stdin if args.in_sam != '-': inf = open(args.in_sam) for line in inf: line = line.rstrip() if SamBasics.is_header(line): print line continue f = line.rstrip().split("\t") e = SamBasics.sam_line_to_dictionary(line) if SamBasics.check_flag(e['flag'],4): continue # skip the unmapped reads if SamBasics.check_flag(e['flag'],16): f[1] = "16" else: f[1] = "0" f[4] = "0" f[6] = "*" f[7] = "0" f[8] = "0" print "\t".join(f)
def main(): parser = argparse.ArgumentParser( description= "Make sam file compatible with tools counting on a splicemap format sam file." ) parser.add_argument('in_sam', help="FILENAME of sam file, or '-' for STDIN") args = parser.parse_args() inf = sys.stdin if args.in_sam != '-': inf = open(args.in_sam) for line in inf: line = line.rstrip() if SamBasics.is_header(line): print line continue f = line.rstrip().split("\t") e = SamBasics.sam_line_to_dictionary(line) if SamBasics.check_flag(e['flag'], 4): continue # skip the unmapped reads if SamBasics.check_flag(e['flag'], 16): f[1] = "16" else: f[1] = "0" f[4] = "0" f[6] = "*" f[7] = "0" f[8] = "0" print "\t".join(f)
def get_exons_from_seqs(seqs, d, spcf): sind = 0 oline = '' for seq in seqs: sind += 1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H', 'D', 'N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N' * total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry, 68) for i in range(0, len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str( gsmooth['exonStarts'][i]) + "\t" + str( gsmooth['exonEnds'] [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[ 'name'] + "\t" + str(possible_matches) + "\t" + str( indels) + "\t" + psec + "\t" + str(qstart) + "\n" return oline
def get_exons_from_seqs(seqs,d,spcf): sind = 0 oline = '' for seq in seqs: sind+=1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H','D','N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N'*total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry,68) for i in range(0,len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n" return oline
def make_exons(args, thread_index, thread_count): is_sam = True if re.search('\.bam$', args.sam_file): is_sam = False stag = '' if is_sam: stag = '-S' cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file spcf = SamBasics.SAMtoPSLconversionFactory() if args.reference_genome: spcf.set_genome(args.reference_genome) sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed' of = open(fname, 'w') z = 0 with sampipe.stdout as inf: for line in inf: z += 1 if z % thread_count != thread_index: continue line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) strand = '+' if SamBasics.check_flag(d['flag'], 16): strand = '-' seqs = [] sequence = d['seq'] seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']]) m = re.search('XA:Z:(\S+)', line) if m and args.use_secondary_alignments: e = m.group(1) secondaries = e.rstrip(";").split(";") for secondary in secondaries: m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary) if not m1: sys.stderr.write("strange secondary format " + secondary + "\n") sys.exit() seqs.append([ d['qname'], m1.group(1), m1.group(2), int(m1.group(3)), m1.group(4) ]) #p.apply_async(get_exons_from_seqs,[seqs,d,spcf]) exons = get_exons_from_seqs(seqs, d, spcf) of.write(exons) #return exons of.close()
def main(): parser = argparse.ArgumentParser( description="Get read counts from sam or bam.") parser.add_argument('input', help="FILENAME sam or bam") parser.add_argument( '--add_report', action='store_true', help="make a new file where we replace sam or bam with a .mapped_count" ) args = parser.parse_args() if args.add_report: m = re.match('(.+)\.[bs]am', args.input) if not m: sys.stderr.write("bad inputfile type should be .bam or .sam\n") sys.exit() baseinput = m.group(1) samtag = '' if re.search('\.sam$', args.input): samtag = '-S' z = 0 #se = open('/dev/stderr','w') p = subprocess.Popen('sort | uniq | wc -l', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) with os.popen('samtools view ' + samtag + ' ' + args.input) as inf: for line in inf: z += 1 if z % 100000 == 0: sys.stderr.write(str(z) + " alignments processed\r") line = line.rstrip() d = SamBasics.sam_line_to_dictionary(line) if not SamBasics.check_flag(d['flag'], 4): if SamBasics.check_flag(d['flag'], 64): p.stdin.write(d['qname'] + '.1' + "\n") elif SamBasics.check_flag(d['flag'], 128): p.stdin.write(d['qname'] + '.2' + "\n") else: sys.stderr.write("Unrecognized\n") sys.exit() sys.stderr.write("\n") aligned_reads = int(p.communicate()[0].rstrip()) if args.add_report: of = open(baseinput + '.mapped_reads', 'w') of.write(str(aligned_reads) + "\n") return print aligned_reads
def main(): parser = argparse.ArgumentParser( description= "Find mapping distance of paired end reads. Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -" ) parser.add_argument( 'input_sam', help="SAMFILE ordered alignment a transcriptome or - for stdin") args = parser.parse_args() inf = sys.stdin if args.input_sam != '-': inf = open(args.input_sam) msr = SamBasics.MultiEntrySamReader(inf) spcf = SamBasics.SAMtoPSLconversionFactory() data = [] sys.stderr.write("Pairs Mean Stddev\n") while True: entries = msr.read_entries() if not entries: break if len(entries) != 2: continue [e1, e2] = entries if e1.check_flag(4) or e2.check_flag(4): continue if not e1.check_flag(2) and e2.check_flag(2): continue if not ((e1.check_flag(64) and e2.check_flag(128)) or (e1.check_flag(128) and e2.check_flag(64))): continue p1 = spcf.convert_line(e1.get_line()) p2 = spcf.convert_line(e2.get_line()) if not p1 or not p2: continue p1 = PSLBasics.PSL(p1) p2 = PSLBasics.PSL(p2) dist = max( p2.value('tEnd') - p1.value('tStart'), p1.value('tEnd') - p2.value('tStart')) data.append(dist) if len(data) < 2: continue if len(data) % 1000 == 0: sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write("\n")
def make_exons(args,thread_index,thread_count): is_sam = True if re.search('\.bam$',args.sam_file): is_sam = False stag = '' if is_sam: stag = '-S' cmd = 'samtools view -F 4 '+stag+' '+args.sam_file spcf = SamBasics.SAMtoPSLconversionFactory() if args.reference_genome: spcf.set_genome(args.reference_genome) sampipe = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) fname = args.tempdir+'/bedpart.'+str(thread_index)+'.bed' of = open(fname,'w') z = 0 with sampipe.stdout as inf: for line in inf: z+=1 if z%thread_count != thread_index: continue line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) strand = '+' if SamBasics.check_flag(d['flag'],16): strand = '-' seqs = [] sequence = d['seq'] seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']]) m = re.search('XA:Z:(\S+)',line) if m and args.use_secondary_alignments: e = m.group(1) secondaries = e.rstrip(";").split(";") for secondary in secondaries: m1 = re.match('([^,]+),([+-])(\d+),([^,]+)',secondary) if not m1: sys.stderr.write("strange secondary format "+secondary+"\n") sys.exit() seqs.append([d['qname'], m1.group(1),m1.group(2),int(m1.group(3)),m1.group(4)]) #p.apply_async(get_exons_from_seqs,[seqs,d,spcf]) exons = get_exons_from_seqs(seqs,d,spcf) of.write(exons) #return exons of.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('psl',help="FILENAME of psl file (can be gzipped)") parser.add_argument('refgenome',help="FASTA of the reference genome") parser.add_argument('--min_intron_size',default=68,type=int,help="INT minimum intron size") parser.add_argument('--fastq_reads',help="FASTQ of the reads") parser.add_argument('--fasta_reads',help="FASTA of the reads") parser.add_argument('--skip_directionless_splice',action='store_true',help='only output reads where canonical splice sites indicate direciton if junctions are present') parser.add_argument('-o',help="FILENAME to save sam output") args = parser.parse_args() pscf = SamBasics.PSLtoSAMconversionFactory() pscf.set_min_intron_size(args.min_intron_size) sys.stderr.write("Creating header from reference fasta\n") #header = SamBasics.construct_header_from_reference_fasta('/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa') if args.skip_directionless_splice: pscf.set_skip_directionless_splice() header = SamBasics.construct_header_from_reference_fasta(args.refgenome) if args.o: of = open(args.o,'w') of.write(header) else: sys.stdout.write(header) sys.stderr.write("setting reference fasta for conversion\n") pscf.set_reference_genome(args.refgenome) sys.stderr.write("determining mapping counts from psl\n") pscf.set_mapping_counts(args.psl) #pscf.construct_header_from_reference_fasta('test.fa') sys.stderr.write("Establishing library of reads\n") if args.fastq_reads: pscf.set_read_fastq(args.fastq_reads) elif args.fasta_reads: pscf.set_read_fasta(args.fasta_reads) sys.stderr.write("Performing conversion\n") gfr = None if args.psl[-3:]=='.gz': gfr = gzip.open(args.psl) else: gfr = open(args.psl) skipped = 0 while True: line = gfr.readline() if not line: break samline = pscf.convert_line(line.rstrip()) if not samline: skipped += 1 sys.stderr.write("\rskipping directionless splice ("+str(skipped)+") ") continue # happens if we are skipping directionless splice if args.o: of.write(samline+"\n") else: sys.stdout.write(samline+"\n") if args.o: of.close() gfr.close() sys.stderr.write("\n")
def main(): parser = argparse.ArgumentParser(description="Get read counts from sam or bam.") parser.add_argument('input',help="FILENAME sam or bam") parser.add_argument('--add_report',action='store_true',help="make a new file where we replace sam or bam with a .mapped_count") args = parser.parse_args() if args.add_report: m = re.match('(.+)\.[bs]am',args.input) if not m: sys.stderr.write("bad inputfile type should be .bam or .sam\n") sys.exit() baseinput = m.group(1) samtag = '' if re.search('\.sam$',args.input): samtag = '-S' z = 0 #se = open('/dev/stderr','w') p = subprocess.Popen('sort | uniq | wc -l',shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE) with os.popen('samtools view '+samtag+' '+args.input) as inf: for line in inf: z += 1 if z%100000 ==0: sys.stderr.write(str(z)+" alignments processed\r") line = line.rstrip() d = SamBasics.sam_line_to_dictionary(line) if not SamBasics.check_flag(d['flag'],4): if SamBasics.check_flag(d['flag'],64): p.stdin.write(d['qname']+'.1'+"\n") elif SamBasics.check_flag(d['flag'],128): p.stdin.write(d['qname']+'.2'+"\n") else: sys.stderr.write("Unrecognized\n") sys.exit() sys.stderr.write("\n") aligned_reads = int(p.communicate()[0].rstrip()) if args.add_report: of = open(baseinput+'.mapped_reads','w') of.write(str(aligned_reads)+"\n") return print aligned_reads
def read_sam_file(self,filename): gsr = SamBasics.GenericSamReader(filename) linecount = 0 while True and linecount < self.max_read_count: line1 = gsr.readline().rstrip() if not line1: break line2 = gsr.readline().rstrip() if not line2: break line3 = gsr.readline().rstrip() if not line3: break line4 = gsr.readline().rstrip() if not line4: break self.record_observation(line4) linecount += 1 gsr.close()
def main(): parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed") parser.add_argument('-o','--output',help='FILENAME is output') parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size') parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN') parser.add_argument('reference_genome',help='FILENAME of the reference genome') args = parser.parse_args() # get our reference genome sys.stderr.write("reading reference genome\n") #g = SequenceBasics.read_fasta_into_hash(args.reference_genome) g = FastaData(open(args.reference_genome).read()) sys.stderr.write("finished reading reference genome\n") inf = sys.stdin read_mapping_count = {} junctions = {} if args.infile != '-': inf = open(args.infile) sys.stderr.write("reading through sam file\n") zall = 0 zn = 0 while True: line = inf.readline() if not line: break line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) chrom = d['rname'] if chrom =='*': continue if chrom not in g.keys(): sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n") continue mate = 'U' if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped continue # we can ignore the unmapped things for now if SamBasics.check_flag(d['flag'],int('0x40',16)): mate = 'L' elif SamBasics.check_flag(d['flag'],int('0x80',16)): mate = 'R' actual_read = d['qname']+"\t"+mate if actual_read not in read_mapping_count: read_mapping_count[actual_read] = 0 read_mapping_count[actual_read] += 1 has_intron = 0 start_loc = d['pos'] current_loc = start_loc bounds = [] for i in range(0,len(d['cigar_array'])): ce = d['cigar_array'][i] if ce['op'] == 'N' and ce['val'] >= args.min_intron_size: has_intron = 1 lbound = current_loc # should be the intron start base index-1 current_loc += ce['val'] rbound = current_loc # should be the second exon start base index-1 right_size = d['cigar_array'][i+1]['val'] bounds.append([lbound,rbound,right_size]) elif ce['op'] == 'D': current_loc += ce['val'] elif re.match('[=XMSHP]',ce['op']): current_loc += ce['val'] if has_intron == 0: continue # there are no splices to report here #print actual_read #print d['cigar'] #print d #print start_loc #print bounds for bound in bounds: zall += 1 intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \ g[chrom][bound[1]-3:bound[1]-1].upper() strand = '' if is_canon(intronflank): # its a positive strand strand = '+' elif is_revcanon(intronflank): # its a negative strand strand = '-' else: # We can't deal with the non-canonical splice sorry zn += 1 sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r") continue # If we are still in we have successfully found a splice out_chrom = chrom out_start = bound[0]-51 out_end = bound[1]+49 out_name = '*' # this will be done later out_score = 50 out_strand = strand out_thickStart = out_start out_thickEnd = out_end out_rgb = '0,0,0' out_block_count = 2 out_block_sizes = '50,50' out_block_starts = '0,'+str(bound[1]-bound[0]+50) bed = [] bed.append(out_chrom) bed.append(str(out_start)) bed.append(str(out_end)) bed.append(out_name) bed.append(str(out_score)) bed.append(out_strand) bed.append(str(out_thickStart)) bed.append(str(out_thickEnd)) bed.append(out_rgb) bed.append(str(out_block_count)) bed.append(out_block_sizes) bed.append(out_block_starts) entry = "\t".join(bed) if entry not in junctions: junctions[entry] = {} junctions[entry]['reads'] = set() junctions[entry]['positions'] = set() junctions[entry]['right_sizes'] = set() junctions[entry]['reads'].add(actual_read) junctions[entry]['positions'].add(d['pos']) junctions[entry]['right_sizes'].add(bound[2]) sys.stderr.write("\n") sys.stderr.write("finished reading sam\n") of = sys.stdout if args.output: of = open(args.output,'w') if len(junctions) > 0: # if we have stuff lets print a header of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n") for entry in junctions: nR = len(junctions[entry]['reads']) width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes']) nNR = len(junctions[entry]['positions']) nUR = 0 nMR = 0 for read in junctions[entry]['reads']: if read_mapping_count[read] == 1: nUR += 1 elif read_mapping_count[read] > 1: nMR += 1 else: sys.stderr.write("ERROR: nonsense read count\n") return name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')' bed = entry.split("\t") bed[3] = name of.write("\t".join(bed)+"\n")
def main(): parser = argparse.ArgumentParser( description="Convert a sam file into a psl file") parser.add_argument('--genome', help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments', action='store_true', help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments', action='store_true', help="Report XA:Z alternative alignments as well") parser.add_argument( '--get_all_alignments', action='store_true', help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names', action='store_true', help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument( '--output_fasta', help= "FILENAME to save an outgoing fasta. Only works for primary alignments." ) group.add_argument( '--output_fastq', help= "FILENAME to save an outgoing fastq. Only works for primary alignments." ) parser.add_argument('infile', help="FILENAME input file or '-' for STDIN") parser.add_argument('-o', '--output', help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write( "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n" ) sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output, 'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta, 'w') if args.output_fastq: off = open(args.output_fastq, 'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">" + pobj.value('qName') + "\n" + sequence + "\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@" + pobj.value('qName') + "\n" + sequence + "\n" + "+\n" + quality + "\n") else: sys.stderr.write("ERROR: sequence " + sequence + " length (" + str(len(sequence)) + ") doesnt match quality " + quality + " length (" + str(len(quality)) + ")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments( line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments( line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") inf.close() of.close()
def main(): parser = argparse.ArgumentParser(description="Convert a sam file into a psl file") parser.add_argument('--genome',help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well") parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta. Only works for primary alignments.") group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq. Only works for primary alignments.") parser.add_argument('infile',help="FILENAME input file or '-' for STDIN") parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n") sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output,'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta,'w') if args.output_fastq: off = open(args.output_fastq,'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">"+pobj.value('qName')+"\n"+sequence+"\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n") else: sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") inf.close() of.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('psl', help="FILENAME of psl file (can be gzipped)") parser.add_argument('refgenome', help="FASTA of the reference genome") parser.add_argument('--min_intron_size', default=68, type=int, help="INT minimum intron size") parser.add_argument('--fastq_reads', help="FASTQ of the reads") parser.add_argument('--fasta_reads', help="FASTA of the reads") parser.add_argument( '--skip_directionless_splice', action='store_true', help= 'only output reads where canonical splice sites indicate direciton if junctions are present' ) parser.add_argument('-o', help="FILENAME to save sam output") args = parser.parse_args() pscf = SamBasics.PSLtoSAMconversionFactory() pscf.set_min_intron_size(args.min_intron_size) sys.stderr.write("Creating header from reference fasta\n") #header = SamBasics.construct_header_from_reference_fasta('/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa') if args.skip_directionless_splice: pscf.set_skip_directionless_splice() header = SamBasics.construct_header_from_reference_fasta(args.refgenome) if args.o: of = open(args.o, 'w') of.write(header) else: sys.stdout.write(header) sys.stderr.write("setting reference fasta for conversion\n") pscf.set_reference_genome(args.refgenome) sys.stderr.write("determining mapping counts from psl\n") pscf.set_mapping_counts(args.psl) #pscf.construct_header_from_reference_fasta('test.fa') sys.stderr.write("Establishing library of reads\n") if args.fastq_reads: pscf.set_read_fastq(args.fastq_reads) elif args.fasta_reads: pscf.set_read_fasta(args.fasta_reads) sys.stderr.write("Performing conversion\n") gfr = None if args.psl[-3:] == '.gz': gfr = gzip.open(args.psl) else: gfr = open(args.psl) skipped = 0 while True: line = gfr.readline() if not line: break samline = pscf.convert_line(line.rstrip()) if not samline: skipped += 1 sys.stderr.write("\rskipping directionless splice (" + str(skipped) + ") ") continue # happens if we are skipping directionless splice if args.o: of.write(samline + "\n") else: sys.stdout.write(samline + "\n") if args.o: of.close() gfr.close() sys.stderr.write("\n")
def check_parameters(z,gz,ifile,tdir,max_allowed_mismatches,Q,fsize): #sys.stderr.write("doing "+str(z)+"\n") g = {} for n in gz: g[n] = zlib.decompress(gz[n]) FNULL = open(os.devnull,'w') cmd1 = "bwa mem "+ifile+" "+tdir+'/'+str(z)+'.fq' cmd2 = "samtools view -S -" stream1 = subprocess.Popen(cmd1.split(),stdout=subprocess.PIPE,stderr=FNULL) stream2 = subprocess.Popen(cmd2.split(),stdin=stream1.stdout,stdout=subprocess.PIPE,stderr=FNULL) reads = {} while True: sumlen= 0 mismatches = 0 line = stream2.stdout.readline() if not line: break f = line.rstrip().split("\t") if f[2] == '*': continue d = SamBasics.sam_line_to_dictionary(line) #if d['rname'] != 'chr20': continue #get rid of this line soon. cigar = d['cigar_array'] #endmismatch = 0 #if cigar[0]['op'] == 'S': # endmismatch += cigar[0]['val'] #if cigar[len(cigar)-1]['op'] == 'S': # endmismatch += cigar[len(cigar)-1]['val'] #if endmismatch > max_end_mismatches: continue read_index = 1 chrom_index = d['pos'] for e in cigar: if re.match('[MX=]',e['op']): sumlen += e['val'] # keep track of our match length refseq = g[d['rname']][chrom_index-1:chrom_index-1+e['val']].upper() readseq = d['seq'][read_index-1:read_index-1+e['val']].upper() for i in range(0,e['val']): if refseq[i] != readseq[i]: mismatches += 1 read_index += e['val'] chrom_index += e['val'] elif re.match('[SI]',e['op']): mismatches += e['val'] read_index += e['val'] elif re.match('[NDH]',e['op']): chrom_index += e['val'] else: sys.stderr.write("warning: strange SAM op\n") # save the biggest sum for the read name #print 'mismatches: '+str(mismatches) if mismatches > max_allowed_mismatches: continue if d['qname'] not in reads: reads[d['qname']] = {} reads[d['qname']]['alignment_length'] = 0 reads[d['qname']]['mismatches'] = 0 if sumlen > reads[d['qname']]['alignment_length']: reads[d['qname']]['alignment_length'] = sumlen reads[d['qname']]['mismatches'] = mismatches mapped_bases = 0 mapped_reads = 0 for rname in reads: mapped_bases += reads[rname]['alignment_length'] mapped_reads += 1 #print str(mapped_bases) + "\t" + str(mapped_reads) res = [z,mapped_reads,mapped_bases] #sys.stderr.write(str(z)+"\t"+str(mapped_reads)+"\t"+str(mapped_bases)+"\n") Q.put(res) progress = Q.qsize() sys.stderr.write('\r'+(' '*40)) sys.stderr.write('\r'+str(progress)+"/"+str(fsize)) sys.stderr.flush() return