def main(): parser = argparse.ArgumentParser( description= "Make sam file compatible with tools counting on a splicemap format sam file." ) parser.add_argument('in_sam', help="FILENAME of sam file, or '-' for STDIN") args = parser.parse_args() inf = sys.stdin if args.in_sam != '-': inf = open(args.in_sam) for line in inf: line = line.rstrip() if SamBasics.is_header(line): print line continue f = line.rstrip().split("\t") e = SamBasics.sam_line_to_dictionary(line) if SamBasics.check_flag(e['flag'], 4): continue # skip the unmapped reads if SamBasics.check_flag(e['flag'], 16): f[1] = "16" else: f[1] = "0" f[4] = "0" f[6] = "*" f[7] = "0" f[8] = "0" print "\t".join(f)
def main(): parser = argparse.ArgumentParser(description="Make sam file compatible with tools counting on a splicemap format sam file.") parser.add_argument('in_sam',help="FILENAME of sam file, or '-' for STDIN") args = parser.parse_args() inf = sys.stdin if args.in_sam != '-': inf = open(args.in_sam) for line in inf: line = line.rstrip() if SamBasics.is_header(line): print line continue f = line.rstrip().split("\t") e = SamBasics.sam_line_to_dictionary(line) if SamBasics.check_flag(e['flag'],4): continue # skip the unmapped reads if SamBasics.check_flag(e['flag'],16): f[1] = "16" else: f[1] = "0" f[4] = "0" f[6] = "*" f[7] = "0" f[8] = "0" print "\t".join(f)
def make_exons(args, thread_index, thread_count): is_sam = True if re.search('\.bam$', args.sam_file): is_sam = False stag = '' if is_sam: stag = '-S' cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file spcf = SamBasics.SAMtoPSLconversionFactory() if args.reference_genome: spcf.set_genome(args.reference_genome) sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed' of = open(fname, 'w') z = 0 with sampipe.stdout as inf: for line in inf: z += 1 if z % thread_count != thread_index: continue line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) strand = '+' if SamBasics.check_flag(d['flag'], 16): strand = '-' seqs = [] sequence = d['seq'] seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']]) m = re.search('XA:Z:(\S+)', line) if m and args.use_secondary_alignments: e = m.group(1) secondaries = e.rstrip(";").split(";") for secondary in secondaries: m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary) if not m1: sys.stderr.write("strange secondary format " + secondary + "\n") sys.exit() seqs.append([ d['qname'], m1.group(1), m1.group(2), int(m1.group(3)), m1.group(4) ]) #p.apply_async(get_exons_from_seqs,[seqs,d,spcf]) exons = get_exons_from_seqs(seqs, d, spcf) of.write(exons) #return exons of.close()
def make_exons(args,thread_index,thread_count): is_sam = True if re.search('\.bam$',args.sam_file): is_sam = False stag = '' if is_sam: stag = '-S' cmd = 'samtools view -F 4 '+stag+' '+args.sam_file spcf = SamBasics.SAMtoPSLconversionFactory() if args.reference_genome: spcf.set_genome(args.reference_genome) sampipe = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) fname = args.tempdir+'/bedpart.'+str(thread_index)+'.bed' of = open(fname,'w') z = 0 with sampipe.stdout as inf: for line in inf: z+=1 if z%thread_count != thread_index: continue line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) strand = '+' if SamBasics.check_flag(d['flag'],16): strand = '-' seqs = [] sequence = d['seq'] seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']]) m = re.search('XA:Z:(\S+)',line) if m and args.use_secondary_alignments: e = m.group(1) secondaries = e.rstrip(";").split(";") for secondary in secondaries: m1 = re.match('([^,]+),([+-])(\d+),([^,]+)',secondary) if not m1: sys.stderr.write("strange secondary format "+secondary+"\n") sys.exit() seqs.append([d['qname'], m1.group(1),m1.group(2),int(m1.group(3)),m1.group(4)]) #p.apply_async(get_exons_from_seqs,[seqs,d,spcf]) exons = get_exons_from_seqs(seqs,d,spcf) of.write(exons) #return exons of.close()
def main(): parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed") parser.add_argument('-o','--output',help='FILENAME is output') parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size') parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN') parser.add_argument('reference_genome',help='FILENAME of the reference genome') args = parser.parse_args() # get our reference genome sys.stderr.write("reading reference genome\n") #g = SequenceBasics.read_fasta_into_hash(args.reference_genome) g = FastaData(open(args.reference_genome).read()) sys.stderr.write("finished reading reference genome\n") inf = sys.stdin read_mapping_count = {} junctions = {} if args.infile != '-': inf = open(args.infile) sys.stderr.write("reading through sam file\n") zall = 0 zn = 0 while True: line = inf.readline() if not line: break line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) chrom = d['rname'] if chrom =='*': continue if chrom not in g.keys(): sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n") continue mate = 'U' if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped continue # we can ignore the unmapped things for now if SamBasics.check_flag(d['flag'],int('0x40',16)): mate = 'L' elif SamBasics.check_flag(d['flag'],int('0x80',16)): mate = 'R' actual_read = d['qname']+"\t"+mate if actual_read not in read_mapping_count: read_mapping_count[actual_read] = 0 read_mapping_count[actual_read] += 1 has_intron = 0 start_loc = d['pos'] current_loc = start_loc bounds = [] for i in range(0,len(d['cigar_array'])): ce = d['cigar_array'][i] if ce['op'] == 'N' and ce['val'] >= args.min_intron_size: has_intron = 1 lbound = current_loc # should be the intron start base index-1 current_loc += ce['val'] rbound = current_loc # should be the second exon start base index-1 right_size = d['cigar_array'][i+1]['val'] bounds.append([lbound,rbound,right_size]) elif ce['op'] == 'D': current_loc += ce['val'] elif re.match('[=XMSHP]',ce['op']): current_loc += ce['val'] if has_intron == 0: continue # there are no splices to report here #print actual_read #print d['cigar'] #print d #print start_loc #print bounds for bound in bounds: zall += 1 intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \ g[chrom][bound[1]-3:bound[1]-1].upper() strand = '' if is_canon(intronflank): # its a positive strand strand = '+' elif is_revcanon(intronflank): # its a negative strand strand = '-' else: # We can't deal with the non-canonical splice sorry zn += 1 sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r") continue # If we are still in we have successfully found a splice out_chrom = chrom out_start = bound[0]-51 out_end = bound[1]+49 out_name = '*' # this will be done later out_score = 50 out_strand = strand out_thickStart = out_start out_thickEnd = out_end out_rgb = '0,0,0' out_block_count = 2 out_block_sizes = '50,50' out_block_starts = '0,'+str(bound[1]-bound[0]+50) bed = [] bed.append(out_chrom) bed.append(str(out_start)) bed.append(str(out_end)) bed.append(out_name) bed.append(str(out_score)) bed.append(out_strand) bed.append(str(out_thickStart)) bed.append(str(out_thickEnd)) bed.append(out_rgb) bed.append(str(out_block_count)) bed.append(out_block_sizes) bed.append(out_block_starts) entry = "\t".join(bed) if entry not in junctions: junctions[entry] = {} junctions[entry]['reads'] = set() junctions[entry]['positions'] = set() junctions[entry]['right_sizes'] = set() junctions[entry]['reads'].add(actual_read) junctions[entry]['positions'].add(d['pos']) junctions[entry]['right_sizes'].add(bound[2]) sys.stderr.write("\n") sys.stderr.write("finished reading sam\n") of = sys.stdout if args.output: of = open(args.output,'w') if len(junctions) > 0: # if we have stuff lets print a header of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n") for entry in junctions: nR = len(junctions[entry]['reads']) width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes']) nNR = len(junctions[entry]['positions']) nUR = 0 nMR = 0 for read in junctions[entry]['reads']: if read_mapping_count[read] == 1: nUR += 1 elif read_mapping_count[read] > 1: nMR += 1 else: sys.stderr.write("ERROR: nonsense read count\n") return name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')' bed = entry.split("\t") bed[3] = name of.write("\t".join(bed)+"\n")
def main(): parser = argparse.ArgumentParser( description="Convert a sam file into a psl file") parser.add_argument('--genome', help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments', action='store_true', help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments', action='store_true', help="Report XA:Z alternative alignments as well") parser.add_argument( '--get_all_alignments', action='store_true', help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names', action='store_true', help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument( '--output_fasta', help= "FILENAME to save an outgoing fasta. Only works for primary alignments." ) group.add_argument( '--output_fastq', help= "FILENAME to save an outgoing fastq. Only works for primary alignments." ) parser.add_argument('infile', help="FILENAME input file or '-' for STDIN") parser.add_argument('-o', '--output', help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write( "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n" ) sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output, 'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta, 'w') if args.output_fastq: off = open(args.output_fastq, 'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">" + pobj.value('qName') + "\n" + sequence + "\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@" + pobj.value('qName') + "\n" + sequence + "\n" + "+\n" + quality + "\n") else: sys.stderr.write("ERROR: sequence " + sequence + " length (" + str(len(sequence)) + ") doesnt match quality " + quality + " length (" + str(len(quality)) + ")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments( line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments( line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") inf.close() of.close()
def main(): parser = argparse.ArgumentParser(description="Convert a sam file into a psl file") parser.add_argument('--genome',help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well") parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta. Only works for primary alignments.") group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq. Only works for primary alignments.") parser.add_argument('infile',help="FILENAME input file or '-' for STDIN") parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n") sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output,'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta,'w') if args.output_fastq: off = open(args.output_fastq,'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">"+pobj.value('qName')+"\n"+sequence+"\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n") else: sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q'+str(z) of.write(pobj.get_line()+"\n") inf.close() of.close()