def construct_header_from_reference_fasta(ref_fasta_filename): g = FastaData(open(ref_fasta_filename).read()) #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename) chrs = {} for name in sorted(g.keys()): chrs[name] = len(g[name]) sys.stderr.write(name+" is there at length "+str(len(g[name]))+"\n") header = '' header += "@HD\tVN:1.0\tSO:coordinate\n" for chr in sorted(chrs): header += "@SQ\tSN:"+chr+"\tLN:"+str(chrs[chr])+"\n" header += "@PG\tID:SamBasics.py\tVN:1.0\n" return header
def construct_header_from_reference_fasta(ref_fasta_filename): g = FastaData(open(ref_fasta_filename).read()) #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename) chrs = {} for name in sorted(g.keys()): chrs[name] = len(g[name]) sys.stderr.write(name + " is there at length " + str(len(g[name])) + "\n") header = '' header += "@HD\tVN:1.0\tSO:coordinate\n" for chr in sorted(chrs): header += "@SQ\tSN:" + chr + "\tLN:" + str(chrs[chr]) + "\n" header += "@PG\tID:SamBasics.py\tVN:1.0\n" return header
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('-r', '--reference', help="reference genome FASTA") parser.add_argument('--no_qual', action='store_true', help="dont put in quality") args = parser.parse_args() ref = {} if args.reference: ref = FastaData(open(args.reference, 'rb').read()) if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) h1 = '@HD VN:1.0 SO:unsorted' h2 = '@PG ID:FA2UN PN:FA2UN VN:2016-06-09 CL:' + ' '.join(sys.argv) print h1 print h2 if ref: for chr in sorted(ref.keys()): print "@SQ\tSN:" + chr + "\t" + 'LN:' + str(len(ref[chr])) inf = FastqHandle(args.input) for e in inf: o = '' o += e.name + "\t" o += "4\t" o += "*\t" o += "0\t" o += "0\t" o += "*\t" o += "*\t" o += "0\t" o += "0\t" o += e.seq + "\t" if args.no_qual: o += "*\t" else: o += e.qual + "\t" o += "XO:Z:NM" print o
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('-r','--reference',help="reference genome FASTA") parser.add_argument('--no_qual',action='store_true',help="dont put in quality") args = parser.parse_args() ref = {} if args.reference: ref = FastaData(open(args.reference,'rb').read()) if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) h1 = '@HD VN:1.0 SO:unsorted' h2 = '@PG ID:FA2UN PN:FA2UN VN:2016-06-09 CL:'+' '.join(sys.argv) print h1 print h2 if ref: for chr in sorted(ref.keys()): print "@SQ\tSN:"+chr+"\t"+'LN:'+str(len(ref[chr])) inf = FastqHandle(args.input) for e in inf: o = '' o += e.name+"\t" o += "4\t" o += "*\t" o += "0\t" o += "0\t" o += "*\t" o += "*\t" o += "0\t" o += "0\t" o += e.seq+"\t" if args.no_qual: o+= "*\t" else: o += e.qual+"\t" o += "XO:Z:NM" print o
def main(): parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed") parser.add_argument('-o','--output',help='FILENAME is output') parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size') parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN') parser.add_argument('reference_genome',help='FILENAME of the reference genome') args = parser.parse_args() # get our reference genome sys.stderr.write("reading reference genome\n") #g = SequenceBasics.read_fasta_into_hash(args.reference_genome) g = FastaData(open(args.reference_genome).read()) sys.stderr.write("finished reading reference genome\n") inf = sys.stdin read_mapping_count = {} junctions = {} if args.infile != '-': inf = open(args.infile) sys.stderr.write("reading through sam file\n") zall = 0 zn = 0 while True: line = inf.readline() if not line: break line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) chrom = d['rname'] if chrom =='*': continue if chrom not in g.keys(): sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n") continue mate = 'U' if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped continue # we can ignore the unmapped things for now if SamBasics.check_flag(d['flag'],int('0x40',16)): mate = 'L' elif SamBasics.check_flag(d['flag'],int('0x80',16)): mate = 'R' actual_read = d['qname']+"\t"+mate if actual_read not in read_mapping_count: read_mapping_count[actual_read] = 0 read_mapping_count[actual_read] += 1 has_intron = 0 start_loc = d['pos'] current_loc = start_loc bounds = [] for i in range(0,len(d['cigar_array'])): ce = d['cigar_array'][i] if ce['op'] == 'N' and ce['val'] >= args.min_intron_size: has_intron = 1 lbound = current_loc # should be the intron start base index-1 current_loc += ce['val'] rbound = current_loc # should be the second exon start base index-1 right_size = d['cigar_array'][i+1]['val'] bounds.append([lbound,rbound,right_size]) elif ce['op'] == 'D': current_loc += ce['val'] elif re.match('[=XMSHP]',ce['op']): current_loc += ce['val'] if has_intron == 0: continue # there are no splices to report here #print actual_read #print d['cigar'] #print d #print start_loc #print bounds for bound in bounds: zall += 1 intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \ g[chrom][bound[1]-3:bound[1]-1].upper() strand = '' if is_canon(intronflank): # its a positive strand strand = '+' elif is_revcanon(intronflank): # its a negative strand strand = '-' else: # We can't deal with the non-canonical splice sorry zn += 1 sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r") continue # If we are still in we have successfully found a splice out_chrom = chrom out_start = bound[0]-51 out_end = bound[1]+49 out_name = '*' # this will be done later out_score = 50 out_strand = strand out_thickStart = out_start out_thickEnd = out_end out_rgb = '0,0,0' out_block_count = 2 out_block_sizes = '50,50' out_block_starts = '0,'+str(bound[1]-bound[0]+50) bed = [] bed.append(out_chrom) bed.append(str(out_start)) bed.append(str(out_end)) bed.append(out_name) bed.append(str(out_score)) bed.append(out_strand) bed.append(str(out_thickStart)) bed.append(str(out_thickEnd)) bed.append(out_rgb) bed.append(str(out_block_count)) bed.append(out_block_sizes) bed.append(out_block_starts) entry = "\t".join(bed) if entry not in junctions: junctions[entry] = {} junctions[entry]['reads'] = set() junctions[entry]['positions'] = set() junctions[entry]['right_sizes'] = set() junctions[entry]['reads'].add(actual_read) junctions[entry]['positions'].add(d['pos']) junctions[entry]['right_sizes'].add(bound[2]) sys.stderr.write("\n") sys.stderr.write("finished reading sam\n") of = sys.stdout if args.output: of = open(args.output,'w') if len(junctions) > 0: # if we have stuff lets print a header of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n") for entry in junctions: nR = len(junctions[entry]['reads']) width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes']) nNR = len(junctions[entry]['positions']) nUR = 0 nMR = 0 for read in junctions[entry]['reads']: if read_mapping_count[read] == 1: nUR += 1 elif read_mapping_count[read] > 1: nMR += 1 else: sys.stderr.write("ERROR: nonsense read count\n") return name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')' bed = entry.split("\t") bed[3] = name of.write("\t".join(bed)+"\n")
def main(args): random.seed(args.seed) sum = 0 if args.reference_genome: ref = FastaData(open(args.reference_genome).read()) for name in ref.keys(): sum += len(ref[name]) else: with open(args.reference_lengths) as inf: for line in inf: f = line.rstrip().split("\t") sum += int(f[1]) c = args.minimum_coverage z = 0 values = {} while c < sum: z += 1 values[c] = z c = c * 5 if c >= sum: break z += 1 values[c] = z c = c * 2 z += 1 values[sum] = z for c in sorted(values.keys()): values[c] = z - values[c] + 1 ### Now values contains the stratified coverage values if args.output_key: of = open(args.output_key, 'w') of.write("bp_size\tstrata_label\n") for c in sorted(values.keys()): of.write(str(c) + "\t" + str(values[c]) + "\n") of.close() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') depths = {} vals = [] z = 0 for line in inf: z += 1 if z % 100000 == 0: sys.stderr.write(str(z) + " bed entries read \r") f = line.rstrip().split("\t") addition = 0 if not args.dont_make_unique: addition = +args.unique_scale * random.random() vals.append([f[0], int(f[1]), int(f[2]), float(f[3]) + addition]) z = 0 sys.stderr.write("\n") for f in vals: z += 1 if z % 100000 == 0: sys.stderr.write(str(z) + " bed entries read \r") #keep track of the number of bases at each depth depth = f[3] cov = f[2] - f[1] if depth not in depths: depths[depth] = 0 depths[depth] += cov #vals.append([f[0],int(f[1]),int(f[2]),depth]) sys.stderr.write("\n") #total_bases = sum(depths.values()) #thresh = {} #for strata in stratas: # pos = 0 # cur = float(i)*float(total_bases)/float(args.strata) stratas = sorted(values.keys()) pos = 0 depth_strata = {} for d in reversed(sorted(depths.keys())): pos += depths[d] while stratas[0] < pos: stratas.pop(0) depth_strata[d] = values[stratas[0]] #print str(d)+"\t"+str(values[stratas[0]]) #if float(pos) > cur: # thresh[d] = [pos,i] # break vals[0][3] = depth_strata[vals[0][3]] buffer = vals[0] for val in vals[1:]: val[3] = depth_strata[val[3]] if val[1] == buffer[2] and val[3] == buffer[3] and val[0] == buffer[0]: #print 'hello' buffer[2] = val[2] continue else: of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" + str(buffer[3]) + "\n") buffer = val of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" + str(buffer[3]) + "\n") of.close()
def main(): #do our inputs args = do_inputs() global of of = sys.stdout if args.output: if args.output[-4:] == '.bam': cmd = 'samtools view -Sb - -o '+args.output p = Popen(cmd.split(),stdin=PIPE) of = p.stdin else: sys.stderr.write("ERROR: stdout and .bam are the only valid output formats\n") sys.exit() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading reference genome\n") ref = FastaData(open(args.reference).read()) #shared = manager.dict() shared = {} for chr in sorted(ref.keys()): sys.stderr.write("reading "+chr+"\n") shared[chr] = ref[chr].upper() ref.remove(chr) sys.stderr.write("finished reading shared memory reference\n") sys.stderr.write("Now make the header\n") of.write("@HD\tVN:1.0\tSO:unknown\n") of.write("@PG\tID:SLR\n") for chr in sorted(shared.keys()): of.write("@SQ\tSN:"+chr+"\tLN:"+str(len(shared[chr]))+"\n") if args.threads > 1: poo = Pool(processes=args.threads) buffer = [] max_buffer = 1 z = 0 for line in inf: z += 1 if z%1000==0: sys.stderr.write(str(z)+" \r") buffer.append(line) if len(buffer) >= max_buffer: if args.threads == 1: results = do_buffer(buffer,shared,args) do_out(results) else: poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out) buffer = [] if len(buffer) > 0: if args.threads ==1: results = do_buffer(buffer,shared,args) do_out(results) else: poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out) if args.threads > 1: poo.close() poo.join() sys.stderr.write("\n") if args.output: p.communicate() else: of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): random.seed(args.seed) sum = 0 if args.reference_genome: ref = FastaData(open(args.reference_genome).read()) for name in ref.keys(): sum += len(ref[name]) else: with open(args.reference_lengths) as inf: for line in inf: f = line.rstrip().split("\t") sum += int(f[1]) c = args.minimum_coverage z = 0 values = {} while c < sum: z += 1 values[c] = z c = c*5 if c >= sum: break z += 1 values[c] = z c = c*2 z +=1 values[sum] = z for c in sorted(values.keys()): values[c] = z-values[c]+1 ### Now values contains the stratified coverage values if args.output_key: of = open(args.output_key,'w') of.write("bp_size\tstrata_label\n") for c in sorted(values.keys()): of.write(str(c)+"\t"+str(values[c])+"\n") of.close() inf = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') depths = {} vals = [] z = 0 for line in inf: z += 1 if z % 100000 == 0: sys.stderr.write(str(z)+" bed entries read \r") f = line.rstrip().split("\t") addition = 0 if not args.dont_make_unique: addition = +args.unique_scale*random.random() vals.append([f[0],int(f[1]),int(f[2]),float(f[3])+addition]) z = 0 sys.stderr.write("\n") for f in vals: z += 1 if z % 100000 == 0: sys.stderr.write(str(z)+" bed entries read \r") #keep track of the number of bases at each depth depth = f[3] cov = f[2]-f[1] if depth not in depths: depths[depth] = 0 depths[depth] += cov #vals.append([f[0],int(f[1]),int(f[2]),depth]) sys.stderr.write("\n") #total_bases = sum(depths.values()) #thresh = {} #for strata in stratas: # pos = 0 # cur = float(i)*float(total_bases)/float(args.strata) stratas = sorted(values.keys()) pos = 0 depth_strata = {} for d in reversed(sorted(depths.keys())): pos += depths[d] while stratas[0] < pos: stratas.pop(0) depth_strata[d] = values[stratas[0]] #print str(d)+"\t"+str(values[stratas[0]]) #if float(pos) > cur: # thresh[d] = [pos,i] # break vals[0][3] = depth_strata[vals[0][3]] buffer = vals[0] for val in vals[1:]: val[3] = depth_strata[val[3]] if val[1]==buffer[2] and val[3]==buffer[3] and val[0]==buffer[0]: #print 'hello' buffer[2] = val[2] continue else: of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n") buffer = val of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n") of.close()
def main(): #do our inputs args = do_inputs() global of of = sys.stdout if args.output: if args.output[-4:] == '.bam': cmd = 'samtools view -Sb - -o ' + args.output p = Popen(cmd.split(), stdin=PIPE) of = p.stdin else: sys.stderr.write( "ERROR: stdout and .bam are the only valid output formats\n") sys.exit() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading reference genome\n") ref = FastaData(open(args.reference).read()) #shared = manager.dict() shared = {} for chr in sorted(ref.keys()): sys.stderr.write("reading " + chr + "\n") shared[chr] = ref[chr].upper() ref.remove(chr) sys.stderr.write("finished reading shared memory reference\n") sys.stderr.write("Now make the header\n") of.write("@HD\tVN:1.0\tSO:unknown\n") of.write("@PG\tID:SLR\n") for chr in sorted(shared.keys()): of.write("@SQ\tSN:" + chr + "\tLN:" + str(len(shared[chr])) + "\n") if args.threads > 1: poo = Pool(processes=args.threads) buffer = [] max_buffer = 1 z = 0 for line in inf: z += 1 if z % 1000 == 0: sys.stderr.write(str(z) + " \r") buffer.append(line) if len(buffer) >= max_buffer: if args.threads == 1: results = do_buffer(buffer, shared, args) do_out(results) else: poo.apply_async(do_buffer, args=( buffer[:], shared, args, ), callback=do_out) buffer = [] if len(buffer) > 0: if args.threads == 1: results = do_buffer(buffer, shared, args) do_out(results) else: poo.apply_async(do_buffer, args=( buffer[:], shared, args, ), callback=do_out) if args.threads > 1: poo.close() poo.join() sys.stderr.write("\n") if args.output: p.communicate() else: of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)