def ConvertToBedViaR(cel_file, bpmapfile): '''function for t calling R to get Bed-like values''' # original script - minus redundant variables. # library(rMAT) # library(Biobase) # # expName <- "2012-07-12_SIR2_IP"; # cell file name # bpmapFile <- "Sc03b_MR_v04.bpmap"; # the mapping! # seqHeader <- ReadBPMAPAllSeqHeader(bpmapFile); # arrayFile1 <- c("2012-07-12_SIR2_IP.CEL"); # ScSet <- BPMAPCelParser(bpmapFile, arrayFile1, verbose = FALSE, groupName = "Sc", seqName="chr"); # data <- list(chrNo = ScSet1@featureChromosome, probePos = ScSet1@featurePosition, MATScore = exprs(ScSet1)); # last parameter is the raw data. # write.table(data, file = paste(expName,"_exp1_AllData.txt",sep=''), append = FALSE, row.names = FALSE, sep = "\t"); bedfile = StringUtils.rreplace(cel_file, 'CEL', 'BED', 2) bedfile += "like" print "input file: %s" % (cel_file) print "Will be writing out to %s" % (bedfile) print "importing rMat and Biobase libraries" importr('rMAT') importr('Biobase') # print "creating seqHeader" # robjects.r('seqHeader <- ReadBPMAPAllSeqHeader(\"' + bpmapfile + '\")') print "creating scSet" robjects.r('ScSet <- BPMAPCelParser(\"' + bpmapfile + '\", c(\"' + cel_file + '\"), verbose = FALSE, groupName = "Sc", seqName="chr")') print "creating data" robjects.r('data <- list(chrNo = ScSet@featureChromosome, probePos = ScSet@featurePosition, MATScore = exprs(ScSet))') # last parameter is the raw data. print "writing table" robjects.r('write.table(data, file = \"' + bedfile + '\", append = FALSE, quote = FALSE, row.names = FALSE, sep = "\t")')
files = os.listdir(args.BEDlikefiles) starttime = time.time() num_probes = 2635714 print "initializing array..." probeset = [[0.0 for y in xrange(num_probes)] for x in xrange(0, 2)] print "reading baseline from %s" % (args.Baselinefile) print "probeset[%i][%i]" % (len(probeset), len(probeset[0])) ReadBaseline(args.Baselinefile, probeset) blaverage = FindAverageProbeIntensity(probeset) print "baseline average probe intensity is:", blaverage print "done." # op = StringUtils.rreplace(sys.argv[1], 'BED', 'NORMAL', 1) for i, f in enumerate(files): filetime = time.time() of = StringUtils.rreplace(f, '.BEDlike', '.normalized.BEDlike', 1) bedprobes = [[0.0 for y in xrange(num_probes)] for x in xrange(0, 2)] print "determining average probe intensity from %s%s" % (args.BEDlikefiles, f) print "bedprobes[%i][%i]" % (len(bedprobes), len(bedprobes[0])) ReadBaseline("%s%s" % (args.BEDlikefiles, f), bedprobes) bedaverage = FindAverageProbeIntensity(bedprobes) print "average probe intensity is:", bedaverage scaling = float(bedaverage) / blaverage if scaling < 1: scaling = 1 print "baseline needs to be scaled by a factor of", scaling ApplyBaseline(i + 1, "%s%s" % (args.BEDlikefiles, f), "%s%s" % (args.output_path, of), probeset, scaling) # first file is first file, use zero for chr/pos print "File %i - %s processed in %f seconds" % (i + 1, f, time.time() - filetime) # ProduceStats(len(files) + 1, probeset) print 'Completed in %s seconds' % int((time.time() - starttime))
def run(PARAM, metadata_file, tohide): '''reads metadata_file, and updates sample entries in database''' while not os.path.isfile(metadata_file): print "Unable to find file %s" % metadata_file sys.exit() print "Data being imported into database: ", PARAM.get('default_database') print "opening connection(s) to MongoDB..." mongo = Mongo_Connector.MongoConnector(PARAM.get("server"), PARAM.get("port"), PARAM.get('default_database')) print "processing %s..." % metadata_file f = open(metadata_file, 'r') firstrow = True collection_name = 'samples' missing = 0 for line in f: sample_update = {} if not firstrow: a = line.split("\t") # # Find file name file_name = StringUtils.rreplace(a[0], ".CEL", ".normalized.waves", 1) print "Checking for file %s in database %s" % (file_name, PARAM.get('default_database')) # check if entry exists in database if mongo.find(collection_name, {"file_name":file_name}, {"_id":1}).count() > 0 : print "Entry exists; updating metadata for ", file_name # Update other parameters sample_update["exp_date"] = a[1] sample_update["strain_number"] = a[2] sample_update["strain_background"] = a[3] sample_update["mutations"] = a[4] sample_update["researcher"] = a[5] sample_update["type"] = a[6] sample_update["antibody"] = a[7] sample_update["catalog_number"] = a[8] sample_update["antibody_volume"] = a[9] sample_update["array_type"] = a[10] sample_update["array_lot_number"] = a[11] sample_update["protocol"] = a[12] sample_update["crosslinking_time"] = a[13] sample_update["pubmed_id"] = a[14] sample_update["sample_id"] = a[15] sample_update["comments"] = a[16] if a[16] == "\n": sample_update["comments"] = "" if tohide: sample_update['hide'] = True else: sample_update['hide'] = False # now that metadata is added, unhide sample # update entries mongo.update(collection_name, {"file_name":file_name}, {"$set": sample_update}) print "\tFinished updating metadata for ", file_name else: print "NOT FOUND: ", file_name missing += 1 else: firstrow = False print "Done all updates from metadata file ", metadata_file if missing > 0: print "%s files were unable to be updated as no matching file_name was found in the waves table." % missing mongo.close()
def find_waves_in_spliced_genes(splicefile, wavesfile, output, autothresh, heisig, thresh): # body of function goes here chromosomes = {1:"I", 2:"II", 3:"III", 4:"IV", 5:"V", 6:"VI", 7:"VII", 8:"VIII", 9:"IX", 10:"X", 11:"XI", 12:"XII", 13:"XIII", 14:"XIV", 15:"XV", 16:"XVI"} waves = [] print "Reading files..." f = open(wavesfile, 'r') for line in f: if line.startswith("#"): continue else: a = line.split("\t") wave = {} wave["chr"] = "chr" + a[0].replace("chr", "").upper() wave["pos"] = int(a[1]) wave["stddev"] = int(a[2]) wave["height"] = float(a[3]) wave["used"] = False waves.append(wave) f.close() if(autothresh): if(heisig): # print "\nNow determining background levels for height of peaks" bins = 70 # based on max peak height of 7 counts = [0] * bins thresh = [0] * bins for i in range(0, bins): thresh[i] = (i + 1) * 0.1 for i in waves: # check all heights to determine background levels counts[int(i['height'] / 0.1)] += 1 # increment count where height1 is in bin of size 0.1 # print "counts are: ", counts x = [] y = [] for i in range(10, 15): # (0 to 9 correspond to heights 0 to 0.9, do not have) x.append(thresh[i]) y.append(counts[i]) # print "x is ", x # print "y is ", y slr = scipystats.linregress(x, y) slope = slr[0] intercept = slr[1] # print "slope: %s and intercept: %s for background peak height" % (slope, intercept) # find x-intercept, threshold for noise-signal xint = abs(intercept / slope) print "height threshold between noise and signal is ", xint thresh = round(xint, 2) else: # determine threshold for sigma bins = 300 # based on max sigma of 300 counts = [0] * bins threshes = [0] * bins for i in range(0, bins): threshes[i] = (i + 1) for i in waves: # check all heights to determine background levels counts[int(i['stddev'])] += 1 # increment count where height1 is in bin of size 0.1 # print "counts are: ", counts highest = 0 ind = -1 for i in range(0, bins): if counts[i] > highest: highest = counts[i] ind = i thresh = threshes[ind] print "sigma threshold between noise and signal is ", thresh # remove waves that don't meet threshold if(heisig): waves[:] = [x for x in waves if x['height'] > thresh] else: waves[:] = [x for x in waves if x['stddev'] > thresh] waves.sort(key = lambda x: (x['chr'], x['pos'])) # list sorted by position splice = [] f = open(splicefile, 'r') for line in f: a = line.split("\t") region = {} region["gene"] = a[0] region["chr"] = "chr" + chromosomes[int(a[1])] if int(a[2]) == 1: # forward # promoter region["p1"] = int(a[3]) - 301 # p1 is left boundary of promoter region["p2"] = int(a[3]) - 1 # p2 is right boundary of promoter # exon1 region["e11"] = int(a[3]) # e11 is left boundary of exon1 region["e12"] = int(a[4]) # e12 us right boundary of exon1 # intron region["i1"] = int(a[5]) region["i2"] = int(a[6]) # exon2 region["e21"] = int(a[7]) region["e22"] = int(a[8]) # 3' UTR region["u1"] = int(a[8]) + 1 # 3'UTR region["u2"] = int(a[8]) + 301 else: # reverse # promoter region["p1"] = int(a[4]) - 301 region["p2"] = int(a[4]) - 1 # exon1 region["e11"] = int(a[4]) region["e12"] = int(a[3]) # intron region["i1"] = int(a[6]) region["i2"] = int(a[5]) # exon2 region["e21"] = int(a[8]) region["e22"] = int(a[7]) # 3' UTR region["u1"] = int(a[7]) + 1 region["u2"] = int(a[7]) + 301 # counts in each region region['p'] = 0 region['e1'] = 0 region['i'] = 0 region['e2'] = 0 region['u'] = 0 # splice site regions (+/- 100 bp) region['pe11'] = region['e11'] - 99 # promoter to exon1 left boundary region['pe12'] = region['e11'] + 101 # promoter to exon1 right boundary region['e1i1'] = region['i1'] - 99 region['e1i2'] = region['i1'] + 101 region['ie21'] = region['e21'] - 99 region['ie22'] = region['e21'] + 101 region['e2u1'] = region['u1'] - 99 region['e2u2'] = region['u1'] + 101 # splice site counts region['pe1'] = 0 region['e1i'] = 0 region['ie2'] = 0 region['e2u'] = 0 splice.append(region) splice.sort(key = lambda x: (x['chr'], x['p1'])) print_queue = multiprocessing.Queue() # launch thread to read and process the print queue print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".normalized.waves", "_" + str(thresh) + "_splice_summary.txt", 1), True, True) # print header print_queue.put("gene\tchr\tp1\tp2\te11\te12\ti1\ti2\te21\te22\tu1\tu2\tp\tpe1\te1\te1i\ti\tie2\te2\te2u\tu") print "Now finding peaks in each region..." w = 0 b = 0 while b < len(splice) and w < len(waves): if(waves[w]['chr'] == splice[b]['chr'] and (waves[w]['pos'] + 1 * waves[w]['stddev']) >= splice[b]['p1'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= splice[b]['u2']): # print "found a wave in gene ", splice["gene"] # find which segment of gene it is present in # only use standard deviation threshold when at beginning of promoter and end of UTR if (waves[w]['pos'] + 1 * waves[w]['stddev']) >= splice[b]['p1'] and waves[w]['pos'] <= splice[b]['p2']: splice[b]['p'] += 1.0 / (splice[b]['p2'] - splice[b]['p1']) elif waves[w]['pos'] >= splice[b]['e11'] and waves[w]['pos'] <= splice[b]['e12']: splice[b]['e1'] += 1.0 / (splice[b]['e12'] - splice[b]['e11']) elif waves[w]['pos'] >= splice[b]['i1'] and waves[w]['pos'] <= splice[b]['i2']: splice[b]['i'] += 1.0 / (splice[b]['i2'] - splice[b]['i1']) elif waves[w]['pos'] >= splice[b]['e21'] and waves[w]['pos'] <= splice[b]['e22']: splice[b]['e2'] += 1.0 / (splice[b]['e22'] - splice[b]['e21']) elif waves[w]['pos'] >= splice[b]['u1'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= splice[b]['u2']: splice[b]['u'] += 1.0 / (splice[b]['u2'] - splice[b]['u1']) if waves[w]['pos'] >= splice[b]['pe11'] and waves[w]['pos'] <= splice[b]['pe12']: splice[b]['pe1'] += 1.0 / (splice[b]['pe12'] - splice[b]['pe11']) elif waves[w]['pos'] >= splice[b]['e1i1'] and waves[w]['pos'] <= splice[b]['e1i2']: splice[b]['e1i'] += 1.0 / (splice[b]['e1i2'] - splice[b]['e1i1']) elif waves[w]['pos'] >= splice[b]['ie21'] and waves[w]['pos'] <= splice[b]['ie22']: splice[b]['ie2'] += 1.0 / (splice[b]['ie22'] - splice[b]['ie21']) elif waves[w]['pos'] >= splice[b]['e2u1'] and waves[w]['pos'] <= splice[b]['e2u2']: splice[b]['e2u'] += 1.0 / (splice[b]['e2u2'] - splice[b]['e2u1']) waves[w]['used'] = True w += 1 elif waves[w]['chr'] > splice[b]['chr'] or ((waves[w]['pos'] - 1 * waves[w]['stddev']) > splice[b]['u2'] and waves[w]['chr'] == splice[b]['chr']): # wave is past bin, move to next bin # write to file # print "b: %i, w: %i, wave is AFTER, start: %s, end: %s, pos: %s, %s %s" % (b, w, splice[b]['start'], splice[b]['end'], waves[w]['pos'], splice[b]['chr'], waves[w]['chr']) print_queue.put(splice[b]['gene'] + "\t" + splice[b]['chr'] + "\t" + \ str(splice[b]['p1']) + "\t" + str(splice[b]['p2']) + "\t" + \ str(splice[b]['e11']) + "\t" + str(splice[b]['e12']) + "\t" + \ str(splice[b]['i1']) + "\t" + str(splice[b]['i2']) + "\t" + \ str(splice[b]['e21']) + "\t" + str(splice[b]['e22']) + "\t" + \ str(splice[b]['u1']) + "\t" + str(splice[b]['u2']) + "\t" + \ str(splice[b]['p']) + "\t" + str(splice[b]['pe1']) + "\t" + \ str(splice[b]['e1']) + "\t" + str(splice[b]['e1i']) + "\t" + \ str(splice[b]['i']) + "\t" + str(splice[b]['ie2']) + "\t" + \ str(splice[b]['e2']) + "\t" + str(splice[b]['e2u']) + "\t" + \ str(splice[b]['u'])) b += 1 else: # wave is before bin # print "b: %i, w: %i, wave is BEFORE, start: %s, end: %s, pos: %s, %s %s" % (b, w, splice[b]['start'], splice[b]['end'], waves[w]['pos'], splice[b]['chr'], waves[w]['chr']) w += 1 while b < len(splice): # if ended because got to end of waves # need to write statistics for remaining bins # print "filling in remaining bins with 0s" print_queue.put(splice[b]['gene'] + "\t" + splice[b]['chr'] + "\t" + \ str(splice[b]['p1']) + "\t" + str(splice[b]['p2']) + "\t" + \ str(splice[b]['e11']) + "\t" + str(splice[b]['e12']) + "\t" + \ str(splice[b]['i1']) + "\t" + str(splice[b]['i2']) + "\t" + \ str(splice[b]['e21']) + "\t" + str(splice[b]['e22']) + "\t" + \ str(splice[b]['u1']) + "\t" + str(splice[b]['u2']) + "\t" + \ str(splice[b]['p']) + "\t" + str(splice[b]['pe1']) + "\t" + \ str(splice[b]['e1']) + "\t" + str(splice[b]['e1i']) + "\t" + \ str(splice[b]['i']) + "\t" + str(splice[b]['ie2']) + "\t" + \ str(splice[b]['e2']) + "\t" + str(splice[b]['e2u']) + "\t" + \ str(splice[b]['u'])) b += 1 # end printing if print_thread is None or not print_thread.is_alive(): pass else: while print_queue.qsize() > 0: print "waiting on print_queue to empty", print_queue.qsize() time.sleep(1) print_thread.END_PROCESSES = True print_thread.f.close() # while not print_thread.is_closed(): # print "waiting for print_thread to close ", print_queue.qsize(), " ", print_thread.is_closed(), " ", print_thread.END_PROCESSES # time.sleep(1) # quick summary statistics unassigned = 0 for i in waves: if not i["used"]: unassigned += 1 if(heisig): print "Height threshold applied to waves:", thresh else: print "Sigma threshold applied to waves:", thresh ln = 0 # total number of genes ctp = 0 # count of peaks in promoters ctpe1 = 0 # count of peaks in promoter/exon1 region cte1 = 0 cte1i = 0 cti = 0 ctie2 = 0 cte2 = 0 cte2u = 0 ctu = 0 emptygenes = 0 for line in splice: ln += 1 ctp += line['p'] ctpe1 += line['pe1'] cte1 += line['e1'] cte1i += line['e1i'] cti += line['i'] ctie2 += line['ie2'] cte2 += line['e2'] cte2u += line['e2u'] ctu += line['u'] if line['p'] == 0 and line['e1'] == 0 and line['i'] == 0 and line['e2'] == 0 and line['u'] == 0: # no waves in this gene emptygenes += 1 ln -= 1 # dont use in calculating the average avgp = float(ctp) / ln avgpe1 = float(ctpe1) / ln avge1 = float(cte1) / ln avge1i = float(cte1i) / ln avgi = float(cti) / ln avgie2 = float(ctie2) / ln avge2 = float(cte2) / ln avge2u = float(cte2u) / ln avgu = float(ctu) / ln print "%s waves were not part of a spliced gene" % unassigned print "%s genes had no waves" % emptygenes print "Region\tAverage_num_waves" print "Prom\t%f" % avgp print "Exon_1\t%f" % avge1 print "Intron\t%f" % avgi print "Exon_2\t%f" % avge2 print "3'_UTR\t%f" % avgu print "\n" print "prom_exon1\t%f" % avgpe1 print "exon1_intron\t%f" % avge1i print "intron_exon2\t%f" % avgie2 print "exon2_utr\t%f" % avge2u
def FindBaseline(file_name, normalize = False): '''Find the baseline - TODO: break this into smaller functions''' f = open(file_name, 'r') # open file print "processing data file (" + file_name + ")..." first_line = True headers = [] data = [] for line in f: if (first_line): headers = line.split("\t") for h in range(len(headers)): headers[h] = headers[h].lower() if headers[h].find("\n") != -1: headers[h] = headers[h].replace("\n", "") first_line = False else: a = line.split("\t") r = row(a[0], int(a[1]), float(a[2])) data.append(r) # find most common number f.close() if normalize: v_sum = 0 hist = {} for x in range(len(data)): v_sum += data[x].value if hist.has_key(str(data[x].value)): hist[str(data[x].value)] += 1 else: hist[str(data[x].value)] = 1 v_avg = v_sum / len(data) print "v average = %f" % (v_avg) point = 0 largest = 0 for g, y in hist.iteritems(): if y > largest: largest = y point = g # print "g, y (%s, %i)" % (g, y) # print "point, largest (%s, %i)" % (point, largest) point = float(point) for g in range(len(data)): v = data[g].value - point if v < 0: v = 0 data[g].setv(v) # create wig file f_w_name = StringUtils.rreplace(file_name, '.BEDlike', '', 1) # f_w_name = StringUtils.rreplace(f_w_name, 'BED', 'WIG', 1) # #should these two lines just replace .BEDlike with .WIG? trackname = os.path.basename(f_w_name) # print "Writing to %s" % (f_w_name) # f = open(f_w_name, 'w') # open file current_chr = data[0].chromosome print "New Chromosome %s (%s)" % (current_chr, chr_yeast[current_chr]) last_bp = 0 last_ht = 0 wigfile = WigFileThread.WigFileWriter(None) wigfile.start_wig_writer(os.path.dirname(f_w_name), os.path.basename(f_w_name), trackname) # for x in range (0, 5020): # print "x=%i position=%i, value=%f" % (x, data[x].position + 1, data[x].value) x = 0 a = 0 # gc.disable() coverage_map = [] l = len(data) x = 1 block_left = 0 while x < l: # print "x of len: %i/%i" % (x, l) if data[x].chromosome != current_chr: if len(coverage_map) > 0: wigfile.add_map(coverage_map, chr_yeast[current_chr], block_left) # for yeast chromosome nomenclature (roman numeral) # wigfile.add_map(coverage_map, current_chr, block_left) coverage_map = [] # TODO: taper off chromosome # TODO: taper "on" new chromosome print "New Chromosome %s (%s)" % (data[x].chromosome, chr_yeast[data[x].chromosome]) current_chr = data[x].chromosome # switch chromosomes, if data[x - 1].chromosome == data[x].chromosome: block_left = data[x - 1].position + 2 # shift by 1, and block starts after the zero. last_bp = data[x - 1].position + 1 else: block_left = data[x].position + 2 last_bp = data[x].position + 1 # print "x=%i data=%i, value=%f" % (x, data[x].position + 1, data[x].value) while x < l and data[x].value >= 0 and data[x].chromosome == current_chr: diff = (data[x].position + 1) - last_bp # if diff > 6: # diff = 6 if (diff > 1): slope = float(data[x].value - last_ht) / (diff) # slope between the two for y in range(1, diff): coverage_map.append(round(last_ht + (slope * y), 2)) coverage_map.append(round(data[x].value, 2)) else: coverage_map.append(round(data[x].value, 2)) a += 1 last_bp = data[x].position + 1 last_ht = data[x].value if x < l - 1 and (data[x + 1].value <= 0 or data[x + 1].chromosome != current_chr): diff = (data[x + 1].position + 1) - (data[x].position + 1) slope = float(0 - data[x].value) / (diff) for y in range(1, diff): coverage_map.append(round(data[x].value + (slope * y), 2)) last_ht = 0 x += 1 while x < l and data[x].value <= 0: x += 1 last_ht = 0 if len(coverage_map) > 0: wigfile.add_map(coverage_map, chr_yeast[current_chr], block_left) # for yeast chromosome nomenclature (roman numeral) # print "writing map" # wigfile.add_map(coverage_map, current_chr, block_left) coverage_map = [] # gc.enable() print "Closing Wigwriter. This may take some time." wigfile.close_wig_writer() print "Wigwriter closed."
def find_waves_in_promoter(orffile, wavesfile, output, autothresh, heisig): # body of function goes here chromosomes = {1:"I", 2:"II", 3:"III", 4:"IV", 5:"V", 6:"VI", 7:"VII", 8:"VIII", 9:"IX", 10:"X", 11:"XI", 12:"XII", 13:"XIII", 14:"XIV", 15:"XV", 16:"XVI"} waves = [] print "Reading files..." f = open(wavesfile, 'r') for line in f: if line.startswith("#"): continue else: a = line.split("\t") wave = {} wave["chr"] = "chr" + a[0].replace("chr", "").upper() wave["pos"] = int(a[1]) wave["stddev"] = int(a[2]) wave["height"] = float(a[3]) wave["used"] = False waves.append(wave) f.close() # automatically determine noise if(autothresh): if(heisig): # print "\nNow determining background levels for height of peaks" bins = 70 # based on max peak height of 7 counts = [0] * bins thresh = [0] * bins for i in range(0, bins): thresh[i] = (i + 1) * 0.1 for i in waves: # check all heights to determine background levels counts[int(i['height'] / 0.1)] += 1 # increment count where height1 is in bin of size 0.1 # print "counts are: ", counts x = [] y = [] for i in range(10, 15): # (0 to 9 correspond to heights 0 to 0.9, do not have) x.append(thresh[i]) y.append(counts[i]) # print "x is ", x # print "y is ", y slr = scipystats.linregress(x, y) slope = slr[0] intercept = slr[1] # print "slope: %s and intercept: %s for background peak height" % (slope, intercept) # find x-intercept, threshold for noise-signal xint = abs(intercept / slope) print "height threshold between noise and signal is ", xint thresh = round(xint, 2) else: # determine threshold for sigma bins = 300 # based on max sigma of 300 counts = [0] * bins threshes = [0] * bins for i in range(0, bins): threshes[i] = (i + 1) for i in waves: # check all heights to determine background levels counts[int(i['stddev'])] += 1 # increment count where height1 is in bin of size 0.1 # print "counts are: ", counts highest = 0 ind = -1 for i in range(0, bins): if counts[i] > highest: highest = counts[i] ind = i thresh = threshes[ind] print "sigma threshold between noise and signal is ", thresh else: if(heisig): usr_in = raw_input("What would you like to use as the minimum wave height? ") thresh = float(usr_in) else: usr_in = raw_input("What would you like to use as the minimum wave sigma? ") thresh = int(usr_in) # remove waves that don't meet threshold if(heisig): waves[:] = [x for x in waves if x['height'] > thresh] else: waves[:] = [x for x in waves if x['stddev'] > thresh] waves.sort(key = lambda x: (x['chr'], x['pos'])) # list sorted by position usr_in = raw_input("Distance upstream of TSS to check for waves: ") prom = int(usr_in) bed = [] f = open(orffile, 'r') next(f) for line in f: a = line.split("\t") if int(a[1]) == 17: # assumed mitochondrial chromosome pass else: region = {} region['gene'] = a[0] # print a[1] region["chr"] = "chr" + chromosomes[int(a[1])] # print "chromosome is: ", region["chr"] region["start"] = int(a[2]) # if int(a[4]) == 1: # region["start"] = int(a[2]) - prom # region["end"] = int(a[2]) # else: # region["start"] = int(a[3]) # region["end"] = int(a[3]) + prom if int(a[4]) == 1: region["start"] = int(a[2]) region["end"] = int(a[2]) + prom else: region["start"] = int(a[3]) - prom region["end"] = int(a[3]) bed.append(region) bed.sort(key = lambda x: (x['chr'], x['start'])) maxperbin = 0 print_queue = multiprocessing.Queue() # launch thread to read and process the print queue # print "printing to: ", output + StringUtils.rreplace(os.path.basename(wavesfile), ".normalized.waves", "_" + str(thresh) + "_summary.txt", 1) print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_summary.txt", 1), True, True) print "Now finding peaks in each promoter..." w = 0 b = 0 count = 0 height = 0 while b < len(bed) and w < len(waves): if(waves[w]['chr'] == bed[b]['chr'] and (waves[w]['pos'] + 1 * waves[w]['stddev']) >= bed[b]['start'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= bed[b]['end']): # print "found a wave in bin ", b count += 1 if count > maxperbin: maxperbin = count height += waves[w]['height'] waves[w]['used'] = True w += 1 elif waves[w]['chr'] > bed[b]['chr'] or ((waves[w]['pos'] + 1 * waves[w]['stddev']) > bed[b]['end'] and waves[w]['chr'] == bed[b]['chr']): # wave is past bin, move to next bin # write to file # print "b: %i, w: %i, wave is AFTER, start: %s, end: %s, pos: %s, %s %s" % (b, w, bed[b]['start'], bed[b]['end'], waves[w]['pos'], bed[b]['chr'], waves[w]['chr']) print_queue.put(bed[b]['gene'] + "\t" + bed[b]['chr'] + "\t" + str(bed[b]['start']) + "\t" + str(bed[b]['end']) + "\t" + str(count) + "\t" + str(height)) b += 1 count = 0 height = 0 else: # wave is before bin # print "b: %i, w: %i, wave is BEFORE, start: %s, end: %s, pos: %s, %s %s" % (b, w, bed[b]['start'], bed[b]['end'], waves[w]['pos'], bed[b]['chr'], waves[w]['chr']) w += 1 while b < len(bed): # if ended because got to end of waves # need to write statistics for remaining bins # print "filling in remaining bins with 0s" start = bed[b]["start"] end = bed[b]["end"] chrom = bed[b]["chr"] count = 0 height = 0 print_queue.put(bed[b]['gene'] + "\t" + str(chrom) + "\t" + str(start) + "\t" + str(end) + "\t" + str(count) + "\t" + str(height)) b += 1 # end printing if print_thread is None or not print_thread.is_alive(): pass else: while print_queue.qsize() > 0: print "waiting on print_queue to empty", print_queue.qsize() time.sleep(1) print_thread.END_PROCESSES = True print_thread.f.close() # while not print_thread.is_closed(): # print "waiting for print_thread to close ", print_queue.qsize(), " ", print_thread.is_closed(), " ", print_thread.END_PROCESSES # time.sleep(1) print_queue = multiprocessing.Queue() # launch thread to read and process the print queue print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_counts.txt", 1), True, True) # quick summary statistics print_queue.put("nwaves\tnbins\tavgBinSize") unassigned = 0 un_height = 0 for i in waves: if not i["used"]: unassigned += 1 un_height += i["height"] # print i['height'] # for each bin of height 0.5, determine % of waves not mapped total = [0] * 13 unmap = [0] * 13 for i in waves: total[int(i['height'] / 0.5) - 2] += 1 # TODO: this is almost certainly wrong! if not i['used']: unmap[int(i['height'] / 0.5) - 2] += 1 # TODO: most certainly wrong. # TODO: Print these values in some meaningful way. prop = [0.0] * 13 for i in range(0, len(prop)): if total[i] != 0: prop[i] = float(unmap[i]) / float(total[i]) else: prop[i] = 0 print "Proportions of unused waves in each bin:" print "[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]" print total print unmap print prop # print "%s waves were unassigned to a BED bin." % unassigned print "Max per bin is:", maxperbin print "Height threshold applied to waves:", thresh counts = [0] * (maxperbin + 1) # sizes = [[]] * (maxperbin + 1) sizes = [[] for x in range(0, maxperbin + 1)] f = open(output + StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_summary.txt", 1), 'r', 0) ln = 0 for line in f: ln += 1 a = line.split("\t") # print "on line ", ln, " \n line is: ", line, " \nand value is: ", a[4] counts[int(a[4])] += 1 sizes[int(a[4])].append(int(a[3]) - int(a[2])) f.close() print "%s waves of %s were unassigned to a BED bin (%f%%)." % (unassigned, len(waves), round(float(unassigned) / len(waves) * 100, 2)) print "Average height of waves not part of a bin:", (un_height / unassigned) print_queue.put(str(unassigned) + "\t0\t0") print "Total number of bins:", len(bed) for i in range(0, maxperbin + 1): tot = 0 # print len(sizes[i]) for j in range(0, len(sizes[i])): tot += sizes[i][j] # print "size is: ", sizes[i][j] if len(sizes[i]) != 0: avg = float(tot) / len(sizes[i]) else: avg = 0 if i != 0: print_queue.put(str(i) + "\t" + str(counts[i]) + "\t" + str(avg)) else: print "%i bins have no waves mapped to them (%s%%)" % (counts[i], float(counts[i]) / len(bed) * 100) print "%i bins have at least 1 wave mapped to them." % (len(bed) - counts[i]) print_queue.put(str(i) + "\t" + str(counts[i]) + "\t" + str(avg)) # end printing if print_thread is None or not print_thread.is_alive(): pass else: while print_queue.qsize() > 0: print "waiting on print_queue to empty", print_queue.qsize() time.sleep(1) print_thread.END_PROCESSES = True print_thread.f.close()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("parameter_file", help = ".input parameterfile", type = str) parser.add_argument("-data_file", help = "override the source file in the parameter input file, containing data upon which waves will be called", type = str) parser.add_argument("-output_path", help = "override the path for output files", type = str) parser.add_argument("-noise_compensation", "-nc", help = "increase the amount of noise acceptable for wave calling - set to 16 for chip-chip, 1 for chip_seq.", type = int, default = 1) args = parser.parse_args() p = Parameters.parameter() param = create_param_obj(args.parameter_file) param.set("noise_compensation", args.noise_compensation) # override parameter file with cmdline args if args.data_file : param.set("input_file", args.data_file) # override input_file # set file_name in param to be based on input file. ofile = StringUtils.rreplace(os.path.basename(args.data_file), '.wig', '', 1) param.set("file_name", ofile) # override output file_name (.waves gets added later) if args.output_path: param.set("output_path", args.output_path) # override output_path print "param file: ", args.parameter_file print "input_file: ", param.get("input_file") print "output_path: ", param.get("output_path") print "output_file_name: ", param.get("file_name") main(param)