def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bamfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) #if len(argv) < 10: # sys.stderr.write(str(len(argv)) + '\n') # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # #chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; chromsDict= SeparateByChrom.getChromsFromBam(opt.bamfile) SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed'); makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size); final_output_file = opt.outfile; final_output_file = SeparateByChrom.combineAllGraphFiles(chromsDict.keys(), ".graph", final_output_file); SeparateByChrom.cleanup(chromsDict.keys(), ".bed"); SeparateByChrom.cleanup(chromsDict.keys(), ".graph");
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species under consideration", metavar="<str>") parser.add_option("-b", "--raw_bam_file", action="store", type="string", dest="bam_file", help="raw bam file", metavar="<file>") parser.add_option("-t", "--threshold", action="store", type="int", dest="threshold", help="threshold for copy number", metavar="<int>") parser.add_option("-o", "--output_file_name", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") ## Add options to filter reads parser.add_option("-f", "--requiredFlag", type= 'int', help="Required bit in sam flag. Same as samtools view -f") parser.add_option("-F", "--filterFlag", type= 'int', help="Filter out bit in sam flag, Same as samtools view -F") parser.add_option("-q", "--mapq", type= 'int', help="minimum mapq for a read to be kept") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; #else: # sys.stderr.write("\nThis species is not recognized, exiting\n"); # sys.exit(1); chroms= SeparateByChrom.getChromsFromBam(opt.bam_file) SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag= opt.requiredFlag, filterFlag= opt.filterFlag, mapq= opt.mapq) if opt.threshold > 0: for chrom in chroms: if (Utility.fileExists(chrom + ".bed1")): strand_broken_remove(chrom, opt.threshold) SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file) else: SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file) SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): """ Probability scoring with random background model. """ parser = OptionParser() #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--gap_size(bp)", action="store", type="int", dest="gap", help="gap size (in bps)", metavar="<int>") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>") parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): chromsDict= SeparateByChrom.getChromsFromBam(opt.bam) sys.stderr.write("Window_size: %s\n" %(opt.window_size)) sys.stderr.write("Gap size: %s\n" %(opt.gap)) sys.stderr.write("E value is: %s\n" %(opt.evalue)) total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph); sys.stderr.write("Total read count: %s\n" %(total_read_count)) genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values()); sys.stderr.write("Genome Length: %s\n" %(genome_length)); genome_length = int(opt.fraction * genome_length); average = float(total_read_count) * opt.window_size/genome_length; sys.stderr.write("Effective genome Length: %s\n" %(genome_length)); sys.stderr.write("Window average: %s\n" %(average)); window_pvalue = 0.20; bin_size = 0.001; sys.stderr.write("Window pvalue: %s\n" %(window_pvalue)) background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size); min_tags_in_window = background.min_tags_in_window sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window)) sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); #read in the summary graph file bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH"); #generate the probscore summary graph file, only care about enrichment #filter the summary graph to get rid of windows whose scores are less than window_score_threshold filtered_bed_val = {}; for chrom in bed_val.keys(): if len(bed_val[chrom])>0: filtered_bed_val [chrom]= []; for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value; if ( read_count < min_tags_in_window): score = -1; #score = 0; else: prob = poisson(read_count, average); if prob <1e-250: score = 1000; #outside of the scale, take an arbitrary number. else: score = -log(prob); bed_val[chrom][index].value = score; if score > 0: filtered_bed_val[chrom].append( (bed_val[chrom])[index] ); #print chrom, start, read_count, score; #write the probscore summary graph file #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file); #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered"); sys.stderr.write("Determine the score threshold from random background\n"); #determine threshold from random background hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) + "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist"; score_threshold = background.find_island_threshold(opt.evalue); # background.output_distribution(hist_outfile); sys.stderr.write("The score threshold is: %s\n" %(score_threshold)); sys.stderr.write("Make and write islands\n"); total_number_islands = 0; outputfile = open(opt.out_island_file, 'w'); for chrom in filtered_bed_val.keys(): if len(filtered_bed_val[chrom])>0: islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2); islands = find_region_above_threshold(islands, score_threshold); total_number_islands += len(islands); if len(islands)>0: for i in islands: outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n"; outputfile.write(outline); else: sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n"); outputfile.close(); sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))
def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bamfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) #if len(argv) < 10: # sys.stderr.write(str(len(argv)) + '\n') # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # #chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; chromsDict = SeparateByChrom.getChromsFromBam(opt.bamfile) SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed') makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size) final_output_file = opt.outfile final_output_file = SeparateByChrom.combineAllGraphFiles( chromsDict.keys(), ".graph", final_output_file) SeparateByChrom.cleanup(chromsDict.keys(), ".bed") SeparateByChrom.cleanup(chromsDict.keys(), ".graph")
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species under consideration", metavar="<str>") parser.add_option("-b", "--raw_bam_file", action="store", type="string", dest="bam_file", help="raw bam file", metavar="<file>") parser.add_option("-t", "--threshold", action="store", type="int", dest="threshold", help="threshold for copy number", metavar="<int>") parser.add_option("-o", "--output_file_name", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") ## Add options to filter reads parser.add_option( "-f", "--requiredFlag", type='int', help="Required bit in sam flag. Same as samtools view -f") parser.add_option( "-F", "--filterFlag", type='int', help="Filter out bit in sam flag, Same as samtools view -F") parser.add_option("-q", "--mapq", type='int', help="minimum mapq for a read to be kept") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; #else: # sys.stderr.write("\nThis species is not recognized, exiting\n"); # sys.exit(1); chroms = SeparateByChrom.getChromsFromBam(opt.bam_file) SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag=opt.requiredFlag, filterFlag=opt.filterFlag, mapq=opt.mapq) if opt.threshold > 0: for chrom in chroms: if (Utility.fileExists(chrom + ".bed1")): strand_broken_remove(chrom, opt.threshold) SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file) else: SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file) SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) #if len(argv) < 14: # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values()); # genomesize = opt.fraction * genomesize; #else: # sys.stderr.write("This species is not recognized, exiting\n") # sys.exit(1) chromsDict = SeparateByChrom.getChromsFromBam(opt.chipreadfile) genomesize = sum(chromsDict.values()) * opt.fraction chip_library_size = get_total_tag_counts.get_total_tag_counts_bam( opt.chipreadfile) control_library_size = get_total_tag_counts.get_total_tag_counts_bam( opt.controlreadfile) sys.stderr.write("chip library size %s\n" % (chip_library_size)) sys.stderr.write("control library size %s\n" % (control_library_size)) totalchip = 0 totalcontrol = 0 islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0) # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.chipreadfile, '.bed1') else: sys.stderr.write(opt.chipreadfile + " not found") sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.controlreadfile, '.bed2') else: sys.stderr.write(opt.controlreadfile + " not found") sys.exit(1) island_chip_readcount = {} island_control_readcount = {} for chrom in chromsDict: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_chip_readcount_list[index] += 1 totalchip += 1 f.close() island_chip_readcount[chrom] = island_chip_readcount_list island_control_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_control_readcount_list[index] += 1 totalcontrol += 1 f.close() island_control_readcount[chrom] = island_control_readcount_list chip_background_read = chip_library_size - totalchip control_background_read = control_library_size - totalcontrol #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size * 1.0 / control_library_size print "Total number of chip reads on islands is: ", totalchip print "Total number of control reads on islands is: ", totalcontrol #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w') pvalue_list = [] result_list = [] for chrom in sorted(chromsDict.keys()): if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] observation = (island_chip_readcount[chrom])[index] control_tag = (island_control_readcount[chrom])[index] if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation) / float(average) else: length = item.end - item.start + 1 average = length * control_library_size * 1.0 / genomesize average = min(0.25, average) * scaling_factor fc = float(observation) / float(average) if observation > average: pvalue = scipy.stats.poisson.sf( (island_chip_readcount[chrom])[index], average)[()] else: pvalue = 1 pvalue_list.append(pvalue) item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray = scipy.array(pvalue_list) pvaluerankarray = scipy.stats.rankdata(pvaluearray) totalnumber = len(result_list) for i in range(totalnumber): item = result_list[i] alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i] if alpha > 1: alpha = 1 outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str( item['end']) + "\t" + str(item['chip']) + "\t" + str( item['control']) + "\t" + str(item['pvalue']) + "\t" + str( item['fc']) + "\t" + str(alpha) + "\n" out.write(outline) #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close() SeparateByChrom.cleanup(chromsDict.keys(), '.bed1') SeparateByChrom.cleanup(chromsDict.keys(), '.bed2')
def main(argv): parser = OptionParser() # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) #if len(argv) < 14: # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values()); # genomesize = opt.fraction * genomesize; #else: # sys.stderr.write("This species is not recognized, exiting\n") # sys.exit(1) chromsDict= SeparateByChrom.getChromsFromBam(opt.chipreadfile) genomesize= sum(chromsDict.values()) * opt.fraction chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile); control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile); sys.stderr.write("chip library size %s\n" %(chip_library_size)) sys.stderr.write("control library size %s\n" %(control_library_size)) totalchip = 0 totalcontrol = 0 islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0) # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.chipreadfile, '.bed1'); else: sys.stderr.write(opt.chipreadfile + " not found") sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.controlreadfile, '.bed2'); else: sys.stderr.write(opt.controlreadfile + " not found") sys.exit(1) island_chip_readcount = {}; island_control_readcount = {}; for chrom in chromsDict: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list=[0]*len(island_list); read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_chip_readcount_list[index] += 1; totalchip += 1; f.close(); island_chip_readcount[chrom] = island_chip_readcount_list; island_control_readcount_list=[0]*len(island_list); read_file = chrom + ".bed2"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_control_readcount_list[index] += 1; totalcontrol += 1; f.close(); island_control_readcount[chrom] = island_control_readcount_list; chip_background_read = chip_library_size - totalchip; control_background_read = control_library_size - totalcontrol; #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size*1.0/control_library_size; print "Total number of chip reads on islands is: ", totalchip; print "Total number of control reads on islands is: ", totalcontrol; #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w'); pvalue_list = []; result_list = []; for chrom in sorted(chromsDict.keys()): if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; observation = (island_chip_readcount[chrom])[index]; control_tag = (island_control_readcount[chrom])[index]; if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation)/float(average); else: length = item.end - item.start + 1; average = length * control_library_size *1.0/genomesize; average = min(0.25, average)* scaling_factor; fc = float(observation)/float(average); if observation > average: pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; else: pvalue = 1; pvalue_list.append(pvalue); item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray=scipy.array(pvalue_list); pvaluerankarray=scipy.stats.rankdata(pvaluearray); totalnumber = len(result_list); for i in range(totalnumber): item = result_list[i]; alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i]; if alpha > 1: alpha = 1; outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; out.write(outline); #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close(); SeparateByChrom.cleanup(chromsDict.keys(), '.bed1'); SeparateByChrom.cleanup(chromsDict.keys(), '.bed2');
def main(argv): """ Probability scoring with random background model. """ parser = OptionParser() #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option( "-B", "--bam", action="store", type="string", dest="bam", help= "Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store", type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--gap_size(bp)", action="store", type="int", dest="gap", help="gap size (in bps)", metavar="<int>") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option( "-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>") parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): chromsDict = SeparateByChrom.getChromsFromBam(opt.bam) sys.stderr.write("Window_size: %s\n" % (opt.window_size)) sys.stderr.write("Gap size: %s\n" % (opt.gap)) sys.stderr.write("E value is: %s\n" % (opt.evalue)) total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph( opt.summarygraph) sys.stderr.write("Total read count: %s\n" % (total_read_count)) genome_length = sum(chromsDict.values( )) ## sum (GenomeData.species_chrom_lengths[opt.species].values()); sys.stderr.write("Genome Length: %s\n" % (genome_length)) genome_length = int(opt.fraction * genome_length) average = float(total_read_count) * opt.window_size / genome_length sys.stderr.write("Effective genome Length: %s\n" % (genome_length)) sys.stderr.write("Window average: %s\n" % (average)) window_pvalue = 0.20 bin_size = 0.001 sys.stderr.write("Window pvalue: %s\n" % (window_pvalue)) background = Background_island_probscore_statistics.Background_island_probscore_statistics( total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size) min_tags_in_window = background.min_tags_in_window sys.stderr.write("Minimum num of tags in a qualified window: %s\n" % (min_tags_in_window)) sys.stderr.write( "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n" ) #read in the summary graph file bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH") #generate the probscore summary graph file, only care about enrichment #filter the summary graph to get rid of windows whose scores are less than window_score_threshold filtered_bed_val = {} for chrom in bed_val.keys(): if len(bed_val[chrom]) > 0: filtered_bed_val[chrom] = [] for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value if (read_count < min_tags_in_window): score = -1 #score = 0; else: prob = poisson(read_count, average) if prob < 1e-250: score = 1000 #outside of the scale, take an arbitrary number. else: score = -log(prob) bed_val[chrom][index].value = score if score > 0: filtered_bed_val[chrom].append((bed_val[chrom])[index]) #print chrom, start, read_count, score; #write the probscore summary graph file #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file); #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered"); sys.stderr.write("Determine the score threshold from random background\n") #determine threshold from random background hist_outfile = "L" + str(genome_length) + "_W" + str( opt.window_size) + "_G" + str(opt.gap) + "_s" + str( min_tags_in_window) + "_T" + str(total_read_count) + "_B" + str( bin_size) + "_calculatedprobscoreisland.hist" score_threshold = background.find_island_threshold(opt.evalue) # background.output_distribution(hist_outfile); sys.stderr.write("The score threshold is: %s\n" % (score_threshold)) sys.stderr.write("Make and write islands\n") total_number_islands = 0 outputfile = open(opt.out_island_file, 'w') for chrom in filtered_bed_val.keys(): if len(filtered_bed_val[chrom]) > 0: islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2) islands = find_region_above_threshold(islands, score_threshold) total_number_islands += len(islands) if len(islands) > 0: for i in islands: outline = chrom + "\t" + str(i.start) + "\t" + str( i.end) + "\t" + str(i.value) + "\n" outputfile.write(outline) else: sys.stderr.write( "\t" + chrom + " does not have any islands meeting the required significance\n" ) outputfile.close() sys.stderr.write("Total number of islands: %s\n" % (total_number_islands))