def get_island_read_counts (species, summary_graph_file, islands_file, window_size, out_file, window_read_count_threshold=0): """ Filter summary graphs using the islands. Find the read count on islands """ total_read_count = 0; assert (species in GenomeData.species_chroms.keys()) windows_on_islands = filter_summary_graphs.find_windows_on_islands(species, summary_graph_file, islands_file, window_size, out_file, window_read_count_threshold); total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph("", windows_on_islands, window_read_count_threshold); return total_read_count;
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-b", "--summary_graph_file", action="store", type="string", dest="summary_graph_file", help="bed summary graph file of", metavar="<file>") parser.add_option("-i", "--islands_file", action="store", type="string", dest="islands_file", help="islands file", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size of summary", metavar="<int>") parser.add_option("-o", "--islands_score_histogram_file", action="store", type="string", dest="islands_score_histogram_file", help="islands histogram file", metavar="<file>") parser.add_option("-q", "--islands_length_histogram_file", action="store", type="string", dest="islands_length_histogram_file", help="islands length histogram file", metavar="<file>") parser.add_option("-r", "--island_filtered_summary_graph", action="store", type="string", dest="island_filtered_summary_graph", default = "", help=" Optional. The default is not to do it. ", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): genome_length = sum ( GenomeData.species_chrom_lengths[opt.species].values()); chroms = GenomeData.species_chroms[opt.species]; total_tag_counts = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summary_graph_file); print "Total read count is:" , total_tag_counts; total_length = find_islands_length_histogram({}, opt.islands_file, opt.islands_length_histogram_file); print "Total islands length is: ", total_length, "; Length coverage = total_length_of_islands/genome_length is: ", total_length*1.0/genome_length; bin_size=0.1; total_score = find_islands_score_histogram({}, opt.islands_file, bin_size, opt.islands_score_histogram_file); print "Total islands score is: ", total_score; if (opt.island_filtered_summary_graph != ""): read_count_on_islands = get_island_read_counts (opt.species, opt.summary_graph_file, opt.islands_file, opt.window_size, opt.island_filtered_summary_graph, 0) print "Total read count on island is: ", read_count_on_islands, " Read count coverage=read_count_on_islands/Total-read-count: ", read_count_on_islands/float(total_tag_counts); else: print "This species is not in my list!";
def main(argv): """ Probability scoring with random background model. """ parser = OptionParser() #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--gap_size(bp)", action="store", type="int", dest="gap", help="gap size (in bps)", metavar="<int>") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>") parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): chromsDict= SeparateByChrom.getChromsFromBam(opt.bam) sys.stderr.write("Window_size: %s\n" %(opt.window_size)) sys.stderr.write("Gap size: %s\n" %(opt.gap)) sys.stderr.write("E value is: %s\n" %(opt.evalue)) total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph); sys.stderr.write("Total read count: %s\n" %(total_read_count)) genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values()); sys.stderr.write("Genome Length: %s\n" %(genome_length)); genome_length = int(opt.fraction * genome_length); average = float(total_read_count) * opt.window_size/genome_length; sys.stderr.write("Effective genome Length: %s\n" %(genome_length)); sys.stderr.write("Window average: %s\n" %(average)); window_pvalue = 0.20; bin_size = 0.001; sys.stderr.write("Window pvalue: %s\n" %(window_pvalue)) background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size); min_tags_in_window = background.min_tags_in_window sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window)) sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); #read in the summary graph file bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH"); #generate the probscore summary graph file, only care about enrichment #filter the summary graph to get rid of windows whose scores are less than window_score_threshold filtered_bed_val = {}; for chrom in bed_val.keys(): if len(bed_val[chrom])>0: filtered_bed_val [chrom]= []; for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value; if ( read_count < min_tags_in_window): score = -1; #score = 0; else: prob = poisson(read_count, average); if prob <1e-250: score = 1000; #outside of the scale, take an arbitrary number. else: score = -log(prob); bed_val[chrom][index].value = score; if score > 0: filtered_bed_val[chrom].append( (bed_val[chrom])[index] ); #print chrom, start, read_count, score; #write the probscore summary graph file #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file); #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered"); sys.stderr.write("Determine the score threshold from random background\n"); #determine threshold from random background hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) + "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist"; score_threshold = background.find_island_threshold(opt.evalue); # background.output_distribution(hist_outfile); sys.stderr.write("The score threshold is: %s\n" %(score_threshold)); sys.stderr.write("Make and write islands\n"); total_number_islands = 0; outputfile = open(opt.out_island_file, 'w'); for chrom in filtered_bed_val.keys(): if len(filtered_bed_val[chrom])>0: islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2); islands = find_region_above_threshold(islands, score_threshold); total_number_islands += len(islands); if len(islands)>0: for i in islands: outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n"; outputfile.write(outline); else: sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n"); outputfile.close(); sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))
def main(argv): ''' Coarse graining test chr1, input must only have chr1 ''' parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--graining_size", action="store", type="int", dest="step", help="graining unit size (>0)", metavar="<int>") parser.add_option("-e", "--score", action="store", type="int", dest="score", help="graining criterion, 0<score<=graining_size", metavar="<int>") parser.add_option("-t", "--mappable_faction_of_genome_size", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option("-f", "--output_file", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) print "Coarse-graining approach to identify ChIP-Seq enriched domains:" if opt.species in GenomeData.species_chroms.keys(): print "Species: ", opt.species; print "Window_size: ", opt.window_size; print "Coarse graining step: ", opt.step; print "Coarse graining score:", opt.score; chroms = GenomeData.species_chroms[opt.species] total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph); print "Total read count:", total_read_count genome_length = sum (GenomeData.species_chrom_lengths[opt.species].values()); genome_length = int(opt.fraction * genome_length); average = float(total_read_count) * opt.window_size/genome_length; print "Effective genome length: ", genome_length; print "window average:", average; min_tags_in_window = int(average) + 1 print "Minimum read count in a qualified window: ", min_tags_in_window print "Generate preprocessed data list"; #read in the summary graph file bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH"); #generate the probscore summary graph file, only care about enrichment for chrom in chroms: if chrom in bed_val.keys() and len(bed_val[chrom]) > 0: chrom_length = GenomeData.species_chrom_lengths[opt.species][chrom] eligible_start_list = [] for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value; if read_count >= min_tags_in_window: eligible_start_list.append(bed_val[chrom][index].start) print "Coarse graining:"; (result_list, island_list) = coarsegraining(eligible_start_list, opt.window_size, opt.step, opt.score, chrom_length) print "Trace back...", len(island_list) islands = traceback(island_list, opt.window_size, opt.step, 0, chrom_length, chrom) print len(islands), "islands found in", chrom f = open(chrom + ".islandstemp", 'w') for i in range(0, len(islands)): f.write(chrom + '\t' + str(int(islands[i].start)) + '\t' + str(int(islands[i].end)) + '\t1\n') f.close() o = open(opt.out_file, 'w') o.write('track type=bedGraph name=' + opt.out_file + '\n') o.close() SeparateByChrom.combineAllGraphFiles(chroms, ".islandstemp", opt.out_file) SeparateByChrom.cleanup(chroms, ".islandstemp") #else: #print "input data error!" else: print "This species is not in my list!";
def main(argv): """ Probability scoring with random background model. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store", type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--gap_size(bp)", action="store", type="int", dest="gap", help="gap size (in bps)", metavar="<int>") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option( "-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>") parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): print "Species: ", opt.species print "Window_size: ", opt.window_size print "Gap size: ", opt.gap print "E value is:", opt.evalue total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph( opt.summarygraph) print "Total read count:", total_read_count genome_length = sum( GenomeData.species_chrom_lengths[opt.species].values()) print "Genome Length: ", genome_length genome_length = int(opt.fraction * genome_length) average = float(total_read_count) * opt.window_size / genome_length print "Effective genome Length: ", genome_length print "Window average:", average print "opt.evalue:", opt.evalue window_pvalue = 0.20 bin_size = 0.001 print "Total read count:", total_read_count print "window_size:", opt.window_size print "opt.gap:", opt.gap print "Window pvalue:", window_pvalue print "genome_length:", genome_length background = Background_island_probscore_statistics.Background_island_probscore_statistics( total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size) min_tags_in_window = background.min_tags_in_window print "Minimum num of tags in a qualified window: ", min_tags_in_window print "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows " #read in the summary graph file print "opt.summarygraph " * 5 + opt.summarygraph bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH") #generate the probscore summary graph file, only care about enrichment #filter the summary graph to get rid of windows whose scores are less than window_score_threshold filtered_bed_val = {} for chrom in bed_val.keys(): if len(bed_val[chrom]) > 0: filtered_bed_val[chrom] = [] for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value # if chrom == "chr1" and bed_val[chrom][index].start in [36532800, 36533000, 36533200, 36533400, 36533600, 36533800, 36534000, 36534200, 36534400, 36534600, 36534800, 36535000]: # print("read_count", read_count, "start", bed_val[chrom][index].start) if (read_count < min_tags_in_window): score = -1 #score = 0; else: prob = poisson(read_count, average) if prob < 1e-250: score = 1000 #outside of the scale, take an arbitrary number. else: score = -log(prob) bed_val[chrom][index].value = score if score > 0: filtered_bed_val[chrom].append((bed_val[chrom])[index]) #print chrom, start, read_count, score; #write the probscore summary graph file #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file); #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered"); print "Determine the score threshold from random background" #determine threshold from random background hist_outfile = "L" + str(genome_length) + "_W" + str( opt.window_size) + "_G" + str( opt.gap) + "_s" + str(min_tags_in_window) + "_T" + str( total_read_count) + "_B" + str( bin_size) + "_calculatedprobscoreisland.hist" print "opt.evalue", opt.evalue score_threshold = background.find_island_threshold(opt.evalue) # background.output_distribution(hist_outfile); print "The score threshold is: ", score_threshold print "Make and write islands" total_number_islands = 0 outputfile = open(opt.out_island_file, 'w') for chrom in filtered_bed_val.keys(): if len(filtered_bed_val[chrom]) > 0: islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2) islands = find_region_above_threshold(islands, score_threshold) total_number_islands += len(islands) if len(islands) > 0: for i in islands: outline = chrom + "\t" + str(i.start) + "\t" + str( i.end) + "\t" + str(i.value) + "\n" outputfile.write(outline) else: print "\t", chrom, "does not have any islands meeting the required significance" outputfile.close() print "Total number of islands: ", total_number_islands else: print "This species is not in my list!"
def main(argv): """ Probability scoring with random background model. """ parser = OptionParser() #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option( "-B", "--bam", action="store", type="string", dest="bam", help= "Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store", type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--gap_size(bp)", action="store", type="int", dest="gap", help="gap size (in bps)", metavar="<int>") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option( "-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>") parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): chromsDict = SeparateByChrom.getChromsFromBam(opt.bam) sys.stderr.write("Window_size: %s\n" % (opt.window_size)) sys.stderr.write("Gap size: %s\n" % (opt.gap)) sys.stderr.write("E value is: %s\n" % (opt.evalue)) total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph( opt.summarygraph) sys.stderr.write("Total read count: %s\n" % (total_read_count)) genome_length = sum(chromsDict.values( )) ## sum (GenomeData.species_chrom_lengths[opt.species].values()); sys.stderr.write("Genome Length: %s\n" % (genome_length)) genome_length = int(opt.fraction * genome_length) average = float(total_read_count) * opt.window_size / genome_length sys.stderr.write("Effective genome Length: %s\n" % (genome_length)) sys.stderr.write("Window average: %s\n" % (average)) window_pvalue = 0.20 bin_size = 0.001 sys.stderr.write("Window pvalue: %s\n" % (window_pvalue)) background = Background_island_probscore_statistics.Background_island_probscore_statistics( total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size) min_tags_in_window = background.min_tags_in_window sys.stderr.write("Minimum num of tags in a qualified window: %s\n" % (min_tags_in_window)) sys.stderr.write( "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n" ) #read in the summary graph file bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH") #generate the probscore summary graph file, only care about enrichment #filter the summary graph to get rid of windows whose scores are less than window_score_threshold filtered_bed_val = {} for chrom in bed_val.keys(): if len(bed_val[chrom]) > 0: filtered_bed_val[chrom] = [] for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value if (read_count < min_tags_in_window): score = -1 #score = 0; else: prob = poisson(read_count, average) if prob < 1e-250: score = 1000 #outside of the scale, take an arbitrary number. else: score = -log(prob) bed_val[chrom][index].value = score if score > 0: filtered_bed_val[chrom].append((bed_val[chrom])[index]) #print chrom, start, read_count, score; #write the probscore summary graph file #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file); #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered"); sys.stderr.write("Determine the score threshold from random background\n") #determine threshold from random background hist_outfile = "L" + str(genome_length) + "_W" + str( opt.window_size) + "_G" + str(opt.gap) + "_s" + str( min_tags_in_window) + "_T" + str(total_read_count) + "_B" + str( bin_size) + "_calculatedprobscoreisland.hist" score_threshold = background.find_island_threshold(opt.evalue) # background.output_distribution(hist_outfile); sys.stderr.write("The score threshold is: %s\n" % (score_threshold)) sys.stderr.write("Make and write islands\n") total_number_islands = 0 outputfile = open(opt.out_island_file, 'w') for chrom in filtered_bed_val.keys(): if len(filtered_bed_val[chrom]) > 0: islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2) islands = find_region_above_threshold(islands, score_threshold) total_number_islands += len(islands) if len(islands) > 0: for i in islands: outline = chrom + "\t" + str(i.start) + "\t" + str( i.end) + "\t" + str(i.value) + "\n" outputfile.write(outline) else: sys.stderr.write( "\t" + chrom + " does not have any islands meeting the required significance\n" ) outputfile.close() sys.stderr.write("Total number of islands: %s\n" % (total_number_islands))