コード例 #1
0
def  get_island_read_counts (species, summary_graph_file, islands_file,  window_size, out_file, window_read_count_threshold=0):
	"""
	Filter summary graphs using the islands. Find the read count on islands
	"""
	total_read_count = 0;
	
	assert (species in GenomeData.species_chroms.keys())
	windows_on_islands = filter_summary_graphs.find_windows_on_islands(species, summary_graph_file, islands_file,  window_size, out_file, window_read_count_threshold);
	
	total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph("", windows_on_islands, window_read_count_threshold);
	return total_read_count;
コード例 #2
0
def main(argv):
	
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="mm8, hg18, background, etc", metavar="<str>")
	parser.add_option("-b", "--summary_graph_file", action="store", type="string",
                      dest="summary_graph_file", help="bed summary graph file of", metavar="<file>")
	parser.add_option("-i", "--islands_file", action="store", type="string",
                      dest="islands_file", help="islands file", metavar="<file>")
	parser.add_option("-w", "--window_size", action="store", type="int",
                      dest="window_size", help="window size of summary", metavar="<int>")     
	parser.add_option("-o", "--islands_score_histogram_file", action="store", type="string",
                      dest="islands_score_histogram_file", help="islands histogram file", metavar="<file>")
	parser.add_option("-q", "--islands_length_histogram_file", action="store", type="string",
                      dest="islands_length_histogram_file", help="islands length histogram file", metavar="<file>")
	parser.add_option("-r", "--island_filtered_summary_graph", action="store", type="string",
                      dest="island_filtered_summary_graph", default = "", help=" Optional. The default is not to do it. ", metavar="<file>")	   	   

	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
        	parser.print_help()
        	sys.exit(1)
	if opt.species in GenomeData.species_chroms.keys():
		genome_length = sum ( GenomeData.species_chrom_lengths[opt.species].values());
		chroms = GenomeData.species_chroms[opt.species];
		
		total_tag_counts = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summary_graph_file);
		print "Total read count is:" , total_tag_counts;
		
		total_length = find_islands_length_histogram({}, opt.islands_file, opt.islands_length_histogram_file);
		print "Total islands length is: ", total_length, ";      Length coverage = total_length_of_islands/genome_length is: ", total_length*1.0/genome_length;
		
		bin_size=0.1;
		total_score = find_islands_score_histogram({}, opt.islands_file, bin_size, opt.islands_score_histogram_file);
		print "Total islands score is: ", total_score;
		
		if (opt.island_filtered_summary_graph != ""):
			read_count_on_islands = get_island_read_counts (opt.species, opt.summary_graph_file, opt.islands_file,  opt.window_size, opt.island_filtered_summary_graph, 0)
			print "Total read count on island is: ", read_count_on_islands, " Read count coverage=read_count_on_islands/Total-read-count: ", read_count_on_islands/float(total_tag_counts); 
		
    	else: 
		print "This species is not in my list!"; 
コード例 #3
0
def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()
    
    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>")
    parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
    parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
    parser.add_option("-g", "--gap_size(bp)", action="store", type="int",  dest="gap", help="gap size (in bps)", metavar="<int>")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
    parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>")
    parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
            parser.print_help()
            sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bam)
       
    sys.stderr.write("Window_size: %s\n" %(opt.window_size))
    sys.stderr.write("Gap size: %s\n" %(opt.gap))
    sys.stderr.write("E value is: %s\n" %(opt.evalue))
    
    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
    sys.stderr.write("Total read count: %s\n" %(total_read_count))
    genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" %(genome_length));
    genome_length = int(opt.fraction * genome_length);

    average = float(total_read_count) * opt.window_size/genome_length; 
    sys.stderr.write("Effective genome Length: %s\n" %(genome_length));
    sys.stderr.write("Window average: %s\n" %(average));
    
    window_pvalue = 0.20;
    bin_size = 0.001;
    sys.stderr.write("Window pvalue: %s\n" %(window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size);
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window))
    
    sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); 
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH");
    
    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold
    
    filtered_bed_val = {};
    
    for chrom in bed_val.keys():
        if len(bed_val[chrom])>0:
            filtered_bed_val [chrom]= [];
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value;
                if ( read_count < min_tags_in_window):
                    score = -1;
                    #score = 0;
                else:
                    prob = poisson(read_count, average);
                    if prob <1e-250:
                        score = 1000; #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob);
                bed_val[chrom][index].value = score;
                if score > 0:
                    filtered_bed_val[chrom].append( (bed_val[chrom])[index] );
                #print chrom, start, read_count, score;
    
    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);
    
    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");
    
    sys.stderr.write("Determine the score threshold from random background\n"); 
    #determine threshold from random background
    hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) +  "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist";
    score_threshold = background.find_island_threshold(opt.evalue); 
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" %(score_threshold));
    
    
    sys.stderr.write("Make and write islands\n");
    total_number_islands = 0;
    outputfile = open(opt.out_island_file, 'w');
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom])>0:
            islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2);
            islands = find_region_above_threshold(islands, score_threshold);
            total_number_islands += len(islands);
            if len(islands)>0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n";    
                    outputfile.write(outline);
            else:
                sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n");
    outputfile.close();    
    sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))
コード例 #4
0
def main(argv):
	'''
	Coarse graining test chr1, input must only have chr1
	
	'''
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
	parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
	parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
	parser.add_option("-g", "--graining_size", action="store", type="int",  dest="step", help="graining unit size (>0)", metavar="<int>")
	parser.add_option("-e", "--score", action="store", type="int", dest="score", help="graining criterion, 0<score<=graining_size", metavar="<int>")
	parser.add_option("-t", "--mappable_faction_of_genome_size", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
	parser.add_option("-f", "--output_file", action="store", type="string", dest="out_file", help="output file name", metavar="<file>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 14:
        	parser.print_help()
        	sys.exit(1)

	print "Coarse-graining approach to identify ChIP-Seq enriched domains:"
	if opt.species in  GenomeData.species_chroms.keys():
		print "Species: ", opt.species;
		print "Window_size: ", opt.window_size;
		print "Coarse graining step: ", opt.step;
		print "Coarse graining score:", opt.score;
		chroms = GenomeData.species_chroms[opt.species]
		total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
		print "Total read count:", total_read_count
		genome_length = sum (GenomeData.species_chrom_lengths[opt.species].values());
		genome_length = int(opt.fraction * genome_length);

		average = float(total_read_count) * opt.window_size/genome_length; 
		print "Effective genome length: ", genome_length;
		print "window average:", average;
		
		min_tags_in_window = int(average) + 1
		print "Minimum read count in a qualified window: ", min_tags_in_window
		
		print "Generate preprocessed data list"; 
		#read in the summary graph file
		bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH");
		#generate the probscore summary graph file, only care about enrichment
		for chrom in chroms: 
			if chrom in bed_val.keys() and len(bed_val[chrom]) > 0:
				chrom_length = GenomeData.species_chrom_lengths[opt.species][chrom]
				eligible_start_list = []
				for index in xrange(len(bed_val[chrom])):
					read_count = bed_val[chrom][index].value;
					if read_count >= min_tags_in_window:
						eligible_start_list.append(bed_val[chrom][index].start)
				print "Coarse graining:";
				(result_list, island_list) = coarsegraining(eligible_start_list, opt.window_size, opt.step, opt.score, chrom_length)
				print "Trace back...", len(island_list)
				islands = traceback(island_list, opt.window_size, opt.step, 0, chrom_length, chrom)
				print len(islands), "islands found in", chrom
				f = open(chrom + ".islandstemp", 'w')
				for i in range(0, len(islands)):
					f.write(chrom + '\t' + str(int(islands[i].start)) + '\t' + str(int(islands[i].end)) + '\t1\n')
				f.close()
		o = open(opt.out_file, 'w')
		o.write('track type=bedGraph name=' + opt.out_file + '\n')
		o.close()
		SeparateByChrom.combineAllGraphFiles(chroms, ".islandstemp", opt.out_file)
		SeparateByChrom.cleanup(chroms, ".islandstemp")
		#else: 
			#print "input data error!"
	else:
		print "This species is not in my list!"; 
コード例 #5
0
def main(argv):
    """
	Probability scoring with random background model.

	"""
    parser = OptionParser()

    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="mm8, hg18, background, etc",
                      metavar="<str>")
    parser.add_option("-b",
                      "--summarygraph",
                      action="store",
                      type="string",
                      dest="summarygraph",
                      help="summarygraph",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size(bp)",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window_size(in bps)",
                      metavar="<int>")
    parser.add_option("-g",
                      "--gap_size(bp)",
                      action="store",
                      type="int",
                      dest="gap",
                      help="gap size (in bps)",
                      metavar="<int>")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")
    parser.add_option(
        "-e",
        "--evalue ",
        action="store",
        type="float",
        dest="evalue",
        help="evalue that determines score threshold for significant islands",
        metavar="<float>")
    parser.add_option("-f",
                      "--out_island_file",
                      action="store",
                      type="string",
                      dest="out_island_file",
                      help="output island file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        print "Species: ", opt.species
        print "Window_size: ", opt.window_size
        print "Gap size: ", opt.gap
        print "E value is:", opt.evalue

        total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(
            opt.summarygraph)
        print "Total read count:", total_read_count
        genome_length = sum(
            GenomeData.species_chrom_lengths[opt.species].values())
        print "Genome Length: ", genome_length
        genome_length = int(opt.fraction * genome_length)

        average = float(total_read_count) * opt.window_size / genome_length
        print "Effective genome Length: ", genome_length
        print "Window average:", average
        print "opt.evalue:", opt.evalue

        window_pvalue = 0.20
        bin_size = 0.001
        print "Total read count:", total_read_count
        print "window_size:", opt.window_size
        print "opt.gap:", opt.gap
        print "Window pvalue:", window_pvalue
        print "genome_length:", genome_length
        background = Background_island_probscore_statistics.Background_island_probscore_statistics(
            total_read_count, opt.window_size, opt.gap, window_pvalue,
            genome_length, bin_size)
        min_tags_in_window = background.min_tags_in_window
        print "Minimum num of tags in a qualified window: ", min_tags_in_window

        print "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows "
        #read in the summary graph file
        print "opt.summarygraph " * 5 + opt.summarygraph
        bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH")

        #generate the probscore summary graph file, only care about enrichment
        #filter the summary graph to get rid of windows whose scores are less than window_score_threshold

        filtered_bed_val = {}

        for chrom in bed_val.keys():
            if len(bed_val[chrom]) > 0:
                filtered_bed_val[chrom] = []
                for index in xrange(len(bed_val[chrom])):
                    read_count = bed_val[chrom][index].value
                    # if chrom == "chr1" and bed_val[chrom][index].start in [36532800, 36533000, 36533200, 36533400, 36533600, 36533800, 36534000, 36534200, 36534400, 36534600, 36534800, 36535000]:
                    # 	print("read_count", read_count, "start", bed_val[chrom][index].start)
                    if (read_count < min_tags_in_window):
                        score = -1
                        #score = 0;
                    else:
                        prob = poisson(read_count, average)
                        if prob < 1e-250:
                            score = 1000
                            #outside of the scale, take an arbitrary number.
                        else:
                            score = -log(prob)
                    bed_val[chrom][index].value = score
                    if score > 0:
                        filtered_bed_val[chrom].append((bed_val[chrom])[index])
                    #print chrom, start, read_count, score;

        #write the probscore summary graph file
        #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);

        #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");

        print "Determine the score threshold from random background"
        #determine threshold from random background
        hist_outfile = "L" + str(genome_length) + "_W" + str(
            opt.window_size) + "_G" + str(
                opt.gap) + "_s" + str(min_tags_in_window) + "_T" + str(
                    total_read_count) + "_B" + str(
                        bin_size) + "_calculatedprobscoreisland.hist"
        print "opt.evalue", opt.evalue
        score_threshold = background.find_island_threshold(opt.evalue)
        # background.output_distribution(hist_outfile);
        print "The score threshold is: ", score_threshold

        print "Make and write islands"
        total_number_islands = 0
        outputfile = open(opt.out_island_file, 'w')
        for chrom in filtered_bed_val.keys():
            if len(filtered_bed_val[chrom]) > 0:
                islands = combine_proximal_islands(filtered_bed_val[chrom],
                                                   opt.gap, 2)
                islands = find_region_above_threshold(islands, score_threshold)
                total_number_islands += len(islands)
                if len(islands) > 0:
                    for i in islands:
                        outline = chrom + "\t" + str(i.start) + "\t" + str(
                            i.end) + "\t" + str(i.value) + "\n"
                        outputfile.write(outline)
                else:
                    print "\t", chrom, "does not have any islands meeting the required significance"
        outputfile.close()
        print "Total number of islands: ", total_number_islands

    else:
        print "This species is not in my list!"
コード例 #6
0
def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()

    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option(
        "-B",
        "--bam",
        action="store",
        type="string",
        dest="bam",
        help=
        "Any suitable bam file that can be used to extrcat chroms from header",
        metavar="<str>")
    parser.add_option("-b",
                      "--summarygraph",
                      action="store",
                      type="string",
                      dest="summarygraph",
                      help="summarygraph",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size(bp)",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window_size(in bps)",
                      metavar="<int>")
    parser.add_option("-g",
                      "--gap_size(bp)",
                      action="store",
                      type="int",
                      dest="gap",
                      help="gap size (in bps)",
                      metavar="<int>")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")
    parser.add_option(
        "-e",
        "--evalue ",
        action="store",
        type="float",
        dest="evalue",
        help="evalue that determines score threshold for significant islands",
        metavar="<float>")
    parser.add_option("-f",
                      "--out_island_file",
                      action="store",
                      type="string",
                      dest="out_island_file",
                      help="output island file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():

    chromsDict = SeparateByChrom.getChromsFromBam(opt.bam)

    sys.stderr.write("Window_size: %s\n" % (opt.window_size))
    sys.stderr.write("Gap size: %s\n" % (opt.gap))
    sys.stderr.write("E value is: %s\n" % (opt.evalue))

    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(
        opt.summarygraph)
    sys.stderr.write("Total read count: %s\n" % (total_read_count))
    genome_length = sum(chromsDict.values(
    ))  ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" % (genome_length))
    genome_length = int(opt.fraction * genome_length)

    average = float(total_read_count) * opt.window_size / genome_length
    sys.stderr.write("Effective genome Length: %s\n" % (genome_length))
    sys.stderr.write("Window average: %s\n" % (average))

    window_pvalue = 0.20
    bin_size = 0.001
    sys.stderr.write("Window pvalue: %s\n" % (window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(
        total_read_count, opt.window_size, opt.gap, window_pvalue,
        genome_length, bin_size)
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %
                     (min_tags_in_window))

    sys.stderr.write(
        "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"
    )
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH")

    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold

    filtered_bed_val = {}

    for chrom in bed_val.keys():
        if len(bed_val[chrom]) > 0:
            filtered_bed_val[chrom] = []
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value
                if (read_count < min_tags_in_window):
                    score = -1
                    #score = 0;
                else:
                    prob = poisson(read_count, average)
                    if prob < 1e-250:
                        score = 1000
                        #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob)
                bed_val[chrom][index].value = score
                if score > 0:
                    filtered_bed_val[chrom].append((bed_val[chrom])[index])
                #print chrom, start, read_count, score;

    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);

    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");

    sys.stderr.write("Determine the score threshold from random background\n")
    #determine threshold from random background
    hist_outfile = "L" + str(genome_length) + "_W" + str(
        opt.window_size) + "_G" + str(opt.gap) + "_s" + str(
            min_tags_in_window) + "_T" + str(total_read_count) + "_B" + str(
                bin_size) + "_calculatedprobscoreisland.hist"
    score_threshold = background.find_island_threshold(opt.evalue)
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" % (score_threshold))

    sys.stderr.write("Make and write islands\n")
    total_number_islands = 0
    outputfile = open(opt.out_island_file, 'w')
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom]) > 0:
            islands = combine_proximal_islands(filtered_bed_val[chrom],
                                               opt.gap, 2)
            islands = find_region_above_threshold(islands, score_threshold)
            total_number_islands += len(islands)
            if len(islands) > 0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(
                        i.end) + "\t" + str(i.value) + "\n"
                    outputfile.write(outline)
            else:
                sys.stderr.write(
                    "\t" + chrom +
                    " does not have any islands meeting the required significance\n"
                )
    outputfile.close()
    sys.stderr.write("Total number of islands: %s\n" % (total_number_islands))