def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="mm8,hg18,dm2,etc", metavar="<str>")
    parser.add_option("-b", "--bed_file", action="store", type="string",
                      dest="bamfile", help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w", "--window_size", action="store", type="int",
                      dest="window_size", help="window size", metavar="<int>")
    parser.add_option("-i", "--fragment_size", action="store", type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o", "--outfile", action="store", type="string",
                      dest="outfile", help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
	#chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed');

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size);
    final_output_file = opt.outfile;
    final_output_file = SeparateByChrom.combineAllGraphFiles(chromsDict.keys(), ".graph", final_output_file);
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed");

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph");
def main(argv):
    
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="species under consideration", metavar="<str>")
    parser.add_option("-b", "--raw_bam_file", action="store", type="string",
                      dest="bam_file", help="raw bam file", metavar="<file>")
    parser.add_option("-t", "--threshold", action="store", type="int",
                      dest="threshold", help="threshold for copy number", metavar="<int>")          
    parser.add_option("-o", "--output_file_name", action="store", type="string",
                      dest="out_file", help="output file name", metavar="<file>")
    ## Add options to filter reads
    parser.add_option("-f", "--requiredFlag", type= 'int', help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option("-F", "--filterFlag", type= 'int', help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q", "--mapq", type= 'int', help="minimum mapq for a read to be kept")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
            parser.print_help()
            sys.exit(1)
    
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms= SeparateByChrom.getChromsFromBam(opt.bam_file)
    
    SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag= opt.requiredFlag, filterFlag= opt.filterFlag, mapq= opt.mapq)
    
    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
Example #3
0
def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()
    
    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>")
    parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
    parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
    parser.add_option("-g", "--gap_size(bp)", action="store", type="int",  dest="gap", help="gap size (in bps)", metavar="<int>")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
    parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>")
    parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
            parser.print_help()
            sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bam)
       
    sys.stderr.write("Window_size: %s\n" %(opt.window_size))
    sys.stderr.write("Gap size: %s\n" %(opt.gap))
    sys.stderr.write("E value is: %s\n" %(opt.evalue))
    
    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
    sys.stderr.write("Total read count: %s\n" %(total_read_count))
    genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" %(genome_length));
    genome_length = int(opt.fraction * genome_length);

    average = float(total_read_count) * opt.window_size/genome_length; 
    sys.stderr.write("Effective genome Length: %s\n" %(genome_length));
    sys.stderr.write("Window average: %s\n" %(average));
    
    window_pvalue = 0.20;
    bin_size = 0.001;
    sys.stderr.write("Window pvalue: %s\n" %(window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size);
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window))
    
    sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); 
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH");
    
    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold
    
    filtered_bed_val = {};
    
    for chrom in bed_val.keys():
        if len(bed_val[chrom])>0:
            filtered_bed_val [chrom]= [];
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value;
                if ( read_count < min_tags_in_window):
                    score = -1;
                    #score = 0;
                else:
                    prob = poisson(read_count, average);
                    if prob <1e-250:
                        score = 1000; #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob);
                bed_val[chrom][index].value = score;
                if score > 0:
                    filtered_bed_val[chrom].append( (bed_val[chrom])[index] );
                #print chrom, start, read_count, score;
    
    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);
    
    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");
    
    sys.stderr.write("Determine the score threshold from random background\n"); 
    #determine threshold from random background
    hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) +  "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist";
    score_threshold = background.find_island_threshold(opt.evalue); 
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" %(score_threshold));
    
    
    sys.stderr.write("Make and write islands\n");
    total_number_islands = 0;
    outputfile = open(opt.out_island_file, 'w');
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom])>0:
            islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2);
            islands = find_region_above_threshold(islands, score_threshold);
            total_number_islands += len(islands);
            if len(islands)>0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n";    
                    outputfile.write(outline);
            else:
                sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n");
    outputfile.close();    
    sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))
Example #4
0
def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="mm8,hg18,dm2,etc",
                      metavar="<str>")
    parser.add_option("-b",
                      "--bed_file",
                      action="store",
                      type="string",
                      dest="bamfile",
                      help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window size",
                      metavar="<int>")
    parser.add_option("-i",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
    #chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict = SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile,
                                            '.bed')

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size,
                  opt.fragment_size)
    final_output_file = opt.outfile
    final_output_file = SeparateByChrom.combineAllGraphFiles(
        chromsDict.keys(), ".graph", final_output_file)
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed")

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph")
Example #5
0
def main(argv):

    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species under consideration",
                      metavar="<str>")
    parser.add_option("-b",
                      "--raw_bam_file",
                      action="store",
                      type="string",
                      dest="bam_file",
                      help="raw bam file",
                      metavar="<file>")
    parser.add_option("-t",
                      "--threshold",
                      action="store",
                      type="int",
                      dest="threshold",
                      help="threshold for copy number",
                      metavar="<int>")
    parser.add_option("-o",
                      "--output_file_name",
                      action="store",
                      type="string",
                      dest="out_file",
                      help="output file name",
                      metavar="<file>")
    ## Add options to filter reads
    parser.add_option(
        "-f",
        "--requiredFlag",
        type='int',
        help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option(
        "-F",
        "--filterFlag",
        type='int',
        help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q",
                      "--mapq",
                      type='int',
                      help="minimum mapq for a read to be kept")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms = SeparateByChrom.getChromsFromBam(opt.bam_file)

    SeparateByChrom.separateByChromBamToBed(chroms,
                                            opt.bam_file,
                                            '.bed1',
                                            requiredFlag=opt.requiredFlag,
                                            filterFlag=opt.filterFlag,
                                            mapq=opt.mapq)

    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2',
                                                     opt.bam_file,
                                                     opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1',
                                                     opt.bam_file,
                                                     opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
Example #6
0
def main(argv):
    parser = OptionParser()
    # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
    parser.add_option("-a",
                      "--rawchipreadfile",
                      action="store",
                      type="string",
                      dest="chipreadfile",
                      metavar="<file>",
                      help="raw read file from chip in bed format")
    parser.add_option("-b",
                      "--rawcontrolreadfile",
                      action="store",
                      type="string",
                      dest="controlreadfile",
                      metavar="<file>",
                      help="raw read file from control in BAM format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 14:
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #    genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
    #    genomesize = opt.fraction * genomesize;
    #else:
    #    sys.stderr.write("This species is not recognized, exiting\n")
    #    sys.exit(1)
    chromsDict = SeparateByChrom.getChromsFromBam(opt.chipreadfile)
    genomesize = sum(chromsDict.values()) * opt.fraction

    chip_library_size = get_total_tag_counts.get_total_tag_counts_bam(
        opt.chipreadfile)
    control_library_size = get_total_tag_counts.get_total_tag_counts_bam(
        opt.controlreadfile)
    sys.stderr.write("chip library size  %s\n" % (chip_library_size))
    sys.stderr.write("control library size %s\n" % (control_library_size))

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(),
                                                opt.chipreadfile, '.bed1')
    else:
        sys.stderr.write(opt.chipreadfile + " not found")
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(),
                                                opt.controlreadfile, '.bed2')
    else:
        sys.stderr.write(opt.controlreadfile + " not found")
        sys.exit(1)

    island_chip_readcount = {}
    island_control_readcount = {}

    for chrom in chromsDict:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_chip_readcount_list[index] += 1
                            totalchip += 1
                f.close()
                island_chip_readcount[chrom] = island_chip_readcount_list

                island_control_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_control_readcount_list[index] += 1
                            totalcontrol += 1
                f.close()

                island_control_readcount[chrom] = island_control_readcount_list

    chip_background_read = chip_library_size - totalchip
    control_background_read = control_library_size - totalcontrol
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size * 1.0 / control_library_size

    print "Total number of chip reads on islands is: ", totalchip
    print "Total number of control reads on islands is: ", totalcontrol

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w')
    pvalue_list = []
    result_list = []
    for chrom in sorted(chromsDict.keys()):
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    observation = (island_chip_readcount[chrom])[index]
                    control_tag = (island_control_readcount[chrom])[index]
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation) / float(average)
                    else:
                        length = item.end - item.start + 1
                        average = length * control_library_size * 1.0 / genomesize
                        average = min(0.25, average) * scaling_factor
                        fc = float(observation) / float(average)
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf(
                            (island_chip_readcount[chrom])[index], average)[()]
                    else:
                        pvalue = 1
                    pvalue_list.append(pvalue)
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray = scipy.array(pvalue_list)
    pvaluerankarray = scipy.stats.rankdata(pvaluearray)
    totalnumber = len(result_list)
    for i in range(totalnumber):
        item = result_list[i]
        alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i]
        if alpha > 1:
            alpha = 1
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(
            item['end']) + "\t" + str(item['chip']) + "\t" + str(
                item['control']) + "\t" + str(item['pvalue']) + "\t" + str(
                    item['fc']) + "\t" + str(alpha) + "\n"
        out.write(outline)

    #pvalue_list.sort()
    #for item in result_list:
    #pvalue = float(item['pvalue'])
    #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
    #if alpha > 1:
    #alpha = 1;
    #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";
    #out.write(outline);
    out.close()

    SeparateByChrom.cleanup(chromsDict.keys(), '.bed1')
    SeparateByChrom.cleanup(chromsDict.keys(), '.bed2')
def main(argv):
    parser = OptionParser()
    # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
    parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format")
    parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format")
    parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment")
    parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
    parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 14:
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #    genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
    #    genomesize = opt.fraction * genomesize;
    #else:
    #    sys.stderr.write("This species is not recognized, exiting\n")
    #    sys.exit(1)
    chromsDict= SeparateByChrom.getChromsFromBam(opt.chipreadfile)
    genomesize= sum(chromsDict.values()) * opt.fraction

    chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile);
    control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile);
    sys.stderr.write("chip library size  %s\n" %(chip_library_size))
    sys.stderr.write("control library size %s\n" %(control_library_size))

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.chipreadfile, '.bed1');
    else:
        sys.stderr.write(opt.chipreadfile + " not found")
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.controlreadfile, '.bed2');
    else:
        sys.stderr.write(opt.controlreadfile + " not found")
        sys.exit(1)    

    island_chip_readcount = {};
    island_control_readcount = {};

    for chrom in chromsDict:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom];
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'));

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list=[0]*len(island_list);
                read_file = chrom + ".bed1";
                f = open(read_file,'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
                        index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
                        if index >= 0:
                            island_chip_readcount_list[index] += 1;
                            totalchip += 1;
                f.close();
                island_chip_readcount[chrom] = island_chip_readcount_list;

                island_control_readcount_list=[0]*len(island_list);
                read_file = chrom + ".bed2";
                f = open(read_file,'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
                        if index >= 0:
                            island_control_readcount_list[index] += 1;
                            totalcontrol += 1;
                f.close();

                island_control_readcount[chrom] = island_control_readcount_list;            

    chip_background_read = chip_library_size - totalchip;
    control_background_read = control_library_size - totalcontrol;
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size*1.0/control_library_size;


    print "Total number of chip reads on islands is: ", totalchip; 
    print "Total number of control reads on islands is: ", totalcontrol; 

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w');
    pvalue_list = [];
    result_list = [];
    for chrom in sorted(chromsDict.keys()):
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom];
                for index in xrange(len(island_list)):
                    item = island_list[index];
                    observation = (island_chip_readcount[chrom])[index];
                    control_tag = (island_control_readcount[chrom])[index];
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation)/float(average);
                    else:
                        length = item.end - item.start + 1;
                        average = length * control_library_size *1.0/genomesize;            
                        average = min(0.25, average)* scaling_factor;
                        fc = float(observation)/float(average);
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; 
                    else:
                        pvalue = 1;
                    pvalue_list.append(pvalue);
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray=scipy.array(pvalue_list);
    pvaluerankarray=scipy.stats.rankdata(pvaluearray);
    totalnumber = len(result_list);
    for i in range(totalnumber):
        item = result_list[i];
        alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i];
        if alpha > 1:
            alpha = 1;
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";    
        out.write(outline);

    #pvalue_list.sort()
    #for item in result_list:
        #pvalue = float(item['pvalue'])
        #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
        #if alpha > 1:
            #alpha = 1;
        #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";    
        #out.write(outline);        
    out.close();


    SeparateByChrom.cleanup(chromsDict.keys(), '.bed1');
    SeparateByChrom.cleanup(chromsDict.keys(), '.bed2');
Example #8
0
def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()

    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option(
        "-B",
        "--bam",
        action="store",
        type="string",
        dest="bam",
        help=
        "Any suitable bam file that can be used to extrcat chroms from header",
        metavar="<str>")
    parser.add_option("-b",
                      "--summarygraph",
                      action="store",
                      type="string",
                      dest="summarygraph",
                      help="summarygraph",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size(bp)",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window_size(in bps)",
                      metavar="<int>")
    parser.add_option("-g",
                      "--gap_size(bp)",
                      action="store",
                      type="int",
                      dest="gap",
                      help="gap size (in bps)",
                      metavar="<int>")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")
    parser.add_option(
        "-e",
        "--evalue ",
        action="store",
        type="float",
        dest="evalue",
        help="evalue that determines score threshold for significant islands",
        metavar="<float>")
    parser.add_option("-f",
                      "--out_island_file",
                      action="store",
                      type="string",
                      dest="out_island_file",
                      help="output island file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():

    chromsDict = SeparateByChrom.getChromsFromBam(opt.bam)

    sys.stderr.write("Window_size: %s\n" % (opt.window_size))
    sys.stderr.write("Gap size: %s\n" % (opt.gap))
    sys.stderr.write("E value is: %s\n" % (opt.evalue))

    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(
        opt.summarygraph)
    sys.stderr.write("Total read count: %s\n" % (total_read_count))
    genome_length = sum(chromsDict.values(
    ))  ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" % (genome_length))
    genome_length = int(opt.fraction * genome_length)

    average = float(total_read_count) * opt.window_size / genome_length
    sys.stderr.write("Effective genome Length: %s\n" % (genome_length))
    sys.stderr.write("Window average: %s\n" % (average))

    window_pvalue = 0.20
    bin_size = 0.001
    sys.stderr.write("Window pvalue: %s\n" % (window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(
        total_read_count, opt.window_size, opt.gap, window_pvalue,
        genome_length, bin_size)
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %
                     (min_tags_in_window))

    sys.stderr.write(
        "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"
    )
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH")

    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold

    filtered_bed_val = {}

    for chrom in bed_val.keys():
        if len(bed_val[chrom]) > 0:
            filtered_bed_val[chrom] = []
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value
                if (read_count < min_tags_in_window):
                    score = -1
                    #score = 0;
                else:
                    prob = poisson(read_count, average)
                    if prob < 1e-250:
                        score = 1000
                        #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob)
                bed_val[chrom][index].value = score
                if score > 0:
                    filtered_bed_val[chrom].append((bed_val[chrom])[index])
                #print chrom, start, read_count, score;

    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);

    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");

    sys.stderr.write("Determine the score threshold from random background\n")
    #determine threshold from random background
    hist_outfile = "L" + str(genome_length) + "_W" + str(
        opt.window_size) + "_G" + str(opt.gap) + "_s" + str(
            min_tags_in_window) + "_T" + str(total_read_count) + "_B" + str(
                bin_size) + "_calculatedprobscoreisland.hist"
    score_threshold = background.find_island_threshold(opt.evalue)
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" % (score_threshold))

    sys.stderr.write("Make and write islands\n")
    total_number_islands = 0
    outputfile = open(opt.out_island_file, 'w')
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom]) > 0:
            islands = combine_proximal_islands(filtered_bed_val[chrom],
                                               opt.gap, 2)
            islands = find_region_above_threshold(islands, score_threshold)
            total_number_islands += len(islands)
            if len(islands) > 0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(
                        i.end) + "\t" + str(i.value) + "\n"
                    outputfile.write(outline)
            else:
                sys.stderr.write(
                    "\t" + chrom +
                    " does not have any islands meeting the required significance\n"
                )
    outputfile.close()
    sys.stderr.write("Total number of islands: %s\n" % (total_number_islands))