Python SeparateByChrom.getChromsFromBam Examples

Programming Language: Python

Class/Type: SeparateByChrom

Method/Function: getChromsFromBam

Examples at hotexamples.com: 8

Python SeparateByChrom.getChromsFromBam - 8 examples found. These are the top rated real world Python examples of SeparateByChrom.getChromsFromBam extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

cleanup(30)

separateByChrom(27)

combineAllGraphFiles(9)

getChromsFromBam(4)

separateByChromBamToBed(3)

combineAllGraphFilesBedToBam(1)

Example #1

Show file

File: run-make-graph-file-by-chrom_bam.py Project: dariober/SICERpy

def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="mm8,hg18,dm2,etc", metavar="<str>")
    parser.add_option("-b", "--bed_file", action="store", type="string",
                      dest="bamfile", help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w", "--window_size", action="store", type="int",
                      dest="window_size", help="window size", metavar="<int>")
    parser.add_option("-i", "--fragment_size", action="store", type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o", "--outfile", action="store", type="string",
                      dest="outfile", help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
	#chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed');

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size);
    final_output_file = opt.outfile;
    final_output_file = SeparateByChrom.combineAllGraphFiles(chromsDict.keys(), ".graph", final_output_file);
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed");

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph");

Example #2

Show file

File: remove_redundant_reads_bam.py Project: dariober/SICERpy

def main(argv):
    
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="species under consideration", metavar="<str>")
    parser.add_option("-b", "--raw_bam_file", action="store", type="string",
                      dest="bam_file", help="raw bam file", metavar="<file>")
    parser.add_option("-t", "--threshold", action="store", type="int",
                      dest="threshold", help="threshold for copy number", metavar="<int>")          
    parser.add_option("-o", "--output_file_name", action="store", type="string",
                      dest="out_file", help="output file name", metavar="<file>")
    ## Add options to filter reads
    parser.add_option("-f", "--requiredFlag", type= 'int', help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option("-F", "--filterFlag", type= 'int', help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q", "--mapq", type= 'int', help="minimum mapq for a read to be kept")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
            parser.print_help()
            sys.exit(1)
    
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms= SeparateByChrom.getChromsFromBam(opt.bam_file)
    
    SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag= opt.requiredFlag, filterFlag= opt.filterFlag, mapq= opt.mapq)
    
    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')

Example #3

Show file

File: find_islands_in_pr.py Project: dariober/SICERpy

def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()
    
    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>")
    parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
    parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
    parser.add_option("-g", "--gap_size(bp)", action="store", type="int",  dest="gap", help="gap size (in bps)", metavar="<int>")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
    parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>")
    parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
            parser.print_help()
            sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bam)
       
    sys.stderr.write("Window_size: %s\n" %(opt.window_size))
    sys.stderr.write("Gap size: %s\n" %(opt.gap))
    sys.stderr.write("E value is: %s\n" %(opt.evalue))
    
    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
    sys.stderr.write("Total read count: %s\n" %(total_read_count))
    genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" %(genome_length));
    genome_length = int(opt.fraction * genome_length);

    average = float(total_read_count) * opt.window_size/genome_length; 
    sys.stderr.write("Effective genome Length: %s\n" %(genome_length));
    sys.stderr.write("Window average: %s\n" %(average));
    
    window_pvalue = 0.20;
    bin_size = 0.001;
    sys.stderr.write("Window pvalue: %s\n" %(window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size);
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window))
    
    sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); 
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH");
    
    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold
    
    filtered_bed_val = {};
    
    for chrom in bed_val.keys():
        if len(bed_val[chrom])>0:
            filtered_bed_val [chrom]= [];
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value;
                if ( read_count < min_tags_in_window):
                    score = -1;
                    #score = 0;
                else:
                    prob = poisson(read_count, average);
                    if prob <1e-250:
                        score = 1000; #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob);
                bed_val[chrom][index].value = score;
                if score > 0:
                    filtered_bed_val[chrom].append( (bed_val[chrom])[index] );
                #print chrom, start, read_count, score;
    
    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);
    
    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");
    
    sys.stderr.write("Determine the score threshold from random background\n"); 
    #determine threshold from random background
    hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) +  "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist";
    score_threshold = background.find_island_threshold(opt.evalue); 
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" %(score_threshold));
    
    
    sys.stderr.write("Make and write islands\n");
    total_number_islands = 0;
    outputfile = open(opt.out_island_file, 'w');
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom])>0:
            islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2);
            islands = find_region_above_threshold(islands, score_threshold);
            total_number_islands += len(islands);
            if len(islands)>0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n";    
                    outputfile.write(outline);
            else:
                sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n");
    outputfile.close();    
    sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))

Example #4

Show file

def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="mm8,hg18,dm2,etc",
                      metavar="<str>")
    parser.add_option("-b",
                      "--bed_file",
                      action="store",
                      type="string",
                      dest="bamfile",
                      help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window size",
                      metavar="<int>")
    parser.add_option("-i",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
    #chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict = SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile,
                                            '.bed')

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size,
                  opt.fragment_size)
    final_output_file = opt.outfile
    final_output_file = SeparateByChrom.combineAllGraphFiles(
        chromsDict.keys(), ".graph", final_output_file)
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed")

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph")

Example #5

Show file

def main(argv):

    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species under consideration",
                      metavar="<str>")
    parser.add_option("-b",
                      "--raw_bam_file",
                      action="store",
                      type="string",
                      dest="bam_file",
                      help="raw bam file",
                      metavar="<file>")
    parser.add_option("-t",
                      "--threshold",
                      action="store",
                      type="int",
                      dest="threshold",
                      help="threshold for copy number",
                      metavar="<int>")
    parser.add_option("-o",
                      "--output_file_name",
                      action="store",
                      type="string",
                      dest="out_file",
                      help="output file name",
                      metavar="<file>")
    ## Add options to filter reads
    parser.add_option(
        "-f",
        "--requiredFlag",
        type='int',
        help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option(
        "-F",
        "--filterFlag",
        type='int',
        help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q",
                      "--mapq",
                      type='int',
                      help="minimum mapq for a read to be kept")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms = SeparateByChrom.getChromsFromBam(opt.bam_file)

    SeparateByChrom.separateByChromBamToBed(chroms,
                                            opt.bam_file,
                                            '.bed1',
                                            requiredFlag=opt.requiredFlag,
                                            filterFlag=opt.filterFlag,
                                            mapq=opt.mapq)

    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2',
                                                     opt.bam_file,
                                                     opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1',
                                                     opt.bam_file,
                                                     opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')

Example #6

Show file

def main(argv):
    parser = OptionParser()
    # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
    parser.add_option("-a",
                      "--rawchipreadfile",
                      action="store",
                      type="string",
                      dest="chipreadfile",
                      metavar="<file>",
                      help="raw read file from chip in bed format")
    parser.add_option("-b",
                      "--rawcontrolreadfile",
                      action="store",
                      type="string",
                      dest="controlreadfile",
                      metavar="<file>",
                      help="raw read file from control in BAM format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 14:
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #    genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
    #    genomesize = opt.fraction * genomesize;
    #else:
    #    sys.stderr.write("This species is not recognized, exiting\n")
    #    sys.exit(1)
    chromsDict = SeparateByChrom.getChromsFromBam(opt.chipreadfile)
    genomesize = sum(chromsDict.values()) * opt.fraction

    chip_library_size = get_total_tag_counts.get_total_tag_counts_bam(
        opt.chipreadfile)
    control_library_size = get_total_tag_counts.get_total_tag_counts_bam(
        opt.controlreadfile)
    sys.stderr.write("chip library size  %s\n" % (chip_library_size))
    sys.stderr.write("control library size %s\n" % (control_library_size))

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(),
                                                opt.chipreadfile, '.bed1')
    else:
        sys.stderr.write(opt.chipreadfile + " not found")
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(),
                                                opt.controlreadfile, '.bed2')
    else:
        sys.stderr.write(opt.controlreadfile + " not found")
        sys.exit(1)

    island_chip_readcount = {}
    island_control_readcount = {}

    for chrom in chromsDict:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_chip_readcount_list[index] += 1
                            totalchip += 1
                f.close()
                island_chip_readcount[chrom] = island_chip_readcount_list

                island_control_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_control_readcount_list[index] += 1
                            totalcontrol += 1
                f.close()

                island_control_readcount[chrom] = island_control_readcount_list

    chip_background_read = chip_library_size - totalchip
    control_background_read = control_library_size - totalcontrol
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size * 1.0 / control_library_size

    print "Total number of chip reads on islands is: ", totalchip
    print "Total number of control reads on islands is: ", totalcontrol

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w')
    pvalue_list = []
    result_list = []
    for chrom in sorted(chromsDict.keys()):
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    observation = (island_chip_readcount[chrom])[index]
                    control_tag = (island_control_readcount[chrom])[index]
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation) / float(average)
                    else:
                        length = item.end - item.start + 1
                        average = length * control_library_size * 1.0 / genomesize
                        average = min(0.25, average) * scaling_factor
                        fc = float(observation) / float(average)
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf(
                            (island_chip_readcount[chrom])[index], average)[()]
                    else:
                        pvalue = 1
                    pvalue_list.append(pvalue)
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray = scipy.array(pvalue_list)
    pvaluerankarray = scipy.stats.rankdata(pvaluearray)
    totalnumber = len(result_list)
    for i in range(totalnumber):
        item = result_list[i]
        alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i]
        if alpha > 1:
            alpha = 1
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(
            item['end']) + "\t" + str(item['chip']) + "\t" + str(
                item['control']) + "\t" + str(item['pvalue']) + "\t" + str(
                    item['fc']) + "\t" + str(alpha) + "\n"
        out.write(outline)

    #pvalue_list.sort()
    #for item in result_list:
    #pvalue = float(item['pvalue'])
    #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
    #if alpha > 1:
    #alpha = 1;
    #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";
    #out.write(outline);
    out.close()

    SeparateByChrom.cleanup(chromsDict.keys(), '.bed1')
    SeparateByChrom.cleanup(chromsDict.keys(), '.bed2')

Example #7

Show file

File: associate_tags_with_chip_and_control_w_fc_q_bam.py Project: dariober/SICERpy

def main(argv):
    parser = OptionParser()
    # parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
    parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format")
    parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format")
    parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment")
    parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
    parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 14:
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #    genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
    #    genomesize = opt.fraction * genomesize;
    #else:
    #    sys.stderr.write("This species is not recognized, exiting\n")
    #    sys.exit(1)
    chromsDict= SeparateByChrom.getChromsFromBam(opt.chipreadfile)
    genomesize= sum(chromsDict.values()) * opt.fraction

    chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile);
    control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile);
    sys.stderr.write("chip library size  %s\n" %(chip_library_size))
    sys.stderr.write("control library size %s\n" %(control_library_size))

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(chromsDict.keys(), opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.chipreadfile, '.bed1');
    else:
        sys.stderr.write(opt.chipreadfile + " not found")
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.controlreadfile, '.bed2');
    else:
        sys.stderr.write(opt.controlreadfile + " not found")
        sys.exit(1)    

    island_chip_readcount = {};
    island_control_readcount = {};

    for chrom in chromsDict:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom];
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'));

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list=[0]*len(island_list);
                read_file = chrom + ".bed1";
                f = open(read_file,'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
                        index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
                        if index >= 0:
                            island_chip_readcount_list[index] += 1;
                            totalchip += 1;
                f.close();
                island_chip_readcount[chrom] = island_chip_readcount_list;

                island_control_readcount_list=[0]*len(island_list);
                read_file = chrom + ".bed2";
                f = open(read_file,'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
                        if index >= 0:
                            island_control_readcount_list[index] += 1;
                            totalcontrol += 1;
                f.close();

                island_control_readcount[chrom] = island_control_readcount_list;            

    chip_background_read = chip_library_size - totalchip;
    control_background_read = control_library_size - totalcontrol;
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size*1.0/control_library_size;


    print "Total number of chip reads on islands is: ", totalchip; 
    print "Total number of control reads on islands is: ", totalcontrol; 

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w');
    pvalue_list = [];
    result_list = [];
    for chrom in sorted(chromsDict.keys()):
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom];
                for index in xrange(len(island_list)):
                    item = island_list[index];
                    observation = (island_chip_readcount[chrom])[index];
                    control_tag = (island_control_readcount[chrom])[index];
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation)/float(average);
                    else:
                        length = item.end - item.start + 1;
                        average = length * control_library_size *1.0/genomesize;            
                        average = min(0.25, average)* scaling_factor;
                        fc = float(observation)/float(average);
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; 
                    else:
                        pvalue = 1;
                    pvalue_list.append(pvalue);
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray=scipy.array(pvalue_list);
    pvaluerankarray=scipy.stats.rankdata(pvaluearray);
    totalnumber = len(result_list);
    for i in range(totalnumber):
        item = result_list[i];
        alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i];
        if alpha > 1:
            alpha = 1;
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";    
        out.write(outline);

    #pvalue_list.sort()
    #for item in result_list:
        #pvalue = float(item['pvalue'])
        #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
        #if alpha > 1:
            #alpha = 1;
        #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";    
        #out.write(outline);        
    out.close();


    SeparateByChrom.cleanup(chromsDict.keys(), '.bed1');
    SeparateByChrom.cleanup(chromsDict.keys(), '.bed2');

Example #8

Show file

File: find_islands_in_pr.py Project: jrtejedor/SICERpy

def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()

    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option(
        "-B",
        "--bam",
        action="store",
        type="string",
        dest="bam",
        help=
        "Any suitable bam file that can be used to extrcat chroms from header",
        metavar="<str>")
    parser.add_option("-b",
                      "--summarygraph",
                      action="store",
                      type="string",
                      dest="summarygraph",
                      help="summarygraph",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size(bp)",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window_size(in bps)",
                      metavar="<int>")
    parser.add_option("-g",
                      "--gap_size(bp)",
                      action="store",
                      type="int",
                      dest="gap",
                      help="gap size (in bps)",
                      metavar="<int>")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")
    parser.add_option(
        "-e",
        "--evalue ",
        action="store",
        type="float",
        dest="evalue",
        help="evalue that determines score threshold for significant islands",
        metavar="<float>")
    parser.add_option("-f",
                      "--out_island_file",
                      action="store",
                      type="string",
                      dest="out_island_file",
                      help="output island file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():

    chromsDict = SeparateByChrom.getChromsFromBam(opt.bam)

    sys.stderr.write("Window_size: %s\n" % (opt.window_size))
    sys.stderr.write("Gap size: %s\n" % (opt.gap))
    sys.stderr.write("E value is: %s\n" % (opt.evalue))

    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(
        opt.summarygraph)
    sys.stderr.write("Total read count: %s\n" % (total_read_count))
    genome_length = sum(chromsDict.values(
    ))  ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" % (genome_length))
    genome_length = int(opt.fraction * genome_length)

    average = float(total_read_count) * opt.window_size / genome_length
    sys.stderr.write("Effective genome Length: %s\n" % (genome_length))
    sys.stderr.write("Window average: %s\n" % (average))

    window_pvalue = 0.20
    bin_size = 0.001
    sys.stderr.write("Window pvalue: %s\n" % (window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(
        total_read_count, opt.window_size, opt.gap, window_pvalue,
        genome_length, bin_size)
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %
                     (min_tags_in_window))

    sys.stderr.write(
        "Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"
    )
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH")

    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold

    filtered_bed_val = {}

    for chrom in bed_val.keys():
        if len(bed_val[chrom]) > 0:
            filtered_bed_val[chrom] = []
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value
                if (read_count < min_tags_in_window):
                    score = -1
                    #score = 0;
                else:
                    prob = poisson(read_count, average)
                    if prob < 1e-250:
                        score = 1000
                        #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob)
                bed_val[chrom][index].value = score
                if score > 0:
                    filtered_bed_val[chrom].append((bed_val[chrom])[index])
                #print chrom, start, read_count, score;

    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);

    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");

    sys.stderr.write("Determine the score threshold from random background\n")
    #determine threshold from random background
    hist_outfile = "L" + str(genome_length) + "_W" + str(
        opt.window_size) + "_G" + str(opt.gap) + "_s" + str(
            min_tags_in_window) + "_T" + str(total_read_count) + "_B" + str(
                bin_size) + "_calculatedprobscoreisland.hist"
    score_threshold = background.find_island_threshold(opt.evalue)
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" % (score_threshold))

    sys.stderr.write("Make and write islands\n")
    total_number_islands = 0
    outputfile = open(opt.out_island_file, 'w')
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom]) > 0:
            islands = combine_proximal_islands(filtered_bed_val[chrom],
                                               opt.gap, 2)
            islands = find_region_above_threshold(islands, score_threshold)
            total_number_islands += len(islands)
            if len(islands) > 0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(
                        i.end) + "\t" + str(i.value) + "\n"
                    outputfile.write(outline)
            else:
                sys.stderr.write(
                    "\t" + chrom +
                    " does not have any islands meeting the required significance\n"
                )
    outputfile.close()
    sys.stderr.write("Total number of islands: %s\n" % (total_number_islands))