Esempio n. 1
0
def combine_proximal_islands(islands, gap, window_size_buffer=3):
    """
	islands: a list of BEd_GRAPH object: (chrom, start, end, value)
	
	Extend the regions found in the find_continuous_region function.
	If gap is not allowed, gap = 0, if one window is allowed, gap = window_size (200) 
	
	Return a list of combined regions.
	"""
    #print len(islands);
    proximal_island_dist = gap + window_size_buffer
    Final_islands = []
    if len(islands) > 0:
        if not Utility.is_bed_sorted(islands):
            islands.sort(key=operator.attrgetter('start'))
        current_island = islands[0]
        #print current_island;
        if len(islands) == 1:
            Final_islands = islands
        else:
            for index in range(1, len(islands)):
                dist = islands[index].start - current_island.end
                if dist <= proximal_island_dist:
                    current_island.end = islands[index].end
                    current_island.value += islands[index].value
                else:
                    Final_islands.append(current_island)
                    current_island = islands[index]
            # The last island:
            Final_islands.append(current_island)
    #print len(Final_islands);
    return Final_islands
Esempio n. 2
0
def find_read_copy_distribution(sorted_bed_list):
    """
	Input:  
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
		the tags are from one chromosome and in one direction. 
	Return: the histogram of the tag copies
	"""
    assert (Utility.is_bed_sorted(sorted_bed_list) == 1)

    unique_tag_histogram = [0] * 100
    if (len(sorted_bed_list) != 0):
        total_number_tags = len(sorted_bed_list)
        current_value = (sorted_bed_list[0]).start
        current_count = 1
        for index in range(1, len(sorted_bed_list)):
            item = sorted_bed_list[index]
            if (item.start != current_value):
                if (len(unique_tag_histogram) - 1) < current_count:
                    unique_tag_histogram += [0] * (
                        current_count - (len(unique_tag_histogram) - 1))
                unique_tag_histogram[current_count] += 1
                current_value = item.start
                current_count = 1
                #reset
            else:
                current_count += 1
        #last read
        if (len(unique_tag_histogram) - 1) < current_count:
            unique_tag_histogram += [0] * (current_count -
                                           (len(unique_tag_histogram) - 1))
        unique_tag_histogram[current_count] += 1
    return unique_tag_histogram
Esempio n. 3
0
def find_multi_copy_reads(sorted_bed_list, threshold):
    """
	Input: 	
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
						the tags are from one chromosome and in one direction. 
		threshold:	the threshold for read copy
	Return: the list of BED6 reads with copy number above or equal to threshold.
	"""
    multiple_copy_read_list = []
    temp_list = []

    assert (Utility.is_bed_sorted(sorted_bed_list) == 1)

    if (len(sorted_bed_list) != 0):
        total_number_tags = len(sorted_bed_list)
        current_value = (sorted_bed_list[0]).start
        temp_list.append(sorted_bed_list[0])
        current_count = 1

        for index in range(1, len(sorted_bed_list)):
            item = sorted_bed_list[index]
            if (item.start != current_value):
                if (current_count >= threshold):
                    #current_tag.score = current_count;
                    #multiple_copy_read_list.append(current_tag);
                    multiple_copy_read_list.extend(temp_list)
                current_value = item.start
                current_count = 1
                #reset
                temp_list = []
                temp_list.append(item)
            else:
                current_count += 1
                temp_list.append(item)
        #last read
        if (current_count >= threshold):
            #item.score = current_count;
            #multiple_copy_read_list.append(item);
            multiple_copy_read_list.extend(temp_list)
    return multiple_copy_read_list
Esempio n. 4
0
def find_n_copy_reads(sorted_bed_list, n):
    """
	Input:  
		sorted_bed_list: a list of sorted bed6 objects. Already assumed that 
				the tags are from one chromosome and in one direction. 
		n: the copies for a read 
	Return: the list of BED6 reads with copy number equal to n.
	"""

    assert (Utility.is_bed_sorted(sorted_bed_list) == 1)

    n_copy_read_list = []
    temp_list = []

    if (len(sorted_bed_list) != 0):
        total_number_tags = len(sorted_bed_list)
        temp_list.append(sorted_bed_list[0])
        current_value = (sorted_bed_list[0]).start
        current_count = 1

        for index in range(1, len(sorted_bed_list)):
            item = sorted_bed_list[index]

            if (item.start != current_value):
                if (current_count == n):
                    n_copy_read_list.extend(temp_list)
                current_value = item.start
                current_count = 1
                #reset
                temp_list = []
                temp_list.append(item)

            else:
                current_count += 1
                temp_list.append(item)
        #last read
        if (current_count == threshold):
            n_copy_read_list.extend(temp_list)
    return n_copy_read_list
Esempio n. 5
0
def filter_reads(sorted_bed_list, cutoff, outfile):
    """
	A read has n copies in the sorted_bed_list. If n<=cutoff, all the copies are retained.
	If n>cutoff, only cutoff number of copies of the read are retained.  
	
	Output: write bed objects with the extra redundant copies filtered out.If the number of reads in zero, then that file is not generated. 
	Return: the number of reads remained
	"""
    assert (Utility.is_bed_sorted(sorted_bed_list) == 1)
    counter2 = 0
    if (len(sorted_bed_list) != 0):
        out = open(outfile, 'w')
        total_number_tags = len(sorted_bed_list)
        current_value = (sorted_bed_list[0]).start
        current_count = 1
        current_tag = sorted_bed_list[0]
        for index in range(1, len(sorted_bed_list)):
            item = sorted_bed_list[index]
            if (item.start != current_value):
                if (current_count <= cutoff):
                    write(current_tag, out)
                    counter2 += 1
                current_value = item.start
                current_count = 1
                current_tag = item
            else:
                if (current_count <= cutoff):
                    write(current_tag, out)
                    counter2 += 1
                current_count += 1

        if (current_count <= cutoff):  #last tag
            write(current_tag, out)
            counter2 += 1
        out.close()
    return counter2
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--rawreadfile", action="store", type="string", dest="readfile", metavar="<file>", help="raw read file in bed format")
	parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment")
	parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 10:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	
	
	islands = BED.BED(opt.species, opt.islandfile, "BED3", 0);
	if Utility.fileExists(opt.readfile):
		SeparateByChrom.separateByChrom(chroms, opt.readfile, '.bed1');
	else:
		print opt.readfile, " not found";
		sys.exit(1)
	
	total = 0; 
	library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile);
	
	scaling_factor = 1000000; 
	out = open(opt.out_file, 'w');
	for chrom in chroms:
		if chrom in islands.keys():
			island_list = islands[chrom];
			island_readcount_list=[0]*len(island_list);
			
			if Utility.is_bed_sorted(island_list) == 0:
				island_list.sort(key=operator.attrgetter('start'));
				
			island_start_list = []
			island_end_list = []
			for item in island_list:
				island_start_list.append(item.start)
				island_end_list.append(item.end)

			read_file = chrom + ".bed1";
			f = open(read_file,'r')
			for line in f:
				if not re.match("#", line):
					line = line.strip()
					sline = line.split()
					position = tag_position(sline, opt.fragment_size)
					index = find_readcount_on_islands(island_start_list, island_end_list, position);
					if index >= 0:
						island_readcount_list[index] += 1;
						total += 1;
			f.close();
							
			
			for index in xrange(len(island_list)):
				item = island_list[index];
				normalized_read_count = island_readcount_list[index]/float(library_size) * scaling_factor;
				outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(island_readcount_list[index]) +  "\t" + str(normalized_read_count) + "\n";	
				out.write(outline);		
							
	SeparateByChrom.cleanup(chroms, '.bed1');
	out.close();
	print "Total number of reads on islands are: ", total; 
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawchipreadfile",
                      action="store",
                      type="string",
                      dest="chipreadfile",
                      metavar="<file>",
                      help="raw read file from chip in bed format")
    parser.add_option("-b",
                      "--rawcontrolreadfile",
                      action="store",
                      type="string",
                      dest="controlreadfile",
                      metavar="<file>",
                      help="raw read file from control in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        genomesize = sum(
            GenomeData.species_chrom_lengths[opt.species].values())
        genomesize = opt.fraction * genomesize
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    chip_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.chipreadfile)
    control_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.controlreadfile)
    print "chip library size  ", chip_library_size
    print "control library size  ", control_library_size

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1')
    else:
        print opt.chipreadfile, " not found"
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2')
    else:
        print opt.controlreadfile, " not found"
        sys.exit(1)

    island_chip_readcount = {}
    island_control_readcount = {}

    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_chip_readcount_list[index] += 1
                            totalchip += 1
                f.close()
                island_chip_readcount[chrom] = island_chip_readcount_list

                island_control_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_control_readcount_list[index] += 1
                            totalcontrol += 1
                f.close()

                island_control_readcount[chrom] = island_control_readcount_list

    chip_background_read = chip_library_size - totalchip
    control_background_read = control_library_size - totalcontrol
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size * 1.0 / control_library_size

    print "Total number of chip reads on islands is: ", totalchip
    print "Total number of control reads on islands is: ", totalcontrol

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w')
    pvalue_list = []
    result_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    observation = (island_chip_readcount[chrom])[index]
                    control_tag = (island_control_readcount[chrom])[index]
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation) / float(average)
                    else:
                        length = item.end - item.start + 1
                        average = length * control_library_size * 1.0 / genomesize
                        average = min(0.25, average) * scaling_factor
                        fc = float(observation) / float(average)
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf(
                            (island_chip_readcount[chrom])[index], average)[()]
                    else:
                        pvalue = 1
                    pvalue_list.append(pvalue)
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray = scipy.array(pvalue_list)
    pvaluerankarray = scipy.stats.rankdata(pvaluearray)
    totalnumber = len(result_list)
    for i in range(totalnumber):
        item = result_list[i]
        alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i]
        if alpha > 1:
            alpha = 1
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(
            item['end']) + "\t" + str(item['chip']) + "\t" + str(
                item['control']) + "\t" + str(item['pvalue']) + "\t" + str(
                    item['fc']) + "\t" + str(alpha) + "\n"
        out.write(outline)

    #pvalue_list.sort()
    #for item in result_list:
    #pvalue = float(item['pvalue'])
    #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
    #if alpha > 1:
    #alpha = 1;
    #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";
    #out.write(outline);
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
Esempio n. 8
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawreadfileA",
                      action="store",
                      type="string",
                      dest="readfileA",
                      metavar="<file>",
                      help="raw read file A in bed format")
    parser.add_option("-b",
                      "--rawreadfileB",
                      action="store",
                      type="string",
                      dest="readfileB",
                      metavar="<file>",
                      help="raw read file B in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after A experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    if not Utility.fileExists(opt.readfileA):
        print opt.readfileA, " not found"
        sys.exit(1)
    if not Utility.fileExists(opt.readfileB):
        print opt.readfileB, " not found"
        sys.exit(1)

    A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA)
    B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB)
    print "Library size of ", opt.readfileA, ":  ", A_library_size
    print "Library size of ", opt.readfileB, ":  ", B_library_size

    totalA = 0
    totalB = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the A library
    SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1')
    # separate by chrom the B library
    SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2')

    island_A_readcount = {}
    island_B_readcount = {}

    #Find read counts on the islands
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_A_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_A_readcount_list[index] += 1
                            totalA += 1
                f.close()
                island_A_readcount[chrom] = island_A_readcount_list

                island_B_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_B_readcount_list[index] += 1
                            totalB += 1
                f.close()
                island_B_readcount[chrom] = island_B_readcount_list

    #A_background_read = A_library_size - totalA;
    #B_background_read = B_library_size - totalB;

    print "Total number of A reads on islands is: ", totalA
    print "Total number of B reads on islands is: ", totalB

    # Calculate the p value.
    library_scaling_factor = A_library_size * 1.0 / B_library_size
    #A vs B
    pseudo_count = 1
    pvalue_A_vs_B_list = []
    pvalue_B_vs_A_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    pvalue_A_vs_B = pvaule(Acount, Bcount,
                                           library_scaling_factor,
                                           pseudo_count)
                    pvalue_A_vs_B_list.append(pvalue_A_vs_B)
                    pvalue_B_vs_A = pvaule(Bcount, Acount,
                                           1 / library_scaling_factor,
                                           pseudo_count)
                    pvalue_B_vs_A_list.append(pvalue_B_vs_A)
    #Calculate the FDR
    fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list)
    fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list)

    #Output the islands read counts, normalized read counts, fc, pvalue both ways
    scaling_factor = 1000000
    out = open(opt.out_file, 'w')
    outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"
    out.write(outline)
    ii = 0
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    normalized_A = Acount / float(
                        A_library_size) * scaling_factor
                    normalized_B = Bcount / float(
                        B_library_size) * scaling_factor
                    fc_A_vs_B = (
                        (Acount + pseudo_count) * 1.0 /
                        (Bcount + pseudo_count)) / library_scaling_factor
                    fc_B_vs_A = (
                        (Bcount + pseudo_count) * 1.0 /
                        (Acount + pseudo_count)) * library_scaling_factor
                    outline = item.chrom + "\t" + str(item.start) + "\t" + str(
                        item.end) + "\t" + str(Acount) + "\t" + str(
                            normalized_A) + "\t" + str(Bcount) + "\t" + str(
                                normalized_B
                            ) + "\t" + str(fc_A_vs_B) + "\t" + str(
                                pvalue_A_vs_B_list[ii]) + "\t" + str(
                                    fdr_A_vs_B_list[ii]
                                ) + "\t" + str(fc_B_vs_A) + "\t" + str(
                                    pvalue_B_vs_A_list[ii]) + "\t" + str(
                                        fdr_B_vs_A_list[ii]) + "\n"
                    out.write(outline)
                    ii += 1
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')

    # Calculate the correlations using normalized read counts
    A_array = ()
    B_array = ()
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                temp_array = scipy.array(island_A_readcount[chrom])
                A_array = scipy.concatenate((temp_array, A_array))
                temp_array = scipy.array(island_B_readcount[chrom])
                B_array = scipy.concatenate((temp_array, B_array))
    #Normalization to reads per million
    A_array = A_array / float(A_library_size) * scaling_factor
    B_array = B_array / float(B_library_size) * scaling_factor
    pearson = scipy.stats.pearsonr(A_array, B_array)
    print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[
        1]
    spearman = scipy.stats.spearmanr(A_array, B_array)
    print "Spearman's correlation is: ", spearman[
        0], " with p-value ", spearman[1]