Esempio n. 1
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-f",
                      "--rawtagfile",
                      action="store",
                      type="string",
                      dest="raw_bed_file",
                      help="raw bed file",
                      metavar="<file>")
    parser.add_option("-n",
                      "--desirednumberoftags",
                      action="store",
                      type="int",
                      dest="desired_number_tags",
                      help="desired number of tags",
                      metavar="<int>")
    parser.add_option("-o",
                      "--slicedrawtagfile",
                      action="store",
                      type="string",
                      dest="out_file_name",
                      help="sliced raw bed file",
                      metavar="<file>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 6:
        parser.print_help()
        sys.exit(1)
    random_sample(opt.desired_number_tags, opt.raw_bed_file, opt.out_file_name)
    total = get_total_tag_counts.get_total_tag_counts(opt.out_file_name)
    print "The number of tags in " + opt.out_file_name + ' is ' + str(total)
Esempio n. 2
0
def random_sample (desired_number_tags, raw_bed_file, out_file_name):
	"""
	Read a raw bed file and take the desired number of lines
	
	If reproduceable result is needed, one needs to set seed of the random function outside.
	
	"""
	current_total = int(get_total_tag_counts.get_total_tag_counts(raw_bed_file));
	assert  (current_total >= desired_number_tags);
	sample_list = random.sample(xrange(current_total), desired_number_tags);
	sample_list.sort();
	#print sample_list;
	count = 0;
	index = 0; 
	infile = open(raw_bed_file,'r');
	outfile = open(out_file_name, 'w');
	for line in infile:
		if count == sample_list[index]:	
			outfile.write(line);
			if index == desired_number_tags-1:
				break;
			else:
				index += 1;
		count +=1;
	outfile.close();
	infile.close()		
Esempio n. 3
0
def random_sample(desired_number_tags, raw_bed_file, out_file_name):
    """
	Read a raw bed file and take the desired number of lines
	
	If reproduceable result is needed, one needs to set seed of the random function outside.
	
	"""
    current_total = int(
        get_total_tag_counts.get_total_tag_counts(raw_bed_file))
    assert (current_total >= desired_number_tags)
    sample_list = random.sample(xrange(current_total), desired_number_tags)
    sample_list.sort()
    #print sample_list;
    count = 0
    index = 0
    infile = open(raw_bed_file, 'r')
    outfile = open(out_file_name, 'w')
    for line in infile:
        if count == sample_list[index]:
            outfile.write(line)
            if index == desired_number_tags - 1:
                break
            else:
                index += 1
        count += 1
    outfile.close()
    infile.close()
Esempio n. 4
0
def main(argv):
	parser = OptionParser();
	parser.add_option("-f", "--rawtagfile", action="store", type="string",
			  dest="raw_bed_file", help="raw bed file",
			  metavar="<file>");
	parser.add_option("-n", "--desirednumberoftags", action="store", type="int",
			  dest="desired_number_tags", help="desired number of tags",
			  metavar="<int>");
    	parser.add_option("-o", "--slicedrawtagfile", action="store", type="string",
			  dest="out_file_name", help="sliced raw bed file",
			  metavar="<file>");
	(opt, args) = parser.parse_args(argv);
	if len(argv) < 6:
		parser.print_help()
		sys.exit(1)
	random_sample(opt.desired_number_tags, opt.raw_bed_file, opt.out_file_name);
	total = get_total_tag_counts.get_total_tag_counts(opt.out_file_name);
	print "The number of tags in " + opt.out_file_name + ' is ' + str(total);
Esempio n. 5
0
def slice(desired_number_tags, raw_bed_file, out_file_name):
	"""
	Read a raw bed file and take thedesired number of lines
	
	"""
	current_total = get_total_tag_counts.get_total_tag_counts(raw_bed_file);
	if current_total <= desired_number_tags:
		# copy the file.
		shutil.copy(raw_bed_file, out_file_name); 
		print "existing number of tags is ", current_total,  "<= desired, no need to sample";
	else:
		count =0.0;
		infile = open(raw_bed_file,'r');
		outfile = open(out_file_name, 'w');
		for line in infile:
			if count >= desired_number_tags: break;	
			outfile.write(line);
			count +=1;
		outfile.close();
		infile.close();
Esempio n. 6
0
def slice(desired_number_tags, raw_bed_file, out_file_name):
    """
	Read a raw bed file and take thedesired number of lines
	
	"""
    current_total = get_total_tag_counts.get_total_tag_counts(raw_bed_file)
    if current_total <= desired_number_tags:
        # copy the file.
        shutil.copy(raw_bed_file, out_file_name)
        print "existing number of tags is ", current_total, "<= desired, no need to sample"
    else:
        count = 0.0
        infile = open(raw_bed_file, 'r')
        outfile = open(out_file_name, 'w')
        for line in infile:
            if count >= desired_number_tags: break
            outfile.write(line)
            count += 1
        outfile.close()
        infile.close()
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format")
	parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format")
	parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment")
	parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
        	parser.print_help()
        	sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	if not Utility.fileExists(opt.readfileA):
		print opt.readfileA, " not found";
		sys.exit(1)
	if not Utility.fileExists(opt.readfileB):
		print opt.readfileB, " not found";
		sys.exit(1)	
	
	A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA);
	B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB);
	print "Library size of ", opt.readfileA, ":  ", A_library_size
	print "Library size of ", opt.readfileB, ":  ", B_library_size
	
	totalA = 0;
	totalB = 0;
	
	islands = BED.BED(opt.species, opt.islandfile, "BED3", 0);
	
	# separate by chrom the A library
	SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1');
	# separate by chrom the B library
	SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2');
	
	
	island_A_readcount = {};
	island_B_readcount = {};
	
	#Find read counts on the islands
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				if Utility.is_bed_sorted(island_list) == 0:
					island_list.sort(key=operator.attrgetter('start'));
					
				island_start_list = []
				island_end_list = []
				for item in island_list:
					island_start_list.append(item.start)
					island_end_list.append(item.end)
	
				island_A_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed1";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_A_readcount_list[index] += 1;
							totalA += 1;
				f.close();
				island_A_readcount[chrom] = island_A_readcount_list;
							
				island_B_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed2";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_B_readcount_list[index] += 1;
							totalB += 1;
				f.close();		
				island_B_readcount[chrom] = island_B_readcount_list;			
						
	#A_background_read = A_library_size - totalA;
	#B_background_read = B_library_size - totalB;
	
	print "Total number of A reads on islands is: ", totalA; 
	print "Total number of B reads on islands is: ", totalB; 

	# Calculate the p value.
	library_scaling_factor = A_library_size*1.0/B_library_size; #A vs B
	pseudo_count = 1; 
	pvalue_A_vs_B_list = [];
	pvalue_B_vs_A_list = [];
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					Acount = (island_A_readcount[chrom])[index]; 
					Bcount = (island_B_readcount[chrom])[index];
					pvalue_A_vs_B = pvaule (Acount, Bcount, library_scaling_factor, pseudo_count);
					pvalue_A_vs_B_list.append(pvalue_A_vs_B);
					pvalue_B_vs_A = pvaule (Bcount, Acount, 1/library_scaling_factor, pseudo_count);
					pvalue_B_vs_A_list.append(pvalue_B_vs_A);
	#Calculate the FDR
	fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list);
	fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list);


	#Output the islands read counts, normalized read counts, fc, pvalue both ways
	scaling_factor = 1000000; 
	out = open(opt.out_file, 'w');
	outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A"  + "\n"; 	
	out.write(outline);
	ii=0;
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					Acount = (island_A_readcount[chrom])[index]; 
					Bcount = (island_B_readcount[chrom])[index];
					normalized_A = Acount/ float(A_library_size) * scaling_factor;
					normalized_B = Bcount/ float(B_library_size) * scaling_factor;
					fc_A_vs_B = ((Acount + pseudo_count)*1.0/(Bcount + pseudo_count))/library_scaling_factor;
					fc_B_vs_A = ((Bcount + pseudo_count)*1.0/(Acount + pseudo_count)) * library_scaling_factor;
					outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(Acount) + "\t"  +  str(normalized_A) + "\t"  +  str(Bcount) + "\t" + str(normalized_B) + "\t" +  str(fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str(fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n";	
					out.write(outline);
					ii += 1;		
	out.close();

	SeparateByChrom.cleanup(chroms, '.bed1');
	SeparateByChrom.cleanup(chroms, '.bed2');


	# Calculate the correlations using normalized read counts
	A_array=();
	B_array=();
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				temp_array= scipy.array(island_A_readcount[chrom]);
				A_array=scipy.concatenate((temp_array, A_array));
				temp_array= scipy.array(island_B_readcount[chrom]);
				B_array=scipy.concatenate((temp_array, B_array));
	#Normalization to reads per million
	A_array = A_array/float(A_library_size) * scaling_factor;
	B_array = B_array/float(B_library_size) * scaling_factor;
	pearson=scipy.stats.pearsonr(A_array, B_array);
	print "Pearson's correlation is: ", pearson[0], " with p-value ",  pearson[1];
	spearman = scipy.stats.spearmanr(A_array, B_array);
	print "Spearman's correlation is: ", spearman[0], " with p-value ",  spearman[1];
Esempio n. 8
0
def main(argv):
    parser = OptionParser()

    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-t",
                      "--RE_tree_pickle_file",
                      action="store",
                      type="string",
                      dest="RE_Tree",
                      metavar="<file>",
                      help="file with RE tree in pickle format")
    parser.add_option(
        "-l",
        "--RE_annotation_file_location",
        action="store",
        type="string",
        dest="RE_file_location",
        metavar="<file>",
        help="location of RE files named in repClass_repFamily_repName.txt")
    parser.add_option("-u",
                      "--upstream_extension",
                      action="store",
                      type="int",
                      dest="upstream_extension",
                      help="upstream extension from start",
                      metavar="<int>")
    parser.add_option("-d",
                      "--downstream_extension",
                      action="store",
                      type="int",
                      dest="downstream_extension",
                      help="downstream extension from end",
                      metavar="<int>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-n",
                      "--feature_name",
                      action="store",
                      type="string",
                      dest="feature_name",
                      help="name of the library",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 16:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()
    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    total_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile)

    #Separate_by_chrom on bedfile
    lib_name = (opt.bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(opt.bedfile):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)
    else:
        print bedfile, " is not found"
        sys.exit(1)

    #load the RE tree to get the RE file names
    re_tree = pickle.load(open(opt.RE_Tree, 'rb'))
    (numb_classes, numb_families, numb_names) = numbers(re_tree)
    print "There are %d classes, %d family, and %d names." % (
        numb_classes, numb_families, numb_names)

    #Prepare the summary
    read_counts = {}
    for reClass in re_tree.keys():
        read_counts[reClass] = {}
        for reFamily in re_tree[reClass].keys():
            read_counts[reClass][reFamily] = {}
            for reName in re_tree[reClass][reFamily]:
                read_counts[reClass][reFamily][reName] = {}

    #cycle through chrom
    for chrom in chroms:
        print chrom
        chrom_length = chrom_lengths[chrom]
        chrombed = chrom + extension
        if Utility_extended.fileExists(chrombed):
            # load in each read and shift
            tag_position_list = []
            inf = open(chrombed, 'r')
            for line in inf:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    tag_position_list.append(
                        associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size))
            inf.close()
            if not Utility_extended.is_list_sorted(tag_position_list):
                tag_position_list.sort()  #[tag_positions]

        min_re_length = 10
        for reClass in re_tree.keys():
            for reFamily in re_tree[reClass].keys():
                for reName in re_tree[reClass][reFamily]:
                    re_file_name = "_".join([reClass, reFamily, reName
                                             ]) + ".txt"
                    #{id:{feature_name:value}}
                    rc_dic = get_read_count(
                        opt.RE_file_location, re_file_name, opt.feature_name,
                        chrom, chrom_length, tag_position_list, total_count,
                        opt.upstream_extension, opt.downstream_extension,
                        min_re_length)
                    # id is unique and updated only once, so this should be ok
                    read_counts[reClass][reFamily][reName].update(rc_dic)

    #{reClass:{reFamily:{reName:{id:feature_name, value}}}}
    #feature_name include: feature_name + "_rc", feature_name + "_rpkm"
    #output_file_name = feature_name + "_on_" + "mm9_rmsk.pkl"
    #output = open(output_file_name, 'wb')
    #pickle.dump(read_counts, output)
    #output.close()

    #instead of outputing a huge one, let's output many small pieces
    breakdown_and_output(read_counts, opt.feature_name)

    repClass = 'LTR'
    repFamily = 'ERV1'
    repName = 'RLTR4_Mm'
    outfile_name = lib_name + "_on_" + "_".join([repClass, repFamily, repName
                                                 ]) + ".dat"
    test(read_counts, repClass, repFamily, repName, outfile_name)

    SeparateByChrom.cleanup(chroms, extension)

    print "it took", time.time() - startTime, "seconds."
Esempio n. 9
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawchipreadfile",
                      action="store",
                      type="string",
                      dest="chipreadfile",
                      metavar="<file>",
                      help="raw read file from chip in bed format")
    parser.add_option("-b",
                      "--rawcontrolreadfile",
                      action="store",
                      type="string",
                      dest="controlreadfile",
                      metavar="<file>",
                      help="raw read file from control in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        genomesize = sum(
            GenomeData.species_chrom_lengths[opt.species].values())
        genomesize = opt.fraction * genomesize
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    chip_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.chipreadfile)
    control_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.controlreadfile)
    print "chip library size  ", chip_library_size
    print "control library size  ", control_library_size

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1')
    else:
        print opt.chipreadfile, " not found"
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2')
    else:
        print opt.controlreadfile, " not found"
        sys.exit(1)

    island_chip_readcount = {}
    island_control_readcount = {}

    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_chip_readcount_list[index] += 1
                            totalchip += 1
                f.close()
                island_chip_readcount[chrom] = island_chip_readcount_list

                island_control_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_control_readcount_list[index] += 1
                            totalcontrol += 1
                f.close()

                island_control_readcount[chrom] = island_control_readcount_list

    chip_background_read = chip_library_size - totalchip
    control_background_read = control_library_size - totalcontrol
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size * 1.0 / control_library_size

    print "Total number of chip reads on islands is: ", totalchip
    print "Total number of control reads on islands is: ", totalcontrol

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w')
    pvalue_list = []
    result_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    observation = (island_chip_readcount[chrom])[index]
                    control_tag = (island_control_readcount[chrom])[index]
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation) / float(average)
                    else:
                        length = item.end - item.start + 1
                        average = length * control_library_size * 1.0 / genomesize
                        average = min(0.25, average) * scaling_factor
                        fc = float(observation) / float(average)
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf(
                            (island_chip_readcount[chrom])[index], average)[()]
                    else:
                        pvalue = 1
                    pvalue_list.append(pvalue)
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray = scipy.array(pvalue_list)
    pvaluerankarray = scipy.stats.rankdata(pvaluearray)
    totalnumber = len(result_list)
    for i in range(totalnumber):
        item = result_list[i]
        alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i]
        if alpha > 1:
            alpha = 1
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(
            item['end']) + "\t" + str(item['chip']) + "\t" + str(
                item['control']) + "\t" + str(item['pvalue']) + "\t" + str(
                    item['fc']) + "\t" + str(alpha) + "\n"
        out.write(outline)

    #pvalue_list.sort()
    #for item in result_list:
    #pvalue = float(item['pvalue'])
    #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
    #if alpha > 1:
    #alpha = 1;
    #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";
    #out.write(outline);
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
Esempio n. 10
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawreadfile",
                      action="store",
                      type="string",
                      dest="readfile",
                      metavar="<file>",
                      help="raw read file in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-b",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count file")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)
    if Utility.fileExists(opt.readfile):
        SeparateByChrom.separateByChrom(chroms, opt.readfile, '.bed1')
    else:
        print opt.readfile, " not found"
        sys.exit(1)

    total = 0
    library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile)

    scaling_factor = 1000000
    out = open(opt.out_file, 'w')
    for chrom in chroms:
        if chrom in islands.keys():
            island_list = islands[chrom]
            island_readcount_list = [0] * len(island_list)

            if Utility.is_bed_sorted(island_list) == 0:
                island_list.sort(key=operator.attrgetter('start'))

            island_start_list = []
            island_end_list = []
            for item in island_list:
                island_start_list.append(item.start)
                island_end_list.append(item.end)

            read_file = chrom + ".bed1"
            f = open(read_file, 'r')
            for line in f:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    position = tag_position(sline, opt.fragment_size)
                    index = find_readcount_on_islands(island_start_list,
                                                      island_end_list,
                                                      position)
                    if index >= 0:
                        island_readcount_list[index] += 1
                        total += 1
            f.close()

            for index in xrange(len(island_list)):
                item = island_list[index]
                normalized_read_count = island_readcount_list[index] / float(
                    library_size) * scaling_factor
                outline = item.chrom + "\t" + str(item.start) + "\t" + str(
                    item.end) + "\t" + str(
                        island_readcount_list[index]) + "\t" + str(
                            normalized_read_count) + "\n"
                out.write(outline)

    SeparateByChrom.cleanup(chroms, '.bed1')
    out.close()
    print "Total number of reads on islands are: ", total
Esempio n. 11
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawreadfileA",
                      action="store",
                      type="string",
                      dest="readfileA",
                      metavar="<file>",
                      help="raw read file A in bed format")
    parser.add_option("-b",
                      "--rawreadfileB",
                      action="store",
                      type="string",
                      dest="readfileB",
                      metavar="<file>",
                      help="raw read file B in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after A experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    if not Utility.fileExists(opt.readfileA):
        print opt.readfileA, " not found"
        sys.exit(1)
    if not Utility.fileExists(opt.readfileB):
        print opt.readfileB, " not found"
        sys.exit(1)

    A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA)
    B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB)
    print "Library size of ", opt.readfileA, ":  ", A_library_size
    print "Library size of ", opt.readfileB, ":  ", B_library_size

    totalA = 0
    totalB = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the A library
    SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1')
    # separate by chrom the B library
    SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2')

    island_A_readcount = {}
    island_B_readcount = {}

    #Find read counts on the islands
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_A_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_A_readcount_list[index] += 1
                            totalA += 1
                f.close()
                island_A_readcount[chrom] = island_A_readcount_list

                island_B_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_B_readcount_list[index] += 1
                            totalB += 1
                f.close()
                island_B_readcount[chrom] = island_B_readcount_list

    #A_background_read = A_library_size - totalA;
    #B_background_read = B_library_size - totalB;

    print "Total number of A reads on islands is: ", totalA
    print "Total number of B reads on islands is: ", totalB

    # Calculate the p value.
    library_scaling_factor = A_library_size * 1.0 / B_library_size
    #A vs B
    pseudo_count = 1
    pvalue_A_vs_B_list = []
    pvalue_B_vs_A_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    pvalue_A_vs_B = pvaule(Acount, Bcount,
                                           library_scaling_factor,
                                           pseudo_count)
                    pvalue_A_vs_B_list.append(pvalue_A_vs_B)
                    pvalue_B_vs_A = pvaule(Bcount, Acount,
                                           1 / library_scaling_factor,
                                           pseudo_count)
                    pvalue_B_vs_A_list.append(pvalue_B_vs_A)
    #Calculate the FDR
    fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list)
    fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list)

    #Output the islands read counts, normalized read counts, fc, pvalue both ways
    scaling_factor = 1000000
    out = open(opt.out_file, 'w')
    outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"
    out.write(outline)
    ii = 0
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    normalized_A = Acount / float(
                        A_library_size) * scaling_factor
                    normalized_B = Bcount / float(
                        B_library_size) * scaling_factor
                    fc_A_vs_B = (
                        (Acount + pseudo_count) * 1.0 /
                        (Bcount + pseudo_count)) / library_scaling_factor
                    fc_B_vs_A = (
                        (Bcount + pseudo_count) * 1.0 /
                        (Acount + pseudo_count)) * library_scaling_factor
                    print("Acount", Acount, "Bcount", Bcount, "pseudo_count",
                          pseudo_count, "library_scaling_factor",
                          library_scaling_factor, "fc_A_vs_B", fc_A_vs_B,
                          "fc_B_vs_A", fc_B_vs_A)
                    outline = item.chrom + "\t" + str(item.start) + "\t" + str(
                        item.end) + "\t" + str(Acount) + "\t" + str(
                            normalized_A) + "\t" + str(Bcount) + "\t" + str(
                                normalized_B
                            ) + "\t" + str(fc_A_vs_B) + "\t" + str(
                                pvalue_A_vs_B_list[ii]) + "\t" + str(
                                    fdr_A_vs_B_list[ii]
                                ) + "\t" + str(fc_B_vs_A) + "\t" + str(
                                    pvalue_B_vs_A_list[ii]) + "\t" + str(
                                        fdr_B_vs_A_list[ii]) + "\n"
                    out.write(outline)
                    ii += 1
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')

    # Calculate the correlations using normalized read counts
    A_array = ()
    B_array = ()
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                temp_array = scipy.array(island_A_readcount[chrom])
                A_array = scipy.concatenate((temp_array, A_array))
                temp_array = scipy.array(island_B_readcount[chrom])
                B_array = scipy.concatenate((temp_array, B_array))
    #Normalization to reads per million
    A_array = A_array / float(A_library_size) * scaling_factor
    B_array = B_array / float(B_library_size) * scaling_factor
    pearson = scipy.stats.pearsonr(A_array, B_array)
    print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[
        1]
    spearman = scipy.stats.spearmanr(A_array, B_array)
    print "Spearman's correlation is: ", spearman[
        0], " with p-value ", spearman[1]
def main(argv):
    parser = OptionParser()

    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option("-f",
                      "--fragmentsize",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      help="fragment size of ChIP-seq reads, in bps",
                      metavar="<int>")
    parser.add_option("-g",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_genes",
                      metavar="<file>",
                      help="file with known genes in UCSC format")
    parser.add_option("-r",
                      "--'Promoter' or 'GeneBody' or 'PromoterGenebody'",
                      action="store",
                      type="string",
                      dest="region_type",
                      metavar="<str>",
                      help="region to count tags in")
    parser.add_option("-u",
                      "--promoter_upstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_upstream_extension",
                      help="upstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-d",
                      "--promoter_downstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_downstream_extension",
                      help="downstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    genes = get_read_count_on_genes(opt.bedfile, opt.fragment_size,
                                    opt.known_genes, opt.region_type,
                                    opt.promoter_upstream_extension,
                                    opt.promoter_downstream_extension)

    totalcount = get_total_tag_counts.get_total_tag_counts(opt.bedfile)

    f = open(opt.out_file, 'w')
    non_zero_genes = 0
    total_read_count_on_genes = 0
    items = genes.items()
    # convert to a list
    items.sort(key=operator.itemgetter(1), reverse=True)

    for gene in items:
        if gene[1] > 0:
            non_zero_genes += 1
            total_read_count_on_genes += gene[1]
        normalized_count = gene[1] / float(totalcount) * 1000000
        f.write(gene[0] + '\t' + str(gene[1]) + '\t' + str(normalized_count) +
                '\n')
    f.close()

    print "Total number of ", opt.region_type, ": ", len(genes.keys())
    print "Number of ", opt.region_type, " overlapped with islands: ", non_zero_genes
    print total_read_count_on_genes, "of the ", totalcount, " reads are on ", opt.region_type
Esempio n. 13
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-f",
                      "--fastafile",
                      action="store",
                      type="string",
                      dest="fasta_file",
                      metavar="<file>",
                      help="fasta file for the sequences")
    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedFile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 6:
        parser.print_help()
        sys.exit(1)

    chrom_lengths = get_chrom_length.get_chrom_lengths(opt.fasta_file)
    print "There are %d RE species" % len(chrom_lengths)

    chrom_rc = {}

    f = open(opt.out_file, 'w')
    outline = "# Name" + "\t" + "RC" + "\t" + "RPM" + "\t" + "RPKM" + '\n'
    f.write(outline)

    inf = open(opt.bedFile, "r")
    for line in inf:
        if not re.match("#", line):
            line = line.strip()
            sline = line.split()
            chrom = sline[0]
            if chrom in chrom_rc.keys():
                chrom_rc[chrom] += 1.0
            else:
                chrom_rc[chrom] = 1.0
    inf.close()

    #chroms with reads
    chroms = list(set(chrom_rc.keys()) & set(chrom_lengths.keys()))

    total_length = sum(chrom_lengths.values())
    totalcount = get_total_tag_counts.get_total_tag_counts(opt.bedFile) * 1.0
    #print totalcount

    basel_rpkm = (1000000.0) / (total_length / 1000.0)

    print "Basel RPKM for chroms with reads is %f" % basel_rpkm

    for chrom in chroms:
        rc = chrom_rc[chrom]
        rpm = (rc / totalcount) * 1000000
        rpkm = rpm * 1000.0 / chrom_lengths[chrom]
        outline = chrom + "\t" + str(rc) + "\t" + str(rpm) + "\t" + str(
            rpkm) + "\n"
        f.write(outline)
    f.close()
def main(argv):
    parser = OptionParser()
    parser.add_option("-a",
                      "--rawreadfileA",
                      action="store",
                      type="string",
                      dest="readfileA",
                      metavar="<file>",
                      help="raw read file A in bed format")
    parser.add_option("-b",
                      "--rawreadfileB",
                      action="store",
                      type="string",
                      dest="readfileB",
                      metavar="<file>",
                      help="raw read file B in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after A experiment")
    parser.add_option("-g",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_genes",
                      metavar="<file>",
                      help="file with known genes in UCSC format")
    parser.add_option(
        "-r",
        "--'Promoter' or 'GeneBody' or 'PromoterGenebody' or 'ExonicRegion'",
        action="store",
        type="string",
        dest="region_type",
        metavar="<str>",
        help="region to count tags in")
    parser.add_option("-u",
                      "--promoter_upstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_upstream_extension",
                      help="upstream extension of promoter region from TSS",
                      default=5000,
                      metavar="<int>")
    parser.add_option("-d",
                      "--promoter_downstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_downstream_extension",
                      help="downstream extension of promoter region from TSS",
                      default=1000,
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 16:
        parser.print_help()
        sys.exit(1)

    if not Utility.fileExists(opt.readfileA):
        print opt.readfileA, " not found"
        sys.exit(1)
    if not Utility.fileExists(opt.readfileB):
        print opt.readfileB, " not found"
        sys.exit(1)

    scaling_factor = 1000000

    A_read_count_on_genes = get_read_count_on_genes.get_read_count_on_genes(
        opt.readfileA, opt.fragment_size, opt.known_genes, opt.region_type,
        opt.promoter_upstream_extension, opt.promoter_downstream_extension)
    B_read_count_on_genes = get_read_count_on_genes.get_read_count_on_genes(
        opt.readfileB, opt.fragment_size, opt.known_genes, opt.region_type,
        opt.promoter_upstream_extension, opt.promoter_downstream_extension)
    assert len(A_read_count_on_genes.keys()) == len(
        B_read_count_on_genes.keys())

    A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA)
    B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB)
    non_zero_A_genes = 0
    non_zero_B_genes = 0
    total_A_read_count_on_genes = 0
    total_B_read_count_on_genes = 0

    f = open(opt.out_file, 'w')
    for g in A_read_count_on_genes.keys():
        if A_read_count_on_genes[g] > 0:
            non_zero_A_genes += 1
            total_A_read_count_on_genes += A_read_count_on_genes[g]
        normalized_A_count = A_read_count_on_genes[g] / float(
            A_library_size) * scaling_factor
        if B_read_count_on_genes[g] > 0:
            non_zero_B_genes += 1
            total_B_read_count_on_genes += B_read_count_on_genes[g]
        normalized_B_count = B_read_count_on_genes[g] / float(
            B_library_size) * scaling_factor
        outline = g + '\t' + str(
            A_read_count_on_genes[g]
        ) + '\t' + str(normalized_A_count) + '\t' + str(
            B_read_count_on_genes[g]) + '\t' + str(normalized_B_count) + '\n'
        f.write(outline)
    f.close()

    print "Total number of ", opt.region_type, ": ", len(
        A_read_count_on_genes.keys())
    print "Number of ", opt.region_type, " overlapped with ", opt.readfileA, " islands: ", non_zero_A_genes
    print "Number of ", opt.region_type, " overlapped with ", opt.readfileB, " islands: ", non_zero_B_genes
    print total_A_read_count_on_genes, "of the ", A_library_size, " ", opt.readfileA, " reads are on ", opt.region_type
    print total_B_read_count_on_genes, "of the ", B_library_size, " ", opt.readfileB, " reads are on ", opt.region_type

    # Calculate the correlations, those genes whose read counts are zero in both libraries are not counted.
    A_list = []
    B_list = []

    for g in A_read_count_on_genes.keys():
        #if A_read_count_on_genes[g] > 0.5  or B_read_count_on_genes[g] > 0.5 :
        if A_read_count_on_genes[g] >= -1 or B_read_count_on_genes[g] >= -1:
            A_list.append(A_read_count_on_genes[g])
            B_list.append(B_read_count_on_genes[g])

    A_array = scipy.array(A_list) / float(A_library_size) * scaling_factor
    B_array = scipy.array(B_list) / float(B_library_size) * scaling_factor

    print "Number of ", opt.region_type, " with non-zero read count in either libraries: ", len(
        A_list)
    pearson = scipy.stats.pearsonr(A_array, B_array)
    print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[
        1]
    spearman = scipy.stats.spearmanr(A_array, B_array)
    print "Spearman's correlation is: ", spearman[
        0], " with p-value ", spearman[1]

    #Calculate the non-parametric Kolmogorov-Smirnof statistic on 2 samples
    #From scipy.stats: This tests whether 2 samples are drawn from the same distribution. Note that, like in the case of the one-sample K-S test, the distribution is assumed to be continuous.This is the two-sided test, one-sided tests are not implemented. The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution. If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same.
    #The ideal situation would be the following. For example, we have a gene and we made measurement of the expression of this gene multiple times under two conditions. To address the questions whether this gene is expressed in a similar manner under the two conditions (w/o assuming normal distribution etc), we can use KS test.In this case, the situation is different, we have only one measurement for each gene under each condition. What we are doing is to pool all genes together to generate the distribution of gene expressions/mark-levels for each condition, and we are asking whether the distributions differ in the two conditions. The caveat is that even if the two distributions are the same, the behavior of individual genes can be quite different under different conditions.

    ksTestResult = scipy.stats.ks_2samp(A_array, B_array)
    print "P-value from Kolmogorov-Smirnof test is ", ksTestResult[
        1], " with statistic value: ", ksTestResult[0]

    #Generate the scatter plot figure.
    size = len(A_list) * len(A_list)
    plt.plot(A_array, B_array, "bo", markersize=3.0)
    #plt.scatter(A_array, B_array, s=size)
    mytext = "PearsonR=" + str(pearson[0]) + "; SpearmanR=" + str(spearman[0])
    title_line = "Read count correlation on " + opt.region_type
    #plt.title(title_line, fontsize=6)
    plt.title(title_line)
    x = opt.readfileA.split("/")[-1]
    y = opt.readfileB.split("/")[-1]
    plt.ylabel(y)
    plt.xlabel(x)
    ax = plt.gca()
    ax.set_xscale('log')
    ax.set_yscale('log')
    xmin, xmax = ax.get_xlim()
    ymin, ymax = ax.get_xlim()
    plt.text(xmin, ymax * 0.8, mytext, fontsize=7)
    ax.set_aspect(1.)
    epsfilename = opt.out_file.split(".")[0] + ".eps"
    plt.savefig(epsfilename, format="eps")
    pngfilename = opt.out_file.split(".")[0] + ".png"
    plt.savefig(pngfilename, format="png")
def main(argv):
	parser = OptionParser()
	parser.add_option("-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>")
	parser.add_option("-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>")
	parser.add_option("-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help="file with curated known genes clustered by entrez ID in pickle format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>")	
	parser.add_option("-e", "--extension", action="store", type="int", dest="extension",help="integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>")
		

	(opt, args) = parser.parse_args(argv)

	if len(argv) < 14:
		parser.print_help()
		sys.exit(1)

	startTime = time.time()

	allowance = 10

	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting"
		sys.exit(1)

	# entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	annotation = open(opt.entrez_genes, 'rb')
	entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation))
	annotation.close()

	# test module
	test = 0
	if test == 1:
		print "Testing gene structure"
		test_id = 79947
		Entrez.test_gene_structure(entrez_gene_collection, test_id)


	# Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
	entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd()
	print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd."


	#get total read count
	totalcount_F = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnForwardStrand)
	totalcount_R = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnReverseStrand)
	totalcount = totalcount_F + totalcount_R
	print totalcount_F, totalcount_R

	#Clear the file and write the first line
	outf = open(opt.outfile, 'w')
	
	#outline to use to output polyA information for a species	
	#outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
	#outline to use to output RUDs
	outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
	outf.write(outline)
	outf.close()

	#index: column in bed file for sorting
	index = 2

	print "Process genes on forward strand"
	entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids("+", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)


	print "Process genes on reverse strand"
	entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids("-", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)

	print "it took", time.time() - startTime, "seconds."
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-d",
                      "--3UTRdownstreamextension",
                      action="store",
                      type="int",
                      dest="downstream_extension",
                      help="3UTR down stream extension",
                      metavar="<int>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line, needs to be modified
    outf = open(opt.outfile, 'w')
    #outline = "# Entrez ID \t Main Refseq ID \t 3UTR union length \t Length Index \t PA Multiplicity Index \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outline = "# Entrez ID \t 3UTR Union length \t RUD \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "it took", time.time() - startTime, "seconds."
Esempio n. 17
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    test = 0

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    ##################################################################3
    #The column numbers are 1 based instead of 0 based!
    #For positive strand
    start_index_P = 2
    #For negative strand
    start_index_N = 3
    ##################################################################3

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    outf.close()

    # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand.
    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+")
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    (forward_reads_on_shared_exons, forward_reads_on_shared_introns,
     forward_reads_on_merged_transcripts,
     forward_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnForwardStrand,
                                             start_index_P, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-")
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    (reverse_reads_on_shared_exons, reverse_reads_on_shared_introns,
     reverse_reads_on_merged_transcripts,
     reverse_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnReverseStrand,
                                             start_index_N, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    #combine the densities
    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_exons = {}
    reads_on_shared_exons.update(forward_reads_on_shared_exons)
    reads_on_shared_exons.update(reverse_reads_on_shared_exons)
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_exons, test_id)

    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_introns = {}
    reads_on_shared_introns.update(forward_reads_on_shared_introns)
    reads_on_shared_introns.update(reverse_reads_on_shared_introns)
    #store the info in a pickle file
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_introns, test_id)

    reads_on_merged_transcripts = {}
    reads_on_merged_transcripts.update(forward_reads_on_merged_transcripts)
    reads_on_merged_transcripts.update(reverse_reads_on_merged_transcripts)
    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    summary = {}
    summary.update(forward_summary)
    summary.update(reverse_summary)
    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def main(argv):
    parser = OptionParser()

    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedFile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-g",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_genes",
                      metavar="<file>",
                      help="file with known genes in UCSC format")
    parser.add_option(
        "-r",
        "--'Promoter' or 'GeneBody' or 'PromoterGenebody' or 'ExonicRegion'",
        action="store",
        type="string",
        dest="region_type",
        metavar="<str>",
        help="region to count tags in")
    parser.add_option("-u",
                      "--promoter_upstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_upstream_extension",
                      help="upstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-d",
                      "--promoter_downstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_downstream_extension",
                      help="downstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    known_genes = UCSC_revised.KnownGenes(opt.known_genes)
    chroms = known_genes.keys()
    #Promoter and GeneBody are mutually exclusive.
    #Promoter: TSS-upstreamextention, TSS+downstreamextension
    #GeneBody: TSS+downstreamextension, TES
    #PromoterGenebody: TSS-upstreamextention,  TES.
    allowed_region_type = [
        'Promoter', 'GeneBody', 'PromoterGenebody', 'ExonicRegion'
    ]
    if opt.region_type not in allowed_region_type:
        print " The allowed region types are Promoter, GeneBody, PromoterGenebody and ExonicRegion. The region type is not recognized, exiting"
        sys.exit(1)

    if opt.region_type == 'Promoter':
        region_dic = known_genes.getPromoters(
            opt.promoter_upstream_extension, opt.promoter_downstream_extension)
    elif opt.region_type == 'GeneBody':
        region_dic = known_genes.getGenebodys(
            opt.promoter_downstream_extension)
    elif opt.region_type == 'PromoterGenebody':
        region_dic = known_genes.getPromotergenebodys(
            opt.promoter_upstream_extension)

    libName = (opt.bedFile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + '.bed1'
    if Utility_extended.fileExists(opt.bedFile):
        SeparateByChrom.separateByChrom(chroms, opt.bedFile, extension)
    else:
        print opt.bedFile, " not found"
        sys.exit(1)

    totalcount = get_total_tag_counts.get_total_tag_counts(opt.bedFile)

    f = open(opt.out_file, 'w')
    outline = "# GeneName" + '\t' + "Read Count" + '\t' + "RPKM" + '\n'
    f.write(outline)

    for chrom in chroms:
        chrombed = chrom + extension
        if Utility_extended.fileExists(chrombed):
            gene_coords = known_genes[chrom]
            if len(gene_coords) > 0:
                if opt.region_type == 'ExonicRegion':
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_exons(
                         gene_coords, chrombed, opt.fragment_size)
                else:
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_genic_regions(
                         region_dic[chrom], chrombed, opt.fragment_size)
                    #test_get_read_count_on_genic_regions("AAAS", gene_name_list, region_length_list, read_count_list)
                    #test_get_read_count_on_genic_regions("AACS", gene_name_list, region_length_list, read_count_list)
                assert len(gene_name_list) == len(region_length_list)
                assert len(gene_name_list) == len(read_count_list)
                RPKM = [0] * len(gene_name_list)
                for i in xrange(len(gene_name_list)):
                    if region_length_list[i] > 0:
                        RPKM[i] = read_count_list[i] / (region_length_list[i] /
                                                        1000.0) / (totalcount /
                                                                   1000000.0)
                        outline = gene_name_list[i] + '\t' + str(
                            read_count_list[i]) + '\t' + str(RPKM[i]) + '\n'
                        f.write(outline)
    f.close()

    SeparateByChrom.cleanup(chroms, extension)

    print "it took", time.time() - startTime, "seconds."
Esempio n. 19
0
                      help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragsize",
                      help="fragment size for chipseq experiment",
                      metavar="<file>")
    (opt, args) = parser.parse_args(sys.argv)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    genome_length = float(sum(chrom_lengths.values()))
    total_read_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile)
    tag_coords = get_bed_coordinates(opt.bedfile, chroms, chrom_lengths,
                                     opt.fragsize)

    #print total_read_count
    #print "Genome size: " , genome_length*0.76

    outname = opt.bedfile[:-4] + '_windowsizes.txt'
    outfile = open(outname, 'w')

    pool = multiprocessing.Pool()
    results = {}
    binwidths = []

    for chrom in chroms:
        results[chrom] = pool.apply_async(optimize.fminbound, (
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>"
    )
    parser.add_option(
        "-a",
        "--rawreadfile",
        action="store",
        type="string",
        dest="readfile",
        metavar="<file>",
        help="raw read file in bed format",
    )
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        metavar="<int>",
        help="average size of a fragment after CHIP experiment",
    )
    parser.add_option(
        "-b",
        "--islandfile",
        action="store",
        type="string",
        dest="islandfile",
        metavar="<file>",
        help="island file in BED format",
    )
    parser.add_option(
        "-o",
        "--outfile",
        action="store",
        type="string",
        dest="out_file",
        metavar="<file>",
        help="island read count file",
    )

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)
    if Utility.fileExists(opt.readfile):
        SeparateByChrom.separateByChrom(chroms, opt.readfile, ".bed1")
    else:
        print opt.readfile, " not found"
        sys.exit(1)

    total = 0
    library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile)

    scaling_factor = 1000000
    out = open(opt.out_file, "w")
    for chrom in chroms:
        if chrom in islands.keys():
            island_list = islands[chrom]
            island_readcount_list = [0] * len(island_list)

            if Utility.is_bed_sorted(island_list) == 0:
                island_list.sort(key=operator.attrgetter("start"))

            island_start_list = []
            island_end_list = []
            for item in island_list:
                island_start_list.append(item.start)
                island_end_list.append(item.end)

            read_file = chrom + ".bed1"
            f = open(read_file, "r")
            for line in f:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    position = tag_position(sline, opt.fragment_size)
                    index = find_readcount_on_islands(island_start_list, island_end_list, position)
                    if index >= 0:
                        island_readcount_list[index] += 1
                        total += 1
            f.close()

            for index in xrange(len(island_list)):
                item = island_list[index]
                normalized_read_count = island_readcount_list[index] / float(library_size) * scaling_factor
                outline = (
                    item.chrom
                    + "\t"
                    + str(item.start)
                    + "\t"
                    + str(item.end)
                    + "\t"
                    + str(island_readcount_list[index])
                    + "\t"
                    + str(normalized_read_count)
                    + "\n"
                )
                out.write(outline)

    SeparateByChrom.cleanup(chroms, ".bed1")
    out.close()
    print "Total number of reads on islands are: ", total
Esempio n. 21
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-r",
                      "--readfile",
                      action="store",
                      type="string",
                      dest="Reads",
                      help="input bed file for non-strand specific raw reads",
                      metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    rawreadslibName1 = (opt.Reads).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"

    totalcount = 0
    if Utility_extended.fileExists(opt.Reads) == 1:
        totalcount = get_total_tag_counts.get_total_tag_counts(opt.Reads)
    else:  # if the all file exist, then use the all file, otherwise use the chrom separated file
        for chrom in chroms:
            chrombed = chrom + rawreadsextension1
            totalcount1 = get_total_tag_counts.get_total_tag_counts(chrombed)
            print chrom, totalcount1
            totalcount += totalcount1

    (reads_on_shared_exons, reads_on_shared_introns,
     reads_on_merged_transcripts,
     summary) = calculate_non_strandspecific_rc_on_ExonIntrons(
         entrez_gene_collection, opt.Reads, chroms, opt.fragment_size)

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    for entrez_id in entrez_gene_collection.entrez_ids:
        gene = (entrez_gene_collection.entrez_genes)[entrez_id]
        gene_symbol = []
        for transcript in gene.transcripts:
            if transcript.additional_annotations[0] not in gene_symbol:
                gene_symbol.append(transcript.additional_annotations[0])
        outline = str(entrez_id) + '\t' + str(
            summary[entrez_id]["merged_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["merged_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["merged_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_introns_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_introns_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_intron_RPKM"]) + '\t' + str(
            summary[entrez_id]["merged_transcript_rc"]) + '\t' + str(
                summary[entrez_id]["merged_transcript_length"]) + '\t' + str(
                    summary[entrez_id]
                    ["merged_transcript_RPKM"]) + '\t' + ','.join([
                        transcript.name for transcript in gene.transcripts
                    ]) + '\t' + ','.join(gene_symbol) + '\n'
        outf.write(outline)
    outf.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def main(argv):
    parser = OptionParser()
    parser.add_option("-i",
                      "--readfile",
                      action="store",
                      type="string",
                      dest="ReadFile",
                      help="input bed file for raw reads",
                      metavar="<file>")
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-g",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_genes",
                      metavar="<file>",
                      help="file with known genes in UCSC format")
    parser.add_option(
        "-r",
        "--RegionType",
        action="store",
        type="string",
        dest="region_type",
        metavar="<str>",
        help=
        " Region to count tags in: Promoter(txStart-upstream, txStart+downstream), GeneBody (txStart + downstream, txEnd), ExtendedGeneBodys(txStart-upstream, txEnd+downstream), PromoterGenebody(txStart-upstream, txEnd), GeneEnd(txEnd-upstream, txEnd+downstream), ExonicRegion (per exon), IntronicRegion (per intron), Exonictranscript (per transcript), IntronicTranscript (per transcript), 5UTR(txStart, cdsStart), 3UTR(cdsEnd, txEnd)"
    )
    parser.add_option(
        "-u",
        "--upstream_extension",
        action="store",
        type="int",
        dest="upstream_extension",
        help=
        "upstream extension of region or location, for Promoter, ExtendedGeneBody, PromoterGenebody and GeneEnd ",
        metavar="<int>")
    parser.add_option(
        "-d",
        "--downstream_extension",
        action="store",
        type="int",
        dest="downstream_extension",
        help=
        "downstream extension of region or location, for Promoter, GeneBody, ExtendedGeneBody and GeneEnd ",
        metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 16:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    known_genes = UCSC.KnownGenes_list(opt.known_genes)
    chroms = list(set(known_genes.keys()) & set(chroms))

    #Promoter and GeneBody are mutually exclusive.
    #Promoter: TSS-upstreamextention, TSS+downstreamextension
    #GeneBody: TSS-downstreamextension, TES
    #PromoterGenebody: TSS-upstreamextention,TES
    #TES:TES-upstreamextention, TES+downstreamextension
    #
    #ExonicRegions:
    #IntronicRegions

    print "There are ", known_genes.getNumGenes(), " genes. "

    totalcount = get_total_tag_counts.get_total_tag_counts(opt.ReadFile)

    #Clear the file.
    outf = open(opt.outfile, 'w')
    #outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    #outf.write(outline)
    outf.close()

    # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand.

    allowed_region_type_1 = [
        'Promoter', 'GeneBody', 'ExtendedGeneBody', 'PromoterGenebody',
        'GeneEnd', 'ExonicRegion', 'IntronicRegion', '5UTR', '3UTR'
    ]
    allowed_region_type_2 = ["ExonicTranscript", "IntronicTranscript"]

    if opt.region_type in allowed_region_type_1:
        getReadCount(known_genes, opt.ReadFile, chroms, opt.fragment_size,
                     opt.region_type, opt.upstream_extension,
                     opt.downstream_extension, totalcount, opt.outfile)
    elif opt.region_type in allowed_region_type_2:
        get_read_count_on_onic_transcript(known_genes, opt.ReadFile, chroms,
                                          opt.fragment_size, opt.region_type,
                                          totalcount, opt.outfile)
    else:
        print " The allowed region types are ", allowed_region_type, " .The region type is not recognized, exiting"
        sys.exit(1)

    print "it took", time.time() - startTime, "seconds."
Esempio n. 23
0
def calculate_non_strandspecific_rc_on_ExonIntrons(entrez_genes, bedfile,
                                                   chroms, fragment_size):
    """
	entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	
	return:
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {} # {entrezID:{attribute:value}}
		(summary[entrez_id])["merged_exons_rc"] = merged_exons_rc
		(summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM
		(summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length
		(summary[entrez_id])["shared_exons_rc"] = shared_exons_rc
		(summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM
		(summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length
		(summary[entrez_id])["shared_introns_rc"] = shared_introns_rc
		(summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM
		(summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length
		(summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc
		(summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM
		(summary[entrez_id])["merged_transcript_length"] = merged_transcript_length
	"""
    lib_name = (bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"

    if Utility_extended.fileExists(bedfile):
        p_file_name = bedfile + "_P"
        n_file_name = bedfile + "_n"
        Utility_extended.separate_by_strand(
            bedfile, p_file_name, n_file_name
        )  #partition the bed file into reads in positive strand and negative strand
        ##################################################################3
        #The column numbers are 1 based instead of 0 based!
        #For positive strand
        start_index_P = 2
        #For negative strand
        start_index_N = 3
        ##################################################################3
        p_totalcount = get_total_tag_counts.get_total_tag_counts(p_file_name)
        (
            forward_reads_on_shared_exons, forward_reads_on_shared_introns,
            forward_reads_on_merged_transcripts, forward_summary
        ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons(
            entrez_genes, p_file_name, start_index_P, chroms, fragment_size,
            p_totalcount, None)

        n_totalcount = get_total_tag_counts.get_total_tag_counts(n_file_name)
        (
            reverse_reads_on_shared_exons, reverse_reads_on_shared_introns,
            reverse_reads_on_merged_transcripts, reverse_summary
        ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons(
            entrez_genes, n_file_name, start_index_N, chroms, fragment_size,
            n_totalcount, None)

        all_reads_on_shared_exons = {
        }  # {entrezID:[((start, end), read_count)]}
        all_reads_on_shared_introns = {
        }  # {entrezID:[((start, end), read_count)]}
        all_reads_on_merged_transcripts = {
        }  #{entrezID:[((start, end), read_count)]}
        all_summary = {}  # {entrezID:{attributes}}

        all_reads_on_shared_exons = combine_rc(forward_reads_on_shared_exons,
                                               reverse_reads_on_shared_exons)
        all_reads_on_shared_introns = combine_rc(
            forward_reads_on_shared_introns, reverse_reads_on_shared_introns)
        all_reads_on_merged_transcripts = combine_rc(
            forward_reads_on_merged_transcripts,
            reverse_reads_on_merged_transcripts)
        all_summary = combine_summary(forward_summary, reverse_summary,
                                      p_totalcount, n_totalcount)

    SeparateByChrom.cleanup(chroms, extension)
    return (all_reads_on_shared_exons, all_reads_on_shared_introns,
            all_reads_on_merged_transcripts, all_summary)
Esempio n. 24
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-p",
                      "--PAfile",
                      action="store",
                      type="string",
                      dest="PAfile",
                      help="input bed3 file",
                      metavar="<file>")
    parser.add_option(
        "-e",
        "--extension",
        action="store",
        type="int",
        dest="extension",
        help=
        "integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end",
        metavar="<float>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 79947
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line
    outf = open(opt.outfile, 'w')

    #outline to use to output polyA information for a species
    #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
    #outline to use to output RUDs
    outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "it took", time.time() - startTime, "seconds."
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format")
	parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in bed format")
	parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment")
	parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
	parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 14:
        	parser.print_help()
        	sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
		genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
		genomesize = opt.fraction * genomesize;
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	chip_library_size=get_total_tag_counts.get_total_tag_counts(opt.chipreadfile);
	control_library_size=get_total_tag_counts.get_total_tag_counts(opt.controlreadfile);
	print "chip library size  ", chip_library_size
	print "control library size  ", control_library_size
	
	totalchip = 0;
	totalcontrol = 0;
	
	islands = BED.BED(opt.species, opt.islandfile, "BED3", 0);
	
	# separate by chrom the chip library
	if Utility.fileExists(opt.chipreadfile):
		SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1');
	else:
		print opt.chipreadfile, " not found";
		sys.exit(1)
	# separate by chrom the control library
	if Utility.fileExists(opt.controlreadfile):
		SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2');
	else:
		print opt.controlreadfile, " not found";
		sys.exit(1)	
	
	island_chip_readcount = {};
	island_control_readcount = {};
	
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				if Utility.is_bed_sorted(island_list) == 0:
					island_list.sort(key=operator.attrgetter('start'));
					
				island_start_list = []
				island_end_list = []
				for item in island_list:
					island_start_list.append(item.start)
					island_end_list.append(item.end)
	
				island_chip_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed1";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_chip_readcount_list[index] += 1;
							totalchip += 1;
				f.close();
				island_chip_readcount[chrom] = island_chip_readcount_list;
							
				island_control_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed2";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_control_readcount_list[index] += 1;
							totalcontrol += 1;
				f.close();
							
				island_control_readcount[chrom] = island_control_readcount_list;			
						
	chip_background_read = chip_library_size - totalchip;
	control_background_read = control_library_size - totalcontrol;
	#scaling_factor = chip_background_read*1.0/control_background_read;
	scaling_factor = chip_library_size*1.0/control_library_size;
	
	
	print "Total number of chip reads on islands is: ", totalchip; 
	print "Total number of control reads on islands is: ", totalcontrol; 

	#print "chip_background_read   ", chip_background_read
	#print "control_background_read   ", control_background_read

	out = open(opt.out_file, 'w');
	pvalue_list = [];
	result_list = [];
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					observation = (island_chip_readcount[chrom])[index];
					control_tag = (island_control_readcount[chrom])[index];
					if (island_control_readcount[chrom])[index] > 0:
						#average = (island_control_readcount[chrom])[index] * scaling_factor;
						average = control_tag * scaling_factor
						fc = float(observation)/float(average);
					else:
						length = item.end - item.start + 1;
						average = length * control_library_size *1.0/genomesize;			
						average = min(0.25, average)* scaling_factor;
						fc = float(observation)/float(average);
					if observation > average:
						pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; 
					else:
						pvalue = 1;
					pvalue_list.append(pvalue);
					item_dic = {}
					item_dic['chrom'] = item.chrom
					item_dic['start'] = item.start
					item_dic['end'] = item.end
					item_dic['chip'] = observation
					item_dic['control'] = control_tag
					item_dic['pvalue'] = pvalue
					item_dic['fc'] = fc
					result_list.append(item_dic)
	
	pvaluearray=scipy.array(pvalue_list);
	pvaluerankarray=scipy.stats.rankdata(pvaluearray);
	totalnumber = len(result_list);
	for i in range(totalnumber):
		item = result_list[i];
		alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i];
		if alpha > 1:
			alpha = 1;
		outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";	
		out.write(outline);
					
	#pvalue_list.sort()
	#for item in result_list:
		#pvalue = float(item['pvalue'])
		#alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
		#if alpha > 1:
			#alpha = 1;
		#outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";	
		#out.write(outline);		
	out.close();
	
	
	SeparateByChrom.cleanup(chroms, '.bed1');
	SeparateByChrom.cleanup(chroms, '.bed2');