def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species under consideration", metavar="<str>") parser.add_option("-b", "--raw_bed_file", action="store", type="string", dest="bed_file", help="raw bed file", metavar="<file>") parser.add_option("-t", "--threshold", action="store", type="int", dest="threshold", help="threshold for copy number", metavar="<int>") parser.add_option("-o", "--output_file_name", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); SeparateByChrom.separateByChrom(chroms, opt.bed_file, '.bed1') for chrom in chroms: if (Utility.fileExists(chrom + ".bed1")): strand_broken_remove(chrom, opt.threshold) SeparateByChrom.combineAllGraphFiles(chroms, '.bed2', opt.out_file) SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--summary_graph_file1", action="store", type="string", dest="bedfile1", metavar="<file>", help="summary graph file 1 in bed format") parser.add_option("-b", "--summary_graph_file2", action="store", type="string", dest="bedfile2", metavar="<file>", help="summary graph file 2 in bed format") parser.add_option("-i", "--windows_size", action="store", type="int", dest="window_size", metavar="<int>", help="window size in summary graph file") parser.add_option("-d", "--data_resolution", action="store", type="int", dest="step", metavar="<int>", help="distance between data points, must be integer times of window size") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file extension") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; chrom_lengths = species_chrom_lengths[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); SeparateByChrom.separateByChrom(['chr1'], opt.bedfile1, '.bed1') SeparateByChrom.separateByChrom(['chr1'], opt.bedfile2, '.bed2') generate_all_functions_2('.bed1', '.bed2', ['chr1'], opt.window_size, opt.step, opt.out_file, chrom_lengths) SeparateByChrom.cleanup(['chr1'], '.bed1') SeparateByChrom.cleanup(['chr1'], '.bed2')
def find_windows_on_islands(species, summary_graph_file, islands_file, window_size, out_file, window_read_count_threshold=0): summary_graph_extension=".summarygraph" island_extension=".islands" chroms = species_chroms[species]; SeparateByChrom.separateByChrom(chroms, summary_graph_file, summary_graph_extension) SeparateByChrom.separateByChrom(chroms, islands_file, island_extension) windows_on_island={}; for chrom in chroms: if Utility.fileExists(chrom+summary_graph_extension) and Utility.fileExists(chrom+island_extension): summary = BED.BED(species, chrom+summary_graph_extension, "BED_GRAPH", 0); islands = BED.BED(species, chrom+island_extension, "BED_GRAPH", 0); windows_on_island[chrom] = filter_out_uncovered_windows(islands[chrom], summary[chrom], window_size) if out_file !="": f = open(out_file, 'w') for chrom in chroms: if chrom in windows_on_island.keys(): for item in windows_on_island[chrom]: if (item.value >= window_read_count_threshold): f.write(item.chrom + '\t' + str(item.start) +'\t'+ str(item.end) +'\t'+ str(item.value) + '\n') f.close() SeparateByChrom.cleanup(chroms, summary_graph_extension); SeparateByChrom.cleanup(chroms, island_extension); return windows_on_island;
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file): lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found"; sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} for chrom in chroms: chrombed = chrom + extension entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bedfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] SeparateByChrom.separateByChrom(chroms, opt.bedfile, ".bed") makeGraphFile(chroms, chrom_lengths, opt.window_size, opt.fragment_size) final_output_file = opt.outfile final_output_file = SeparateByChrom.combineAllGraphFiles( chroms, ".graph", final_output_file) SeparateByChrom.cleanup(chroms, ".bed") SeparateByChrom.cleanup(chroms, ".graph") else: print opt.species + " is not in the species list "
def get_read_count_on_genes(rawreadfile, fragment_size, knowngenefile, regiontype, promoter_upstream_extension, promoter_downstream_extension): """ Promoter and GeneBody are mutually exclusive. Promoter: TSS-upstreamextention, TSS+downstreamextension PromoterGenebody: Promoter + gene body. Return: a dictionary with key of gene name and value of read count """ knowngenes = UCSC.KnownGenes(knowngenefile) chroms = knowngenes.keys() allowed_region_type = ['Promoter', 'GeneBody', 'PromoterGenebody'] if regiontype == 'Promoter': region_dic = knowngenes.getPromoters(promoter_upstream_extension, promoter_downstream_extension) elif regiontype == 'GeneBody': region_dic = knowngenes.getGenebodys(promoter_downstream_extension) elif regiontype == 'PromoterGenebody': region_dic = knowngenes.getPromotergenebodys( promoter_upstream_extension) else: print " The allowed region types are Promoter, GeneBody and PromoterGenebody. The region type is not recognized, exiting" sys.exit(1) if Utility.fileExists(rawreadfile): SeparateByChrom.separateByChrom(chroms, rawreadfile, '.bed1') else: print rawreadfile, " not found" sys.exit(1) genes = {} for chrom in chroms: (gene_name_list, region_start_list, region_end_list) = get_feature_lists(region_dic[chrom]) tag_position_list = [] read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) f.close() tag_position_list.sort() #A list, with total tag number on this region, order as the region lists tag_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, region_start_list, region_end_list) assert len(gene_name_list) == len(tag_count_list) for i in range(0, len(gene_name_list)): genes[gene_name_list[i]] = tag_count_list[i] SeparateByChrom.cleanup(chroms, '.bed1') return genes
def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bamfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) #if len(argv) < 10: # sys.stderr.write(str(len(argv)) + '\n') # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # #chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; chromsDict= SeparateByChrom.getChromsFromBam(opt.bamfile) SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed'); makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size); final_output_file = opt.outfile; final_output_file = SeparateByChrom.combineAllGraphFiles(chromsDict.keys(), ".graph", final_output_file); SeparateByChrom.cleanup(chromsDict.keys(), ".bed"); SeparateByChrom.cleanup(chromsDict.keys(), ".graph");
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawbedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="raw data file in bed format") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandbedfile", metavar="<file>", help="island file") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="filtered raw bed file") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); islands = BED.BED(opt.species, opt.islandbedfile, "BED3", 0); SeparateByChrom.separateByChrom(chroms, opt.bedfile, '.bed1') filter_tags_by_islands(chroms, islands, opt.fragment_size) final_output_file = opt.out_file; final_output_file = SeparateByChrom.combineAllGraphFiles(chroms, '_filtered.bed1', final_output_file); SeparateByChrom.cleanup(chroms,'.bed1'); SeparateByChrom.cleanup(chroms,'_filtered.bed1');
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species under consideration", metavar="<str>") parser.add_option("-b", "--raw_bam_file", action="store", type="string", dest="bam_file", help="raw bam file", metavar="<file>") parser.add_option("-t", "--threshold", action="store", type="int", dest="threshold", help="threshold for copy number", metavar="<int>") parser.add_option("-o", "--output_file_name", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") ## Add options to filter reads parser.add_option("-f", "--requiredFlag", type= 'int', help="Required bit in sam flag. Same as samtools view -f") parser.add_option("-F", "--filterFlag", type= 'int', help="Filter out bit in sam flag, Same as samtools view -F") parser.add_option("-q", "--mapq", type= 'int', help="minimum mapq for a read to be kept") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; #else: # sys.stderr.write("\nThis species is not recognized, exiting\n"); # sys.exit(1); chroms= SeparateByChrom.getChromsFromBam(opt.bam_file) SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag= opt.requiredFlag, filterFlag= opt.filterFlag, mapq= opt.mapq) if opt.threshold > 0: for chrom in chroms: if (Utility.fileExists(chrom + ".bed1")): strand_broken_remove(chrom, opt.threshold) SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file) else: SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file) SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def find_windows_on_islands(species, summary_graph_file, islands_file, window_size, out_file, window_read_count_threshold=0): summary_graph_extension = ".summarygraph" island_extension = ".islands" chroms = species_chroms[species] SeparateByChrom.separateByChrom(chroms, summary_graph_file, summary_graph_extension) SeparateByChrom.separateByChrom(chroms, islands_file, island_extension) windows_on_island = {} for chrom in chroms: if Utility.fileExists(chrom + summary_graph_extension) and Utility.fileExists( chrom + island_extension): summary = BED.BED(species, chrom + summary_graph_extension, "BED_GRAPH", 0) islands = BED.BED(species, chrom + island_extension, "BED_GRAPH", 0) windows_on_island[chrom] = filter_out_uncovered_windows( islands[chrom], summary[chrom], window_size) if out_file != "": f = open(out_file, 'w') for chrom in chroms: if chrom in windows_on_island.keys(): for item in windows_on_island[chrom]: if (item.value >= window_read_count_threshold): f.write(item.chrom + '\t' + str(item.start) + '\t' + str(item.end) + '\t' + str(item.value) + '\n') f.close() SeparateByChrom.cleanup(chroms, summary_graph_extension) SeparateByChrom.cleanup(chroms, island_extension) return windows_on_island
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--summarygraphfile", action="store", type="string", dest="bedfile", metavar="<file>", help="summary graph file") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandbedfile", metavar="<file>", help="island file") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="filtered summary graph file") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size of summary graph", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); summary_graph_extension=".summarygraph" island_extension=".islands" SeparateByChrom.separateByChrom(chroms, opt.bedfile, summary_graph_extension) SeparateByChrom.separateByChrom(chroms, opt.islandbedfile, island_extension) f = open(opt.out_file, 'w') for chrom in chroms: if Utility.fileExists(chrom+summary_graph_extension) and Utility.fileExists(chrom+island_extension): summary = BED.BED(opt.species, chrom+summary_graph_extension, "BED_GRAPH", 0); islands = BED.BED(opt.species, chrom+island_extension, "BED_GRAPH", 0); result = filter_out_uncovered_windows(islands[chrom], summary[chrom], opt.window_size) for item in result: f.write(item.chrom + '\t' + str(item.start) +'\t'+ str(item.end) +'\t'+ str(item.value) + '\n') f.close() SeparateByChrom.cleanup(chroms, summary_graph_extension); SeparateByChrom.cleanup(chroms, island_extension);
def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bedfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; SeparateByChrom.separateByChrom(chroms, opt.bedfile, ".bed"); makeGraphFile(chroms, chrom_lengths, opt.window_size, opt.fragment_size); final_output_file = opt.outfile; final_output_file = SeparateByChrom.combineAllGraphFiles(chroms, ".graph", final_output_file); SeparateByChrom.cleanup(chroms, ".bed"); SeparateByChrom.cleanup(chroms, ".graph"); else: print opt.species + " is not in the species list ";
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawbedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="raw data file in bed format") parser.add_option("-i", "--shift", action="store", type="int", dest="shift", metavar="<int>", help="shift for finding the center of DNA fragment represented by the read") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandbedfile", metavar="<file>", help="island file") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="filtered raw bed file") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); islands = BED_revised.BED_annotated(opt.species, opt.islandbedfile, "BED3", 0); libName = (opt.bedfile).split('/')[-1] #remove directories libName = libName.split('.')[0] #remove .bed extension = "-" + libName +'.bed1' SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) for chrom in chroms: if chrom in islands.keys(): outfile = chrom + "-filtered" + extension # [(island, rc)] island_rc = filter_tags_by_islands(chrom + extension, islands[chrom], opt.shift, outfile, boundary_extension=0) output_island_with_rc(island_rc, chrom + "-islands" + extension) SeparateByChrom.combineAllGraphFiles(chroms, "-islands" + extension, "islands" + extension); SeparateByChrom.combineAllGraphFiles(chroms, "-filtered" + extension, opt.out_file); SeparateByChrom.cleanup(chroms, extension); SeparateByChrom.cleanup(chroms, "-filtered" + extension); SeparateByChrom.cleanup(chroms, "-islands" + extension);
def main(argv): parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be unioned") parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be unioned; if no, type in any word") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>") parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1') if Utility.fileExists(opt.islandfile2): SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2') for chrom in chroms: f = open(chrom + '.output', 'w') bed_vals_1 = BED.BED(opt.species, chrom+'.island1', "BED3", 0) bed_vals_2 = BED.BED(opt.species, chrom+'.island2', "BED3", 0) if len(bed_vals_1[chrom]) > 0 or len(bed_vals_2[chrom]) > 0: islandlist = bed_vals_1[chrom] + bed_vals_2[chrom]; union_islands_to_file(islandlist, f) f.close() SeparateByChrom.cleanup(chroms, '.island2') else: for chrom in chroms: f = open(chrom + '.output', 'w') bed_vals_1 = BED.BED(opt.species, chrom+'.island1', "BED3", 0) if len(bed_vals_1[chrom]) > 0: islandlist = bed_vals_1[chrom] union_islands_to_file(islandlist, f) f.close() SeparateByChrom.combineAllGraphFiles(chroms, '.output', opt.outfile); SeparateByChrom.cleanup(chroms, '.output') SeparateByChrom.cleanup(chroms, '.island1')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] genomesize = sum( GenomeData.species_chrom_lengths[opt.species].values()) genomesize = opt.fraction * genomesize else: print "This species is not recognized, exiting" sys.exit(1) chip_library_size = get_total_tag_counts.get_total_tag_counts( opt.chipreadfile) control_library_size = get_total_tag_counts.get_total_tag_counts( opt.controlreadfile) print "chip library size ", chip_library_size print "control library size ", control_library_size totalchip = 0 totalcontrol = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1') else: print opt.chipreadfile, " not found" sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2') else: print opt.controlreadfile, " not found" sys.exit(1) island_chip_readcount = {} island_control_readcount = {} for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_chip_readcount_list[index] += 1 totalchip += 1 f.close() island_chip_readcount[chrom] = island_chip_readcount_list island_control_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_control_readcount_list[index] += 1 totalcontrol += 1 f.close() island_control_readcount[chrom] = island_control_readcount_list chip_background_read = chip_library_size - totalchip control_background_read = control_library_size - totalcontrol #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size * 1.0 / control_library_size print "Total number of chip reads on islands is: ", totalchip print "Total number of control reads on islands is: ", totalcontrol #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w') pvalue_list = [] result_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] observation = (island_chip_readcount[chrom])[index] control_tag = (island_control_readcount[chrom])[index] if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation) / float(average) else: length = item.end - item.start + 1 average = length * control_library_size * 1.0 / genomesize average = min(0.25, average) * scaling_factor fc = float(observation) / float(average) if observation > average: pvalue = scipy.stats.poisson.sf( (island_chip_readcount[chrom])[index], average)[()] else: pvalue = 1 pvalue_list.append(pvalue) item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray = scipy.array(pvalue_list) pvaluerankarray = scipy.stats.rankdata(pvaluearray) totalnumber = len(result_list) for i in range(totalnumber): item = result_list[i] alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i] if alpha > 1: alpha = 1 outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str( item['end']) + "\t" + str(item['chip']) + "\t" + str( item['control']) + "\t" + str(item['pvalue']) + "\t" + str( item['fc']) + "\t" + str(alpha) + "\n" out.write(outline) #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawreadfile", action="store", type="string", dest="readfile", metavar="<file>", help="raw read file in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count file") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) if Utility.fileExists(opt.readfile): SeparateByChrom.separateByChrom(chroms, opt.readfile, '.bed1') else: print opt.readfile, " not found" sys.exit(1) total = 0 library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile) scaling_factor = 1000000 out = open(opt.out_file, 'w') for chrom in chroms: if chrom in islands.keys(): island_list = islands[chrom] island_readcount_list = [0] * len(island_list) if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = tag_position(sline, opt.fragment_size) index = find_readcount_on_islands(island_start_list, island_end_list, position) if index >= 0: island_readcount_list[index] += 1 total += 1 f.close() for index in xrange(len(island_list)): item = island_list[index] normalized_read_count = island_readcount_list[index] / float( library_size) * scaling_factor outline = item.chrom + "\t" + str(item.start) + "\t" + str( item.end) + "\t" + str( island_readcount_list[index]) + "\t" + str( normalized_read_count) + "\n" out.write(outline) SeparateByChrom.cleanup(chroms, '.bed1') out.close() print "Total number of reads on islands are: ", total
def main(argv): parser = OptionParser() parser.add_option("-b", "--readfile", action="store", type="string", dest="readFile", metavar="<file>", help="raw read file in bed format") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-i", "--islands", action="store", type="string", dest="islandFile", metavar="<file>", help="island File in chrom start end ... format") parser.add_option("-w", "--binSize", action="store", type="int", dest="binSize", metavar="<int>", help="bin size for resolution") parser.add_option("-m", "--minimum-number-of-points-per-island", action="store", type="int", dest="minimumRequiredPoints", metavar="<int>", help="minimum-number-of-data-points-needed-per-island") parser.add_option("-n", "--maxdistance", action="store", type="int", dest="maxDistance", metavar="<int>", help=" max distance for correlation, in terms of bin size") parser.add_option("-r", "--resolution", action="store", type="int", dest="resolution", metavar="<int>", help=" resolution in distance in terms of bin size") parser.add_option("-t", "--type", action="store", type="string", dest="type", metavar="<str>", help=" type of correlation, +auto, -auto, cross") parser.add_option("-f", "--shift", action="store", type="int", dest="shift", metavar="<int>", help=" shift of reads, only useful when calculate cross-correlation or combining the plus and minus reads") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file") (opt, args) = parser.parse_args(argv) if len(argv) < 20: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); #t0 = time.time() if Utility.fileExists(opt.readFile) == 0: print opt.readFile, " does not exist" exit(1) libName = (opt.readFile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName +'.bed1' SeparateByChrom.separateByChrom(chroms, opt.readFile, extension) print "Species: ", opt.species print "Read File: ", opt.readFile print "Island File: ", opt.islandFile print "Bin Size: ", opt.binSize, "bp" print "Resolution: ", opt.resolution, " bins" print "Max distance: ", opt.maxDistance, " bins" assert (opt.type == "+auto" or opt.type == "-auto" or opt.type == "cross") print "Type of correlation: ", opt.type print "Reads shift: ", opt.shift # Here we are assuming that the file has the format chrom start end + .....for each line # chrom is sline[0], start is sline[1], end is sline[2] if Utility.fileExists(opt.islandFile): islandDic = BED_revised.BED(opt.species, opt.islandFile, "BED3") num_islands = 0 for chrom in islandDic.keys(): num_islands += len(islandDic[chrom]); # Clean up potential island-specific read files filter_raw_tags_by_islands_dev.cleanup_files(islandDic[chrom], extension) total = (int) (opt.maxDistance/opt.resolution) + 1 distances = [0] * total numberOfPointsCollector = [0] * total correlationCollector = [0] * total totalReadCount = 0 for chrom in chroms: chrombed = chrom + extension; if Utility.fileExists(chrombed): if (chrom in islandDic.keys()): if (len(islandDic[chrom]) > 0): # First find out all the reads that lands on islands and save them on island-specific temporary files.Then use only the read file specific to that island to do binning. currentReadCount = filter_raw_tags_by_islands_dev.find_reads_on_each_island(chrombed, islandDic[chrom], opt.shift, extension) totalReadCount += currentReadCount for island in islandDic[chrom]: assert (island.start >= 0) assert (island.end <= chrom_lengths[chrom]) islandReadFile = island.chrom + "-" + str(island.start) + "-" + str(island.end) + extension numberOfPoints = [0] * total correlations = [0] * total if opt.type == "+auto": readCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "+", opt.shift, islandReadFile) (numberOfPoints, correlations) = autoCorrelations(readCountVector, opt.resolution, opt.maxDistance, opt.minimumRequiredPoints) elif opt.type == "-auto": readCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "-", opt.shift, islandReadFile) (numberOfPoints, correlations) = autoCorrelations(readCountVector,opt.resolution, opt.maxDistance, opt.minimumRequiredPoints) elif opt.type == "cross": plusReadCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "+", opt.shift, islandReadFile) minusReadCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "-", opt.shift, islandReadFile) (numberOfPoints, correlations) = crossCorrelations(plusReadCountVector, minusReadCountVector, opt.resolution, opt.maxDistance, opt.minimumRequiredPoints) assert (len(numberOfPoints) == total) assert (len(correlations) == total) print chrom, island.start, island.end for i in xrange(total): if numberOfPoints[i] == 0: distances[i] = i * opt.resolution * opt.binSize print distances[i], "\t", numberOfPointsCollector[i], "\t", correlationCollector[i] numberOfPointsCollector[i] += numberOfPoints[i] correlationCollector[i] += correlations[i] # Normalization and output f = open(opt.out_file, 'w') for i in xrange(total): distances[i] = i * opt.resolution * opt.binSize correlationCollector[i] /= numberOfPointsCollector[i] #normalize by the number of points correlationCollector[i] /= (totalReadCount/1000000.0)*(totalReadCount/1000000.0) print distances[i], "\t", numberOfPointsCollector[i], "\t", correlationCollector[i] outline = str(distances[i]) + "\t" + str(correlationCollector[i]) + "\n" f.write(outline) f.close() SeparateByChrom.cleanup(chroms, extension) #Plot it out title = libName + " " + opt.type + " correlation" legend = "B" + str(opt.binSize) + " S" + str(opt.shift) plot_profile(distances[1:], correlationCollector[1:], 0, title, legend, opt.out_file + '.eps')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) if not Utility.fileExists(opt.readfileA): print opt.readfileA, " not found" sys.exit(1) if not Utility.fileExists(opt.readfileB): print opt.readfileB, " not found" sys.exit(1) A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA) B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB) print "Library size of ", opt.readfileA, ": ", A_library_size print "Library size of ", opt.readfileB, ": ", B_library_size totalA = 0 totalB = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the A library SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1') # separate by chrom the B library SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2') island_A_readcount = {} island_B_readcount = {} #Find read counts on the islands for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_A_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_A_readcount_list[index] += 1 totalA += 1 f.close() island_A_readcount[chrom] = island_A_readcount_list island_B_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_B_readcount_list[index] += 1 totalB += 1 f.close() island_B_readcount[chrom] = island_B_readcount_list #A_background_read = A_library_size - totalA; #B_background_read = B_library_size - totalB; print "Total number of A reads on islands is: ", totalA print "Total number of B reads on islands is: ", totalB # Calculate the p value. library_scaling_factor = A_library_size * 1.0 / B_library_size #A vs B pseudo_count = 1 pvalue_A_vs_B_list = [] pvalue_B_vs_A_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] pvalue_A_vs_B = pvaule(Acount, Bcount, library_scaling_factor, pseudo_count) pvalue_A_vs_B_list.append(pvalue_A_vs_B) pvalue_B_vs_A = pvaule(Bcount, Acount, 1 / library_scaling_factor, pseudo_count) pvalue_B_vs_A_list.append(pvalue_B_vs_A) #Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list) fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list) #Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000 out = open(opt.out_file, 'w') outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n" out.write(outline) ii = 0 for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] normalized_A = Acount / float( A_library_size) * scaling_factor normalized_B = Bcount / float( B_library_size) * scaling_factor fc_A_vs_B = ( (Acount + pseudo_count) * 1.0 / (Bcount + pseudo_count)) / library_scaling_factor fc_B_vs_A = ( (Bcount + pseudo_count) * 1.0 / (Acount + pseudo_count)) * library_scaling_factor print("Acount", Acount, "Bcount", Bcount, "pseudo_count", pseudo_count, "library_scaling_factor", library_scaling_factor, "fc_A_vs_B", fc_A_vs_B, "fc_B_vs_A", fc_B_vs_A) outline = item.chrom + "\t" + str(item.start) + "\t" + str( item.end) + "\t" + str(Acount) + "\t" + str( normalized_A) + "\t" + str(Bcount) + "\t" + str( normalized_B ) + "\t" + str(fc_A_vs_B) + "\t" + str( pvalue_A_vs_B_list[ii]) + "\t" + str( fdr_A_vs_B_list[ii] ) + "\t" + str(fc_B_vs_A) + "\t" + str( pvalue_B_vs_A_list[ii]) + "\t" + str( fdr_B_vs_A_list[ii]) + "\n" out.write(outline) ii += 1 out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2') # Calculate the correlations using normalized read counts A_array = () B_array = () for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: temp_array = scipy.array(island_A_readcount[chrom]) A_array = scipy.concatenate((temp_array, A_array)) temp_array = scipy.array(island_B_readcount[chrom]) B_array = scipy.concatenate((temp_array, B_array)) #Normalization to reads per million A_array = A_array / float(A_library_size) * scaling_factor B_array = B_array / float(B_library_size) * scaling_factor pearson = scipy.stats.pearsonr(A_array, B_array) print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[ 1] spearman = scipy.stats.spearmanr(A_array, B_array) print "Spearman's correlation is: ", spearman[ 0], " with p-value ", spearman[1]
def get_read_count_on_genes(rawreadfile, fragment_size, knowngenefile, regiontype, promoter_upstream_extension, promoter_downstream_extension): """ This one provides an integrated module, where the chrom separation etc is done inside. Promoter and GeneBody are mutually exclusive. Promoter: TSS-upstreamextention, TSS+downstreamextension GeneBody: TSS+downstreamextension, TES ExonicRegion: exons of a gene taken together PromoterGenebody: Promoter + gene body. Return: a dictionary with key of gene name and value of read count """ knowngenes = UCSC_revised.KnownGenes(knowngenefile) chroms = knowngenes.keys() allowed_region_type = [ 'Promoter', 'GeneBody', 'PromoterGenebody', 'ExonicRegion' ] if regiontype not in allowed_region_type: print " The allowed region types are Promoter, GeneBody, PromoterGenebody and ExonicRegion. The region type is not recognized, exiting" sys.exit(1) if regiontype == 'Promoter': region_dic = knowngenes.getPromoters(promoter_upstream_extension, promoter_downstream_extension) if regiontype == 'GeneBody': region_dic = knowngenes.getGenebodys(promoter_downstream_extension) if regiontype == 'PromoterGenebody': region_dic = knowngenes.getPromotergenebodys( promoter_upstream_extension) libName = (rawreadfile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName + ".bed1" if Utility_extended.fileExists(rawreadfile): SeparateByChrom.separateByChrom(chroms, rawreadfile, extension) else: print rawreadfile, " not found" sys.exit(1) genes = {} #for output for chrom in chroms: chrombed = chrom + extension if Utility_extended.fileExists(chrombed): gene_coords = knowngenes[chrom] if len(gene_coords) > 0: if regiontype == 'ExonicRegion': (gene_name_list, region_length_list, read_count_list) = get_read_count_on_exons( gene_coords, chrombed, fragment_size) else: (gene_name_list, region_length_list, read_count_list) = get_read_count_on_genic_regions( region_dic[chrom], chrombed, fragment_size) assert len(gene_name_list) == len(region_length_list) assert len(gene_name_list) == len(read_count_list) #RPKM = [0] * len(gene_name_list) for i in xrange(len(gene_name_list)): #if region_length_list[i] > 0: # RPKM[i] = read_count_list[i] / (region_length_list[i]/1000.0) / (totalcount/1000000.0) genes[gene_name_list[i]] = read_count_list[i] #outline = gene_name_list[i] + '\t' + str(read_count_list[i]) + '\t' + str(RPKM[i]) + '\n' #f.write(outline) SeparateByChrom.cleanup(chroms, extension) return genes
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); if not Utility.fileExists(opt.readfileA): print opt.readfileA, " not found"; sys.exit(1) if not Utility.fileExists(opt.readfileB): print opt.readfileB, " not found"; sys.exit(1) A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA); B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB); print "Library size of ", opt.readfileA, ": ", A_library_size print "Library size of ", opt.readfileB, ": ", B_library_size totalA = 0; totalB = 0; islands = BED.BED(opt.species, opt.islandfile, "BED3", 0); # separate by chrom the A library SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1'); # separate by chrom the B library SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2'); island_A_readcount = {}; island_B_readcount = {}; #Find read counts on the islands for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_A_readcount_list=[0]*len(island_list); read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_A_readcount_list[index] += 1; totalA += 1; f.close(); island_A_readcount[chrom] = island_A_readcount_list; island_B_readcount_list=[0]*len(island_list); read_file = chrom + ".bed2"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_B_readcount_list[index] += 1; totalB += 1; f.close(); island_B_readcount[chrom] = island_B_readcount_list; #A_background_read = A_library_size - totalA; #B_background_read = B_library_size - totalB; print "Total number of A reads on islands is: ", totalA; print "Total number of B reads on islands is: ", totalB; # Calculate the p value. library_scaling_factor = A_library_size*1.0/B_library_size; #A vs B pseudo_count = 1; pvalue_A_vs_B_list = []; pvalue_B_vs_A_list = []; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; Acount = (island_A_readcount[chrom])[index]; Bcount = (island_B_readcount[chrom])[index]; pvalue_A_vs_B = pvaule (Acount, Bcount, library_scaling_factor, pseudo_count); pvalue_A_vs_B_list.append(pvalue_A_vs_B); pvalue_B_vs_A = pvaule (Bcount, Acount, 1/library_scaling_factor, pseudo_count); pvalue_B_vs_A_list.append(pvalue_B_vs_A); #Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list); fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list); #Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000; out = open(opt.out_file, 'w'); outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"; out.write(outline); ii=0; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; Acount = (island_A_readcount[chrom])[index]; Bcount = (island_B_readcount[chrom])[index]; normalized_A = Acount/ float(A_library_size) * scaling_factor; normalized_B = Bcount/ float(B_library_size) * scaling_factor; fc_A_vs_B = ((Acount + pseudo_count)*1.0/(Bcount + pseudo_count))/library_scaling_factor; fc_B_vs_A = ((Bcount + pseudo_count)*1.0/(Acount + pseudo_count)) * library_scaling_factor; outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(Acount) + "\t" + str(normalized_A) + "\t" + str(Bcount) + "\t" + str(normalized_B) + "\t" + str(fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str(fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n"; out.write(outline); ii += 1; out.close(); SeparateByChrom.cleanup(chroms, '.bed1'); SeparateByChrom.cleanup(chroms, '.bed2'); # Calculate the correlations using normalized read counts A_array=(); B_array=(); for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: temp_array= scipy.array(island_A_readcount[chrom]); A_array=scipy.concatenate((temp_array, A_array)); temp_array= scipy.array(island_B_readcount[chrom]); B_array=scipy.concatenate((temp_array, B_array)); #Normalization to reads per million A_array = A_array/float(A_library_size) * scaling_factor; B_array = B_array/float(B_library_size) * scaling_factor; pearson=scipy.stats.pearsonr(A_array, B_array); print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[1]; spearman = scipy.stats.spearmanr(A_array, B_array); print "Spearman's correlation is: ", spearman[0], " with p-value ", spearman[1];
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--summarygraphfile", action="store", type="string", dest="bedfile", metavar="<file>", help="summary graph file") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandbedfile", metavar="<file>", help="island file") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="filtered summary graph file") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size of summary graph", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) summary_graph_extension = ".summarygraph" island_extension = ".islands" SeparateByChrom.separateByChrom(chroms, opt.bedfile, summary_graph_extension) SeparateByChrom.separateByChrom(chroms, opt.islandbedfile, island_extension) f = open(opt.out_file, 'w') for chrom in chroms: if Utility.fileExists(chrom + summary_graph_extension) and Utility.fileExists( chrom + island_extension): summary = BED.BED(opt.species, chrom + summary_graph_extension, "BED_GRAPH", 0) islands = BED.BED(opt.species, chrom + island_extension, "BED_GRAPH", 0) result = filter_out_uncovered_windows(islands[chrom], summary[chrom], opt.window_size) for item in result: f.write(item.chrom + '\t' + str(item.start) + '\t' + str(item.end) + '\t' + str(item.value) + '\n') f.close() SeparateByChrom.cleanup(chroms, summary_graph_extension) SeparateByChrom.cleanup(chroms, island_extension)
def main(argv): parser = OptionParser() parser.add_option("-k", "--known_genes_file", action="store", type="string", dest="known_file", help="file with known genes", metavar="<file>") parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", help="file with tags in bed format", metavar="<file>") parser.add_option("-c", "--TypeOfSites", action="store", type="string", dest="type", help="TSS, TES, TFBS", metavar="<str>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option( "-n", "--normalization", action="store", type="float", dest="norm", help= "additional normalization in addition to number of reads per million and window_size per 1K" ) parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species", metavar="<str>") parser.add_option("-u", "--UpstreamExtension", action="store", type="int", dest="upstreamExtension", help="UpstreamExtension", metavar="<int>") parser.add_option("-d", "--DownstreamExtension", action="store", type="int", dest="downstreamExtension", help="DownstreamExtension", metavar="<int>") parser.add_option("-r", "--resolution", action="store", type="int", dest="resolution", help="resolution of the profile, eg, 5", metavar="<int>") parser.add_option( "-w", "--WindowSize", action="store", type="int", dest="window_size", help= "window size for averaging. When window size > resolution, there is smoothing", metavar="<int>") parser.add_option("-p", "--plusReadShift", action="store", type="int", dest="pshift", help="plusReadShift", metavar="<int>") parser.add_option("-m", "--minusReadShift", action="store", type="int", dest="mshift", help="minusReadShift", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 24: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) #t0 = time.time() libName = (opt.bedfile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName + '.bed1' SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) num_genes = 0 num_tags = 0 numPoints = float(opt.upstreamExtension + opt.downstreamExtension) / float( opt.resolution) print "Upstream extension: ", opt.upstreamExtension print "Downstream extension: ", opt.downstreamExtension print "Resolution:", opt.resolution print "Scanning window size: ", opt.window_size print "Number of Points", numPoints all_genes_scores = {} #{name:[]} if (opt.type == "TSS"): coords = UCSC.KnownGenes(opt.known_file) for chrom in chroms: chrombed = chrom + extension if Utility.fileExists(chrombed): scoredic = {} mycoords = {} bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if (chrom in coords.keys()) and (len(coords[chrom]) > 0): num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] scoredic = getTSSPMProfileMatrix( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, opt.pshift, opt.mshift, bed_vals) all_genes_scores.update(scoredic) #print annotations #print scoreMatrix #print score_profiles elif (opt.type == "TES"): coords = UCSC.KnownGenes(opt.known_file) for chrom in chroms: chrombed = chrom + extension if Utility.fileExists(chrombed): scoredic = {} mycoords = {} bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if (chrom in coords.keys()) and (len(coords[chrom]) > 0): num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] scoredic = getTESPMProfileMatrix( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, opt.pshift, opt.mshift, bed_vals) all_genes_scores.update(scoredic) elif (opt.type == "TFBS"): # Build coords # Here we are assuming that the file has the format chrom location + .....for each line # chrom is sline[0], location is sline[1] coords = {} if (opt.known_file): infile = open(opt.known_file, 'r') for line in infile: """ check to make sure not a header line """ if not re.match("track", line): line = line.strip() sline = line.split() if sline[0] not in coords.keys(): coords[sline[0]] = [] coords[sline[0]].append(atoi(sline[1])) infile.close() for chrom in chroms: chrombed = chrom + extension if Utility.fileExists(chrombed): scoredic = {} mycoords = {} bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if chrom in coords.keys() and len(coords[chrom]) > 0: num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] scoredic = getTFBSPMProfileMatrix( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, opt.pshift, opt.mshift, bed_vals) all_genes_scores.update(scoredic) else: print "Only three types of locations are allowed: TSS, TES, TFBS" sys.exit(1) SeparateByChrom.cleanup(chroms, extension) normalization = num_tags / 1000000.0 normalization *= opt.window_size / 1000.0 normalization *= opt.norm outFile = open(opt.outfile, 'w') # export the normalized result to a file. for mykey in all_genes_scores.keys(): outline = str(mykey) + "\t" + "\t".join( [str(item / normalization) for item in all_genes_scores[mykey]]) + '\n' outFile.write(outline) outFile.close() print "Number of locations: ", num_genes print "Number of reads: ", num_tags print "normalization = ", normalization # Testing overall_profile = [0] * int(numPoints) for mykey in all_genes_scores.keys(): assert (len(all_genes_scores[mykey]) == int(numPoints)) for j in xrange(int(numPoints)): overall_profile[j] += (all_genes_scores[mykey])[j] / normalization overall_profile = [item / float(num_genes) for item in overall_profile] #for item in overall_profile: # print item; pylab.clf() pylab.plot(overall_profile, "b") pylab.savefig("Overall_profile.png", format='png')
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="island bed file") parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format") parser.add_option("-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt") parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>") parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end", metavar="<int>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting"; sys.exit(1); #Separate_by_chrom on bedfile lib_name = (opt.bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(opt.bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) else: print bedfile, " is not found"; sys.exit(1) print "\nLoad the RE tree to get the RE file names" re_tree = pickle.load(open(opt.RE_Tree, 'rb')) (numb_classes, numb_families, numb_names) = get_read_count_on_REs.numbers(re_tree) print "There are %d classes, %d family, and %d names." %(numb_classes, numb_families, numb_names) total_num_islands = 0 total_num_RE_islands = 0 #cycle through chrom for chrom in chroms: # Get the islands island_list = [] print chrom chrom_length = chrom_lengths[chrom] chrombed = chrom + extension if Utility_extended.fileExists(chrombed): # load in each island inf = open(chrombed,'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() start = int(sline[1]) end = int(sline[2]) island_list.append( (start, end) ) inf.close() if Utility_extended.is_tuplelist_sorted(island_list, 0) != 1: island_list.sort(key = itemgetter[0]) # sort by start, assume non-overlapping else: print "%s can not be found" %chrombed island_flags = [0 for island in island_list] min_re_length = 10 for reClass in re_tree.keys(): for reFamily in re_tree[reClass].keys(): for reName in re_tree[reClass][reFamily]: re_file_name = "_".join([reClass, reFamily, reName]) + ".txt" #print re_file_name this_island_flags = assign_islands_to_REs(opt.RE_file_location, re_file_name, chrom, chrom_length, island_list, opt.upstream_extension, opt.downstream_extension, min_re_length) #Collect the results into island_flags for i in xrange(len(this_island_flags)): if this_island_flags[i] == 1: island_flags[i] = 1 print "There are %d island on %s" %(len(island_list),chrom) print "There are %d RE islands" %(sum(island_flags)) total_num_islands += len(island_list) total_num_RE_islands += sum(island_flags) SeparateByChrom.cleanup(chroms, extension) print "There are %d islands" %(total_num_islands) print "There are %d RE islands" %(total_num_RE_islands)
def main(argv): desc = """This is a template for the analysis of aggretated tag distribution with respect to a set of points, such as the TSSs of known genes, with one profile from each strand.""" parser = OptionParser(description=desc) parser.add_option("-k", "--known_genes_file", action="store", type="string", dest="known_file", help="file with known genes", metavar="<file>") parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", help="file with tags in bed format", metavar="<file>") parser.add_option("-c", "--TypeOfSites", action="store", type="string", dest="type", help="TSS, TES, TFBS", metavar="<str>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option( "-n", "--normalization", action="store", type="float", dest="norm", help= "additional normalization in addition to number of sites, number of reads per million and window_size per 1K" ) parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species", metavar="<str>") parser.add_option("-u", "--UpstreamExtension", action="store", type="int", dest="upstreamExtension", help="UpstreamExtension", metavar="<int>") parser.add_option("-d", "--DownstreamExtension", action="store", type="int", dest="downstreamExtension", help="DownstreamExtension", metavar="<int>") parser.add_option("-r", "--resolution", action="store", type="int", dest="resolution", help="resolution of the profile, eg, 5", metavar="<int>") parser.add_option( "-w", "--WindowSize", action="store", type="int", dest="window_size", help= "window size for averaging. When window size > resolution, there is smoothing", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 20: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) #t0 = time.time() libName = (opt.bedfile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName + '.bed1' SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) num_genes = 0 num_tags = 0 profiles = {} numPoints = float(opt.upstreamExtension + opt.downstreamExtension) / float( opt.resolution) print "Upstream extension: ", opt.upstreamExtension print "Downstream extension: ", opt.downstreamExtension print "Resolution:", opt.resolution print "Scanning window size: ", opt.window_size print "Number of Points", numPoints plus_score_profile = [0] * int(numPoints) minus_score_profile = [0] * int(numPoints) if (opt.type == "TSS"): coords = UCSC.KnownGenes(opt.known_file) for chrom in chroms: mycoords = {} chrombed = chrom + extension if Utility.fileExists(chrombed): bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if (chrom in coords.keys()): if (len(coords[chrom]) > 0): num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] profiles[chrom] = getTSSProfile( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, 75, 75, bed_vals) elif (opt.type == "TES"): coords = UCSC.KnownGenes(opt.known_file) for chrom in chroms: mycoords = {} chrombed = chrom + extension if Utility.fileExists(chrombed): bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if (chrom in coords.keys()): if (len(coords[chrom]) > 0): num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] profiles[chrom] = getTESProfile( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, 75, 75, bed_vals) elif (opt.type == "TFBS"): # Build coords # Here we are assuming that the file has the format chrom location + .....for each line # chrom is sline[0], location is sline[1] coords = {} if (opt.known_file): infile = open(opt.known_file, 'r') for line in infile: """ check to make sure not a header line """ if not re.match("track", line): line = line.strip() sline = line.split() if sline[0] not in coords.keys(): coords[sline[0]] = [] coords[sline[0]].append(atoi(sline[1])) infile.close() for chrom in chroms: mycoords = {} chrombed = chrom + extension if Utility.fileExists(chrombed): bed_vals = {} bed_vals = BED.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if chrom in coords.keys(): if len(coords[chrom]) > 0: num_genes += len(coords[chrom]) mycoords[chrom] = coords[chrom] profiles[chrom] = getTFBSProfile( mycoords, opt.upstreamExtension, opt.downstreamExtension, opt.resolution, opt.window_size, 75, 75, bed_vals) else: print "Only three types of locations are allowed: TSS, TES, TFBS" sys.exit(1) for chrom in profiles.keys(): (plus_scores, minus_scores) = profiles[chrom] assert (int(numPoints) == len(plus_scores)) assert (int(numPoints) == len(minus_scores)) for i in xrange(int(numPoints)): plus_score_profile[i] += plus_scores[i] minus_score_profile[i] += minus_scores[i] SeparateByChrom.cleanup(chroms, extension) normalization = num_tags / 1000000.0 normalization *= num_genes normalization *= opt.window_size / 1000.0 normalization *= opt.norm print "Number of locations: ", num_genes print "Number of reads: ", num_tags print "Normalization is by total number of reads per million. normalization = ", normalization xValues = output(opt.upstreamExtension, opt.resolution, plus_score_profile, minus_score_profile, normalization, opt.outfile)
def main(argv): ''' Coarse graining test chr1, input must only have chr1 ''' parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>") parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>") parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>") parser.add_option("-g", "--graining_size", action="store", type="int", dest="step", help="graining unit size (>0)", metavar="<int>") parser.add_option("-e", "--score", action="store", type="int", dest="score", help="graining criterion, 0<score<=graining_size", metavar="<int>") parser.add_option("-t", "--mappable_faction_of_genome_size", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") parser.add_option("-f", "--output_file", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) print "Coarse-graining approach to identify ChIP-Seq enriched domains:" if opt.species in GenomeData.species_chroms.keys(): print "Species: ", opt.species; print "Window_size: ", opt.window_size; print "Coarse graining step: ", opt.step; print "Coarse graining score:", opt.score; chroms = GenomeData.species_chroms[opt.species] total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph); print "Total read count:", total_read_count genome_length = sum (GenomeData.species_chrom_lengths[opt.species].values()); genome_length = int(opt.fraction * genome_length); average = float(total_read_count) * opt.window_size/genome_length; print "Effective genome length: ", genome_length; print "window average:", average; min_tags_in_window = int(average) + 1 print "Minimum read count in a qualified window: ", min_tags_in_window print "Generate preprocessed data list"; #read in the summary graph file bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH"); #generate the probscore summary graph file, only care about enrichment for chrom in chroms: if chrom in bed_val.keys() and len(bed_val[chrom]) > 0: chrom_length = GenomeData.species_chrom_lengths[opt.species][chrom] eligible_start_list = [] for index in xrange(len(bed_val[chrom])): read_count = bed_val[chrom][index].value; if read_count >= min_tags_in_window: eligible_start_list.append(bed_val[chrom][index].start) print "Coarse graining:"; (result_list, island_list) = coarsegraining(eligible_start_list, opt.window_size, opt.step, opt.score, chrom_length) print "Trace back...", len(island_list) islands = traceback(island_list, opt.window_size, opt.step, 0, chrom_length, chrom) print len(islands), "islands found in", chrom f = open(chrom + ".islandstemp", 'w') for i in range(0, len(islands)): f.write(chrom + '\t' + str(int(islands[i].start)) + '\t' + str(int(islands[i].end)) + '\t1\n') f.close() o = open(opt.out_file, 'w') o.write('track type=bedGraph name=' + opt.out_file + '\n') o.close() SeparateByChrom.combineAllGraphFiles(chroms, ".islandstemp", opt.out_file) SeparateByChrom.cleanup(chroms, ".islandstemp") #else: #print "input data error!" else: print "This species is not in my list!";
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file=None): """ entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object return: all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attribute:value}} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id])["merged_transcript_length"] = merged_transcript_length """ lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found" sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = { } #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attributes}} for chrom in chroms: chrombed = chrom + extension if chrom in entrez_genes.chroms: entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) print len(all_summary.keys()) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, all_summary)
def main(argv): parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be compared") parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be compared") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>") parser.add_option("-p", "--overlapin1", action="store", type="string", dest="overlapin1", metavar="<file>", help="file for islands in 1 overlapping with islands in 2") parser.add_option("-q", "--nonoverlapin1", action="store", type="string", dest="nonoverlapin1", help="file for islands in 1 not overlapping with islands in 2 ", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); total_overlap_number_1 = 0 total_islands_1 = 0 SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1') SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2') for chrom in chroms: f = open(chrom + '.1in2', 'w') g = open(chrom + '.1notin2', 'w') bed_vals_2 = BED.BED(opt.species, chrom+'.island2', "BED3", 0) if Utility.fileExists(chrom+'.island1') and len(bed_vals_2[chrom])>0: islandlist2 = bed_vals_2[chrom]; if (are_islands_sorted(islandlist2) != 1): islandlist2.sort(key=operator.attrgetter('start')); (island2_start_list, island2_end_list) = union_islands(islandlist2) islands1 = open(chrom+'.island1', 'r') for line in islands1: if not re.match("#", line): total_islands_1 += 1 line = line.strip() sline = line.split() start = int(sline[1]) end = int(sline[2]) if (region_overlap(start, end, island2_start_list, island2_end_list) == 1): f.write('\t'.join(sline) + '\n') total_overlap_number_1 += 1; else: g.write('\t'.join(sline) + '\n'); elif Utility.fileExists(chrom+'.island1') and (len(bed_vals_2[chrom])==0): islands1 = open(chrom+'.island1', 'r') for line in islands1: if not re.match("#", line): total_islands_1 += 1 line = line.strip() sline = line.split() g.write('\t'.join(sline) + '\n'); f.close() g.close() print "total number of island in "+opt.islandfile1+": ", total_islands_1; print "total number of island in "+opt.overlapin1+": ", total_overlap_number_1; SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1); SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1); SeparateByChrom.cleanup(chroms, '.1in2') SeparateByChrom.cleanup(chroms, '.1notin2') SeparateByChrom.cleanup(chroms, '.island1') SeparateByChrom.cleanup(chroms, '.island2')
def main(argv): parser = OptionParser() parser.add_option("-k", "--known_gene_file", action="store", type="string", dest="genefile", help="file with known gene info", metavar="<file>") parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", help="file with tags in bed format", metavar="<file>") parser.add_option("-c", "--TypeOfSites", action="store", type="string", dest="type", help="GENE, ISLAND", metavar="<str>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species", metavar="<str>") parser.add_option("-u", "--UpstreamExtension", action="store", type="int", dest="upstreamExtension", help="UpstreamExtension", metavar="<int>") parser.add_option("-d", "--DownstreamExtension", action="store", type="int", dest="downstreamExtension", help="DownstreamExtension", metavar="<int>") parser.add_option( "-r", "--resolution", action="store", type="int", dest="resolution", help="resolution of the upstream and downstream profile, eg, 5", metavar="<int>") parser.add_option( "-w", "--WindowSize", action="store", type="int", dest="window_size", help= "window size for averaging for the upstream and downstream profile. When window size > resolution, there is smoothing", metavar="<int>") parser.add_option("-g", "--genicPartition", action="store", type="int", dest="genicPartition", help="genicPartition, eg, 20", metavar="<int>") parser.add_option("-p", "--plusReadShift", action="store", type="int", dest="pshift", help="plusReadShift", metavar="<int>") parser.add_option("-m", "--minusReadShift", action="store", type="int", dest="mshift", help="minusReadShift", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 24: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) #t0 = time.time() libName = (opt.bedfile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName + '.bed1' SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) num_genes = 0 num_tags = 0 if (opt.upstreamExtension % opt.resolution != 0): print "Please choose the resolution commensurate with the length of the upstream region" sys.exit(1) if (opt.downstreamExtension % opt.resolution != 0): print "Please choose the resolution commensurate with the length of the downstream region" sys.exit(1) upstreamNumPoints = opt.upstreamExtension / opt.resolution all_gene_scores = {} print "Species: ", opt.species print "Upstream extension: ", opt.upstreamExtension print "Downstream extension: ", opt.downstreamExtension print "Upstream and Downstream resolution:", opt.resolution print "Upstream and Downstream Scanning window size: ", opt.window_size print "Genic partition: ", opt.genicPartition print "Plus reads shift: ", opt.pshift print "Minus reads shift: ", opt.mshift if opt.type == "GENE": coords = UCSC.KnownGenes(opt.genefile) elif opt.type == "ISLAND": # Build coords in the mode of a pseudo ucsc file, all pseudo genes are in the positive direction # Here we are assuming that the file has the format chrom start end + .....for each line # chrom is sline[0], start is sline[1], end is sline[2] strand = '+' coords = {} index = 0 infile = open(opt.genefile, 'r') for line in infile: """ check to make sure not a header line """ if not re.match("track", line): index += 1 line = line.strip() sline = line.split() if sline[0] not in coords.keys(): coords[sline[0]] = [] name = "Island" + str(index) chrom = sline[0] txStart = atoi(sline[1]) txEnd = atoi(sline[2]) # (name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds) mycoord = UCSC.UCSC(name, chrom, strand, txStart, txEnd, txStart, txEnd, 0, '0', '0') coords[chrom].append(mycoord) infile.close() else: print "Only two types of locations are allowed: GENE, ISLAND" sys.exit(1) normalization = num_tags / 1000000.0 minimum_genic_resolution = 10 old_num_genes = getNumGenes(coords) coords = findEligibleGenes( coords, chroms, opt.genicPartition, minimum_genic_resolution) # no longer a knowngene object num_genes = getNumGenes(coords) print num_genes - new_num_genes, " genes whose length does not support minimal genic resolution of ", minimum_genic_resolution, " or are on exotic chroms, are discarded" print "Number of " + opt.type + ": ", num_genes print "Number of reads: ", num_tags for chrom in chroms: chrombed = chrom + extension if Utility.fileExists(chrombed): bed_vals = {} bed_vals = BED_revised.BED(opt.species, chrombed, "BED2") num_tags += bed_vals.getNumVals() if (chrom in coords.keys()): if (len(coords[chrom]) > 0): mycoords = {} scoredic_upstream = {} scoredic_downstream = {} scoredic_genebody = {} mycoords[chrom] = coords[chrom] scoredic_upstream = GenerateProfileMatrixAroundLocations.getTSSPMProfileMatrix( mycoords, opt.upstreamExtension, 0, opt.resolution, opt.window_size, opt.pshift, opt.mshift, bed_vals) scoredic_downstream = GenerateProfileMatrixAroundLocations.getTESPMProfileMatrix( mycoords, 0, opt.downstreamExtension, opt.resolution, opt.window_size, opt.pshift, opt.mshift, bed_vals) scoredic_genebody = getGeneBodyProfileMatrix( mycoords, opt.genicPartition, opt.pshift, opt.mshift, bed_vals, minimum_genic_resolution) myid_set = list( set(scoredic_upstream.keys()) & set(scoredic_downstream.keys()) & set(scoredic_genebody.keys())) chrom_gene_scores = {} count_normalization = float(opt.window_size) / 1000.0 for myid in myid_set: chrom_gene_scores[myid] = [ item / count_normalization for item in scoredic_upstream[myid] ] + scoredic_genebody.keys[mykey] + [ item / count_normalization for item in scoredic_downstream[mykey] ] all_genes_scores.update(chrom_gene_scores) # Save in a file outFile = open(opt.outfile, 'w') for mykey in xrange(len(all_genes_scores.keys())): outline = mykey + "\t" + "\t".join( [str(item / normalization) for item in all_genes_scores[mykey]]) + '\n' outFile.write(outline) outFile.close() #test totalPoints = upstreamNumPoints + opt.genicPartition + downstreamNumPoints half_partition = int(opt.resolution / 2.0) upstreamXcoordinates = [0.0] * upstreamNumPoints for i in xrange(upstreamNumPoints): upstreamXcoordinates[ i] = -1.0 * opt.upstreamExtension + half_partition + i * opt.resolution downstreamXcoordinates = [0.0] * downstreamNumPoints for i in xrange(downstreamNumPoints): downstreamXcoordinates[i] = half_partition + i * opt.resolution genebodyXcoordinates = [0.0] * opt.genicPartition for i in xrange(opt.genicPartition): genebodyXcoordinates[i] = float((i + 1)) / opt.genicPartition overallXcoordinates = upstreamXcoordinates + genebodyXcoordinates + downstreamXcoordinates overall_score_profile = [0] * overallXcoordinates for mykey in xrange(len(all_genes_scores.keys())): assert (len(overallXcoordinates) == len(all_genes_scores[mykey])) for i in xrange(overallXcoordinates): overall_score_profile[i] += (all_genes_scores[mykey])[i] #Plot it out xcords = [0] * len(overallXcoordinates) for i in xrange(len(xcords)): xcords[i] = i libName = (opt.bedfile).split('/')[-1] libName = libName.split('.')[0] annotationName = (opt.genefile).split('/')[-1] annotationName = annotationName.split('.')[0] title = libName + " on " + annotationName legend = "" GenerateAroundRegions.plot_profile(opt.upstreamExtension, downstreamExtension, opt.resolution, opt.genicPartition, xcords, overall_score_profile, 0, title, legend, opt.outfile + '_plot.eps') SeparateByChrom.cleanup(chroms, extension)
def calculate_non_strandspecific_rc_on_ExonIntrons(entrez_genes, bedfile, chroms, fragment_size): """ entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object return: all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attribute:value}} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id])["merged_transcript_length"] = merged_transcript_length """ lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(bedfile): p_file_name = bedfile + "_P" n_file_name = bedfile + "_n" Utility_extended.separate_by_strand( bedfile, p_file_name, n_file_name ) #partition the bed file into reads in positive strand and negative strand ##################################################################3 #The column numbers are 1 based instead of 0 based! #For positive strand start_index_P = 2 #For negative strand start_index_N = 3 ##################################################################3 p_totalcount = get_total_tag_counts.get_total_tag_counts(p_file_name) ( forward_reads_on_shared_exons, forward_reads_on_shared_introns, forward_reads_on_merged_transcripts, forward_summary ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons( entrez_genes, p_file_name, start_index_P, chroms, fragment_size, p_totalcount, None) n_totalcount = get_total_tag_counts.get_total_tag_counts(n_file_name) ( reverse_reads_on_shared_exons, reverse_reads_on_shared_introns, reverse_reads_on_merged_transcripts, reverse_summary ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons( entrez_genes, n_file_name, start_index_N, chroms, fragment_size, n_totalcount, None) all_reads_on_shared_exons = { } # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = { } # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = { } #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attributes}} all_reads_on_shared_exons = combine_rc(forward_reads_on_shared_exons, reverse_reads_on_shared_exons) all_reads_on_shared_introns = combine_rc( forward_reads_on_shared_introns, reverse_reads_on_shared_introns) all_reads_on_merged_transcripts = combine_rc( forward_reads_on_merged_transcripts, reverse_reads_on_merged_transcripts) all_summary = combine_summary(forward_summary, reverse_summary, p_totalcount, n_totalcount) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, all_summary)
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="ChIP seq read file") parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format") parser.add_option( "-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt") parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>") parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end", metavar="<int>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-n", "--feature_name", action="store", type="string", dest="feature_name", help="name of the library", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 16: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) total_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile) #Separate_by_chrom on bedfile lib_name = (opt.bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(opt.bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) else: print bedfile, " is not found" sys.exit(1) #load the RE tree to get the RE file names re_tree = pickle.load(open(opt.RE_Tree, 'rb')) (numb_classes, numb_families, numb_names) = numbers(re_tree) print "There are %d classes, %d family, and %d names." % ( numb_classes, numb_families, numb_names) #Prepare the summary read_counts = {} for reClass in re_tree.keys(): read_counts[reClass] = {} for reFamily in re_tree[reClass].keys(): read_counts[reClass][reFamily] = {} for reName in re_tree[reClass][reFamily]: read_counts[reClass][reFamily][reName] = {} #cycle through chrom for chrom in chroms: print chrom chrom_length = chrom_lengths[chrom] chrombed = chrom + extension if Utility_extended.fileExists(chrombed): # load in each read and shift tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, opt.fragment_size)) inf.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #[tag_positions] min_re_length = 10 for reClass in re_tree.keys(): for reFamily in re_tree[reClass].keys(): for reName in re_tree[reClass][reFamily]: re_file_name = "_".join([reClass, reFamily, reName ]) + ".txt" #{id:{feature_name:value}} rc_dic = get_read_count( opt.RE_file_location, re_file_name, opt.feature_name, chrom, chrom_length, tag_position_list, total_count, opt.upstream_extension, opt.downstream_extension, min_re_length) # id is unique and updated only once, so this should be ok read_counts[reClass][reFamily][reName].update(rc_dic) #{reClass:{reFamily:{reName:{id:feature_name, value}}}} #feature_name include: feature_name + "_rc", feature_name + "_rpkm" #output_file_name = feature_name + "_on_" + "mm9_rmsk.pkl" #output = open(output_file_name, 'wb') #pickle.dump(read_counts, output) #output.close() #instead of outputing a huge one, let's output many small pieces breakdown_and_output(read_counts, opt.feature_name) repClass = 'LTR' repFamily = 'ERV1' repName = 'RLTR4_Mm' outfile_name = lib_name + "_on_" + "_".join([repClass, repFamily, repName ]) + ".dat" test(read_counts, repClass, repFamily, repName, outfile_name) SeparateByChrom.cleanup(chroms, extension) print "it took", time.time() - startTime, "seconds."
def main(argv): """ Note the window_size and the fragment_size are both input as strings, as they are used in a shell script in makeGraphFile. """ parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8,hg18,dm2,etc", metavar="<str>") parser.add_option("-b", "--bed_file", action="store", type="string", dest="bamfile", help="bed file to make graph file of", metavar="<file>") parser.add_option("-w", "--window_size", action="store", type="int", dest="window_size", help="window size", metavar="<int>") parser.add_option("-i", "--fragment_size", action="store", type="int", dest="fragment_size", help="size of fragments after CHIP experiment", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="output bed summary file name", metavar="<file>") (opt, args) = parser.parse_args(argv) #if len(argv) < 10: # sys.stderr.write(str(len(argv)) + '\n') # parser.print_help() # sys.exit(1) # #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; # #chrom_lengths = GenomeData.species_chrom_lengths[opt.species]; chromsDict = SeparateByChrom.getChromsFromBam(opt.bamfile) SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed') makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size) final_output_file = opt.outfile final_output_file = SeparateByChrom.combineAllGraphFiles( chromsDict.keys(), ".graph", final_output_file) SeparateByChrom.cleanup(chromsDict.keys(), ".bed") SeparateByChrom.cleanup(chromsDict.keys(), ".graph")
def main(argv): parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be compared") parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be compared") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>") parser.add_option( "-p", "--overlapin1", action="store", type="string", dest="overlapin1", metavar="<file>", help="file for islands in 1 overlapping with islands in 2") parser.add_option( "-q", "--nonoverlapin1", action="store", type="string", dest="nonoverlapin1", help="file for islands in 1 not overlapping with islands in 2 ", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) total_overlap_number_1 = 0 total_islands_1 = 0 SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1') SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2') for chrom in chroms: f = open(chrom + '.1in2', 'w') g = open(chrom + '.1notin2', 'w') bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED3", 0) if Utility.fileExists(chrom + '.island1') and len(bed_vals_2[chrom]) > 0: islandlist2 = bed_vals_2[chrom] if (are_islands_sorted(islandlist2) != 1): islandlist2.sort(key=operator.attrgetter('start')) (island2_start_list, island2_end_list) = union_islands(islandlist2) islands1 = open(chrom + '.island1', 'r') for line in islands1: if not re.match("#", line): total_islands_1 += 1 line = line.strip() sline = line.split() start = int(sline[1]) end = int(sline[2]) if (region_overlap(start, end, island2_start_list, island2_end_list) == 1): f.write('\t'.join(sline) + '\n') total_overlap_number_1 += 1 else: g.write('\t'.join(sline) + '\n') elif Utility.fileExists(chrom + '.island1') and (len(bed_vals_2[chrom]) == 0): islands1 = open(chrom + '.island1', 'r') for line in islands1: if not re.match("#", line): total_islands_1 += 1 line = line.strip() sline = line.split() g.write('\t'.join(sline) + '\n') f.close() g.close() print "total number of island in " + opt.islandfile1 + ": ", total_islands_1 print "total number of island in " + opt.overlapin1 + ": ", total_overlap_number_1 SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1) SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1) SeparateByChrom.cleanup(chroms, '.1in2') SeparateByChrom.cleanup(chroms, '.1notin2') SeparateByChrom.cleanup(chroms, '.island1') SeparateByChrom.cleanup(chroms, '.island2')
def main(argv): parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be compared") parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be compared") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>") parser.add_option( "-p", "--overlapin1", action="store", type="string", dest="overlapin1", metavar="<file>", help="file for islands in 1 overlapping with islands in 2") parser.add_option( "-q", "--nonoverlapin1", action="store", type="string", dest="nonoverlapin1", help="file for islands in 1 not overlapping with islands in 2 ", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) total_overlap_number_1 = 0 total_islands_1 = 0 SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1') SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2') for chrom in chroms: f = open(chrom + '.1in2', 'w') g = open(chrom + '.1notin2', 'w') bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED_GRAPH", 0) bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED_GRAPH", 0) if len(bed_vals_1[chrom]) > 0 and len(bed_vals_2[chrom]) > 0: islandlist1 = bed_vals_1[chrom] islandlist2 = bed_vals_2[chrom] total_islands_1 += len(bed_vals_1[chrom]) for islandlist1_item in islandlist1: start = islandlist1_item.start end = islandlist1_item.end if (region_overlap(start, end, islandlist2) == 1): write(islandlist1_item, f) total_overlap_number_1 += 1 else: write(islandlist1_item, g) elif (len(bed_vals_1[chrom]) > 0) and (len(bed_vals_2[chrom]) == 0): total_islands_1 += len(bed_vals_1[chrom]) for islandlist1_item in bed_vals_1[chrom]: write(islandlist1_item, g) f.close() g.close() print "total number of island in " + opt.islandfile1 + ": ", total_islands_1 print "total number of island in " + opt.overlapin1 + ": ", total_overlap_number_1 SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1) SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1) SeparateByChrom.cleanup(chroms, '.1in2') SeparateByChrom.cleanup(chroms, '.1notin2') SeparateByChrom.cleanup(chroms, '.island1') SeparateByChrom.cleanup(chroms, '.island2')
def main(argv): parser = OptionParser() parser.add_option( "-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>" ) parser.add_option( "-a", "--rawreadfile", action="store", type="string", dest="readfile", metavar="<file>", help="raw read file in bed format", ) parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment", ) parser.add_option( "-b", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format", ) parser.add_option( "-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count file", ) (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) if Utility.fileExists(opt.readfile): SeparateByChrom.separateByChrom(chroms, opt.readfile, ".bed1") else: print opt.readfile, " not found" sys.exit(1) total = 0 library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile) scaling_factor = 1000000 out = open(opt.out_file, "w") for chrom in chroms: if chrom in islands.keys(): island_list = islands[chrom] island_readcount_list = [0] * len(island_list) if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter("start")) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) read_file = chrom + ".bed1" f = open(read_file, "r") for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = tag_position(sline, opt.fragment_size) index = find_readcount_on_islands(island_start_list, island_end_list, position) if index >= 0: island_readcount_list[index] += 1 total += 1 f.close() for index in xrange(len(island_list)): item = island_list[index] normalized_read_count = island_readcount_list[index] / float(library_size) * scaling_factor outline = ( item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(island_readcount_list[index]) + "\t" + str(normalized_read_count) + "\n" ) out.write(outline) SeparateByChrom.cleanup(chroms, ".bed1") out.close() print "Total number of reads on islands are: ", total
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedFile", metavar="<file>", help="ChIP seq read file") parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-g", "--known_genes_file", action="store", type="string", dest="known_genes", metavar="<file>", help="file with known genes in UCSC format") parser.add_option( "-r", "--'Promoter' or 'GeneBody' or 'PromoterGenebody' or 'ExonicRegion'", action="store", type="string", dest="region_type", metavar="<str>", help="region to count tags in") parser.add_option("-u", "--promoter_upstream_extension", action="store", type="int", dest="promoter_upstream_extension", help="upstream extension of promoter region from TSS", metavar="<int>") parser.add_option("-d", "--promoter_downstream_extension", action="store", type="int", dest="promoter_downstream_extension", help="downstream extension of promoter region from TSS", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file name for genes and tag numbers") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() known_genes = UCSC_revised.KnownGenes(opt.known_genes) chroms = known_genes.keys() #Promoter and GeneBody are mutually exclusive. #Promoter: TSS-upstreamextention, TSS+downstreamextension #GeneBody: TSS+downstreamextension, TES #PromoterGenebody: TSS-upstreamextention, TES. allowed_region_type = [ 'Promoter', 'GeneBody', 'PromoterGenebody', 'ExonicRegion' ] if opt.region_type not in allowed_region_type: print " The allowed region types are Promoter, GeneBody, PromoterGenebody and ExonicRegion. The region type is not recognized, exiting" sys.exit(1) if opt.region_type == 'Promoter': region_dic = known_genes.getPromoters( opt.promoter_upstream_extension, opt.promoter_downstream_extension) elif opt.region_type == 'GeneBody': region_dic = known_genes.getGenebodys( opt.promoter_downstream_extension) elif opt.region_type == 'PromoterGenebody': region_dic = known_genes.getPromotergenebodys( opt.promoter_upstream_extension) libName = (opt.bedFile).split('/')[-1] libName = libName.split('.')[0] extension = "-" + libName + '.bed1' if Utility_extended.fileExists(opt.bedFile): SeparateByChrom.separateByChrom(chroms, opt.bedFile, extension) else: print opt.bedFile, " not found" sys.exit(1) totalcount = get_total_tag_counts.get_total_tag_counts(opt.bedFile) f = open(opt.out_file, 'w') outline = "# GeneName" + '\t' + "Read Count" + '\t' + "RPKM" + '\n' f.write(outline) for chrom in chroms: chrombed = chrom + extension if Utility_extended.fileExists(chrombed): gene_coords = known_genes[chrom] if len(gene_coords) > 0: if opt.region_type == 'ExonicRegion': (gene_name_list, region_length_list, read_count_list) = get_read_count_on_exons( gene_coords, chrombed, opt.fragment_size) else: (gene_name_list, region_length_list, read_count_list) = get_read_count_on_genic_regions( region_dic[chrom], chrombed, opt.fragment_size) #test_get_read_count_on_genic_regions("AAAS", gene_name_list, region_length_list, read_count_list) #test_get_read_count_on_genic_regions("AACS", gene_name_list, region_length_list, read_count_list) assert len(gene_name_list) == len(region_length_list) assert len(gene_name_list) == len(read_count_list) RPKM = [0] * len(gene_name_list) for i in xrange(len(gene_name_list)): if region_length_list[i] > 0: RPKM[i] = read_count_list[i] / (region_length_list[i] / 1000.0) / (totalcount / 1000000.0) outline = gene_name_list[i] + '\t' + str( read_count_list[i]) + '\t' + str(RPKM[i]) + '\n' f.write(outline) f.close() SeparateByChrom.cleanup(chroms, extension) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species under consideration", metavar="<str>") parser.add_option("-b", "--raw_bam_file", action="store", type="string", dest="bam_file", help="raw bam file", metavar="<file>") parser.add_option("-t", "--threshold", action="store", type="int", dest="threshold", help="threshold for copy number", metavar="<int>") parser.add_option("-o", "--output_file_name", action="store", type="string", dest="out_file", help="output file name", metavar="<file>") ## Add options to filter reads parser.add_option( "-f", "--requiredFlag", type='int', help="Required bit in sam flag. Same as samtools view -f") parser.add_option( "-F", "--filterFlag", type='int', help="Filter out bit in sam flag, Same as samtools view -F") parser.add_option("-q", "--mapq", type='int', help="minimum mapq for a read to be kept") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) #if opt.species in GenomeData.species_chroms.keys(): # chroms = GenomeData.species_chroms[opt.species]; #else: # sys.stderr.write("\nThis species is not recognized, exiting\n"); # sys.exit(1); chroms = SeparateByChrom.getChromsFromBam(opt.bam_file) SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag=opt.requiredFlag, filterFlag=opt.filterFlag, mapq=opt.mapq) if opt.threshold > 0: for chrom in chroms: if (Utility.fileExists(chrom + ".bed1")): strand_broken_remove(chrom, opt.threshold) SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file) else: SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file) SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() parser.add_option("-k", "--known_gene_file", action="store", type="string", dest="genefile", help="file with known gene info", metavar="<file>") parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", help="file with tags in bed format", metavar="<file>") parser.add_option("-n", "--name", action="store", type="string", dest="name", help="name for plotting", metavar="<str>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species", metavar="<str>") parser.add_option("-c", "--number", action="store", type="int", dest="exonnumber", help="number of exons", metavar="<int>") parser.add_option("-f", "--fragmentsize", action="store", type="int", dest="fragment_size", help="fragment size", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; chrom_length = GenomeData.species_chrom_lengths[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); gene_coords = UCSC.KnownGenes(opt.genefile); #separate by chrom if Utility.fileExists(opt.bedfile): SeparateByChrom.separateByChrom(chroms, opt.bedfile, '.bed1'); else: print opt.beddfile, " not found"; sys.exit(1) total_exon_sums = [0]*opt.exonnumber; total_intron_sums = [0]*opt.exonnumber; total_exon_sizes = [0]*opt.exonnumber; total_intron_sizes = [0]*opt.exonnumber; total_num_tags = 0; for chrom in chroms: read_file = chrom + ".bed1"; bed_vals = BED.BED(opt.species, read_file, "BED6", 0); total_num_tags += bed_vals.getNumVals(); (exon_counts, intron_counts, exon_seq_sizes, intron_seq_sizes) = getExonIntronDensities(gene_coords, bed_vals, opt.exonnumber, opt.fragment_size); for j in range(opt.exonnumber): total_exon_sums[j] += exon_counts[j]; total_intron_sums[j] += intron_counts[j]; total_exon_sizes[j] += exon_seq_sizes[j]; total_intron_sizes[j] += intron_seq_sizes[j]; """ print everything out to a file """ outfilename = '%s-exon-intron-scores' % opt.name; outFile = open(outfilename, 'w'); for j in range(len(exon_counts)): exon_density = float(total_exon_sums[j]) / float(total_exon_sizes[j]); intron_density = float(total_intron_sums[j]) / float(total_intron_sizes[j]); exon_density /= float(total_num_tags); intron_density /= float(total_num_tags); outline = str(j+1) + " " + str(exon_density) + " " + str(intron_density) + "\n"; outFile.write(outline); outFile.close(); SeparateByChrom.cleanup(chroms,'.bed1');
def main(argv): parser = OptionParser() parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be unioned") parser.add_option( "-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be unioned; if no, type in any word") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>") parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) if opt.species in species_chroms.keys(): chroms = species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1') if Utility.fileExists(opt.islandfile2): SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2') for chrom in chroms: f = open(chrom + '.output', 'w') bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED3", 0) bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED3", 0) if len(bed_vals_1[chrom]) > 0 or len(bed_vals_2[chrom]) > 0: islandlist = bed_vals_1[chrom] + bed_vals_2[chrom] union_islands_to_file(islandlist, f) f.close() SeparateByChrom.cleanup(chroms, '.island2') else: for chrom in chroms: f = open(chrom + '.output', 'w') bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED3", 0) if len(bed_vals_1[chrom]) > 0: islandlist = bed_vals_1[chrom] union_islands_to_file(islandlist, f) f.close() SeparateByChrom.combineAllGraphFiles(chroms, '.output', opt.outfile) SeparateByChrom.cleanup(chroms, '.output') SeparateByChrom.cleanup(chroms, '.island1')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values()); genomesize = opt.fraction * genomesize; else: print "This species is not recognized, exiting"; sys.exit(1); chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile); control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile); print "chip library size ", chip_library_size print "control library size ", control_library_size totalchip = 0; totalcontrol = 0; islands = BED.BED(opt.species, opt.islandfile, "BED3", 0); # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChromBamToBed(chroms, opt.chipreadfile, '.bed1'); else: print opt.chipreadfile, " not found"; sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChromBamToBed(chroms, opt.controlreadfile, '.bed2'); else: print opt.controlreadfile, " not found"; sys.exit(1) island_chip_readcount = {}; island_control_readcount = {}; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list=[0]*len(island_list); read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_chip_readcount_list[index] += 1; totalchip += 1; f.close(); island_chip_readcount[chrom] = island_chip_readcount_list; island_control_readcount_list=[0]*len(island_list); read_file = chrom + ".bed2"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_control_readcount_list[index] += 1; totalcontrol += 1; f.close(); island_control_readcount[chrom] = island_control_readcount_list; chip_background_read = chip_library_size - totalchip; control_background_read = control_library_size - totalcontrol; #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size*1.0/control_library_size; print "Total number of chip reads on islands is: ", totalchip; print "Total number of control reads on islands is: ", totalcontrol; #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w'); pvalue_list = []; result_list = []; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; observation = (island_chip_readcount[chrom])[index]; control_tag = (island_control_readcount[chrom])[index]; if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation)/float(average); else: length = item.end - item.start + 1; average = length * control_library_size *1.0/genomesize; average = min(0.25, average)* scaling_factor; fc = float(observation)/float(average); if observation > average: pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; else: pvalue = 1; pvalue_list.append(pvalue); item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray=scipy.array(pvalue_list); pvaluerankarray=scipy.stats.rankdata(pvaluearray); totalnumber = len(result_list); for i in range(totalnumber): item = result_list[i]; alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i]; if alpha > 1: alpha = 1; outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; out.write(outline); #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close(); SeparateByChrom.cleanup(chroms, '.bed1'); SeparateByChrom.cleanup(chroms, '.bed2');
def AssignPeaksToEntrez3UTRs(entrez_genes, peakfile, chroms, chrom_lengths, peak_threshold, downstream_extension): """ Returns {entrez_id:(gene, ThreeUTR_length, peaks_on_3UTR)} gene:gene = entrez_genes_by_chrom.entrez_genes[entrez_id] ThreeUTR_length: longest 3UTR length; length includes the downstream extension peaks_on_3UTR:[(location, read_count)] """ peaks_on_entrez_3UTRs = {} #store the peaks for each 3UTR of the entrez cluster. {Entrez_ID: (gene, ThreeUTR_length, peaks_on_3UTR)} if Utility_extended.fileExists(peakfile): # Read the peaks, which is assumed to have the pseudo ucsc format island_libName1 = (peakfile).split('/')[-1] island_suffix1 = island_libName1.split('.')[-1] island_libName1 = island_libName1.split('.')[0] island_extension1 = "-" + island_libName1 + '.' + island_suffix1 + "1" SeparateByChrom.separateByChrom(chroms, peakfile, island_extension1) else: print peakfile, " is not found"; sys.exit(1) for chrom in chroms: if chrom in entrez_genes.chroms: entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) this_chrom_length = chrom_lengths[chrom] # Load in the PA peak information if Utility_extended.fileExists(chrom + island_extension1): inf = open(chrom + island_extension1, 'r') # Read in the peaks and separate the forward strand peaks and the reverse strand peaks five_peaks = [] # peaks on forward strand, element (location, read_count) three_peaks = [] # peaks on reverse strand, element (location, read_count) for line in inf: line = line.strip(); sline = line.split(); strand = sline[2] if plus.match(strand): if float(sline[10]) >= peak_threshold: five_peaks.append ((int(sline[3]), float(sline[10]))) elif minus.match(strand): if float(sline[10]) >= peak_threshold: three_peaks.append ((int(sline[4]), float(sline[10]))) five_peaks = sorted(five_peaks, key = itemgetter(0)) #sort according to location five_peaks_location = [item[0] for item in five_peaks] three_peaks = sorted(three_peaks, key = itemgetter(0)) three_peaks_location = [item[0] for item in three_peaks] inf.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object # For the set of transcripts, use the longest 3UTR at the designated representative 3UTR transcript_with_longest_3UTR = gene.identify_transcript_with_longest_3UTR() # a UCSC class object if plus.match(transcript_with_longest_3UTR.strand): start = transcript_with_longest_3UTR.cdsEnd end = min(transcript_with_longest_3UTR.txEnd + downstream_extension, this_chrom_length) start_ind = bisect.bisect_left(five_peaks_location, start); end_ind = bisect.bisect_right(five_peaks_location, end); peaks_on_3UTR = five_peaks[start_ind: end_ind] #[(mode_location, readcount)] if minus.match(transcript_with_longest_3UTR.strand): start = max(transcript_with_longest_3UTR.txStart - downstream_extension, 0) end = transcript_with_longest_3UTR.cdsStart start_ind = bisect.bisect_left(three_peaks_location, start); end_ind = bisect.bisect_right(three_peaks_location, end); peaks_on_3UTR = three_peaks[start_ind: end_ind] ThreeUTR_length = end - start + 1 #length includes the downstream extension peaks_on_entrez_3UTRs[entrez_id] = (gene, ThreeUTR_length, peaks_on_3UTR) SeparateByChrom.cleanup(chroms, island_extension1) return peaks_on_entrez_3UTRs
def get_read_count_on_exons(genefile, bedfile, species, output): gene_coords = UCSC.KnownGenes(genefile) bed_vals = my_BED.Starts(species, bedfile) #print bed_vals.keys() --- only those chroms in species hg18 stored in my_BED.py num_tags = bed_vals.getNumVals() print num_tags if Utility.fileExists(bedfile): SeparateByChrom.separateByChrom(bed_vals.keys(), bedfile, '.bed1') else: print bedfile, " not found" sys.exit(1) outFile = open(output, 'w') tag_starts = [] exon_sums = 0 exon_sizes = 0 exon_density = 0 RPKM = 0 for chrom in gene_coords.keys(): if chrom in bed_vals.keys(): #print chrom bed_vals = my_BED.Starts(species, chrom + '.bed1') tag_starts = bed_vals[chrom] #print len(tag_starts) tag_starts.sort() for g in gene_coords[chrom]: if len(tag_starts) > 0: #print 'tag_start_length='+str(len(tag_starts)) #print 'exonCount='+ str(g.exonCount) exon_Starts = g.exonStarts.split(',') exon_Ends = g.exonEnds.split(',') assert len(exon_Starts) == len(exon_Ends) if g.exonCount > 0: exon_sums = 0 exon_sizes = 0 if plus.match(g.strand): for i in range(0, int(g.exonCount)): exon_sums += countTagsInWindow( int(exon_Starts[i]), int(exon_Ends[i]), tag_starts) exon_sizes += abs( int(exon_Ends[i]) - int(exon_Starts[i])) ## exon_density is per mappedreads(million) * exonlength(kb), edgeR will divide this by mappedreads exon_density = (float(exon_sums) / float(exon_sizes)) RPKM = (exon_density / float(num_tags)) * 1000 * 1000000 #print exon_density elif minus.match(g.strand): for i in range(0, int(g.exonCount)): exon_sums += countTagsInWindow( int(exon_Starts[-2 - i]), int(exon_Ends[-2 - i]), tag_starts) exon_sizes += abs( int(exon_Ends[-2 - i]) - int(exon_Starts[-2 - i])) exon_density = (float(exon_sums) / float(exon_sizes)) RPKM = (exon_density / float(num_tags)) * 1000 * 1000000 #print exon_density print g.name, exon_sums, exon_sizes, exon_density, RPKM else: print g.name, exon_sums, exon_sizes, exon_density, RPKM outline = str(g.name) + "\t" + str(RPKM) + "\n" #outline = str(g.name) + "\t" + str(exon_sums) + "\t" + str(exon_sizes)+"\t" + str(exon_density)+"\t" + str(RPKM) + "\n" outFile.write(outline) outFile.close() SeparateByChrom.cleanup(bed_vals.keys(), '.bed1') return 0