def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line:float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub(r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update({name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs','motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start-5:region.stop+5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list(pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend(( h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total+1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines('%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join(['p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand'])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join(map(str, [p[0], h[0], h[1], motif_name, 1000, h[2]])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z'+zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % ( m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option("--motif_file", '-m', dest="motif_file", type="string", help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option("--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29, help="""Calculate threshold score estimate from this Z-score. [default=%default]""") parser.add_option('--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option('--bg_samples', dest='bg_samples', type='string', help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""") parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file') parser.add_option("--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: allMotifs.update(pickle.load(file(opts.motif_file, 'rb'))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name:makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize']) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if type(pwm) is list: pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd) allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, float(fgMatches)/fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError("specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >>sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main(argv=None): """ Calculate significance of a motif in peaks with genomic background Can use restricted annotationDB, such as only promoter regions """ parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" + main.__doc__) parser.add_option("--genome", '-g', dest="genome_resource", type="string", help="""The pygr resource for the genome""") parser.add_option( "--motif_file", '-m', dest="motif_file", type="string", help= """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g., {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""") parser.add_option( "--consensus_file", '-c', dest="consensus_file", type="string", help="""index file for consensus motifs (IUPAC format, one per line in the file""") parser.add_option( "--motif_key", '-k', dest="motif_key", type="string", help="""The key for the current motif in motif_file, default=all""") parser.add_option( '--zscore', '-z', dest='zscore', type='float', default=4.29, help= """Calculate threshold score estimate from this Z-score. [default=%default]""" ) parser.add_option( '--overlap_resource', dest='overlap_resource', type='string', help="""Only count fg and bg that overlap with pygr resource""") parser.add_option( '--bg_samples', dest='bg_samples', type='string', help= """Pickled or Fasta file of background sequences to use instead of sampling the genome""" ) parser.add_option('--no_bg', dest='no_bg', action='store_true', help="""skip sampling in the background""") parser.add_option( '--report_region', type='string', help= 'Report the genomic regions of peaks with motif instances to this file' ) parser.add_option( "--output_file", '-f', dest="output_file", type="string", help="""Append the zscore information to the given file""") parser.add_option('--search_genome', action='store_true') if argv is None: argv = sys.argv[1:] opts, args = parser.parse_args(argv) if len(args) != 1: parser.print_help() print 'Specify the peaks bed file!' sys.exit(-1) if not opts.motif_file and not opts.consensus_file: parser.print_help() print 'Specify the motif file!' sys.exit(-1) updated_motifs = False print '# Loading resources...' opts.genome_resource = getFullGenomeName(opts.genome_resource) genome = worldbase(opts.genome_resource) if opts.overlap_resource: annotMap = worldbase(opts.overlap_resource) annotDB = worldbase(opts.overlap_resource + '_db') allMotifs = {} # load pickled dict of motifs if opts.motif_file: if opts.motif_file.endswith('.transfac'): allMotifs.update( parseMotifsFromTransfac(open(opts.motif_file, 'r').read())) else: allMotifs.update(pickle.load(open(opts.motif_file))) # create consensus dict of motifs if opts.consensus_file: with open(opts.consensus_file) as infile: for line in infile: name, consensus = line.strip().split('\t') allMotifs.update({name: makePWMFromIUPAC(consensus)}) if opts.motif_key: allKeys = [opts.motif_key] else: allKeys = allMotifs.keys() # write a header if opts.output_file: outstr = '\t'.join([ 'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z', 'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize', 'bgMatches', 'bgSize' ]) open(opts.output_file, 'w').write(outstr) for motifKey in allKeys: print '# Loaded motif %s...' % motifKey pwm = allMotifs[motifKey] if isinstance(pwm, list): pwm = Motif(pwm) allMotifs[motifKey] = pwm if not pwm.bg_calculated(): print '# Calculating motif background distribution...' pwm.calculate_background(genome) updated_motifs = True print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % ( motifKey, len(pwm), pwm.get_threshold( opts.zscore), pwm._mean, pwm._sd, pwm.max_score()) if opts.search_genome and opts.report_region is not None: # search the genome with the motif print 'searching genome!' with open(opts.report_region, 'w') as outfile: for chrom in genome: for match in pwm.find_in_region(genome[chrom]): outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format( chrom=chrom, start=match[0], stop=match[1], name=motifKey, score=pwm.calc_score(match[3]), strand='+' if match[2] == 1 else '-') outfile.write(outstr) continue allPeaks = open(args[0]).readlines() allPeaks = list(readBedLines(allPeaks)) peakSizes = [stop - start for _, start, stop, _ in allPeaks] print '# Searching foreground sequence...' sys.stdout.flush() peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks) if opts.overlap_resource: # check to see if the bed line overlaps the resource overlappingRegions = [region for region in peakRegions \ if len(annotMap[region]) > 0] # run a search in each of the overlapping regions motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \ for region in overlappingRegions] fgSize = len(overlappingRegions) # count the number of peaks with at least one motif instance fgMatches = len( filter(lambda matches: len(matches) > 0, motifInstancesInOverlap)) else: matchingPeaks = [region for region in peakRegions \ if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0] fgMatches = len(matchingPeaks) fgSize = len(allPeaks) if opts.report_region is not None: with open(opts.report_region, 'w') as outfile: outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks) if opts.no_bg: outstr = '\t'.join([args[0], motifKey] + map( str, [opts.zscore, fgMatches, fgSize, float(fgMatches) / fgSize])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr else: print '# Searching background sequence...' sys.stdout.flush() if opts.bg_samples: try: bgSamples = pickle.load(open(opts.bg_samples)) except: try: bgSamples = parseFastaLines(open(opts.bg_samples)) except: raise RuntimeError( "specified background samples file %s" "was niether a pickled file nor a fasta file!" % opts.bg_samples) elif opts.overlap_resource: bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000) else: bgSamples = sample_genome(genome, peakSizes, sampleSize=100000) #bgSamples = sample_genome(genome, peakSizes, sampleSize=100) bgSize = 0 bgMatches = 0 for region in bgSamples: bgSize += 1 if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0: bgMatches += 1 #calculate significance of foreground vs. background zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize) pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize) outstr = '\t'.join([args[0], motifKey] + map(str, [ 'thesh_z=' + str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches) / fgSize, bgMatches, bgSize ])) if opts.output_file: open(opts.output_file, 'a').write(outstr + '\n') else: print >> sys.stderr, outstr if updated_motifs: print '# Saving motif info back to %s' % opts.motif_file pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line: float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub( r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update( {name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs', 'motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start - 5:region.stop + 5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list( pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend( (h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total + 1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines( '%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join([ 'p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand' ])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join( map(str, [ p[0], h[0], h[1], motif_name, 1000, h[2] ])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size