Example #1
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome,
                                        peak_lengths,
                                        control_file,
                                        sampleSize=cfg.getint(
                                            'motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' %
                                  (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([
                        seq.id,
                        str(seq.start),
                        str(seq.stop),
                        str(index), '0', '+' if seq.orientation == 1 else '-'
                    ]) + '\n')
Example #2
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                                                readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome, peak_lengths, control_file,
                                sampleSize=cfg.getint('motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([seq.id, str(seq.start),
                                                 str(seq.stop), str(index), '0',
                                '+' if seq.orientation == 1 else '-']) + '\n')
Example #3
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)

    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(
        wb_genome,
        peak_lengths,
        sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'),
        excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'),
        excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'),
        ignoreCharacters='_',
        weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([
                    line.id,
                    str(line.start),
                    str(line.stop),
                    str(index), '0', '+' if line.orientation == 1 else '-'
                ]) + '\n')
Example #4
0
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep):
    """keep only the top peaks as input to motif discovery"""
    with open(in_peaks) as infile:
        seqs = list(readBedLines(infile, dataOnly=False))
        # sort by score, highest first
        seqs.sort(key=lambda x: int(x[4]), reverse=True)
        with open(out_subset, 'w') as outfile:
            subset = seqs[:num_peaks_to_keep]
            outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)
Example #5
0
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep):
    """keep only the top peaks as input to motif discovery"""
    with open(in_peaks) as infile:
        seqs = list(readBedLines(infile, dataOnly=False))
        # sort by score, highest first
        seqs.sort(key=lambda x: int(x[4]), reverse=True)
        with open(out_subset, 'w') as outfile:
            subset = seqs[:num_peaks_to_keep]
            outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)
Example #6
0
def makeResourceFromBed(fileLines, genome, docstring='Temp Resource From BED', dataPath='memory'):
    'Generate a sqlite table, annotDB, and NLMSA from the given bed lines'
    bedLines = readBedLines(fileLines)
    bedDict = makeDictFromBed(bedLines)
    tableName = os.path.split(dataPath)[1]
    sqlDataPath = dataPath if dataPath != 'memory' else ':memory:'  # SQLite has special name for in-memory tables
    dataTable = convertDictToSQLite(bedDict, tableName, sqlDataPath)
    annotDB = annotation.AnnotationDB(dataTable, genome,
                                      sliceAttrDict=eval(defaultSliceAttrs))
    annotMap = makeNLMSA([annotDB], dataPath)
    return dataTable, annotDB, annotMap
Example #7
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                        readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(wb_genome, peak_lengths,
                               sampleSize=cfg.getint('motifs',
                                        'motif_significance_sample_size'),
                               excludeRepeat=cfg.getboolean('motifs',
                                                'sampling_exclude_repeats'),
                               excludeN=cfg.getboolean('motifs',
                                                'sampling_exclude_N'),
                               ignoreCharacters='_', weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([line.id, str(line.start),
                                             str(line.stop), str(index), '0',
                                '+' if line.orientation == 1 else '-']) + '\n')
Example #8
0
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n" +
                                   main.__doc__)
    parser.add_option("--genome",
                      '-g',
                      dest="genome_resource",
                      type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option(
        "--motif_file",
        '-m',
        dest="motif_file",
        type="string",
        help=
        """The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option(
        "--consensus_file",
        '-c',
        dest="consensus_file",
        type="string",
        help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option(
        "--motif_key",
        '-k',
        dest="motif_key",
        type="string",
        help="""The key for the current motif in motif_file, default=all""")
    parser.add_option(
        '--zscore',
        '-z',
        dest='zscore',
        type='float',
        default=4.29,
        help=
        """Calculate threshold score estimate from this Z-score. [default=%default]"""
    )
    parser.add_option(
        '--overlap_resource',
        dest='overlap_resource',
        type='string',
        help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option(
        '--bg_samples',
        dest='bg_samples',
        type='string',
        help=
        """Pickled or Fasta file of background sequences to use instead of sampling the genome"""
    )
    parser.add_option('--no_bg',
                      dest='no_bg',
                      action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option(
        '--report_region',
        type='string',
        help=
        'Report the genomic regions of peaks with motif instances to this file'
    )
    parser.add_option(
        "--output_file",
        '-f',
        dest="output_file",
        type="string",
        help="""Append the zscore information to the given file""")
    parser.add_option('--search_genome', action='store_true')
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')

    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        if opts.motif_file.endswith('.transfac'):
            allMotifs.update(
                parseMotifsFromTransfac(open(opts.motif_file, 'r').read()))
        else:
            allMotifs.update(pickle.load(open(opts.motif_file)))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name: makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()

    # write a header
    if opts.output_file:
        outstr = '\t'.join([
            'peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
            'hypergeo_pvalue', 'fgMatches', 'fgSize', 'fgMatches/fgSize',
            'bgMatches', 'bgSize'
        ])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if isinstance(pwm, list):
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s max_score=%s' % (
            motifKey, len(pwm), pwm.get_threshold(
                opts.zscore), pwm._mean, pwm._sd, pwm.max_score())

        if opts.search_genome and opts.report_region is not None:
            # search the genome with the motif
            print 'searching genome!'
            with open(opts.report_region, 'w') as outfile:
                for chrom in genome:
                    for match in pwm.find_in_region(genome[chrom]):
                        outstr = '{chrom}\t{start}\t{stop}\t{name}\t{score}\t{strand}\n'.format(
                            chrom=chrom,
                            start=match[0],
                            stop=match[1],
                            name=motifKey,
                            score=pwm.calc_score(match[3]),
                            strand='+' if match[2] == 1 else '-')
                        outfile.write(outstr)
            continue

        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop]
                       for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(
                filter(lambda matches: len(matches) > 0,
                       motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)

        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' %
                                   (region.id, region.start, region.stop)
                                   for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(
                str,
                [opts.zscore, fgMatches, fgSize,
                 float(fgMatches) / fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError(
                            "specified background samples file %s"
                            "was niether a pickled file nor a fasta file!" %
                            opts.bg_samples)

            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB,
                                            peakSizes,
                                            sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1

            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches,
                                           bgSize)
            outstr = '\t'.join([args[0], motifKey] +
                               map(str, [
                                   'thesh_z=' + str(opts.zscore), zscore,
                                   pvalue, fgMatches, fgSize,
                                   float(fgMatches) / fgSize, bgMatches, bgSize
                               ]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >> sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def main():
    ''' Calculate the chance of observing a certain number of overlaps between two
        set of genomic regions.  Significance is estimated by randomly shuffling the
        positions (not changing the lengths or chromosomes) of one of the samples and reporting the number
        of overlapping sites in the shuffled sets.  '''
    usage = "%prog [options] bedFile1 bedFile2 \n" + main.__doc__
    parser = optparse.OptionParser(usage)
    parser.add_option('--genome', '-g', dest='genome', type='string', default=None,
                      help='The genome name the bed files come from, i.e, mm9 or hg19')
    parser.add_option('--num_shuffles', '-n', dest='num_shuffles', type='int', default=10000,
                      help='Number of times to shuffle bedFile1. default=%default')
    parser.add_option('--disjoint', '-d', action='store_true',
                      help='Make sure that there is no overlap in shuffled regions')
    parser.add_option('--quiet', '-q', action='store_true',
                      help='report only the overlap number (no messages)')
    parser.add_option('--unique_out', '-u', dest='unique_out', type='string', default=None,
                      help='print non-overlapping regions from bedfile1 to this file')
    parser.add_option('--report_col1', '-1', dest='report_col1', type='int', default=None,
                      help='bed column to use when reporting the overlap type. default:None')
    parser.add_option('--report_col2', '-2', dest='report_col2', type='int', default=None,
                      help='bed column to use when reporting the overlap type. default:None')
    parser.add_option('--file_report', '-f', dest='file_out', type='string', default=None,
                      help='where to file the overlap report')
    opts, args = parser.parse_args()
    if opts.genome is None:
        parser.print_help()
        print >>sys.stderr, 'You must specify a genome!'
        sys.exit(-1)
    if opts.num_shuffles < 0:
        parser.print_help()
        print >>sys.stderr, 'Must have a positive or 0 number of shuffles!'
        sys.exit(-1)
    genome = getGenome(opts.genome)
    chromSizes = dict((chrom, len(seq)) for chrom, seq in genome.iteritems())
    bedfile1 = open(args[0], 'r')
    bedfile2 = open(args[1], 'r')
    bedlines1 = sorted(readBedLines(bedfile1, dataOnly=False))
    bedlines2 = sorted(readBedLines(bedfile2, dataOnly=False))
    if opts.report_col1 is None:
        opts.report_col1 = 'none'
    if opts.report_col2 is None:
        opts.report_col2 = 'none'
    
    if not opts.quiet:
        print 'Original data:\t%s in %s\t%s in %s\t' % (args[0],len(bedlines1), args[1],len(bedlines2)),
    if opts.unique_out:
        originalOverlapCount, uniqueBed1 = getBedOverlap(bedlines1, bedlines2, alreadySorted=True, reportUnique=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2)
        with open(opts.unique_out, 'w') as outfile:
            outfile.writelines('\n'.join('\t'.join(map(str, bedFields)) for bedFields in uniqueBed1))
    else:
        originalOverlapCount = getBedOverlap(bedlines1, bedlines2, alreadySorted=True, featureColumn1=opts.report_col1, featureColumn2=opts.report_col2)
    if not opts.quiet:
        print 'with %s overlaps or %s unique to bedfile1' % (originalOverlapCount, len(bedlines1) - originalOverlapCount)
    else:
        sys.stdout.write('\t' + str(originalOverlapCount))
    
    if opts.file_out:
        with open(opts.file_out, 'a') as outfile:
            print >> outfile, '\t'.join([args[0], str(len(bedlines1)), args[1],str(len(bedlines2)),
                                        'overlap: %s' % originalOverlapCount, 
                                        'unique to 1: %s' % (len(uniqueBed1) if opts.unique_out else len(bedlines1) - originalOverlapCount)])
    
    
    if opts.num_shuffles > 0:
        randOverlaps = [-1] * opts.num_shuffles  # preallocate
        
        print 'Generating %s random shuffles...' % opts.num_shuffles,
        for i in xrange(opts.num_shuffles):
            if i % 1000 == 0:
                print i,
                sys.stdout.flush()
            shuffledBeds1 = sorted(generateShuffledBed(bedlines1, chromSizes, disjoint=opts.disjoint))
            overlapCount = getBedOverlap(shuffledBeds1, bedlines2, alreadySorted=True)
            randOverlaps[i] = overlapCount
        print
        randomBetterCount = len(filter(lambda randVal: randVal >= originalOverlapCount, randOverlaps))
        randNumDistinctVals = len(set(randOverlaps))
        randHist, bins = scipy.histogram(randOverlaps, bins=min(randNumDistinctVals, 15))
        print 'Random overlap distribution is: \nbinCounts:\t%s\nbinEdges:  %s' % (randHist, bins)
        print 'Random shuffle:\t%s with at least as many overlaps, pvalue %s %s' % (randomBetterCount,
                                                '<' if randomBetterCount==0 else '=',
                                                max(1./opts.num_shuffles, float(randomBetterCount)/opts.num_shuffles))
        print 'Random mean:\t%s\tstdev:%s' % (scipy.mean(randOverlaps), scipy.std(randOverlaps))
def main(argv=None):
    """ Calculate significance of a motif in peaks with genomic background
    Can use restricted annotationDB, such as only promoter regions """

    parser = optparse.OptionParser("%prog [options] peaks.bed [outfile] \n"+main.__doc__)
    parser.add_option("--genome", '-g', dest="genome_resource", type="string",
                      help="""The pygr resource for the genome""")
    parser.add_option("--motif_file", '-m', dest="motif_file", type="string",
                      help="""The index file for all motifs, as a pickled dictionary, of pwm's or Motifs e.g.,
                      {"LRH_1":[[.25,.25,.1,.4],[.2,.2,.3,.3]]}""")
    parser.add_option("--consensus_file", '-c', dest="consensus_file", type="string",
                      help="""index file for consensus motifs (IUPAC format, one
                      per line in the file""")
    parser.add_option("--motif_key", '-k', dest="motif_key", type="string",
                      help="""The key for the current motif in motif_file, default=all""")
    parser.add_option('--zscore', '-z', dest='zscore', type='float', default=4.29,
                      help="""Calculate threshold score estimate from this Z-score. [default=%default]""")
    parser.add_option('--overlap_resource', dest='overlap_resource', type='string',
                      help="""Only count fg and bg that overlap with pygr resource""")
    parser.add_option('--bg_samples', dest='bg_samples', type='string',
                      help="""Pickled or Fasta file of background sequences to use instead of sampling the genome""")
    parser.add_option('--no_bg', dest='no_bg', action='store_true',
                      help="""skip sampling in the background""")
    parser.add_option('--report_region', type='string', help='Report the genomic regions of peaks with motif instances to this file')
    parser.add_option("--output_file", '-f', dest="output_file", type="string",
                      help="""Append the zscore information to the given file""")
    if argv is None:
        argv = sys.argv[1:]
    opts, args = parser.parse_args(argv)
    if len(args) != 1:
        parser.print_help()
        print 'Specify the peaks bed file!'
        sys.exit(-1)
    if not opts.motif_file and not opts.consensus_file:
        parser.print_help()
        print 'Specify the motif file!'
        sys.exit(-1)

    updated_motifs = False
    print '# Loading resources...'
    opts.genome_resource = getFullGenomeName(opts.genome_resource)
    genome = worldbase(opts.genome_resource)
    if opts.overlap_resource:
        annotMap = worldbase(opts.overlap_resource)
        annotDB = worldbase(opts.overlap_resource + '_db')
    
    allMotifs = {}
    # load pickled dict of motifs
    if opts.motif_file:
        allMotifs.update(pickle.load(file(opts.motif_file, 'rb')))
    # create consensus dict of motifs
    if opts.consensus_file:
        with open(opts.consensus_file) as infile:
            for line in infile:
                name, consensus = line.strip().split('\t')
                allMotifs.update({name:makePWMFromIUPAC(consensus)})

    if opts.motif_key:
        allKeys = [opts.motif_key]
    else:
        allKeys = allMotifs.keys()
    
    # write a header
    if opts.output_file:
        outstr = '\t'.join(['peaks', 'motif', 'threshold_z', 'vs_bg_normal_Z',
                            'hypergeo_pvalue', 'fgMatches', 'fgSize',
                            'fgMatches/fgSize', 'bgMatches', 'bgSize'])
        open(opts.output_file, 'w').write(outstr)

    for motifKey in allKeys:
        print '# Loaded motif %s...' % motifKey
        pwm = allMotifs[motifKey]
        if type(pwm) is list:
            pwm = Motif(pwm)
            allMotifs[motifKey] = pwm
        if not pwm.bg_calculated():
            print '# Calculating motif background distribution...'
            pwm.calculate_background(genome)
            updated_motifs = True
        print 'motif %s: length=%s threshold=%s mean=%s sd=%s' % (motifKey, len(pwm), pwm.get_threshold(opts.zscore), pwm._mean, pwm._sd)
        allPeaks = open(args[0]).readlines()
        allPeaks = list(readBedLines(allPeaks))
        peakSizes = [stop - start for _, start, stop, _ in allPeaks]

        print '# Searching foreground sequence...'
        sys.stdout.flush()
        peakRegions = (genome[chrom][start:stop] for chrom, start, stop, _ in allPeaks)
        if opts.overlap_resource:
            # check to see if the bed line overlaps the resource
            overlappingRegions = [region for region in peakRegions \
                                        if len(annotMap[region]) > 0]
            # run a search in each of the overlapping regions
            motifInstancesInOverlap = [pwm.find_in_region(region, zscore=opts.zscore) \
                                        for region in overlappingRegions]
            fgSize = len(overlappingRegions)
            # count the number of peaks with at least one motif instance
            fgMatches = len(filter(lambda matches: len(matches) > 0, motifInstancesInOverlap))
        else:
            matchingPeaks = [region for region in peakRegions \
                                        if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0]
            fgMatches = len(matchingPeaks)
            fgSize = len(allPeaks)
            
        if opts.report_region is not None:
            with open(opts.report_region, 'w') as outfile:
                outfile.writelines('%s\t%s\t%s\n' % (region.id, region.start, region.stop) for region in matchingPeaks)

        if opts.no_bg:
            outstr = '\t'.join([args[0], motifKey] + map(str, [opts.zscore, fgMatches, fgSize, 
                                                      float(fgMatches)/fgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
        else:
            print '# Searching background sequence...'
            sys.stdout.flush()
            if opts.bg_samples:
                try:
                    bgSamples = pickle.load(open(opts.bg_samples))
                except:
                    try:
                        bgSamples = parseFastaLines(open(opts.bg_samples))
                    except:
                        raise RuntimeError("specified background samples file %s"
                                           "was niether a pickled file nor a fasta file!" %
                                           opts.bg_samples)
                
            elif opts.overlap_resource:
                bgSamples = sample_resource(annotDB, peakSizes, sampleSize=100000)
            else:
                bgSamples = sample_genome(genome, peakSizes, sampleSize=100000)
                #bgSamples = sample_genome(genome, peakSizes, sampleSize=100)
            bgSize = 0
            bgMatches = 0
            for region in bgSamples:
                bgSize += 1
                if len(pwm.find_in_region(region, zscore=opts.zscore)) > 0:
                    bgMatches += 1
    
            #calculate significance of foreground vs. background
            zscore = zscore_normal(fgMatches, fgSize, bgMatches, bgSize)
            pvalue = pvalue_hypergeometric(fgMatches, fgSize, bgMatches, bgSize)
            outstr = '\t'.join([args[0], motifKey] + map(str, ['thesh_z='+str(opts.zscore), zscore, pvalue, fgMatches, fgSize, float(fgMatches)/fgSize,bgMatches, bgSize]))
            if opts.output_file:
                open(opts.output_file, 'a').write(outstr + '\n')
            else:
                print >>sys.stderr, outstr
    if updated_motifs:
        print '# Saving motif info back to %s' % opts.motif_file
        pickle.dump(allMotifs, open(opts.motif_file, 'wb'))
def bed_to_glitr(in_bed, out_starts):
    """Convert reads to (chrom, start, strand) for GLITR"""
    with open(in_bed) as infile:
        with open(out_starts, 'w') as outfile:
            for chrom, start, stop, strand in readBedLines(infile):
                outfile.write('\t'.join([chrom, str(start), strand]) + '\n')
Example #12
0
def main():
    ''' Calculate the chance of observing a certain number of overlaps between two
        set of genomic regions.  Significance is estimated by randomly shuffling the
        positions (not changing the lengths or chromosomes) of one of the samples and reporting the number
        of overlapping sites in the shuffled sets.  '''
    usage = "%prog [options] bedFile1 bedFile2 \n" + main.__doc__
    parser = optparse.OptionParser(usage)
    parser.add_option(
        '--genome',
        '-g',
        dest='genome',
        type='string',
        default=None,
        help='The genome name the bed files come from, i.e, mm9 or hg19')
    parser.add_option(
        '--num_shuffles',
        '-n',
        dest='num_shuffles',
        type='int',
        default=10000,
        help='Number of times to shuffle bedFile1. default=%default')
    parser.add_option(
        '--disjoint',
        '-d',
        action='store_true',
        help='Make sure that there is no overlap in shuffled regions')
    parser.add_option('--quiet',
                      '-q',
                      action='store_true',
                      help='report only the overlap number (no messages)')
    parser.add_option(
        '--unique_out',
        '-u',
        dest='unique_out',
        type='string',
        default=None,
        help='print non-overlapping regions from bedfile1 to this file')
    parser.add_option(
        '--report_col1',
        '-1',
        dest='report_col1',
        type='int',
        default=None,
        help='bed column to use when reporting the overlap type. default:None')
    parser.add_option(
        '--report_col2',
        '-2',
        dest='report_col2',
        type='int',
        default=None,
        help='bed column to use when reporting the overlap type. default:None')
    parser.add_option('--file_report',
                      '-f',
                      dest='file_out',
                      type='string',
                      default=None,
                      help='where to file the overlap report')
    opts, args = parser.parse_args()
    if opts.genome is None:
        parser.print_help()
        print >> sys.stderr, 'You must specify a genome!'
        sys.exit(-1)
    if opts.num_shuffles < 0:
        parser.print_help()
        print >> sys.stderr, 'Must have a positive or 0 number of shuffles!'
        sys.exit(-1)
    genome = getGenome(opts.genome)
    chromSizes = dict((chrom, len(seq)) for chrom, seq in genome.iteritems())
    bedfile1 = open(args[0], 'r')
    bedfile2 = open(args[1], 'r')
    bedlines1 = sorted(readBedLines(bedfile1, dataOnly=False))
    bedlines2 = sorted(readBedLines(bedfile2, dataOnly=False))
    if opts.report_col1 is None:
        opts.report_col1 = 'none'
    if opts.report_col2 is None:
        opts.report_col2 = 'none'

    if not opts.quiet:
        print 'Original data:\t%s in %s\t%s in %s\t' % (
            args[0], len(bedlines1), args[1], len(bedlines2)),
    if opts.unique_out:
        originalOverlapCount, uniqueBed1 = getBedOverlap(
            bedlines1,
            bedlines2,
            alreadySorted=True,
            reportUnique=True,
            featureColumn1=opts.report_col1,
            featureColumn2=opts.report_col2)
        with open(opts.unique_out, 'w') as outfile:
            outfile.writelines('\n'.join('\t'.join(map(str, bedFields))
                                         for bedFields in uniqueBed1))
    else:
        originalOverlapCount = getBedOverlap(bedlines1,
                                             bedlines2,
                                             alreadySorted=True,
                                             featureColumn1=opts.report_col1,
                                             featureColumn2=opts.report_col2)
    if not opts.quiet:
        print 'with %s overlaps or %s unique to bedfile1' % (
            originalOverlapCount, len(bedlines1) - originalOverlapCount)
    else:
        sys.stdout.write('\t' + str(originalOverlapCount))

    if opts.file_out:
        with open(opts.file_out, 'a') as outfile:
            print >> outfile, '\t'.join([
                args[0],
                str(len(bedlines1)), args[1],
                str(len(bedlines2)),
                'overlap: %s' % originalOverlapCount,
                'unique to 1: %s' %
                (len(uniqueBed1) if opts.unique_out else len(bedlines1) -
                 originalOverlapCount)
            ])

    if opts.num_shuffles > 0:
        randOverlaps = [-1] * opts.num_shuffles  # preallocate

        print 'Generating %s random shuffles...' % opts.num_shuffles,
        for i in xrange(opts.num_shuffles):
            if i % 1000 == 0:
                print i,
                sys.stdout.flush()
            shuffledBeds1 = sorted(
                generateShuffledBed(bedlines1,
                                    chromSizes,
                                    disjoint=opts.disjoint))
            overlapCount = getBedOverlap(shuffledBeds1,
                                         bedlines2,
                                         alreadySorted=True)
            randOverlaps[i] = overlapCount
        print
        randomBetterCount = len(
            filter(lambda randVal: randVal >= originalOverlapCount,
                   randOverlaps))
        randNumDistinctVals = len(set(randOverlaps))
        randHist, bins = scipy.histogram(randOverlaps,
                                         bins=min(randNumDistinctVals, 15))
        print 'Random overlap distribution is: \nbinCounts:\t%s\nbinEdges:  %s' % (
            randHist, bins)
        print 'Random shuffle:\t%s with at least as many overlaps, pvalue %s %s' % (
            randomBetterCount, '<' if randomBetterCount == 0 else '=',
            max(1. / opts.num_shuffles,
                float(randomBetterCount) / opts.num_shuffles))
        print 'Random mean:\t%s\tstdev:%s' % (scipy.mean(randOverlaps),
                                              scipy.std(randOverlaps))