Ejemplo n.º 1
0
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix):
    """Plot the running motif presence, starting at most significant peaks"""
    in_peaks, in_motifs = in_files[0], in_files[1:]
    out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence'
    out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png'
    out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations'
    out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed'
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    old_size = matplotlib.rcParams['font.size']
    matplotlib.rcParams['font.size'] = 6
    # read in the peaks file, sorting it by *score*
    print in_peaks
    print open(in_peaks).readline()
    try:
        peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)]
        print peaks
        peaks = sorted([l.strip().split('\t') for l in open(in_peaks)],
                        key=lambda line:float(line[4]), reverse=True)
    except ValueError:
        print 'here is the error!', l.strip(), float(l.strip().split('\t')[4])
        raise
    motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks)
    for m_file in in_motifs:
        cur_motifs = {}
        m_file_short = re.sub(r'((treat|fastq|fastq_illumina|min_qual|bowtie|' +
                                    r'maq|peaks|with_mean_sd|discovered|' +
                                    r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*',
                              '', m_file)
        #print m_file_short
        with open(m_file) as infile:
            try:
                cur_motifs.update(pickle.load(infile))
            except:
                infile.seek(0)
                for line in infile:
                    #print line,
                    name, consensus = line.strip('\n').split('\t')
                    cur_motifs.update({name:
                                    sequence_motif.makePWMFromIUPAC(consensus)})
        #print m_file, cur_motifs
        all_motif_percent = {}
        for zscore in cfg.get('motifs','motif_zscores').strip().split(','):
            for name, pwm in cur_motifs.items():
                with_motif = 0
                percent_with = []  # percent with motif at each peak
                for total, p in enumerate(peaks):
                    chrom, start, stop = p[0], int(p[1]), int(p[2])
                    region = wb_genome[chrom][start:stop]
                    # extend peaks to at least pwm length
                    while len(region) < len(pwm):
                        region = wb_genome[chrom][region.start-5:region.stop+5]
                        # catch nasty infinite loops for very short scaffolds
                        if len(region) == len(wb_genome[chrom]):
                            break
                    # check if the motif occurs in the region
                    try:
                        hits = list(pwm.find_in_region(region,
                                                       zscore=float(zscore)))
                    except Exception as e:
                        log.debug('issue with sequence', repr(region),
                                        name, e.message)
                        hits = []
                    if len(hits) > 0:
                        with_motif += 1
                        # add all peak locations to the list
                        motifs_in_peaks[tuple(p)][name].extend((
                                            h[0] + start, h[1] + start,
                                            '+' if h[2] == 1 else '-')
                                                                for h in hits)
                    percent_with.append(float(with_motif) / (total+1))
                
                #print all_motif_percent, name, percent_with
                all_motif_percent[name] = percent_with
            # having calculated for all motifs in all files,
            # plot a figure and give a summary
            with open(out_summary % ('z' + zscore), 'w') as outfile:
                outfile.writelines('%s\t%s\n' % (name, percent)
                                for name, percent in all_motif_percent.items())

            # write the peak locations along with the motif instances
            # that occur in them
            with open(out_locations % ('z' + zscore), 'w') as outfile:
                with open(out_locations_bed % ('z' + zscore), 'w') as out_bed:
                    # header is 6 columns of peak info, then motif info
                    outfile.write('\t'.join(['p_chrom', 'p_start', 'p_stop',
                                             'p_name', 'p_score', 'p_strand']))
                    for motif_name in sorted(cur_motifs):
                        outfile.write('\t%s\t#instances_%s' % (motif_name,
                                                               motif_name))
                    outfile.write('\n')
                    
                    # write one line per peak, then the motif counts and
                    # instances in the peak
                    # instances for each motif are all in one column
                    for p in peaks:
                        outfile.write('\t'.join(map(str, p)))
                        for motif_name in sorted(cur_motifs):
                            hits = motifs_in_peaks[tuple(p)][motif_name]
                            outfile.write('\t%s\t%s' % (len(hits), hits))
                            for h in hits:
                                out_bed.write('\t'.join(map(str, [p[0], h[0],
                                                        h[1], motif_name, 1000,
                                                        h[2]])) + '\n')
                        outfile.write('\n')
                    
            all_motif_percent_dict = sorted(all_motif_percent.items())
            names = [k for k, v in all_motif_percent_dict]
            datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T
            
            # plot original data
            pyplot.plot(datapoints)
            pyplot.legend(names)
            pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short,
                                                               in_peaks))
            pyplot.savefig(out_png % ('z'+zscore))
            pyplot.close()
            
            # plot top 10% of data
            plot_top = len(datapoints) / 10
            #print datapoints
            #print datapoints[:plot_top, :]
            # check if the slice is the right dimension
            pyplot.plot(datapoints[:plot_top, :])
            pyplot.legend(names)
            pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % (
                                                        m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore + '.top10percent'))
            pyplot.close()
        
    matplotlib.rcParams['font.size'] = old_size 
Ejemplo n.º 2
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Ejemplo n.º 3
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Ejemplo n.º 4
0
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix):
    """Plot the running motif presence, starting at most significant peaks"""
    in_peaks, in_motifs = in_files[0], in_files[1:]
    out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence'
    out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png'
    out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations'
    out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed'
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    old_size = matplotlib.rcParams['font.size']
    matplotlib.rcParams['font.size'] = 6
    # read in the peaks file, sorting it by *score*
    print in_peaks
    print open(in_peaks).readline()
    try:
        peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)]
        print peaks
        peaks = sorted([l.strip().split('\t') for l in open(in_peaks)],
                       key=lambda line: float(line[4]),
                       reverse=True)
    except ValueError:
        print 'here is the error!', l.strip(), float(l.strip().split('\t')[4])
        raise
    motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks)
    for m_file in in_motifs:
        cur_motifs = {}
        m_file_short = re.sub(
            r'((treat|fastq|fastq_illumina|min_qual|bowtie|' +
            r'maq|peaks|with_mean_sd|discovered|' +
            r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*',
            '', m_file)
        #print m_file_short
        with open(m_file) as infile:
            try:
                cur_motifs.update(pickle.load(infile))
            except:
                infile.seek(0)
                for line in infile:
                    #print line,
                    name, consensus = line.strip('\n').split('\t')
                    cur_motifs.update(
                        {name: sequence_motif.makePWMFromIUPAC(consensus)})
        #print m_file, cur_motifs
        all_motif_percent = {}
        for zscore in cfg.get('motifs', 'motif_zscores').strip().split(','):
            for name, pwm in cur_motifs.items():
                with_motif = 0
                percent_with = []  # percent with motif at each peak
                for total, p in enumerate(peaks):
                    chrom, start, stop = p[0], int(p[1]), int(p[2])
                    region = wb_genome[chrom][start:stop]
                    # extend peaks to at least pwm length
                    while len(region) < len(pwm):
                        region = wb_genome[chrom][region.start -
                                                  5:region.stop + 5]
                        # catch nasty infinite loops for very short scaffolds
                        if len(region) == len(wb_genome[chrom]):
                            break
                    # check if the motif occurs in the region
                    try:
                        hits = list(
                            pwm.find_in_region(region, zscore=float(zscore)))
                    except Exception as e:
                        log.debug('issue with sequence', repr(region), name,
                                  e.message)
                        hits = []
                    if len(hits) > 0:
                        with_motif += 1
                        # add all peak locations to the list
                        motifs_in_peaks[tuple(p)][name].extend(
                            (h[0] + start, h[1] + start,
                             '+' if h[2] == 1 else '-') for h in hits)
                    percent_with.append(float(with_motif) / (total + 1))

                #print all_motif_percent, name, percent_with
                all_motif_percent[name] = percent_with
            # having calculated for all motifs in all files,
            # plot a figure and give a summary
            with open(out_summary % ('z' + zscore), 'w') as outfile:
                outfile.writelines(
                    '%s\t%s\n' % (name, percent)
                    for name, percent in all_motif_percent.items())

            # write the peak locations along with the motif instances
            # that occur in them
            with open(out_locations % ('z' + zscore), 'w') as outfile:
                with open(out_locations_bed % ('z' + zscore), 'w') as out_bed:
                    # header is 6 columns of peak info, then motif info
                    outfile.write('\t'.join([
                        'p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score',
                        'p_strand'
                    ]))
                    for motif_name in sorted(cur_motifs):
                        outfile.write('\t%s\t#instances_%s' %
                                      (motif_name, motif_name))
                    outfile.write('\n')

                    # write one line per peak, then the motif counts and
                    # instances in the peak
                    # instances for each motif are all in one column
                    for p in peaks:
                        outfile.write('\t'.join(map(str, p)))
                        for motif_name in sorted(cur_motifs):
                            hits = motifs_in_peaks[tuple(p)][motif_name]
                            outfile.write('\t%s\t%s' % (len(hits), hits))
                            for h in hits:
                                out_bed.write('\t'.join(
                                    map(str, [
                                        p[0], h[0], h[1], motif_name, 1000,
                                        h[2]
                                    ])) + '\n')
                        outfile.write('\n')

            all_motif_percent_dict = sorted(all_motif_percent.items())
            names = [k for k, v in all_motif_percent_dict]
            datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T

            # plot original data
            pyplot.plot(datapoints)
            pyplot.legend(names)
            pyplot.title('Motifs from\n%s\nPresence in\n%s' %
                         (m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore))
            pyplot.close()

            # plot top 10% of data
            plot_top = len(datapoints) / 10
            #print datapoints
            #print datapoints[:plot_top, :]
            # check if the slice is the right dimension
            pyplot.plot(datapoints[:plot_top, :])
            pyplot.legend(names)
            pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' %
                         (m_file_short, in_peaks))
            pyplot.savefig(out_png % ('z' + zscore + '.top10percent'))
            pyplot.close()

    matplotlib.rcParams['font.size'] = old_size