Ejemplo n.º 1
0
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(
                                        bedCommentFilter, infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS_peak_%s' % (index + 1), score]) +
                                    '\t+\n')
    # take region surrounding the peak center as the summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                score = str(max(0, min(1000, int(float(fields[6])))))
                p_start, p_stop = max(0, int(fields[1])), int(fields[2])
                p_center = p_start + (p_stop - p_start) / 2
                s_start = p_center - summit_size / 2
                s_stop = p_center + summit_size / 2
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], str(s_start),
                                    str(s_stop),
                                    'MACS_peak_%s' % (index + 1), score])
                                        + '\t+\n')
Ejemplo n.º 2
0
short_name = lambda x: x.replace('hg19.refseq_genes.', '').split('.trim_regex')[0] + ('.plus' if 'plus' in x else '.minus')

@active_if(cfg.getboolean('PAS-Seq', 'test_differential_polya'))
@follows(remove_terminal_exon, pileup_normalize_per_million)
@split(get_refseq_genes, regex(r'(.*)'),
       add_inputs(pileup_normalize_per_million if
                  cfg.getboolean('visualization', 'normalize_per_million') else
                  remove_terminal_exon),
#@split(get_refseq_genes, regex(r'(.*)'), add_inputs('*.no_prime.norm_mil.pileup_reads'),
#@split('hg19.refseq_genes.extend3utr', regex(r'(.*)'), add_inputs('*.pileup_reads'),
           r'\1.*.polya.*.*_test',
           cfg.getint('PAS-Seq', 'compare_window_width'),
           r'\1.%s.vs.%s.polya.%s.fisher_test',
           r'\1.%s.vs.%s.polya.%s.t_test',
           r'\1.%s.vs.%s.polya.%s.avg_fisher_test',
           cfg.getfloat('PAS-Seq', 'min_score_for_site'))
def test_differential_polya(in_files, out_pattern, max_dist, out_template,
                            ttest_template, avg_fisher_template, min_score):
    """Test for differential poly-adenylation from PAS-seq pileups.
    
    Performs all pairwise tests of merged poly-A sites across all experiments.
    """
    print in_files
    in_genes, all_reads = in_files[0], in_files[1:]
    all_reads = filter(lambda f: f.endswith('pileup_reads'), all_reads)
    all_reads = sorted(all_reads)
    #read_counts = map(lambda f: sum(1 for i in open(f)), all_reads)
    if len(all_reads) == 0:
        raise RuntimeError('differential polyadenylation requires multiple '
                           'input datasets! I saw %s ', cur_reads)
    out_files = {}
Ejemplo n.º 3
0
                    add_inputs, regex, suffix, mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path
import hts_waterworks.mapping as mapping
import hts_waterworks.clip_seq as clip_seq
from hts_waterworks.utils.common import (bedCommentFilter, readBedLines,
                                         parse_ucsc_range)


@active_if(cfg.getboolean('peaks', 'run_macs'))
@collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), 
         add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks',
         cfg.getfloat('peaks', 'max_FDR'))
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile: