def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter( bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS_peak_%s' % (index + 1), score]) + '\t+\n') # take region surrounding the peak center as the summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header score = str(max(0, min(1000, int(float(fields[6]))))) p_start, p_stop = max(0, int(fields[1])), int(fields[2]) p_center = p_start + (p_stop - p_start) / 2 s_start = p_center - summit_size / 2 s_stop = p_center + summit_size / 2 fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], str(s_start), str(s_stop), 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
short_name = lambda x: x.replace('hg19.refseq_genes.', '').split('.trim_regex')[0] + ('.plus' if 'plus' in x else '.minus') @active_if(cfg.getboolean('PAS-Seq', 'test_differential_polya')) @follows(remove_terminal_exon, pileup_normalize_per_million) @split(get_refseq_genes, regex(r'(.*)'), add_inputs(pileup_normalize_per_million if cfg.getboolean('visualization', 'normalize_per_million') else remove_terminal_exon), #@split(get_refseq_genes, regex(r'(.*)'), add_inputs('*.no_prime.norm_mil.pileup_reads'), #@split('hg19.refseq_genes.extend3utr', regex(r'(.*)'), add_inputs('*.pileup_reads'), r'\1.*.polya.*.*_test', cfg.getint('PAS-Seq', 'compare_window_width'), r'\1.%s.vs.%s.polya.%s.fisher_test', r'\1.%s.vs.%s.polya.%s.t_test', r'\1.%s.vs.%s.polya.%s.avg_fisher_test', cfg.getfloat('PAS-Seq', 'min_score_for_site')) def test_differential_polya(in_files, out_pattern, max_dist, out_template, ttest_template, avg_fisher_template, min_score): """Test for differential poly-adenylation from PAS-seq pileups. Performs all pairwise tests of merged poly-A sites across all experiments. """ print in_files in_genes, all_reads = in_files[0], in_files[1:] all_reads = filter(lambda f: f.endswith('pileup_reads'), all_reads) all_reads = sorted(all_reads) #read_counts = map(lambda f: sum(1 for i in open(f)), all_reads) if len(all_reads) == 0: raise RuntimeError('differential polyadenylation requires multiple ' 'input datasets! I saw %s ', cur_reads) out_files = {}
add_inputs, regex, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path import hts_waterworks.mapping as mapping import hts_waterworks.clip_seq as clip_seq from hts_waterworks.utils.common import (bedCommentFilter, readBedLines, parse_ucsc_range) @active_if(cfg.getboolean('peaks', 'run_macs')) @collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks', cfg.getfloat('peaks', 'max_FDR')) def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: