def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[0], ) sys_call(cmd) cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[1], ) sys_call(cmd)
def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd)
def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph, ) sys_call(cmd)
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-']) + '\n')
def sample_control_like_peaks(in_peaks, out_files): """Sample from the control IgG, with similar widths as the peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) # do the dance to map peaks back to their control raw reads control_bed = re.sub(r'treat', 'control', in_peaks) control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed) control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed) control_bed = re.sub(r'peaks', 'mapped_reads', control_bed) control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed) with open(control_bed) as control_file: with open(out_locations, 'w') as outlocations: s = sampling.sample_middles(wb_genome, peak_lengths, control_file, sampleSize=cfg.getint( 'motifs', 'motif_significance_sample_size')) with open(out_sample, 'w') as outfile: for index, seq in enumerate(s): # repr() gives location, str() gives sequence outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq))) outlocations.write('\t'.join([ seq.id, str(seq.start), str(seq.stop), str(index), '0', '+' if seq.orientation == 1 else '-' ]) + '\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome( wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([ line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-' ]) + '\n')
def run_mosaik_align(in_files, out_align): 'align reads to reference using MosaikAligner' # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique in_reads, in_genome_dat, in_genome_jump, _, _ = in_files in_genome_jump = in_genome_jump.replace('_keys.jmp', '') cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s %s' cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align, cfg.getint('mapping', 'mosaik_hash_size'), cfg.get('mapping', 'mosaik_params')) sys_call(cmd)
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[0]) sys_call(cmd) cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[1]) sys_call(cmd)
def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter( bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS_peak_%s' % (index + 1), score]) + '\t+\n') # take region surrounding the peak center as the summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header score = str(max(0, min(1000, int(float(fields[6]))))) p_start, p_stop = max(0, int(fields[1])), int(fields[2]) p_center = p_start + (p_stop - p_start) / 2 s_start = p_center - summit_size / 2 s_stop = p_center + summit_size / 2 fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], str(s_start), str(s_stop), 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def motif_select_random_seqs(in_fasta, out_pattern): """Split a fasta file into several chunks so motif discovery is easier""" name = name = re.search('(.*).fasta', in_fasta).groups()[0] with open(in_fasta) as infile: seqs = list(parseFastaLines(infile)) if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample(seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset)
def motif_select_random_seqs(in_fasta, out_pattern): """Split a fasta file into several chunks so motif discovery is easier""" name = name = re.search('(.*).fasta', in_fasta).groups()[0] with open(in_fasta) as infile: seqs = list(parseFastaLines(infile)) if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample( seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset)
def find_nearby_genes(in_files, out_genes): """report which genes are within a certain distance of a peak""" in_peaks, in_genes = in_files[0] tmp_output = tempfile.NamedTemporaryFile(delete=False).name cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes, tmp_output) sys_call(cmd) with open(tmp_output) as infile: with open(out_genes, 'w') as outfile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1]) if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'): outfile.write(line) os.unlink(tmp_output)
def run_macs14(in_files, out_peaks, max_fdr): """Call peaks using MACS (v1.4). Apply a maximum FDR threshold.""" in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs14.peaks' cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(name + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) fdr = float(fields[8]) if fdr <= max_fdr: outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(name + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def run_macs14_no_control(in_treat, out_peaks): """Call peaks using MACS (v1.4) without control data""" cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks, cfg.get('peaks', 'macs14_params')) sys_call(cmd) peaks_to_keep = set() # convert to proper bedfile- ints for score and + for strand with open(out_peaks, 'w') as outfile: with open(out_peaks + '_peaks.xls') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.split('\t') if fields[0] == 'chr': continue # skip header start = str(max(0, int(fields[1]))) score = str(max(0, min(1000, int(float(fields[6]))))) outfile.write('\t'.join([fields[0], start, fields[2], 'MACS14_peak_%s' % (index + 1), score]) + '\t+\n') peaks_to_keep.add(index) # take region surrounding the peak summit summit_size = cfg.getint('peaks', 'peak_summit_size') with open(out_peaks + '_summits.%s_around' % \ cfg.get('peaks', 'peak_summit_size'), 'w') as outfile: with open(out_peaks + '_summits.bed') as infile: for index, line in enumerate(itertools.ifilter(bedCommentFilter, infile)): fields = line.strip().split('\t') if fields[0] == 'chr': continue # skip header # score is number of reads at summit score = str(max(0, min(1000, int(float(fields[-1]))))) start = str(max(0, int(fields[1]) - summit_size / 2)) stop = str(int(fields[2]) + summit_size / 2) if index in peaks_to_keep: outfile.write('\t'.join([fields[0], start, stop, 'MACS_peak_%s' % (index + 1), score]) + '\t+\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome(wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-']) + '\n')
@transform(call_peaks.all_peak_caller_functions + [pas_seq.remove_terminal_exon] + [clip_seq.search_genome_consensus] + mapping.all_mappers_output + mapping.all_mappers_raw_reads, suffix(''), '.clipped.sorted') def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) #cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd) @active_if(cfg.getboolean('visualization', 'uniquefy_track')) @transform([clip_and_sort_peaks] + mapping.all_mappers_output, suffix(''), '.unique', cfg.getint('visualization', 'uniquefy_track_max_reads')) def bed_uniquefy(in_bed, out_bed, max_reads): 'Given a sorted bed file, remove tags that are on the same start, strand' with open(in_bed) as infile: with open(out_bed, 'w') as outfile: prev_start, prev_chrom = None, None plus_seen, minus_seen = 0, 0 for line in infile: fields = line.split('\t') chrom, start, stop = fields[:3] if prev_start is None or prev_start != start or \ prev_chrom != chrom: prev_start, prev_chrom = start, chrom plus_seen, minus_seen = 0, 0 if len(fields) < 6 or fields[5] == '+': if plus_seen <= max_reads:
cmd2 = 'fastx_artifacts_filter -o %s -z' % (out_fastq) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2) if cfg.getboolean('filtering', 'filter_artifacts'): prev_output = filter_artifacts prev_suffix = '' @active_if(cfg.getboolean('filtering', 'filter_quality')) @transform(prev_output, suffix(prev_suffix), '.min_qual', cfg.getint('filtering', 'filter_min_quality'), cfg.getint('filtering', 'filter_percent_bases_at_min')) def filter_min_quality(in_fastq, out_fastq, min_qual, min_percent): """Remove sequences that have < min_precent bases with quality < min_qual""" cmd1 = 'cat %s' % in_fastq cmd2 = 'fastq_quality_filter -o %s -q %s -p %s' % (out_fastq, min_qual, min_percent) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2) if cfg.getboolean('filtering', 'filter_quality'): prev_output = filter_min_quality
p_start += len(region) gene_db[seq_id] = sequence msa.build(saveSeqDict=True) outfile.close() pickle.dump(gene_db, open(out_db, 'w')) @active_if(cfg.getboolean('mapping', 'run_mosaik')) @transform(reference_genomes, suffix(''), '.mosaik_dat') def run_mosaik_build_reference(in_genome, out_bin): 'convert reference to mosaik binary' cmd = 'MosaikBuild -fr %s -oa %s' % (in_genome, out_bin) sys_call(cmd) mosaik_suffix_base = r'\1.mosaik_jump_%s' % cfg.getint('mapping', 'mosaik_hash_size') @split(run_mosaik_build_reference, regex('(.*)\.mosaik_dat'), [mosaik_suffix_base + '_keys.jmp', mosaik_suffix_base + '_meta.jmp', mosaik_suffix_base + '_positions.jmp'], mosaik_suffix_base) def run_mosiak_jump_reference(in_dat, _, out_jump_base): 'create mosaik jump db on reference' cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base, cfg.getint('mapping', 'mosaik_hash_size')) sys_call(cmd) @active_if(cfg.getboolean('mapping', 'run_mosaik')) @transform(preprocessing.final_output, suffix(''), '.mosaik_reads_dat') def run_mosaik_build_reads(in_fastq, out_dat): 'convert reads to mosaik binary'
def run_mosiak_jump_reference(in_dat, _, out_jump_base): 'create mosaik jump db on reference' cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base, cfg.getint('mapping', 'mosaik_hash_size')) sys_call(cmd)
import hts_waterworks.utils.sequence_motif as sequence_motif import hts_waterworks.utils.sampling as sampling import hts_waterworks.utils.motif_significance as motif_significance from hts_waterworks.bootstrap import cfg, get_genome, genome_path import hts_waterworks.call_peaks as call_peaks import hts_waterworks.annotation as annotation #from ipdb import set_trace as breakpoint # motif setup @transform(call_peaks.all_peak_caller_functions + ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')], regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'), r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'), cfg.getint('motifs', 'motif_chunk_size')) def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep): """keep only the top peaks as input to motif discovery""" with open(in_peaks) as infile: seqs = list(readBedLines(infile, dataOnly=False)) # sort by score, highest first seqs.sort(key=lambda x: int(x[4]), reverse=True) with open(out_subset, 'w') as outfile: subset = seqs[:num_peaks_to_keep] outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset) #@follows(get_genome) @transform([get_top_peaks], suffix(''), '.fasta') def get_peak_sequence(in_peaks, out_fasta): """Get fasta file for peak summits
def uniquefy_downsample_reads(in_files, out_files): """Uniquefy sequence reads then downsample so the total unique tag count in treatment and control is the same. This may generate many downsampled datasets. """ # WARNING: this is a circular dependency. It has to be included at runtime # Top-level import will cause this module to load only 1/2 way # we import here because we need to call this function directly, # and not just when using ruffus from hts_waterworks.visualize import bed_uniquefy if not cfg.getboolean('peaks', 'downsample_reads'): with log_mtx: log.debug('NOT downsampling the sequence reads!') else: in_treat, in_control = in_files out_treat_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_treat) out_control_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_control) if out_treat_template == in_treat: raise RuntimeError('regex substitution failed from %s to %s' % ( in_treat, out_treat_template)) if out_control_template == in_control: raise RuntimeError('regex substitution failed from %s to %s' % ( in_control, out_control_template)) tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name # sort the reads bed_clip_and_sort(in_treat, tmp_t_sorted) bed_clip_and_sort(in_control, tmp_c_sorted) # uniquefy the reads bed_uniquefy(tmp_t_sorted, tmp_t_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) bed_uniquefy(tmp_c_sorted, tmp_c_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) total_treat = sum(1 for l in open(tmp_t_unique)) total_control = sum(1 for l in open(tmp_c_unique)) if total_treat == total_control: with log_mtx: log.debug('No downsampling required-- tag counts identical') else: # downsample num_down_sample times for i in xrange(cfg.getint('peaks', 'num_down_samples')): out_treat = out_treat_template % i out_control = out_control_template % i if total_treat > total_control: # reduce number of treatment reads inds_to_keep = set(random.sample(xrange(total_treat), total_control)) in_orig, out_orig = tmp_c_unique, out_control in_subset, out_subset = tmp_t_unique, out_treat else: # reduce number of control reads inds_to_keep = set(random.sample(xrange(total_control), total_treat)) in_orig, out_orig = tmp_t_unique, out_treat in_subset, out_subset = tmp_c_unique, out_control sys_call('cp %s %s' % (in_orig, out_orig)) # subset the tags with open(in_subset) as infile: with open(out_subset, 'w') as outfile: outfile.writelines(line for i, line in enumerate(infile) if i in inds_to_keep) for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]: os.unlink(f)
outfile.write('\t'.join([fields[0], start, stop, 'AREM_peak_%s' % (index + 1), score]) + '\t+\n') @active_if(cfg.getboolean('peaks', 'run_glitr')) @transform(mapping.all_mappers_output, suffix('.mapped_reads'), '.mapped_reads_glitr') def bed_to_glitr(in_bed, out_starts): """Convert reads to (chrom, start, strand) for GLITR""" with open(in_bed) as infile: with open(out_starts, 'w') as outfile: for chrom, start, stop, strand in readBedLines(infile): outfile.write('\t'.join([chrom, str(start), strand]) + '\n') @active_if(cfg.getboolean('peaks', 'run_glitr')) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @collate(bed_to_glitr, regex(r'(.+)\.(treat|control)\.(.+)\.mapped_reads_glitr$'), r'\1.treat.\3.glitr.ranges') def run_glitr(in_files, out_peaks): """Call peaks with GLITR""" in_treat = filter(lambda f: '.treat.' in f, in_files)[0] in_control = filter(lambda f: '.control.' in f, in_files)[0] glitr_dir = in_treat + '.GLITR_out' cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \ '--CONTROL=../%s --GENOME=%s %s ') % ( glitr_dir, glitr_dir, glitr_dir, in_treat, in_control, cfg.get('DEFAULT', 'genome').upper(), cfg.get('peaks', 'glitr_params')) sys_call(cmd) sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) # cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd) @active_if(cfg.getboolean("visualization", "uniquefy_track")) @transform( [clip_and_sort_peaks] + mapping.all_mappers_output, suffix(""), ".unique", cfg.getint("visualization", "uniquefy_track_max_reads"), ) def bed_uniquefy(in_bed, out_bed, max_reads): "Given a sorted bed file, remove tags that are on the same start, strand" with open(in_bed) as infile: with open(out_bed, "w") as outfile: prev_start, prev_chrom = None, None plus_seen, minus_seen = 0, 0 for line in infile: fields = line.split("\t") chrom, start, stop = fields[:3] if prev_start is None or prev_start != start or prev_chrom != chrom: prev_start, prev_chrom = start, chrom plus_seen, minus_seen = 0, 0 if len(fields) < 6 or fields[5] == "+": if plus_seen <= max_reads:
import hts_waterworks.utils.sampling as sampling import hts_waterworks.utils.motif_significance as motif_significance from hts_waterworks.bootstrap import cfg, get_genome, genome_path import hts_waterworks.call_peaks as call_peaks import hts_waterworks.annotation as annotation #from ipdb import set_trace as breakpoint # motif setup @transform( call_peaks.all_peak_caller_functions + ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')], regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'), r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'), cfg.getint('motifs', 'motif_chunk_size')) def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep): """keep only the top peaks as input to motif discovery""" with open(in_peaks) as infile: seqs = list(readBedLines(infile, dataOnly=False)) # sort by score, highest first seqs.sort(key=lambda x: int(x[4]), reverse=True) with open(out_subset, 'w') as outfile: subset = seqs[:num_peaks_to_keep] outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset) #@follows(get_genome) @transform([get_top_peaks], suffix(''), '.fasta') def get_peak_sequence(in_peaks, out_fasta):
from hts_waterworks.annotation import get_refseq_genes from hts_waterworks.bootstrap import cfg from hts_waterworks.utils.pas_seq_expression import (group_reads_by_gene, group_adjacent_reads) from hts_waterworks.utils.common import breakpoint @active_if(False) @files(None, '%s.polyA_DB' % cfg.get('DEFAULT', 'genome'), cfg.get('DEFAULT', 'genome')) def get_polyA_DB(_, out_db, genome_build): cmd = r"curl 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/polyaDb.txt.gz' | gunzip - | cut -d $'\t' -f 2- > %s" cmd = cmd % (genome_build, out_db) sys_call(cmd, file_log=False) @active_if(cfg.getint('PAS-Seq', 'min_read_count') > 0) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform(mapping.all_mappers_output, suffix('.mapped_reads'), '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count')) def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count): """ Remove mapped reads that don't overlap with at least *min_read_count* reads """ cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \ r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1, out_bed) sys_call(cmd, file_log=False) @active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads')) #@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'),
p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2) if cfg.getboolean('filtering', 'filter_artifacts'): prev_output = filter_artifacts prev_suffix = '' @active_if(cfg.getboolean('filtering', 'filter_quality')) @transform(prev_output, suffix(prev_suffix), '.min_qual', cfg.getint('filtering', 'filter_min_quality'), cfg.getint('filtering', 'filter_percent_bases_at_min')) def filter_min_quality(in_fastq, out_fastq, min_qual, min_percent): """Remove sequences that have < min_precent bases with quality < min_qual""" cmd1 = 'cat %s' % in_fastq cmd2 = 'fastq_quality_filter -o %s -q %s -p %s' % (out_fastq, min_qual, min_percent) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)