Exemple #1
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[0],
    )
    sys_call(cmd)

    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[1],
    )
    sys_call(cmd)
def bed_to_bedgraph(in_files, out_bedgraph):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph)
    sys_call(cmd)
Exemple #3
0
def bed_to_bedgraph(in_files, out_bedgraph):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph,
    )
    sys_call(cmd)
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                                                readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome, peak_lengths, control_file,
                                sampleSize=cfg.getint('motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' % (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([seq.id, str(seq.start),
                                                 str(seq.stop), str(index), '0',
                                '+' if seq.orientation == 1 else '-']) + '\n')
Exemple #5
0
def sample_control_like_peaks(in_peaks, out_files):
    """Sample from the control IgG, with similar widths as the peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    # do the dance to map peaks back to their control raw reads
    control_bed = re.sub(r'treat', 'control', in_peaks)
    control_bed = re.sub(r'\.top[\d]+\.peaks$', '', control_bed)
    control_bed = re.sub(r'_summits\.[\d]+_around', '', control_bed)
    control_bed = re.sub(r'peaks', 'mapped_reads', control_bed)
    control_bed = re.sub(r'\.(macs(14)*|arem|glitr)', '', control_bed)
    with open(control_bed) as control_file:
        with open(out_locations, 'w') as outlocations:
            s = sampling.sample_middles(wb_genome,
                                        peak_lengths,
                                        control_file,
                                        sampleSize=cfg.getint(
                                            'motifs',
                                            'motif_significance_sample_size'))
            with open(out_sample, 'w') as outfile:
                for index, seq in enumerate(s):
                    # repr() gives location, str() gives sequence
                    outfile.write('>%s_%s\n%s\n' %
                                  (index, repr(seq), str(seq)))
                    outlocations.write('\t'.join([
                        seq.id,
                        str(seq.start),
                        str(seq.stop),
                        str(index), '0', '+' if seq.orientation == 1 else '-'
                    ]) + '\n')
Exemple #6
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)

    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(
        wb_genome,
        peak_lengths,
        sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'),
        excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'),
        excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'),
        ignoreCharacters='_',
        weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([
                    line.id,
                    str(line.start),
                    str(line.stop),
                    str(index), '0', '+' if line.orientation == 1 else '-'
                ]) + '\n')
Exemple #7
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
Exemple #8
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[0])
    sys_call(cmd)
    
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[1])
    sys_call(cmd)
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(
                                        bedCommentFilter, infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS_peak_%s' % (index + 1), score]) +
                                    '\t+\n')
    # take region surrounding the peak center as the summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                score = str(max(0, min(1000, int(float(fields[6])))))
                p_start, p_stop = max(0, int(fields[1])), int(fields[2])
                p_center = p_start + (p_stop - p_start) / 2
                s_start = p_center - summit_size / 2
                s_stop = p_center + summit_size / 2
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], str(s_start),
                                    str(s_stop),
                                    'MACS_peak_%s' % (index + 1), score])
                                        + '\t+\n')
def motif_select_random_seqs(in_fasta, out_pattern):
    """Split a fasta file into several chunks so motif discovery is easier"""
    name = name = re.search('(.*).fasta', in_fasta).groups()[0]
    with open(in_fasta) as infile:
        seqs = list(parseFastaLines(infile))
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(seqs, min(len(seqs),
                                    cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                                                for s in subset)
Exemple #12
0
def motif_select_random_seqs(in_fasta, out_pattern):
    """Split a fasta file into several chunks so motif discovery is easier"""
    name = name = re.search('(.*).fasta', in_fasta).groups()[0]
    with open(in_fasta) as infile:
        seqs = list(parseFastaLines(infile))
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(
                    seqs,
                    min(len(seqs), cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                   for s in subset)
Exemple #13
0
def find_nearby_genes(in_files, out_genes):
    """report which genes are within a certain distance of a peak"""
    in_peaks, in_genes = in_files[0]
    tmp_output = tempfile.NamedTemporaryFile(delete=False).name
    cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks,
                                                       in_genes, tmp_output)
    sys_call(cmd)
    with open(tmp_output) as infile:
        with open(out_genes, 'w') as outfile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'):
                    outfile.write(line)
    os.unlink(tmp_output)
Exemple #14
0
def find_nearby_genes(in_files, out_genes):
    """report which genes are within a certain distance of a peak"""
    in_peaks, in_genes = in_files[0]
    tmp_output = tempfile.NamedTemporaryFile(delete=False).name
    cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes,
                                                       tmp_output)
    sys_call(cmd)
    with open(tmp_output) as infile:
        with open(out_genes, 'w') as outfile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'):
                    outfile.write(line)
    os.unlink(tmp_output)
def run_macs14(in_files, out_peaks, max_fdr):
    """Call peaks using MACS (v1.4). Apply a maximum FDR threshold."""
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs14.peaks'
    cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name,
                                             cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                    peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
def run_macs14_no_control(in_treat, out_peaks):
    """Call peaks using MACS (v1.4) without control data"""
    cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks,
                                         cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(out_peaks + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(out_peaks + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                        readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(wb_genome, peak_lengths,
                               sampleSize=cfg.getint('motifs',
                                        'motif_significance_sample_size'),
                               excludeRepeat=cfg.getboolean('motifs',
                                                'sampling_exclude_repeats'),
                               excludeN=cfg.getboolean('motifs',
                                                'sampling_exclude_N'),
                               ignoreCharacters='_', weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([line.id, str(line.start),
                                             str(line.stop), str(index), '0',
                                '+' if line.orientation == 1 else '-']) + '\n')
@transform(call_peaks.all_peak_caller_functions + [pas_seq.remove_terminal_exon] + [clip_seq.search_genome_consensus] + mapping.all_mappers_output + mapping.all_mappers_raw_reads,
           suffix(''), '.clipped.sorted')
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(),
                                                tmp_clipped.name)
        sys_call(cmd)
        #cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)

@active_if(cfg.getboolean('visualization', 'uniquefy_track'))
@transform([clip_and_sort_peaks] + mapping.all_mappers_output, suffix(''),
           '.unique',
           cfg.getint('visualization', 'uniquefy_track_max_reads'))
def bed_uniquefy(in_bed, out_bed, max_reads):
    'Given a sorted bed file, remove tags that are on the same start, strand'
    with open(in_bed) as infile:
        with open(out_bed, 'w') as outfile:
            prev_start, prev_chrom = None, None
            plus_seen, minus_seen = 0, 0
            for line in infile:
                fields = line.split('\t')
                chrom, start, stop = fields[:3]
                if prev_start is None or prev_start != start or \
                                                       prev_chrom != chrom:
                    prev_start, prev_chrom = start, chrom
                    plus_seen, minus_seen = 0, 0
                if len(fields) < 6 or fields[5] == '+':
                    if plus_seen <= max_reads:
    cmd2 = 'fastx_artifacts_filter -o %s -z' % (out_fastq)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
if cfg.getboolean('filtering', 'filter_artifacts'):
    prev_output = filter_artifacts
    prev_suffix = ''


@active_if(cfg.getboolean('filtering', 'filter_quality'))
@transform(prev_output, suffix(prev_suffix), '.min_qual',
        cfg.getint('filtering', 'filter_min_quality'),
        cfg.getint('filtering', 'filter_percent_bases_at_min'))
def filter_min_quality(in_fastq, out_fastq, min_qual, min_percent):
    """Remove sequences that have < min_precent bases with quality < min_qual"""
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastq_quality_filter -o %s -q %s -p %s' % (out_fastq,
                                                        min_qual, min_percent)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
if cfg.getboolean('filtering', 'filter_quality'):
    prev_output = filter_min_quality
Exemple #20
0
                p_start += len(region)
        gene_db[seq_id] = sequence
    msa.build(saveSeqDict=True)
    outfile.close()
    pickle.dump(gene_db, open(out_db, 'w'))


@active_if(cfg.getboolean('mapping', 'run_mosaik'))
@transform(reference_genomes, suffix(''), '.mosaik_dat')
def run_mosaik_build_reference(in_genome, out_bin):
    'convert reference to mosaik binary'
    cmd = 'MosaikBuild -fr %s -oa %s' % (in_genome, out_bin)
    sys_call(cmd)


mosaik_suffix_base = r'\1.mosaik_jump_%s' % cfg.getint('mapping', 'mosaik_hash_size')
@split(run_mosaik_build_reference, regex('(.*)\.mosaik_dat'),
       [mosaik_suffix_base + '_keys.jmp',
        mosaik_suffix_base + '_meta.jmp',
        mosaik_suffix_base + '_positions.jmp'], mosaik_suffix_base)
def run_mosiak_jump_reference(in_dat, _, out_jump_base):
    'create mosaik jump db on reference'
    cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base,
                                    cfg.getint('mapping', 'mosaik_hash_size'))
    sys_call(cmd)


@active_if(cfg.getboolean('mapping', 'run_mosaik'))
@transform(preprocessing.final_output, suffix(''), '.mosaik_reads_dat')
def run_mosaik_build_reads(in_fastq, out_dat):
    'convert reads to mosaik binary'
Exemple #21
0
def run_mosiak_jump_reference(in_dat, _, out_jump_base):
    'create mosaik jump db on reference'
    cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base,
                                    cfg.getint('mapping', 'mosaik_hash_size'))
    sys_call(cmd)
import hts_waterworks.utils.sequence_motif as sequence_motif
import hts_waterworks.utils.sampling as sampling
import hts_waterworks.utils.motif_significance as motif_significance
from hts_waterworks.bootstrap import cfg, get_genome, genome_path
import hts_waterworks.call_peaks as call_peaks
import hts_waterworks.annotation as annotation


#from ipdb import set_trace as breakpoint

# motif setup

@transform(call_peaks.all_peak_caller_functions + 
          ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')],
        regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'),
        r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'),
        cfg.getint('motifs', 'motif_chunk_size'))
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep):
    """keep only the top peaks as input to motif discovery"""
    with open(in_peaks) as infile:
        seqs = list(readBedLines(infile, dataOnly=False))
        # sort by score, highest first
        seqs.sort(key=lambda x: int(x[4]), reverse=True)
        with open(out_subset, 'w') as outfile:
            subset = seqs[:num_peaks_to_keep]
            outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)

#@follows(get_genome)
@transform([get_top_peaks], suffix(''), '.fasta')
def get_peak_sequence(in_peaks, out_fasta):
    """Get fasta file for peak summits
Exemple #23
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'AREM_peak_%s' % (index + 1), score])
                                                + '\t+\n')

@active_if(cfg.getboolean('peaks', 'run_glitr'))
@transform(mapping.all_mappers_output,
    suffix('.mapped_reads'), '.mapped_reads_glitr')
def bed_to_glitr(in_bed, out_starts):
    """Convert reads to (chrom, start, strand) for GLITR"""
    with open(in_bed) as infile:
        with open(out_starts, 'w') as outfile:
            for chrom, start, stop, strand in readBedLines(infile):
                outfile.write('\t'.join([chrom, str(start), strand]) + '\n')

@active_if(cfg.getboolean('peaks', 'run_glitr'))
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@collate(bed_to_glitr,
         regex(r'(.+)\.(treat|control)\.(.+)\.mapped_reads_glitr$'),
         r'\1.treat.\3.glitr.ranges')
def run_glitr(in_files, out_peaks):
    """Call peaks with GLITR"""
    in_treat = filter(lambda f: '.treat.' in f, in_files)[0]
    in_control = filter(lambda f: '.control.' in f, in_files)[0]
    glitr_dir = in_treat + '.GLITR_out'
    cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \
            '--CONTROL=../%s --GENOME=%s %s ') % (
                glitr_dir, glitr_dir, glitr_dir, in_treat, in_control,
                cfg.get('DEFAULT', 'genome').upper(),
                cfg.get('peaks', 'glitr_params'))
    sys_call(cmd)
    sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
Exemple #25
0
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name)
        sys_call(cmd)
        # cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)


@active_if(cfg.getboolean("visualization", "uniquefy_track"))
@transform(
    [clip_and_sort_peaks] + mapping.all_mappers_output,
    suffix(""),
    ".unique",
    cfg.getint("visualization", "uniquefy_track_max_reads"),
)
def bed_uniquefy(in_bed, out_bed, max_reads):
    "Given a sorted bed file, remove tags that are on the same start, strand"
    with open(in_bed) as infile:
        with open(out_bed, "w") as outfile:
            prev_start, prev_chrom = None, None
            plus_seen, minus_seen = 0, 0
            for line in infile:
                fields = line.split("\t")
                chrom, start, stop = fields[:3]
                if prev_start is None or prev_start != start or prev_chrom != chrom:
                    prev_start, prev_chrom = start, chrom
                    plus_seen, minus_seen = 0, 0
                if len(fields) < 6 or fields[5] == "+":
                    if plus_seen <= max_reads:
Exemple #26
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Exemple #27
0
def run_mosiak_jump_reference(in_dat, _, out_jump_base):
    'create mosaik jump db on reference'
    cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base,
                                    cfg.getint('mapping', 'mosaik_hash_size'))
    sys_call(cmd)
Exemple #28
0
import hts_waterworks.utils.sampling as sampling
import hts_waterworks.utils.motif_significance as motif_significance
from hts_waterworks.bootstrap import cfg, get_genome, genome_path
import hts_waterworks.call_peaks as call_peaks
import hts_waterworks.annotation as annotation

#from ipdb import set_trace as breakpoint

# motif setup


@transform(
    call_peaks.all_peak_caller_functions +
    ['*.peaks_summits.%s_around' % cfg.get('peaks', 'peak_summit_size')],
    regex(r'(.*\.peaks$|.*\..*_around$|_genes.promoter.*_ext[\d]+$)'),
    r'\1.top%s.peaks' % cfg.getint('motifs', 'motif_chunk_size'),
    cfg.getint('motifs', 'motif_chunk_size'))
def get_top_peaks(in_peaks, out_subset, num_peaks_to_keep):
    """keep only the top peaks as input to motif discovery"""
    with open(in_peaks) as infile:
        seqs = list(readBedLines(infile, dataOnly=False))
        # sort by score, highest first
        seqs.sort(key=lambda x: int(x[4]), reverse=True)
        with open(out_subset, 'w') as outfile:
            subset = seqs[:num_peaks_to_keep]
            outfile.writelines('\t'.join(map(str, s)) + '\n' for s in subset)


#@follows(get_genome)
@transform([get_top_peaks], suffix(''), '.fasta')
def get_peak_sequence(in_peaks, out_fasta):
Exemple #29
0
from hts_waterworks.annotation import get_refseq_genes
from hts_waterworks.bootstrap import cfg
from hts_waterworks.utils.pas_seq_expression import (group_reads_by_gene,
                                                     group_adjacent_reads)
from hts_waterworks.utils.common import breakpoint


@active_if(False)
@files(None, '%s.polyA_DB' % cfg.get('DEFAULT', 'genome'), cfg.get('DEFAULT', 'genome'))
def get_polyA_DB(_, out_db, genome_build):
    cmd = r"curl 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/polyaDb.txt.gz' | gunzip - | cut -d $'\t' -f 2- > %s"
    cmd = cmd % (genome_build, out_db)
    sys_call(cmd, file_log=False)


@active_if(cfg.getint('PAS-Seq', 'min_read_count') > 0)
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@transform(mapping.all_mappers_output, suffix('.mapped_reads'),
           '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count'))
def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count):
    """
    Remove mapped reads that don't overlap with at least *min_read_count* reads
    """
    cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \
          r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1,
                                        out_bed)
    sys_call(cmd, file_log=False)


@active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads'))
#@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'),
Exemple #30
0
                p_start += len(region)
        gene_db[seq_id] = sequence
    msa.build(saveSeqDict=True)
    outfile.close()
    pickle.dump(gene_db, open(out_db, 'w'))


@active_if(cfg.getboolean('mapping', 'run_mosaik'))
@transform(reference_genomes, suffix(''), '.mosaik_dat')
def run_mosaik_build_reference(in_genome, out_bin):
    'convert reference to mosaik binary'
    cmd = 'MosaikBuild -fr %s -oa %s' % (in_genome, out_bin)
    sys_call(cmd)


mosaik_suffix_base = r'\1.mosaik_jump_%s' % cfg.getint('mapping', 'mosaik_hash_size')
@split(run_mosaik_build_reference, regex('(.*)\.mosaik_dat'),
       [mosaik_suffix_base + '_keys.jmp',
        mosaik_suffix_base + '_meta.jmp',
        mosaik_suffix_base + '_positions.jmp'], mosaik_suffix_base)
def run_mosiak_jump_reference(in_dat, _, out_jump_base):
    'create mosaik jump db on reference'
    cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base,
                                    cfg.getint('mapping', 'mosaik_hash_size'))
    sys_call(cmd)


@active_if(cfg.getboolean('mapping', 'run_mosaik'))
@transform(preprocessing.final_output, suffix(''), '.mosaik_reads_dat')
def run_mosaik_build_reads(in_fastq, out_dat):
    'convert reads to mosaik binary'
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)


if cfg.getboolean('filtering', 'filter_artifacts'):
    prev_output = filter_artifacts
    prev_suffix = ''


@active_if(cfg.getboolean('filtering', 'filter_quality'))
@transform(prev_output, suffix(prev_suffix), '.min_qual',
           cfg.getint('filtering', 'filter_min_quality'),
           cfg.getint('filtering', 'filter_percent_bases_at_min'))
def filter_min_quality(in_fastq, out_fastq, min_qual, min_percent):
    """Remove sequences that have < min_precent bases with quality < min_qual"""
    cmd1 = 'cat %s' % in_fastq
    cmd2 = 'fastq_quality_filter -o %s -q %s -p %s' % (out_fastq, min_qual,
                                                       min_percent)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)