Example #1
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array(
        'i', (stop - start
              for chrom, start, stop, strand in readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)

    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(
        wb_genome,
        peak_lengths,
        sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'),
        excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'),
        excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'),
        ignoreCharacters='_',
        weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([
                    line.id,
                    str(line.start),
                    str(line.stop),
                    str(index), '0', '+' if line.orientation == 1 else '-'
                ]) + '\n')
Example #2
0
def sample_genome_like_peaks(in_peaks, out_files):
    """Sample from the genome, keeping the sample widths the same as peaks"""
    out_sample, out_locations = out_files[:2]
    peak_lengths = array('i', (stop - start for chrom, start, stop, strand in
                        readBedLines(open(in_peaks))))
    if len(peak_lengths) == 0:
        raise RuntimeError("Peaks file %s is empty!" % in_peaks)
    
    wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome'))
    s = sampling.sample_genome(wb_genome, peak_lengths,
                               sampleSize=cfg.getint('motifs',
                                        'motif_significance_sample_size'),
                               excludeRepeat=cfg.getboolean('motifs',
                                                'sampling_exclude_repeats'),
                               excludeN=cfg.getboolean('motifs',
                                                'sampling_exclude_N'),
                               ignoreCharacters='_', weighted=True)
    with open(out_sample, 'w') as outfile:
        with open(out_locations, 'w') as outlocations:
            for index, line in enumerate(s):
                outfile.write('>%s\n%s\n' % (index, line))
                outlocations.write('\t'.join([line.id, str(line.start),
                                             str(line.stop), str(index), '0',
                                '+' if line.orientation == 1 else '-']) + '\n')
Example #3
0
def trim_regex(in_fastq, out_fastq, trim_pattern):
    """Search the reads for a regex, and trim everything matching the pattern
        and all succeeding sequence.
    
    """
    pattern = re.compile(trim_pattern)
    with gzip.open(in_fastq) as infile:
        with gzip.open(out_fastq, 'w') as outfile:
            for header, seq, qual in parseFastq(infile):
                matches = [m.span() for m in pattern.finditer(seq)]
                if len(matches) > 0:
                    # match to re found--
                    #   trim the right-most hit and add the trimmed sequence to the read ID
                    m = matches[-1]
                    header = seq[m[0]:] + '_' + header
                    seq = seq[:m[0]]
                    qual = qual[:m[0]]
                if len(matches) > 0 or not cfg.getboolean('filtering', 'require_regex'):
                    if len(seq) >= 10:  # TODO: add adjustable min length
                        outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq,
                                                              header, qual))
def trim_regex(in_fastq, out_fastq, trim_pattern):
    """Search the reads for a regex, and trim everything matching the pattern
        and all succeeding sequence.
    
    """
    pattern = re.compile(trim_pattern)
    with gzip.open(in_fastq) as infile:
        with gzip.open(out_fastq, 'w') as outfile:
            for header, seq, qual in parseFastq(infile):
                matches = [m.span() for m in pattern.finditer(seq)]
                if len(matches) > 0:
                    # match to re found--
                    #   trim the right-most hit and add the trimmed sequence to the read ID
                    m = matches[-1]
                    header = seq[m[0]:] + '_' + header
                    seq = seq[:m[0]]
                    qual = qual[:m[0]]
                if len(matches) > 0 or not cfg.getboolean(
                        'filtering', 'require_regex'):
                    if len(seq) >= 10:  # TODO: add adjustable min length
                        outfile.write('@%s\n%s\n+%s\n%s\n' %
                                      (header, seq, header, qual))
Example #5
0
from Bio import SeqIO
from ruffus import (transform, follows, collate, files, split, merge,
                    suffix, mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                               main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg
from hts_waterworks.utils.common import parseFastq

# filtering
original_reads = '*.fastq'
prev_output = original_reads
prev_suffix = '.fastq'

@active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina'))
@transform(prev_output, suffix(prev_suffix), '.fastq_illumina')
def convert_fastq(in_fastq, out_fastq):
    'convert sanger fastq format (phred-33) to illumina format (phred-64)'
    base_out = os.path.splitext(out_fastq)[0]
    records = SeqIO.parse(in_fastq, "fastq")
    with open(base_out, 'w') as outfile:
        SeqIO.write(records, outfile, "fastq-illumina")
    check_call('gzip %s' % base_out, shell=True)
if cfg.getboolean('filtering', 'convert_sanger_to_illumina'):
    prev_output = convert_fastq
    prev_suffix = ''



@active_if(cfg.getboolean('filtering', 'clip_adapter'))
Example #6
0
    with open(in_fasta) as infile:
        seqs = list(parseFastaLines(infile))
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(seqs, min(len(seqs),
                                    cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                                                for s in subset)

# motif discovery
@active_if(cfg.getboolean('motifs', 'run_meme'))
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@transform(motif_select_random_seqs,
           #suffix('.fasta'), '.meme.discovered.motifs')
           #regex(r'(.*(?=_around).*(?=top).*).fasta$'),
           regex(r'(.*(?=top).*).fasta$'),
           r'\1.meme.discovered.motifs')
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta,
                                           cfg.get('motifs', 'meme_params'),
                                           out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs)
    pickle.dump(motifs, open(out_motifs, 'w'))
Example #7
0
    + mapping.all_mappers_output
    + mapping.all_mappers_raw_reads,
    suffix(""),
    ".clipped.sorted",
)
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name)
        sys_call(cmd)
        # cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)


@active_if(cfg.getboolean("visualization", "uniquefy_track"))
@transform(
    [clip_and_sort_peaks] + mapping.all_mappers_output,
    suffix(""),
    ".unique",
    cfg.getint("visualization", "uniquefy_track_max_reads"),
)
def bed_uniquefy(in_bed, out_bed, max_reads):
    "Given a sorted bed file, remove tags that are on the same start, strand"
    with open(in_bed) as infile:
        with open(out_bed, "w") as outfile:
            prev_start, prev_chrom = None, None
            plus_seen, minus_seen = 0, 0
            for line in infile:
                fields = line.split("\t")
                chrom, start, stop = fields[:3]
Example #8
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Example #9
0
import hts_waterworks.call_peaks as call_peaks
import hts_waterworks.mapping as mapping



@transform('%s.*.gtfgenes' % cfg.get('DEFAULT', 'genome'), suffix('.gtfgenes'), '_genes')
def convert_gtf_genes_to_bed(in_gtf, out_gene_pred):
    """convert gtf genes to UCSC's genePred format"""
    sys_call('gtfToGenePred %s %s' % (in_gtf, out_gene_pred), file_log=False)

@transform('%s.*.gff3genes' % cfg.get('DEFAULT', 'genome'), suffix('.gff3genes'), '_genes')
def convert_gff3_genes_to_bed(in_gff3, out_gene_pred):
    """convert gff3 genes to UCSC's genePred format"""
    sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False)

@active_if(cfg.getboolean('genes','download_refseq'))
@files(None, '%s.refseq_genes' % cfg.get('DEFAULT', 'genome'))
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)


@transform([get_refseq_genes, convert_gtf_genes_to_bed,
            convert_gff3_genes_to_bed],
    suffix('_genes'), '_genes.all')
def refseq_genes_to_bed(in_genes, out_bed):
    """convert refseq genes file to BED format"""
Example #10
0
@active_if(cfg.getint('PAS-Seq', 'min_read_count') > 0)
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@transform(mapping.all_mappers_output, suffix('.mapped_reads'),
           '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count'))
def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count):
    """
    Remove mapped reads that don't overlap with at least *min_read_count* reads
    """
    cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \
          r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1,
                                        out_bed)
    sys_call(cmd, file_log=False)


@active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads'))
#@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'),
@split(remove_nonoverlapping_reads, regex('(.*).mapped_reads$'),
           [r'\1.merged.mapped_reads', r'\1.merged.pileup_reads'],
           cfg.getint('PAS-Seq', 'merge_window_width'),
           cfg.getint('PAS-Seq', 'merge_num_iterations'),
           r'\1.merged.mapped_reads', r'\1.merged.pileup_reads',
           cfg.getint('PAS-Seq', 'min_read_count'))
def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations,
                         out_merged, out_pileup, min_read_count):
    """Reassign read ends to a weighted average of adjacent reads"""
    # helper functions for parsing bed files
    filter_lines = lambda l: l.strip() and (not l.startswith('#') or \
                                            l.startswith('"'))
    read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile)
    
import shutil

from ruffus import (transform, follows, collate, files, split, merge,
                    add_inputs, regex, suffix, mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path
import hts_waterworks.mapping as mapping
import hts_waterworks.clip_seq as clip_seq
from hts_waterworks.utils.common import (bedCommentFilter, readBedLines,
                                         parse_ucsc_range)


@active_if(cfg.getboolean('peaks', 'run_macs'))
@collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), 
         add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks',
         cfg.getfloat('peaks', 'max_FDR'))
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
Example #12
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Example #13
0
from ruffus import (transform, follows, files, split, merge, add_inputs,
                    regex, suffix, jobs_limit, mkdir)
from ruffus.task import active_if
from pygr import worldbase, cnestedlist, seqdb
import pybedtools

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
    Creates a fasta-file of resulting genes and a gene to genome alignment.
Example #14
0
        if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'):
            num_chunks = 1
        else:
            num_chunks = cfg.get('motifs', 'motif_num_chunks')
        # get a random sample of peaks
        for i in xrange(num_chunks):
            with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile:
                subset = random.sample(
                    seqs,
                    min(len(seqs), cfg.getint('motifs', 'motif_chunk_size')))
                outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip())
                                   for s in subset)


# motif discovery
@active_if(cfg.getboolean('motifs', 'run_meme'))
@jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled')
@transform(
    motif_select_random_seqs,
    #suffix('.fasta'), '.meme.discovered.motifs')
    #regex(r'(.*(?=_around).*(?=top).*).fasta$'),
    regex(r'(.*(?=top).*).fasta$'),
    r'\1.meme.discovered.motifs')
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (
        in_fasta, cfg.get('motifs', 'meme_params'), out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' %
                                            out_motifs)
Example #15
0
@transform('%s.*.gtfgenes' % cfg.get('DEFAULT', 'genome'), suffix('.gtfgenes'),
           '_genes')
def convert_gtf_genes_to_bed(in_gtf, out_gene_pred):
    """convert gtf genes to UCSC's genePred format"""
    sys_call('gtfToGenePred %s %s' % (in_gtf, out_gene_pred), file_log=False)


@transform('%s.*.gff3genes' % cfg.get('DEFAULT', 'genome'),
           suffix('.gff3genes'), '_genes')
def convert_gff3_genes_to_bed(in_gff3, out_gene_pred):
    """convert gff3 genes to UCSC's genePred format"""
    sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False)


@active_if(cfg.getboolean('genes', 'download_refseq'))
@files(None, '%s.refseq_genes' % cfg.get('DEFAULT', 'genome'))
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)


@transform(
    [get_refseq_genes, convert_gtf_genes_to_bed, convert_gff3_genes_to_bed],
    suffix('_genes'), '_genes.all')
def refseq_genes_to_bed(in_genes, out_bed):
    """convert refseq genes file to BED format"""
Example #16
0
from ruffus import (transform, follows, files, split, merge, add_inputs,
                    regex, suffix, jobs_limit, mkdir)
from ruffus.task import active_if
from pygr import worldbase, cnestedlist, seqdb
import pybedtools

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
    Creates a fasta-file of resulting genes and a gene to genome alignment.
Example #17
0
@jobs_limit(cfg.get('DEFAULT', 'max_throttled_jobs'), 'throttled')
@follows(bootstrap.get_chrom_sizes)
@transform(call_peaks.all_peak_caller_functions + [pas_seq.remove_terminal_exon] + [clip_seq.search_genome_consensus] + mapping.all_mappers_output + mapping.all_mappers_raw_reads,
           suffix(''), '.clipped.sorted')
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(),
                                                tmp_clipped.name)
        sys_call(cmd)
        #cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)

@active_if(cfg.getboolean('visualization', 'uniquefy_track'))
@transform([clip_and_sort_peaks] + mapping.all_mappers_output, suffix(''),
           '.unique',
           cfg.getint('visualization', 'uniquefy_track_max_reads'))
def bed_uniquefy(in_bed, out_bed, max_reads):
    'Given a sorted bed file, remove tags that are on the same start, strand'
    with open(in_bed) as infile:
        with open(out_bed, 'w') as outfile:
            prev_start, prev_chrom = None, None
            plus_seen, minus_seen = 0, 0
            for line in infile:
                fields = line.split('\t')
                chrom, start, stop = fields[:3]
                if prev_start is None or prev_start != start or \
                                                       prev_chrom != chrom:
                    prev_start, prev_chrom = start, chrom
from ruffus import (transform, follows, collate, files, split, merge, suffix,
                    mkdir, jobs_limit, output_from)
from ruffus.task import active_if

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                               main_mutex as log_mtx)
from hts_waterworks.bootstrap import cfg
from hts_waterworks.utils.common import parseFastq

# filtering
original_reads = '*.fastq'
prev_output = original_reads
prev_suffix = '.fastq'


@active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina'))
@transform(prev_output, suffix(prev_suffix), '.fastq_illumina')
def convert_fastq(in_fastq, out_fastq):
    'convert sanger fastq format (phred-33) to illumina format (phred-64)'
    base_out = os.path.splitext(out_fastq)[0]
    records = SeqIO.parse(in_fastq, "fastq")
    with open(base_out, 'w') as outfile:
        SeqIO.write(records, outfile, "fastq-illumina")
    check_call('gzip %s' % base_out, shell=True)


if cfg.getboolean('filtering', 'convert_sanger_to_illumina'):
    prev_output = convert_fastq
    prev_suffix = ''