Ejemplo n.º 1
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[0],
    )
    sys_call(cmd)

    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[1],
    )
    sys_call(cmd)
Ejemplo n.º 2
0
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name)
        sys_call(cmd)
        # cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)
Ejemplo n.º 3
0
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(),
                                                tmp_clipped.name)
        sys_call(cmd)
        #cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)
Ejemplo n.º 4
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[0])
    sys_call(cmd)
    
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[1])
    sys_call(cmd)
Ejemplo n.º 5
0
def bed_to_bedgraph(in_files, out_bedgraph):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph)
    sys_call(cmd)
Ejemplo n.º 6
0
def run_bowtie(in_fastq, out_bowtie):
    'align reads to reference using Bowtie'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'),
                                  out_bowtie)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Ejemplo n.º 7
0
def bed_to_bedgraph(in_files, out_bedgraph):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph,
    )
    sys_call(cmd)
Ejemplo n.º 8
0
def run_bowtie(in_fastq, out_bowtie):
    'align reads to reference using Bowtie'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'),
                                  out_bowtie)
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Ejemplo n.º 9
0
def run_tophat(in_fastq, out_tophat):
    'gapped alignment of reads to reference using TopHat'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(),
                                  cfg.get('mapping', 'bowtie_params'),
                                  '%s_tophat_out' % in_fastq,
                                  'hg19.refseq_genes.gff')
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Ejemplo n.º 10
0
def run_tophat(in_fastq, out_tophat):
    'gapped alignment of reads to reference using TopHat'
    cmd1 = 'zcat %s' % in_fastq
    cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(),
                                  cfg.get('mapping', 'bowtie_params'),
                                  '%s_tophat_out' % in_fastq,
                                  'hg19.refseq_genes.gff')
    p1 = Popen([cmd1], stdout=PIPE, shell=True)
    p2 = Popen([cmd2], stdin=p1.stdout, shell=True)
    p2.communicate()
    if p1.returncode:
        raise CalledProcessError(p1.returncode, cmd1)
    if p2.returncode:
        raise CalledProcessError(p2.returncode, cmd2)
Ejemplo n.º 11
0
def wig_to_bigwig(in_wig, out_bigwig):
    """Convert the wig file to a bigwig file"""
    cmd = "wigToBigWig %s %s.chrom.sizes %s" % (in_wig, genome_path(), out_bigwig)
    sys_call(cmd)
Ejemplo n.º 12
0
def wig_to_bigwig(in_wig, out_bigwig):
    """Convert the wig file to a bigwig file"""
    cmd = 'wigToBigWig %s %s.chrom.sizes %s' % (in_wig, genome_path(), out_bigwig)
    sys_call(cmd)
Ejemplo n.º 13
0
def bedgraph_to_bigwig(in_bedgraph, out_bigwig):
    """Convert the bedgraph file to .bigwig for viewing on UCSC"""
    cmd = 'bedGraphToBigWig %s %s.chrom.sizes %s' % (in_bedgraph, genome_path(),
                                                     out_bigwig)
    sys_call(cmd)
Ejemplo n.º 14
0
                                    str(int(float(foldchange))),'+']) + '\n')

@active_if(cfg.getboolean('peaks', 'run_QuEST'))
@transform(mapping.all_mappers_output,
    suffix('.mapped_reads'), '.mapped_reads_quest')
def bed_to_quest(in_bed, out_regions):
    """Convert bed file input to space-delimited positions"""
    with open(in_bed) as infile:
        with open(out_regions, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                outfile.write(' '.join(fields[:2] + [fields[5]]) + '\n')

@collate(bed_to_quest,
         regex(r'(.*)\.(treat|control)\.(.*)\.mapped_reads_quest$'),
         r'\1.treat.\3.quest.peaks', '%s.chrom.sizes' % genome_path())
def run_quest(in_reads, out_peaks, chrom_sizes):
    """Run QuEST on the given treatment and control data"""
    in_treat = filter(lambda f: '.treat.' in f, in_reads)[0]
    in_control = filter(lambda f: '.control.' in f, in_reads)[0]
    sys_call('echo "y\n1\n2\ny\n" | generate_QuEST_parameters.pl -QuEST_align_ChIP %s '
             '-QuEST_align_RX_noIP %s -gt %s -ap %s_output -silent' %
             (in_treat, in_control, chrom_sizes, in_treat))
    shutil.copy('%s_output/calls/peak_caller.ChIP.out.accepted' % in_treat, out_peaks)

@follows(run_quest)
@split(bed_to_quest, regex(r'(.*)\.treat\.(.*)\.mapped_reads_quest$'),
       r'\1.treat.\2.quest.*.wig',
       r'\1.treat.\2.quest.%s.wig',
       r'\1.treat.\2.mapped_reads_quest_output',
       '%s.chrom.sizes' % genome_path())
Ejemplo n.º 15
0
@transform(motif_select_random_seqs, suffix('.fasta'),
           '.nmica.discovered.motifs')
def discover_nmica_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
    #args = shlex.split('%s %s' % (motifs_name, out_motifs))
    #parse_nmica_motifs.main(args)

# motif enrichment
@follows(get_genome)
@files(None, '%s.genome_samples.size30.num%s.fasta' % (genome_path(),
                            cfg.get('motifs', 'motif_threshold_sample_size')))
def sample_genome_short(_, out_samples):
    """Genomic sampling for threshold score"""
    args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s
                       ''' % (out_samples,
                              cfg.get('DEFAULT', 'worldbase_genome'),
                              cfg.get('motifs', 'motif_threshold_sample_size')))
    sampling.main(args)

@transform('*.known.motifs.transfac', suffix('.transfac'), '')
def convert_transfac_motifs(in_transfac, out_pickle):
    """Convert text files with motifs into our pickled format"""
    transfac_str = open(in_transfac).read()
    m = sequence_motif.parseMotifsFromTransfac(transfac_str)
    pickle.dump(m, open(out_pickle, 'w'))
Ejemplo n.º 16
0
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed)
    sys_call(cmd)
Ejemplo n.º 17
0
def run_pash(in_fastq, out_pash):
    'align reads using PASH'
    cmd = 'pash-3.0lx.exe -h %s -v %s -o %s  %s '
    cmd = cmd % (genome_path(), in_fastq, out_pash,
                        cfg.get('mapping', 'pash_params'))
    sys_call(cmd)
Ejemplo n.º 18
0
from subprocess import Popen, PIPE, CalledProcessError

from ruffus import (transform, follows, files, split, merge, add_inputs,
                    regex, suffix, jobs_limit, mkdir)
from ruffus.task import active_if
from pygr import worldbase, cnestedlist, seqdb
import pybedtools

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
Ejemplo n.º 19
0
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed,
                                                genome_path(), out_bigbed)
    sys_call(cmd)
Ejemplo n.º 20
0
def run_pash(in_fastq, out_pash):
    'align reads using PASH'
    cmd = 'pash-3.0lx.exe -h %s -v %s -o %s  %s '
    cmd = cmd % (genome_path(), in_fastq, out_pash,
                        cfg.get('mapping', 'pash_params'))
    sys_call(cmd)
Ejemplo n.º 21
0
from os.path import join

from ruffus import (transform, follows, files, split, merge, add_inputs,
                    regex, suffix, jobs_limit, mkdir)
from ruffus.task import active_if
from pygr import worldbase, cnestedlist, seqdb
import pybedtools

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
Ejemplo n.º 22
0
def bedgraph_to_bigwig(in_bedgraph, out_bigwig):
    """Convert the bedgraph file to .bigwig for viewing on UCSC"""
    cmd = "bedGraphToBigWig %s %s.chrom.sizes %s" % (in_bedgraph, genome_path(), out_bigwig)
    sys_call(cmd)
Ejemplo n.º 23
0
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs',
                                                      'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
    #args = shlex.split('%s %s' % (motifs_name, out_motifs))
    #parse_nmica_motifs.main(args)


# motif enrichment
@follows(get_genome)
@files(None, '%s.genome_samples.size30.num%s.fasta' %
       (genome_path(), cfg.get('motifs', 'motif_threshold_sample_size')))
def sample_genome_short(_, out_samples):
    """Genomic sampling for threshold score"""
    args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s
                       ''' %
                       (out_samples, cfg.get('DEFAULT', 'worldbase_genome'),
                        cfg.get('motifs', 'motif_threshold_sample_size')))
    sampling.main(args)


@transform('*.known.motifs.transfac', suffix('.transfac'), '')
def convert_transfac_motifs(in_transfac, out_pickle):
    """Convert text files with motifs into our pickled format"""
    transfac_str = open(in_transfac).read()
    m = sequence_motif.parseMotifsFromTransfac(transfac_str)
    pickle.dump(m, open(out_pickle, 'w'))