def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[0], ) sys_call(cmd) cmd = ( """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s" ) % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph[1], ) sys_call(cmd)
def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) # cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd)
def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) #cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd)
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[0]) sys_call(cmd) cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph[1]) sys_call(cmd)
def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd)
def run_bowtie(in_fastq, out_bowtie): 'align reads to reference using Bowtie' cmd1 = 'zcat %s' % in_fastq cmd2 = 'bowtie %s %s - %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'), out_bowtie) p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph, ) sys_call(cmd)
def run_tophat(in_fastq, out_tophat): 'gapped alignment of reads to reference using TopHat' cmd1 = 'zcat %s' % in_fastq cmd2 = 'tophat %s - %s --output-dir=%s --GTF %s' % (genome_path(), cfg.get('mapping', 'bowtie_params'), '%s_tophat_out' % in_fastq, 'hg19.refseq_genes.gff') p1 = Popen([cmd1], stdout=PIPE, shell=True) p2 = Popen([cmd2], stdin=p1.stdout, shell=True) p2.communicate() if p1.returncode: raise CalledProcessError(p1.returncode, cmd1) if p2.returncode: raise CalledProcessError(p2.returncode, cmd2)
def wig_to_bigwig(in_wig, out_bigwig): """Convert the wig file to a bigwig file""" cmd = "wigToBigWig %s %s.chrom.sizes %s" % (in_wig, genome_path(), out_bigwig) sys_call(cmd)
def wig_to_bigwig(in_wig, out_bigwig): """Convert the wig file to a bigwig file""" cmd = 'wigToBigWig %s %s.chrom.sizes %s' % (in_wig, genome_path(), out_bigwig) sys_call(cmd)
def bedgraph_to_bigwig(in_bedgraph, out_bigwig): """Convert the bedgraph file to .bigwig for viewing on UCSC""" cmd = 'bedGraphToBigWig %s %s.chrom.sizes %s' % (in_bedgraph, genome_path(), out_bigwig) sys_call(cmd)
str(int(float(foldchange))),'+']) + '\n') @active_if(cfg.getboolean('peaks', 'run_QuEST')) @transform(mapping.all_mappers_output, suffix('.mapped_reads'), '.mapped_reads_quest') def bed_to_quest(in_bed, out_regions): """Convert bed file input to space-delimited positions""" with open(in_bed) as infile: with open(out_regions, 'w') as outfile: for line in infile: fields = line.strip().split('\t') outfile.write(' '.join(fields[:2] + [fields[5]]) + '\n') @collate(bed_to_quest, regex(r'(.*)\.(treat|control)\.(.*)\.mapped_reads_quest$'), r'\1.treat.\3.quest.peaks', '%s.chrom.sizes' % genome_path()) def run_quest(in_reads, out_peaks, chrom_sizes): """Run QuEST on the given treatment and control data""" in_treat = filter(lambda f: '.treat.' in f, in_reads)[0] in_control = filter(lambda f: '.control.' in f, in_reads)[0] sys_call('echo "y\n1\n2\ny\n" | generate_QuEST_parameters.pl -QuEST_align_ChIP %s ' '-QuEST_align_RX_noIP %s -gt %s -ap %s_output -silent' % (in_treat, in_control, chrom_sizes, in_treat)) shutil.copy('%s_output/calls/peak_caller.ChIP.out.accepted' % in_treat, out_peaks) @follows(run_quest) @split(bed_to_quest, regex(r'(.*)\.treat\.(.*)\.mapped_reads_quest$'), r'\1.treat.\2.quest.*.wig', r'\1.treat.\2.quest.%s.wig', r'\1.treat.\2.mapped_reads_quest_output', '%s.chrom.sizes' % genome_path())
@transform(motif_select_random_seqs, suffix('.fasta'), '.nmica.discovered.motifs') def discover_nmica_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running nestedMICA""" cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params')) sys_call(cmd) motifs_name = in_fasta.replace('.fasta', '.motifs.xms') sys_call('mv motifs.xms %s' % motifs_name) motifs = sequence_motif.parse_xms_motifs(motifs_name) pickle.dump(motifs, open(out_motifs, 'w')) #args = shlex.split('%s %s' % (motifs_name, out_motifs)) #parse_nmica_motifs.main(args) # motif enrichment @follows(get_genome) @files(None, '%s.genome_samples.size30.num%s.fasta' % (genome_path(), cfg.get('motifs', 'motif_threshold_sample_size'))) def sample_genome_short(_, out_samples): """Genomic sampling for threshold score""" args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s ''' % (out_samples, cfg.get('DEFAULT', 'worldbase_genome'), cfg.get('motifs', 'motif_threshold_sample_size'))) sampling.main(args) @transform('*.known.motifs.transfac', suffix('.transfac'), '') def convert_transfac_motifs(in_transfac, out_pickle): """Convert text files with motifs into our pickled format""" transfac_str = open(in_transfac).read() m = sequence_motif.parseMotifsFromTransfac(transfac_str) pickle.dump(m, open(out_pickle, 'w'))
def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed) sys_call(cmd)
def run_pash(in_fastq, out_pash): 'align reads using PASH' cmd = 'pash-3.0lx.exe -h %s -v %s -o %s %s ' cmd = cmd % (genome_path(), in_fastq, out_pash, cfg.get('mapping', 'pash_params')) sys_call(cmd)
from subprocess import Popen, PIPE, CalledProcessError from ruffus import (transform, follows, files, split, merge, add_inputs, regex, suffix, jobs_limit, mkdir) from ruffus.task import active_if from pygr import worldbase, cnestedlist, seqdb import pybedtools from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import (genome_path, get_genome, cfg, get_chrom_sizes) import hts_waterworks.preprocessing as preprocessing #: the references to map against for this run (genome, transcriptome, etc) reference_genomes = [genome_path()] if cfg.getboolean('mapping', 'map_to_transcriptome'): reference_genomes.append('*_genes.transcriptome.fasta') @follows(mkdir('mapped')) def make_mapping_dir(): pass @active_if(cfg.getboolean('mapping', 'map_to_transcriptome')) @split('*_genes', regex(r'(.*)_genes$'), [r'\1_genes.transcriptome.fasta', r'\1_genes.transcriptome.seqdb', r'\1_genes.transcriptome.msa']) def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome.
def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed, genome_path(), out_bigbed) sys_call(cmd)
from os.path import join from ruffus import (transform, follows, files, split, merge, add_inputs, regex, suffix, jobs_limit, mkdir) from ruffus.task import active_if from pygr import worldbase, cnestedlist, seqdb import pybedtools from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import (genome_path, get_genome, cfg, get_chrom_sizes) import hts_waterworks.preprocessing as preprocessing #: the references to map against for this run (genome, transcriptome, etc) reference_genomes = [genome_path()] if cfg.getboolean('mapping', 'map_to_transcriptome'): reference_genomes.append('*_genes.transcriptome.fasta') @follows(mkdir('mapped')) def make_mapping_dir(): pass @active_if(cfg.getboolean('mapping', 'map_to_transcriptome')) @split('*_genes', regex(r'(.*)_genes$'), [r'\1_genes.transcriptome.fasta', r'\1_genes.transcriptome.seqdb', r'\1_genes.transcriptome.msa']) def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome.
def bedgraph_to_bigwig(in_bedgraph, out_bigwig): """Convert the bedgraph file to .bigwig for viewing on UCSC""" cmd = "bedGraphToBigWig %s %s.chrom.sizes %s" % (in_bedgraph, genome_path(), out_bigwig) sys_call(cmd)
"""Discover sequence motifs in peaks by running nestedMICA""" cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params')) sys_call(cmd) motifs_name = in_fasta.replace('.fasta', '.motifs.xms') sys_call('mv motifs.xms %s' % motifs_name) motifs = sequence_motif.parse_xms_motifs(motifs_name) pickle.dump(motifs, open(out_motifs, 'w')) #args = shlex.split('%s %s' % (motifs_name, out_motifs)) #parse_nmica_motifs.main(args) # motif enrichment @follows(get_genome) @files(None, '%s.genome_samples.size30.num%s.fasta' % (genome_path(), cfg.get('motifs', 'motif_threshold_sample_size'))) def sample_genome_short(_, out_samples): """Genomic sampling for threshold score""" args = shlex.split('''%s --genome=%s --sample_length=30 --num_samples=%s ''' % (out_samples, cfg.get('DEFAULT', 'worldbase_genome'), cfg.get('motifs', 'motif_threshold_sample_size'))) sampling.main(args) @transform('*.known.motifs.transfac', suffix('.transfac'), '') def convert_transfac_motifs(in_transfac, out_pickle): """Convert text files with motifs into our pickled format""" transfac_str = open(in_transfac).read() m = sequence_motif.parseMotifsFromTransfac(transfac_str) pickle.dump(m, open(out_pickle, 'w'))