def summarize_rpf_counts(inputfiles, outputfile): filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s)) for s in Options.SHORTTAG_SAMPLES) pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf) for grpname, (rpf, polya) in Options.RPF_PAIRS) runproc(""" $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""", outputfile)
def summarize_rpf_counts(inputfiles, outputfile): filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s)) for s in Options.SHORTTAG_SAMPLES) pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf) for grpname, (rpf, polya) in Options.RPF_PAIRS) runproc( """ $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""", outputfile)
def clip_refseq_enrichment_statistics(inputfiles, outputfile): inputfilesarg = ' '.join('%s:%s' % ( smp, Paths.genomespace_refseq_counts(smp)) for smp in Options.ALLCLIP_SAMPLES) refsample = Options.CLIPCTL_SAMPLES[0] # XXX fix this to support multiple controls clipsamples = ','.join(Options.CLIP_SAMPLES) runproc(""" $STATS_CLIP_NRREFSEQ_ENRICHED \ $nr_refseq_db $refsample $clipsamples $inputfilesarg \ > $outputfile""", outputfile)
def clip_refseq_enrichment_statistics(inputfiles, outputfile): inputfilesarg = ' '.join('%s:%s' % (smp, Paths.genomespace_refseq_counts(smp)) for smp in Options.ALLCLIP_SAMPLES) refsample = Options.CLIPCTL_SAMPLES[ 0] # XXX fix this to support multiple controls clipsamples = ','.join(Options.CLIP_SAMPLES) runproc( """ $STATS_CLIP_NRREFSEQ_ENRICHED \ $nr_refseq_db $refsample $clipsamples $inputfilesarg \ > $outputfile""", outputfile)
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # from ruffus import * import os from rnarry.nrclip import (Paths, Options, DerivedDatabaseBuilding, SequenceAnnotation, ContaminantFilter, SequenceProcessing) from rnarry.nrclip.PipelineControl import * @files([ Paths.genomespace_refseq_counts(sample) for sample in Options.ALLCLIP_SAMPLES ], Paths.clip_enrichment_summary) @follows(DerivedDatabaseBuilding.quantitate_refseq_in_gspace) def clip_refseq_enrichment_statistics(inputfiles, outputfile): inputfilesarg = ' '.join('%s:%s' % (smp, Paths.genomespace_refseq_counts(smp)) for smp in Options.ALLCLIP_SAMPLES) refsample = Options.CLIPCTL_SAMPLES[ 0] # XXX fix this to support multiple controls clipsamples = ','.join(Options.CLIP_SAMPLES) runproc( """ $STATS_CLIP_NRREFSEQ_ENRICHED \ $nr_refseq_db $refsample $clipsamples $inputfilesarg \
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # from ruffus import * import os from rnarry.nrclip import ( Paths, Options, DerivedDatabaseBuilding, SequenceAnnotation, ContaminantFilter, SequenceProcessing) from rnarry.nrclip.PipelineControl import * @files([Paths.genomespace_refseq_counts(sample) for sample in Options.ALLCLIP_SAMPLES], Paths.clip_enrichment_summary) @follows(DerivedDatabaseBuilding.quantitate_refseq_in_gspace) def clip_refseq_enrichment_statistics(inputfiles, outputfile): inputfilesarg = ' '.join('%s:%s' % ( smp, Paths.genomespace_refseq_counts(smp)) for smp in Options.ALLCLIP_SAMPLES) refsample = Options.CLIPCTL_SAMPLES[0] # XXX fix this to support multiple controls clipsamples = ','.join(Options.CLIP_SAMPLES) runproc(""" $STATS_CLIP_NRREFSEQ_ENRICHED \ $nr_refseq_db $refsample $clipsamples $inputfilesarg \ > $outputfile""", outputfile)
from rnarry.nrclip.PipelineControl import * @files(for_each([Paths.nr_refseq_db, Paths.tspace_read_database], Paths.cds_read_count_table, Options.SHORTTAG_SAMPLES)) @follows(DataPreparation.build_nonredundant_refseq_database) @follows(TranscriptomeAnalysis.build_tspace_read_database) def count_cds_reads(inputfiles, outputfile, sample): nr_refseq_db, tspace = inputfiles runproc(""" $TSPACE_COUNT_CDS $nr_refseq_db $tspace > $outputfile""", outputfile) @files([Paths.cds_read_count_table(s) for s in Options.SHORTTAG_SAMPLES], Paths.rpf_summarized_table) @follows(count_cds_reads) def summarize_rpf_counts(inputfiles, outputfile): filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s)) for s in Options.SHORTTAG_SAMPLES) pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf) for grpname, (rpf, polya) in Options.RPF_PAIRS) runproc(""" $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""", outputfile) def tasks(): return [
@files(for_each(Paths.genomespace_read_database, Paths.genomespace_refseq_counts, Paths.ALL_SAMPLES)) @follows(build_genomespace_read_database) @follows(DataPreparation.build_nonredundant_refseq_database) def quantitate_refseq_in_gspace(inputfile, outputfile, sample): gspacedir = os.path.dirname(inputfile) runproc( '$COUNT_REFSEQ_IN_GSPACE $outputfile $nr_refseq_db $gspacedir', outputfile) @files([Paths.nr_refseq_list] + [Paths.genomespace_refseq_counts(sample) for sample in Paths.ALL_SAMPLES], Paths.genomespace_all_expressed_transcripts) @follows(quantitate_refseq_in_gspace) def make_list_of_expressed_transcripts(inputfiles, outputfile): inputlist = ' '.join(inputfiles) runproc(""" $ENV MINDEPTH=$GSPACE_STATS_MINIMUM_RAW_READS \ $REFSEQCNT_PICK_EXPRESSED $inputlist > $outputfile""", outputfile) @files([Paths.nr_refseq_db, Paths.genomespace_all_expressed_transcripts, Paths.genome_fasta, Paths.genomespace_read_database(Options.SNPREF_SAMPLE)], [Paths.reftranscriptome_sequences, Paths.reftranscriptome_cds_anno])
accession_pat = re.compile('ACC.*ID="([^"]*)";') with DeleteOnError(outputfile, gzip.open) as output: for line in urllib.urlopen(Paths.MIRBASE_URL): if line.startswith('#'): continue fields = line[:-1].split('\t') name = accession_pat.findall(fields[8])[0] print >> output, '\t'.join([ 'chr'+fields[0], str(int(fields[3])-1), fields[4], 'miRNA|%s|%s' % (name, name), '.', fields[6] ]) @files([(None, Paths.repeatmasker_original(chrom), chrom) for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES]) @jobs_limit(Options.MAX_PARALLEL_DOWNLOADING, 'download') def download_repeatmasker_catalogs(inputfile, outputfile, chrom): import urllib url = Paths.REPEATMASKER_URL % (Options.GENOME, chrom) print 'Downloading %s ...' % url urllib.urlretrieve(url, outputfile) @files([Paths.repeatmasker_original(chrom) for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES], Paths.repeatmasker_catalog) @follows(download_repeatmasker_catalogs) def merge_repeatmasker_catalogs(inputfiles, outputfile):
@files( for_each([Paths.nr_refseq_db, Paths.tspace_read_database], Paths.cds_read_count_table, Options.SHORTTAG_SAMPLES)) @follows(DataPreparation.build_nonredundant_refseq_database) @follows(TranscriptomeAnalysis.build_tspace_read_database) def count_cds_reads(inputfiles, outputfile, sample): nr_refseq_db, tspace = inputfiles runproc( """ $TSPACE_COUNT_CDS $nr_refseq_db $tspace > $outputfile""", outputfile) @files([Paths.cds_read_count_table(s) for s in Options.SHORTTAG_SAMPLES], Paths.rpf_summarized_table) @follows(count_cds_reads) def summarize_rpf_counts(inputfiles, outputfile): filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s)) for s in Options.SHORTTAG_SAMPLES) pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf) for grpname, (rpf, polya) in Options.RPF_PAIRS) runproc( """ $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""", outputfile) def tasks():