Ejemplo n.º 1
0
def summarize_rpf_counts(inputfiles, outputfile):
    filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s))
                           for s in Options.SHORTTAG_SAMPLES)
    pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf)
                     for grpname, (rpf, polya) in Options.RPF_PAIRS)

    runproc("""
        $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""",
        outputfile)
Ejemplo n.º 2
0
def summarize_rpf_counts(inputfiles, outputfile):
    filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s))
                           for s in Options.SHORTTAG_SAMPLES)
    pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf)
                     for grpname, (rpf, polya) in Options.RPF_PAIRS)

    runproc(
        """
        $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""",
        outputfile)
Ejemplo n.º 3
0
def clip_refseq_enrichment_statistics(inputfiles, outputfile):
    inputfilesarg = ' '.join('%s:%s' % (
        smp, Paths.genomespace_refseq_counts(smp))
        for smp in Options.ALLCLIP_SAMPLES)
    refsample = Options.CLIPCTL_SAMPLES[0] # XXX fix this to support multiple controls
    clipsamples = ','.join(Options.CLIP_SAMPLES)

    runproc("""
        $STATS_CLIP_NRREFSEQ_ENRICHED \
            $nr_refseq_db $refsample $clipsamples $inputfilesarg \
            > $outputfile""", outputfile)
Ejemplo n.º 4
0
def clip_refseq_enrichment_statistics(inputfiles, outputfile):
    inputfilesarg = ' '.join('%s:%s' %
                             (smp, Paths.genomespace_refseq_counts(smp))
                             for smp in Options.ALLCLIP_SAMPLES)
    refsample = Options.CLIPCTL_SAMPLES[
        0]  # XXX fix this to support multiple controls
    clipsamples = ','.join(Options.CLIP_SAMPLES)

    runproc(
        """
        $STATS_CLIP_NRREFSEQ_ENRICHED \
            $nr_refseq_db $refsample $clipsamples $inputfilesarg \
            > $outputfile""", outputfile)
Ejemplo n.º 5
0
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

from ruffus import *
import os
from rnarry.nrclip import (Paths, Options, DerivedDatabaseBuilding,
                           SequenceAnnotation, ContaminantFilter,
                           SequenceProcessing)
from rnarry.nrclip.PipelineControl import *


@files([
    Paths.genomespace_refseq_counts(sample)
    for sample in Options.ALLCLIP_SAMPLES
], Paths.clip_enrichment_summary)
@follows(DerivedDatabaseBuilding.quantitate_refseq_in_gspace)
def clip_refseq_enrichment_statistics(inputfiles, outputfile):
    inputfilesarg = ' '.join('%s:%s' %
                             (smp, Paths.genomespace_refseq_counts(smp))
                             for smp in Options.ALLCLIP_SAMPLES)
    refsample = Options.CLIPCTL_SAMPLES[
        0]  # XXX fix this to support multiple controls
    clipsamples = ','.join(Options.CLIP_SAMPLES)

    runproc(
        """
        $STATS_CLIP_NRREFSEQ_ENRICHED \
            $nr_refseq_db $refsample $clipsamples $inputfilesarg \
Ejemplo n.º 6
0
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

from ruffus import *
import os
from rnarry.nrclip import (
    Paths, Options, DerivedDatabaseBuilding, SequenceAnnotation,
    ContaminantFilter, SequenceProcessing)
from rnarry.nrclip.PipelineControl import *


@files([Paths.genomespace_refseq_counts(sample)
        for sample in Options.ALLCLIP_SAMPLES],
       Paths.clip_enrichment_summary)
@follows(DerivedDatabaseBuilding.quantitate_refseq_in_gspace)
def clip_refseq_enrichment_statistics(inputfiles, outputfile):
    inputfilesarg = ' '.join('%s:%s' % (
        smp, Paths.genomespace_refseq_counts(smp))
        for smp in Options.ALLCLIP_SAMPLES)
    refsample = Options.CLIPCTL_SAMPLES[0] # XXX fix this to support multiple controls
    clipsamples = ','.join(Options.CLIP_SAMPLES)

    runproc("""
        $STATS_CLIP_NRREFSEQ_ENRICHED \
            $nr_refseq_db $refsample $clipsamples $inputfilesarg \
            > $outputfile""", outputfile)
Ejemplo n.º 7
0
from rnarry.nrclip.PipelineControl import *


@files(for_each([Paths.nr_refseq_db, Paths.tspace_read_database],
                Paths.cds_read_count_table,
                Options.SHORTTAG_SAMPLES))
@follows(DataPreparation.build_nonredundant_refseq_database)
@follows(TranscriptomeAnalysis.build_tspace_read_database)
def count_cds_reads(inputfiles, outputfile, sample):
    nr_refseq_db, tspace = inputfiles

    runproc("""
        $TSPACE_COUNT_CDS $nr_refseq_db $tspace > $outputfile""", outputfile)


@files([Paths.cds_read_count_table(s) for s in Options.SHORTTAG_SAMPLES],
       Paths.rpf_summarized_table)
@follows(count_cds_reads)
def summarize_rpf_counts(inputfiles, outputfile):
    filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s))
                           for s in Options.SHORTTAG_SAMPLES)
    pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf)
                     for grpname, (rpf, polya) in Options.RPF_PAIRS)

    runproc("""
        $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""",
        outputfile)


def tasks():
    return [
Ejemplo n.º 8
0

@files(for_each(Paths.genomespace_read_database,
                Paths.genomespace_refseq_counts,
                Paths.ALL_SAMPLES))
@follows(build_genomespace_read_database)
@follows(DataPreparation.build_nonredundant_refseq_database)
def quantitate_refseq_in_gspace(inputfile, outputfile, sample):
    gspacedir = os.path.dirname(inputfile)
    runproc(
        '$COUNT_REFSEQ_IN_GSPACE $outputfile $nr_refseq_db $gspacedir',
        outputfile)


@files([Paths.nr_refseq_list] +
       [Paths.genomespace_refseq_counts(sample)
        for sample in Paths.ALL_SAMPLES],
       Paths.genomespace_all_expressed_transcripts)
@follows(quantitate_refseq_in_gspace)
def make_list_of_expressed_transcripts(inputfiles, outputfile):
    inputlist = ' '.join(inputfiles)

    runproc("""
        $ENV MINDEPTH=$GSPACE_STATS_MINIMUM_RAW_READS \
        $REFSEQCNT_PICK_EXPRESSED $inputlist > $outputfile""", outputfile)


@files([Paths.nr_refseq_db, Paths.genomespace_all_expressed_transcripts,
        Paths.genome_fasta,
        Paths.genomespace_read_database(Options.SNPREF_SAMPLE)],
       [Paths.reftranscriptome_sequences, Paths.reftranscriptome_cds_anno])
Ejemplo n.º 9
0
    accession_pat = re.compile('ACC.*ID="([^"]*)";')

    with DeleteOnError(outputfile, gzip.open) as output:
        for line in urllib.urlopen(Paths.MIRBASE_URL):
            if line.startswith('#'):
                continue
            fields = line[:-1].split('\t')
            name = accession_pat.findall(fields[8])[0]
            print >> output, '\t'.join([
                'chr'+fields[0], str(int(fields[3])-1), fields[4],
                'miRNA|%s|%s' % (name, name), '.', fields[6]
            ])


@files([(None, Paths.repeatmasker_original(chrom), chrom)
        for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES])
@jobs_limit(Options.MAX_PARALLEL_DOWNLOADING, 'download')
def download_repeatmasker_catalogs(inputfile, outputfile, chrom):
    import urllib

    url = Paths.REPEATMASKER_URL % (Options.GENOME, chrom)
    print 'Downloading %s ...' % url
    urllib.urlretrieve(url, outputfile)


@files([Paths.repeatmasker_original(chrom)
        for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES],
       Paths.repeatmasker_catalog)
@follows(download_repeatmasker_catalogs)
def merge_repeatmasker_catalogs(inputfiles, outputfile):
Ejemplo n.º 10
0

@files(
    for_each([Paths.nr_refseq_db, Paths.tspace_read_database],
             Paths.cds_read_count_table, Options.SHORTTAG_SAMPLES))
@follows(DataPreparation.build_nonredundant_refseq_database)
@follows(TranscriptomeAnalysis.build_tspace_read_database)
def count_cds_reads(inputfiles, outputfile, sample):
    nr_refseq_db, tspace = inputfiles

    runproc(
        """
        $TSPACE_COUNT_CDS $nr_refseq_db $tspace > $outputfile""", outputfile)


@files([Paths.cds_read_count_table(s) for s in Options.SHORTTAG_SAMPLES],
       Paths.rpf_summarized_table)
@follows(count_cds_reads)
def summarize_rpf_counts(inputfiles, outputfile):
    filemapping = ' '.join('%s:%s' % (s, Paths.cds_read_count_table(s))
                           for s in Options.SHORTTAG_SAMPLES)
    pairs = ' '.join('%s:%s:%s' % (grpname, polya, rpf)
                     for grpname, (rpf, polya) in Options.RPF_PAIRS)

    runproc(
        """
        $SUMMARIZE_RPF_COUNTS "$filemapping" "$pairs" > $outputfile""",
        outputfile)


def tasks():