Beispiel #1
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError("required file %s for %s (stage %i) not exist." %
                             (filename, outfile, stage))

        if stage == 0:
            statement.append('''gunzip < %(filename)s''' % locals())
        else:
            statement.append('''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals())

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
def publish():
    '''publish files.'''

    # directory, files

    export_files = {"bigwigfiles": glob.glob("*/*.bigwig")}

    if PARAMS['ucsc_exclude']:
        for filetype, files in export_files.items():
            new_files = set(files)
            for f in files:
                for regex in P.asList(PARAMS['ucsc_exclude']):
                    if re.match(regex, f):
                        new_files.remove(f)
                        break

            export_files[filetype] = list(new_files)

    # publish web pages
    E.info("publishing report")
    P.publish_report(export_files=export_files)

    E.info("publishing UCSC data hub")
    P.publish_tracks(export_files)
Beispiel #3
0
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; 
                | gzip 
                > %(outfile)s'''
    P.run()


###################################################################
###################################################################
###################################################################


@files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y)
        for x, y in itertools.product(glob.glob("*.features.gz"),
                                      P.asList(PARAMS["polyphen_models"]))])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = False

    # need to run in chunks for large feature files
    statement = """gunzip 
        < %(infile)s
        | %(cmd-farm)s
            --split-at-lines=10000
            --output-header
        "perl %(polyphen_home)s/bin/run_weka_cpp.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           -p 
Beispiel #4
0
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts,
         mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design), "diff_methylation/%s_%s.deseq.gz" %
         (P.snip(os.path.basename(data),
                 ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")


#########################################################################
#########################################################################
#########################################################################

Beispiel #5
0
    >  %(outfile)s
    '''

    P.run()


##########################################################################
##########################################################################
##########################################################################
# extracting alignments from maf files
##########################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:

    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" %
             (PARAMS["%s_label" % track], PARAMS["maf_master"]), track)
            for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
Beispiel #6
0
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()

###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1, "polyphen")
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''
    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s
    | gzip
    > %(outfile)s
    2> %(outfile)s.log
Beispiel #7
0
        "genesets_abinitio_coding": "pruned.gtf.gz",
        "genesets_abinitio_lncrna": "pruned.gtf.gz",
        "genesets_reference": "reference.gtf.gz",
        "genesets_refcoding": "refcoding.gtf.gz",
        "genesets_previous": ""
    })

PARAMS = P.PARAMS

PARAMS.update(
    P.peekParameters(PARAMS["annotations_annotations_dir"],
                     "pipeline_annotations.py",
                     prefix="annotations_",
                     update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()