Exemple #1
0
    def __call__(self, track, slice=None):

        exp_statement = """
        SELECT TPM, gene_id, sample_name
        FROM sailfish_genes AS A
        JOIN samples AS B
        ON A.sample_id = B.id"""

        exp_df = self.getDataFrame(exp_statement)

        factors_statement = '''
        SELECT factor, factor_value, sample_name
        FROM samples AS A
        JOIN factors AS B
        ON A.id = B.sample_id
        WHERE factor != 'genome'
        '''

        factors_df = self.getDataFrame(factors_statement)

        merged_df = pd.merge(exp_df, factors_df,
                             left_on="sample_name", right_on="sample_name")

        genes = Pipeline.asList(Pipeline.peekParameters(
            ".", "pipeline_rnaseqqc.py")['genes_of_interest'])

        interest_df = merged_df[merged_df['gene_id'].isin(genes)]

        interest_df['TPM'] = interest_df['TPM'].astype(float)

        return interest_df.reset_index().set_index("factor")
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError("required file %s for %s (stage %i) not exist." %
                             (filename, outfile, stage))

        if stage == 0:
            statement.append('''gunzip < %(filename)s''' % locals())
        else:
            statement.append('''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals())

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
Exemple #3
0
def publish():
    '''publish files.'''

    # directory, files

    export_files = {"bigwigfiles": glob.glob("*/*.bigwig")}

    if PARAMS['ucsc_exclude']:
        for filetype, files in export_files.items():
            new_files = set(files)
            for f in files:
                for regex in P.asList(PARAMS['ucsc_exclude']):
                    if re.match(regex, f):
                        new_files.remove(f)
                        break

            export_files[filetype] = list(new_files)

    # publish web pages
    E.info("publishing report")
    P.publish_report(export_files=export_files)

    E.info("publishing UCSC data hub")
    P.publish_tracks(export_files)
    >  %(outfile)s
    '''

    P.run()


##########################################################################
##########################################################################
##########################################################################
# extracting alignments from maf files
##########################################################################
if "maf_dir" in PARAMS and "maf_tracks" in PARAMS:

    @files([(("%s/*.maf.gz" % PARAMS["maf_dir"]), "%sTo%s.raw.psl.gz" %
             (PARAMS["%s_label" % track], PARAMS["maf_master"]), track)
            for track in P.asList(PARAMS["maf_tracks"])])
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)
Exemple #5
0
        "genesets_abinitio_coding": "pruned.gtf.gz",
        "genesets_abinitio_lncrna": "pruned.gtf.gz",
        "genesets_reference": "reference.gtf.gz",
        "genesets_refcoding": "refcoding.gtf.gz",
        "genesets_previous": ""
    })

PARAMS = P.PARAMS

PARAMS.update(
    P.peekParameters(PARAMS["annotations_annotations_dir"],
                     "pipeline_annotations.py",
                     prefix="annotations_",
                     update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Exemple #6
0
     "../pipeline.ini",
     "pipeline.ini"],
    defaults={
        'annotations_dir': "",
        'paired_end': False})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

# get options that are to be tested
cufflinks_options = {}
if "cufflinks_test_options" in PARAMS:
    options = P.asList(PARAMS["cufflinks_test_options"])
    for option in options:
        if option == "--pre-mrna-fraction" \
                or option == "--small-anchor-fraction" \
                or option == "--max-multiread-fraction":
            cufflinks_options[option] = [0, 0.5, 0.75, 1]
        elif option == "--min-isoform-fraction":
            cufflinks_options[option] = [0.05, 0.1, 0.5, 1]
        elif option == "--junc-alpha":
            cufflinks_options[option] = [0.001, 0.01, 0.1]
        elif option == "--min-frags-per-transfrag":
            cufflinks_options[option] = [1, 5, 10]
        elif option == "--overhang-tolerance":
            cufflinks_options[option] = [0, 2, 5, 8]
        elif option == "--overlap-radius":
            cufflinks_options[option] = [50, 100, 200]
Exemple #7
0
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts,
         mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design), "diff_methylation/%s_%s.deseq.gz" %
         (P.snip(os.path.basename(data),
                 ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")


#########################################################################
#########################################################################
#########################################################################

Exemple #8
0
    P.run()

    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; 
                | gzip 
                > %(outfile)s'''
    P.run()


###################################################################
###################################################################
###################################################################


@files([(x, "%s_%s.output.gz" % (x[:-len(".features.gz")], y), y)
        for x, y in itertools.product(glob.glob("*.features.gz"),
                                      P.asList(PARAMS["polyphen_models"]))])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''

    to_cluster = False

    # need to run in chunks for large feature files
    statement = """gunzip 
        < %(infile)s
        | %(cmd-farm)s
            --split-at-lines=10000
            --output-header
        "perl %(polyphen_home)s/bin/run_weka_cpp.pl 
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           -p 
Exemple #9
0
    statement = '''find %(resultsdir)s -not -name "*.err.*" -exec cat {} \; > %(outfile)s'''
    P.run()


###################################################################
###################################################################
###################################################################
# do not run in parallel. run_weka.pl creates a $testfile
# that is not unique. run_weka.pl and pph2arff.pl could either
# be patched or the following jobs run in sequence.


@jobs_limit(1, "polyphen")
@files([(buildPolyphenFeatures, "polyphen_%s.output.gz" % x, x)
        for x in P.asList(PARAMS["polyphen_models"])])
def runPolyphen(infile, outfile, model):
    '''run POLYPHEN on feature tables to classify SNPs.
    '''
    # options
    # -f: feature set, default is F11
    # -c: classifier, default is NBd (Naive Bayes with discretization)
    # -l: model name, default is HumDiv

    statement = '''
    %(polyphen_home)s/bin/run_weka.pl
           -l %(polyphen_home)s/models/%(model)s.UniRef100.NBd.f11.model
           %(infile)s
    | gzip
    > %(outfile)s
    2> %(outfile)s.log