コード例 #1
0
def plotDETagStats(infiles, outfile):
    '''plot differential expression stats'''

    infile, composition_file = infiles
    Expression.plotDETagStats(
        infile, outfile,
        additional_file=composition_file,
        join_columns=("contig", "start", "end"),
        additional_columns=("CpG_density",
                            "length"))

    P.touch(outfile)
コード例 #2
0
def plotRNASEQTagData( infiles, outfile ):
    '''perform differential expression analysis using deseq.'''

    design_file = infiles[0]
    geneset_file = infiles[1]
    bamfiles = infiles[2]

    #IMS: now running on feature counts
    infile = os.path.join( "feature_counts.dir", P.snip( geneset_file, ".gtf.gz") + ".feature_counts.tsv.gz" )
    Expression.plotTagStats( infile, design_file, outfile )

    P.touch( outfile )
コード例 #3
0
def loadCuffdiff( infile, outfile ):
    '''load results from differential expression analysis and produce
    summary plots.

    Note: converts from ln(fold change) to log2 fold change.
   
    The cuffdiff output is parsed. 

    Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent)
    are set to status 'NOCALL'. These transcripts might nevertheless be significant.
    '''

    Expression.loadCuffdiff( infile, outfile )
コード例 #4
0
ファイル: PipelineWindows.py プロジェクト: jmadzo/cgat
def outputRegionsOfInterest(infiles, outfile,
                            max_per_sample=10, sum_per_group=40):
    '''output windows according to various filters.

    The output is a mock analysis similar to a differential expression
    result.

    '''
    job_options = "-l mem_free=64G"

    design_file, counts_file = infiles

    design = Expression.readDesignFile(design_file)

    # remove tracks not included in the design
    design = dict([(x, y) for x, y in design.items() if y.include])

    # define the two groups
    groups = sorted(set([x.group for x in design.values()]))

    # build a filtering statement
    groupA, groupB = groups
    upper_levelA = "max( (%s) ) < %f" % (
        ",".join(
            ["int(r['%s'])" % x for x, y in design.items() if y.group == groupA]),
        max_per_sample)

    sum_levelA = "sum( (%s) ) > %f" % (
        ",".join(
            ["int(r['%s'])" % x for x, y in design.items() if y.group == groupB]),
        sum_per_group)

    upper_levelB = "max( (%s) ) < %f" % (
        ",".join(
            ["int(r['%s'])" % x for x, y in design.items() if y.group == groupB]),
        max_per_sample)

    sum_levelB = "sum( (%s) ) > %f" % (
        ",".join(
            ["int(r['%s'])" % x for x, y in design.items() if y.group == groupA]),
        sum_per_group)

    statement = '''
    zcat %(counts_file)s
    | python %(scriptsdir)s/csv_select.py 
            --log=%(outfile)s.log
            "(%(upper_levelA)s and %(sum_levelA)s) or (%(upper_levelB)s and %(sum_levelB)s)"
    | python %(scriptsdir)s/runExpression.py
            --log=%(outfile)s.log          
            --filename-design=%(design_file)s
            --filename-tags=-
            --method=mock
            --filter-min-counts-per-sample=0 
    | gzip 
    > %(outfile)s
    '''

    P.run()
コード例 #5
0
def runMEDIPSDMR(design_file, outfile):
    '''run differential methylation analysis using MEDIPS package.

    Arguments
    ---------
    infile : string
        Filename of :term:`bam` formatted file
    outfile : string
        Output filename in :term:`tsv` format.
    '''

    job_memory = "30G"

    design = Expression.readDesignFile(design_file)

    # remove data tracks not needed
    design = [(x, y) for x, y in design.items() if y.include]

    # build groups
    groups = set([y.group for x, y in design])

    statements = []
    for pair1, pair2 in itertools.combinations(groups, 2):
        treatment = ["%s.bam" % x for x, y in design if y.group == pair1]
        control = ["%s.bam" % x for x, y in design if y.group == pair2]

        treatment = ",".join(treatment)
        control = ",".join(control)
        # outfile contains directory prefix
        statements.append(
            """python %(scriptsdir)s/runMEDIPS.py
            --ucsc-genome=%(medips_genome)s
            --treatment=%(treatment)s
            --control=%(control)s
            --toolset=dmr
            --shift=%(medips_shift)s
            --extend=%(medips_extension)s
            --window-size=%(medips_window_size)i
            --output-filename-pattern="%(outfile)s_%(pair1)s_vs_%(pair2)s_%%s"
            --fdr-threshold=%(medips_fdr)f
            --log=%(outfile)s.log
            > %(outfile)s.log2;
            checkpoint;
            zcat %(outfile)s_%(pair1)s_vs_%(pair2)s_data.tsv.gz
            | python %(scriptsdir)s/runMEDIPS.py
            --treatment=%(pair1)s
            --control=%(pair2)s
            --toolset=convert
            --fdr-threshold=%(medips_fdr)f
            --log=%(outfile)s.log
            | gzip
            > %(outfile)s
            """)

    P.run()
コード例 #6
0
def runCuffdiff( infiles, outfile ):
    '''perform differential expression analysis using cuffdiff.'''
    
    design_file = infiles[0]
    geneset_file = infiles[1]
    bamfiles = infiles[2]

    if PARAMS["cuffdiff_include_mask"]:
        mask_file = os.path.abspath( "geneset_mask.gtf" )
    else:
        mask_file = None

    options = PARAMS["cuffdiff_options"] + " --library-type %s" % PARAMS["cufflinks_library_type"]

    Expression.runCuffdiff( bamfiles, 
                            design_file,
                            geneset_file,
                            outfile,
                            threads = PARAMS.get("cuffdiff_threads",4),
                            cuffdiff_options = options,
                            fdr = PARAMS["cuffdiff_fdr"],
                            mask_file = mask_file )
コード例 #7
0
def plotDETagStats(infile, composition_file, outfile):
    '''plot differential expression statistics

    Arguments
    ---------
    infile : string
        Filename with :term:`tsv` formatted list of differential
        methylation results output from :doc:`scripts/runExpression`.
    composition_file : string
        Filename with :term:`tsv` formatted data about nucleotide
        compositions of windows tested.
    outfile : string
        Output filename, used as sentinel only.
    '''

    Expression.plotDETagStats(
        infile, outfile,
        additional_file=composition_file,
        join_columns=("contig", "start", "end"),
        additional_columns=("CpG_density",
                            "length"))

    P.touch(outfile)
コード例 #8
0
def runCuffdiff(infiles, outfile):
    '''perform differential expression analysis using cuffdiff.'''

    design_file = infiles[0]
    geneset_file = infiles[1]
    bamfiles = infiles[2]

    if PARAMS["cuffdiff_include_mask"]:
        mask_file = os.path.abspath("geneset_mask.gtf")
    else:
        mask_file = None

    options = PARAMS["cuffdiff_options"] + \
        " --library-type %s" % PARAMS["cufflinks_library_type"]

    Expression.runCuffdiff(bamfiles,
                           design_file,
                           geneset_file,
                           outfile,
                           threads=PARAMS.get("cuffdiff_threads", 4),
                           cuffdiff_options=options,
                           fdr=PARAMS["cuffdiff_fdr"],
                           mask_file=mask_file)
コード例 #9
0
def plotDETagStats(infile, composition_file, outfile):
    '''plot differential expression statistics

    Arguments
    ---------
    infile : string
        Filename with :term:`tsv` formatted list of differential
        methylation results output from :doc:`scripts/runExpression`.
    composition_file : string
        Filename with :term:`tsv` formatted data about nucleotide
        compositions of windows tested.
    outfile : string
        Output filename, used as sentinel only.
    '''

    Expression.plotDETagStats(
        infile, outfile,
        additional_file=composition_file,
        join_columns=("contig", "start", "end"),
        additional_columns=("CpG_density",
                            "length"))

    P.touch(outfile)
コード例 #10
0
def buildProbeset2Gene(infile, outfile):
    """build map relating a probeset to an ENSEMBL gene_id"""
    Expression.buildProbeset2Gene(infile, outfile)
コード例 #11
0
ファイル: runExpression.py プロジェクト: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-o", "--outfile", dest="output_filename", type="string",
                      help="output filename [default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "deseq", "edger", "deseq2",
                          "ttest",
                          "mock", "summary",
                          "dump", "spike",
                          "plottagstats",
                          "plotdetagstats"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option(
        "--edger-dispersion",
        dest="edger_dispersion", type="float",
        help="dispersion value for edgeR if there are no replicates "
        "[default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-p", "--pseudocounts", dest="pseudo_counts",
                      type="float",
                      help="pseudocounts to add for mock analyis "
                      "[default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")
    parser.add_option("--deseq2-design-formula",
                      dest="model",
                      type="string",
                      help="Design formula for DESeq2")
    parser.add_option("--deseq2-contrasts",
                      dest="contrasts",
                      type="string",
                      help=("contrasts for post-hoc testing writen"
                            " variable:control:treatment,..."))

    parser.set_defaults(
        input_filename_tags=None,
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=None,
        save_r_environment=None,
        filter_min_counts_per_row=1,
        filter_min_counts_per_sample=10,
        filter_percentile_rowsums=0,
        pseudo_counts=0,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if options.input_filename_tags == "-":
        fh = tempfile.NamedTemporaryFile(delete=False)
        fh.write("".join([x for x in options.stdin]))
        fh.close()
        options.input_filename_tags = fh.name
    else:
        fh = None

    # load tag data and filter
    if options.method in ("deseq2", "deseq", "edger", "mock", "ttest"):
        assert options.input_filename_tags and os.path.exists(
            options.input_filename_tags)
        assert options.input_filename_design and os.path.exists(
            options.input_filename_design)

        Expression.loadTagData(options.input_filename_tags,
                               options.input_filename_design)

        nobservations, nsamples = Expression.filterTagData(
            filter_min_counts_per_row=options.filter_min_counts_per_row,
            filter_min_counts_per_sample=options.filter_min_counts_per_sample,
            filter_percentile_rowsums=options.filter_percentile_rowsums)

        if nobservations == 0:
            E.warn("no observations - no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        sample_names = R('''colnames(countsTable)''')
        E.info("%i samples to test at %i observations: %s" %
               (nsamples, nobservations,
                ",".join(sample_names)))

    try:
        if options.method == "deseq2":
            Expression.runDESeq2(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                ref_group=options.ref_group,
                model=options.model,
                contrasts=options.contrasts,
            )

        elif options.method == "deseq":
            Expression.runDESeq(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                dispersion_method=options.deseq_dispersion_method,
                fit_type=options.deseq_fit_type,
                sharing_mode=options.deseq_sharing_mode,
                ref_group=options.ref_group,
            )

        elif options.method == "edger":
            Expression.runEdgeR(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                ref_group=options.ref_group,
                dispersion=options.edger_dispersion)

        elif options.method == "mock":
            Expression.runMockAnalysis(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                ref_group=options.ref_group,
                pseudo_counts=options.pseudo_counts,
            )

        elif options.method == "summary":
            Expression.outputTagSummary(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design
            )

        elif options.method == "dump":
            assert options.input_filename_tags and os.path.exists(
                options.input_filename_tags)
            Expression.dumpTagData(options.input_filename_tags,
                                   options.input_filename_design,
                                   outfile=options.stdout)

        elif options.method == "plottagstats":
            assert options.input_filename_tags and os.path.exists(
                options.input_filename_tags)
            Expression.plotTagStats(
                options.input_filename_tags,
                options.input_filename_design,
                outfile_prefix=options.output_filename_pattern)

        elif options.method == "plotdetagstats":
            assert options.input_filename_result and os.path.exists(
                options.input_filename_result)
            Expression.plotDETagStats(
                options.input_filename_result,
                outfile_prefix=options.output_filename_pattern)

        elif options.method == "spike":
            Expression.outputSpikeIns(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design,
                foldchange_max=options.spike_foldchange_max,
                expression_max=options.spike_expression_max,
                max_counts_per_bin=options.spike_max_counts_per_bin,
                expression_bin_width=options.spike_expression_bin_width,
                foldchange_bin_width=options.spike_foldchange_bin_width,
            )

        elif options.method == "ttest":
            Expression.runTTest(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr)

    except rpy2.rinterface.RRuntimeError:
        if options.save_r_environment:
            E.info("saving R image to %s" % options.save_r_environment)
            R['save.image'](options.save_r_environment)
        raise

    if fh and os.path.exists(fh.name):
        os.unlink(fh.name)

    if options.save_r_environment:
        R['save.image'](options.save_r_environment)

    E.Stop()
コード例 #12
0
ファイル: runMEDIPS.py プロジェクト: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id",
        usage=globals()["__doc__"])

    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend", dest="extension", type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size", dest="shift", type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size", dest="window_size", type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations", type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t", "--toolset", dest="toolset", type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment",
                               "dmr", "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w", "--bigwig-file", dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment", dest="treatment_files", type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control", dest="control_files", type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input", dest="input_files", type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip", action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata", dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file", dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="Hsapiens.UCSC.hg19",
        genome_file=None,
        extend=0,
        shift=0,
        window_size=300,
        saturation_iterations=10,
        toolset=[],
        bigwig=False,
        treatment_files=[],
        control_files=[],
        input_files=[],
        output_rdata=False,
        input_rdata=None,
        is_medip=True,
        fdr_threshold=0.1,
        fdr_method="BH",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin,
                                   dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" % (line['chr'],
                                      int(line['start']),
                                      int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError, msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return
コード例 #13
0
def plotDETagStats(infile, outfile):
    '''plot differential expression stats'''
    Expression.plotDETagStats(infile, outfile)
    P.touch(outfile)
コード例 #14
0
def outputRegionsOfInterest(design_file,
                            counts_file,
                            outfile,
                            max_per_sample=10,
                            sum_per_group=40):
    '''output windows according to various filters.

    The output is a mock analysis similar to a differential expression
    result.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format
    max_per_sample : int
       Remove samples with more than threshold counts
    sum_per_group : int
       Minimum counts per group.

    '''
    job_memory = "64G"

    design = Expression.readDesignFile(design_file)

    # remove tracks not included in the design
    design = dict([(x, y) for x, y in list(design.items()) if y.include])
    # define the two groups
    groups = sorted(set([x.group for x in list(design.values())]))

    # build a filtering statement
    groupA, groupB = groups

    def _buildMax(g, threshold):

        selected = [x for x, y in list(design.items()) if y.group == g]
        if len(selected) > 1:
            return "max((%s)) < %f" % (",".join(
                ["int(r['%s'])" % x for x in selected]), threshold)
        elif len(selected) == 1:
            return "int(r['%s']) < %f" % (selected[0], threshold)
        else:
            raise ValueError("no groups found for 'g'" % g)

    def _buildSum(g, threshold):

        selected = [x for x, y in list(design.items()) if y.group == g]
        if len(selected) > 1:
            return "sum((%s)) > %f" % (",".join(
                ["int(r['%s'])" % x for x in selected]), threshold)
        elif len(selected) == 1:
            return "int(r['%s']) > %f" % (selected[0], threshold)
        else:
            raise ValueError("no groups found for 'g'" % g)

    upper_levelA = _buildMax(groupA, max_per_sample)
    upper_levelB = _buildMax(groupB, max_per_sample)
    sum_levelA = _buildSum(groupA, sum_per_group)
    sum_levelB = _buildSum(groupB, sum_per_group)

    statement = '''
    zcat %(counts_file)s
    | cgat csv_select
            --log=%(outfile)s.log
            "(%(upper_levelA)s and %(sum_levelB)s) or
             (%(upper_levelB)s and %(sum_levelA)s)"
    | cgat runExpression
            --log=%(outfile)s.log
            --design-tsv-file=%(design_file)s
            --tags-tsv-file=-
            --method=mock
            --filter-min-counts-per-sample=0
    | gzip
    > %(outfile)s
    '''

    P.run(statement)
コード例 #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("filter", "spike", "normalize"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--spike-change-bin-min",
                      dest="min_cbin",
                      type="float",
                      help="minimum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-max",
                      dest="max_cbin",
                      type="float",
                      help="maximum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-width",
                      dest="width_cbin",
                      type="float",
                      help="bin width for change bins [default=%default].")

    parser.add_option("--spike-initial-bin-min",
                      dest="min_ibin",
                      type="float",
                      help="minimum bin for initial bins[default=%default].")

    parser.add_option("--spike-initial-bin-max",
                      dest="max_ibin",
                      type="float",
                      help="maximum bin for intitial bins[default=%default].")

    parser.add_option("--spike-initial-bin-width",
                      dest="width_ibin",
                      type="float",
                      help="bin width intitial bins[default=%default].")

    parser.add_option(
        "--spike-minimum",
        dest="min_spike",
        type="int",
        help="minimum number of spike-ins required within each bin\
                      [default=%default].")

    parser.add_option(
        "--spike-maximum",
        dest="max_spike",
        type="int",
        help="maximum number of spike-ins allowed within each bin\
                      [default=%default].")

    parser.add_option("--spike-difference-method",
                      dest="difference",
                      type="choice",
                      choices=("relative", "logfold", "abs_logfold"),
                      help="method to use for calculating difference\
                      [default=%default].")

    parser.add_option("--spike-iterations",
                      dest="iterations",
                      type="int",
                      help="number of iterations to generate spike-ins\
                      [default=%default].")

    parser.add_option("--spike-cluster-maximum-distance",
                      dest="cluster_max_distance",
                      type="int",
                      help="maximum distance between adjacent loci in cluster\
                      [default=%default].")

    parser.add_option("--spike-cluster-minimum-size",
                      dest="cluster_min_size",
                      type="int",
                      help="minimum number of loci required per cluster\
                      [default=%default].")

    parser.add_option("--spike-type",
                      dest="spike_type",
                      type="choice",
                      choices=("row", "cluster"),
                      help="spike in type [default=%default].")

    parser.add_option("--spike-subcluster-min-size",
                      dest="min_sbin",
                      type="int",
                      help="minimum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-max-size",
                      dest="max_sbin",
                      type="int",
                      help="maximum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-bin-width",
                      dest="width_sbin",
                      type="int",
                      help="bin width for subcluster size\
                      [default=%default].")

    parser.add_option("--spike-output-method",
                      dest="output_method",
                      type="choice",
                      choices=("append", "seperate"),
                      help="defines whether the spike-ins should be appended\
                      to the original table or seperately [default=%default].")

    parser.add_option("--spike-shuffle-column-suffix",
                      dest="shuffle_suffix",
                      type="string",
                      help="the suffix of the columns which are to be shuffled\
                      [default=%default].")

    parser.add_option("--spike-keep-column-suffix",
                      dest="keep_suffix",
                      type="string",
                      help="a list of suffixes for the columns which are to be\
                      keep along with the shuffled columns[default=%default].")

    parser.add_option("--normalization-method",
                      dest="normalization_method",
                      type="choice",
                      choices=("deseq-size-factors", "total-count",
                               "total-column", "total-row"),
                      help="normalization method to apply [%default]")

    parser.add_option("-t",
                      "--tags-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.set_defaults(input_filename_tags="-",
                        method="filter",
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        output_method="seperate",
                        difference="logfold",
                        spike_type="row",
                        min_cbin=0,
                        max_cbin=100,
                        width_cbin=100,
                        min_ibin=0,
                        max_ibin=100,
                        width_ibin=100,
                        max_spike=100,
                        min_spike=None,
                        iterations=1,
                        cluster_max_distance=100,
                        cluster_min_size=10,
                        min_sbin=1,
                        max_sbin=1,
                        width_sbin=1,
                        shuffle_suffix=None,
                        keep_suffix=None,
                        normalization_method="deseq-size-factors")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # load
    if options.keep_suffix:
        # if using suffix, loadTagDataPandas will throw an error as it
        # looks for column names which exactly match the design
        # "tracks" need to write function in Counts.py to handle
        # counts table and design table + suffix
        counts = pd.read_csv(options.stdin, sep="\t", comment="#")
        inf = IOTools.openFile(options.input_filename_design)
        design = pd.read_csv(inf, sep="\t", index_col=0)
        inf.close()
        design = design[design["include"] != 0]

        if options.method in ("filter", "spike"):
            if options.input_filename_design is None:
                raise ValueError("method '%s' requires a design file" %
                                 options.method)
    else:
        # create Counts object
        # TS if spike type is cluster, need to keep "contig" and "position"
        # columns out of index
        if options.spike_type == "cluster":
            index = None,
        else:
            index = 0
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(options.stdin,
                                       sep="\t",
                                       index_col=index,
                                       comment="#"))
        else:
            counts = Counts.Counts(IOTools.openFile(
                options.input_filename_tags, "r"),
                                   sep="\t",
                                   index_col=index,
                                   comment="#")

        # TS normalization doesn't require a design table
        if not options.method == "normalize":

            assert options.input_filename_design and os.path.exists(
                options.input_filename_design)

            # create Design object
            design = Expression.ExperimentalDesign(
                pd.read_csv(IOTools.openFile(options.input_filename_design,
                                             "r"),
                            sep="\t",
                            index_col=0,
                            comment="#"))

    if options.method == "filter":

        assert (options.filter_min_counts_per_sample is not None or
                options.filter_min_counts_per_row is not None or
                options.filter_percentile_rowsums is not None), \
            "no filtering parameters have been suplied"

        # filter
        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        nobservations, nsamples = counts.table.shape

        if nobservations == 0:
            E.warn("no observations remaining after filtering- no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "normalize":

        counts.normalise(method=options.normalization_method,
                         row_title="total")

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "spike":
        # check parameters are sensible and set parameters where they
        # are not explicitly set
        if not options.min_spike:
            E.info("setting minimum number of spikes per bin to equal"
                   "maximum number of spikes per bin (%s)" % options.max_spike)
            options.min_spike = options.max_spike

        if options.spike_type == "cluster":

            assert options.max_sbin <= options.cluster_min_size, \
                ("max size of subscluster: %s is greater than min size of"
                 "cluster: %s" % (options.max_sbin, options.cluster_min_size))

            counts_columns = set(counts.table.columns.values.tolist())

            assert ("contig" in counts_columns and
                    "position" in counts_columns), \
                ("cluster analysis requires columns named 'contig' and"
                 "'position' in the dataframe")

            counts.sort(sort_columns=["contig", "position"], reset_index=True)

        # restrict design table to first pair only

        design.firstPairOnly()

        # get dictionaries to map group members to column names
        # use different methods depending on whether suffixes are supplied
        if options.keep_suffix:
            g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix(
                options.shuffle_suffix, options.keep_suffix)
        else:
            # if no suffixes supplied, spike and keep tracks are the same
            g_to_track = design.getGroups2Samples()
            g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track)

        # set up numpy arrays for change and initial values
        change_bins = np.arange(options.min_cbin, options.max_cbin,
                                options.width_cbin)
        initial_bins = np.arange(options.min_ibin, options.max_ibin,
                                 options.width_ibin)

        E.info("Column boundaries are: %s" % str(change_bins))
        E.info("Row boundaries are: %s" % str(initial_bins))

        # shuffle rows/clusters
        if options.spike_type == "cluster":
            E.info("looking for clusters...")
            clusters_dict = Counts.findClusters(counts_sort,
                                                options.cluster_max_distance,
                                                options.cluster_min_size,
                                                g_to_spike_tracks, groups)
            if len(clusters_dict) == 0:
                raise Exception("no clusters were found, check parameters")

            E.info("shuffling subcluster regions...")
            output_indices, counts = Counts.shuffleCluster(
                initial_bins, change_bins, g_to_spike_tracks, groups,
                options.difference, options.max_spike, options.iterations,
                clusters_dict, options.max_sbin, options.min_sbin,
                options.width_sbin)

        elif options.spike_type == "row":

            E.info("shuffling rows...")
            output_indices, bin_counts = counts.shuffleRows(
                options.min_cbin, options.max_cbin, options.width_cbin,
                options.min_ibin, options.max_ibin, options.width_ibin,
                g_to_spike_tracks, design.groups, options.difference,
                options.max_spike, options.iterations)

        filled_bins = Counts.thresholdBins(output_indices, bin_counts,
                                           options.min_spike)

        assert len(filled_bins) > 0, "No bins contained enough spike-ins"

        # write out
        counts.outputSpikes(filled_bins,
                            g_to_keep_tracks,
                            design.groups,
                            output_method=options.output_method,
                            spike_type=options.spike_type,
                            min_cbin=options.min_cbin,
                            width_cbin=options.width_cbin,
                            max_cbin=options.max_cbin,
                            min_ibin=options.min_ibin,
                            width_ibin=options.width_ibin,
                            max_ibin=options.max_ibin,
                            min_sbin=options.min_sbin,
                            width_sbin=options.width_sbin,
                            max_sbin=options.max_sbin)

    E.Stop()
コード例 #16
0
ファイル: runMEDIPS.py プロジェクト: zpeng1989/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id",
        usage=globals()["__doc__"])

    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend", dest="extension", type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size", dest="shift", type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size", dest="window_size", type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations", type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t", "--toolset", dest="toolset", type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment",
                               "dmr", "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w", "--bigwig-file", dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment", dest="treatment_files", type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control", dest="control_files", type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input", dest="input_files", type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip", action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata", dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file", dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa", dest="bwa", action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique", dest="unique", type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms", dest="chroms", type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="Hsapiens.UCSC.hg19",
        genome_file=None,
        extend=0,
        shift=0,
        window_size=300,
        saturation_iterations=10,
        toolset=[],
        bigwig=False,
        treatment_files=[],
        control_files=[],
        input_files=[],
        output_rdata=False,
        input_rdata=None,
        is_medip=True,
        fdr_threshold=0.1,
        fdr_method="BH",
        bwa=False,
        unique=0.001,
        chroms=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin,
                                   dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" % (line['chr'],
                                      int(line['start']),
                                      int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError, msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return
コード例 #17
0
def outputRegionsOfInterest(design_file, counts_file, outfile,
                            max_per_sample=10, sum_per_group=40):
    '''output windows according to various filters.

    The output is a mock analysis similar to a differential expression
    result.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format
    max_per_sample : int
       Remove samples with more than threshold counts
    sum_per_group : int
       Minimum counts per group.

    '''
    job_memory = "64G"

    design = Expression.readDesignFile(design_file)

    # remove tracks not included in the design
    design = dict([(x, y) for x, y in design.items() if y.include])
    # define the two groups
    groups = sorted(set([x.group for x in design.values()]))

    # build a filtering statement
    groupA, groupB = groups

    def _buildMax(g, threshold):

        selected = [x for x, y in design.items() if y.group == g]
        if len(selected) > 1:
            return "max((%s)) < %f" % (
                ",".join(
                    ["int(r['%s'])" % x for x in selected]),
                threshold)
        elif len(selected) == 1:
            return "int(r['%s']) < %f" % (selected[0], threshold)
        else:
            raise ValueError("no groups found for 'g'" % g)

    def _buildSum(g, threshold):

        selected = [x for x, y in design.items() if y.group == g]
        if len(selected) > 1:
            return "sum((%s)) > %f" % (
                ",".join(
                    ["int(r['%s'])" % x for x in selected]),
                threshold)
        elif len(selected) == 1:
            return "int(r['%s']) > %f" % (selected[0], threshold)
        else:
            raise ValueError("no groups found for 'g'" % g)

    upper_levelA = _buildMax(groupA, max_per_sample)
    upper_levelB = _buildMax(groupB, max_per_sample)
    sum_levelA = _buildSum(groupA, sum_per_group)
    sum_levelB = _buildSum(groupB, sum_per_group)

    statement = '''
    zcat %(counts_file)s
    | python %(scriptsdir)s/csv_select.py
            --log=%(outfile)s.log
            "(%(upper_levelA)s and %(sum_levelB)s) or
             (%(upper_levelB)s and %(sum_levelA)s)"
    | python %(scriptsdir)s/runExpression.py
            --log=%(outfile)s.log
            --design-tsv-file=%(design_file)s
            --tags-tsv-file=-
            --method=mock
            --filter-min-counts-per-sample=0
    | gzip
    > %(outfile)s
    '''

    P.run()
コード例 #18
0
def runCuffdiff(bamfiles,
                design_file,
                geneset_file,
                outfile,
                cuffdiff_options="",
                job_threads=4,
                job_memory="4G",
                fdr=0.1,
                mask_file=None):
    '''estimate differential expression using cuffdiff.

    Replicates within each track are grouped.

    Arguments
    ---------
    bamfiles : list
        List of filenames in :term:`bam` format.
    designfile : string
        Filename with experimental design in :term:`tsv` format.
    geneset_file : string
        Filename with geneset of interest in :term:`gtf format.
    outfile : string
        Output filename. The output is :term:`tsv` formatted.
    cuffdiff_options : string
        Options to pass on to cuffdiff
    job_threads : int
        Number of threads to use.
    job_memory : string
        Memory to reserve.
    fdr : float
        FDR threshold to apply.
    mask_file : string
        If given, ignore genes overlapping gene models in
        this :term:`gtf` formatted file.
    '''

    design = Expression.readDesignFile(design_file)

    outdir = outfile + ".dir"
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    # replicates are separated by ","
    reps = collections.defaultdict(list)
    for bamfile in bamfiles:
        groups = collections.defaultdict()
        # .accepted.bam kept for legacy reasons (see rnaseq pipeline)
        track = P.snip(os.path.basename(bamfile), ".bam", ".accepted.bam")
        if track not in design:
            E.warn("bamfile '%s' not part of design - skipped" % bamfile)
            continue

        d = design[track]
        if not d.include:
            continue
        reps[d.group].append(bamfile)

    groups = sorted(reps.keys())
    labels = ",".join(groups)
    reps = "   ".join([",".join(reps[group]) for group in groups])

    # Nick - add mask gtf to not assess rRNA and ChrM
    extra_options = []

    if mask_file:
        extra_options.append(" -M %s" % os.path.abspath(mask_file))

    extra_options = " ".join(extra_options)

    # IMS added a checkpoint to catch cuffdiff errors
    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    # AH: compress output in outdir
    job_memory = "7G"
    statement = '''date > %(outfile)s.log;
    hostname >> %(outfile)s.log;
    cuffdiff --output-dir %(outdir)s
             --verbose
             --num-threads %(job_threads)i
             --labels %(labels)s
             --FDR %(fdr)f
             %(extra_options)s
             %(cuffdiff_options)s
             <(gunzip < %(geneset_file)s )
             %(reps)s
    2>&1
    | grep -v 'BAM record error'
    >> %(outfile)s.log;
    checkpoint;
    gzip -f %(outdir)s/*;
    checkpoint;
    date >> %(outfile)s.log;
    '''
    P.run()

    results = parseCuffdiff(os.path.join(outdir, "gene_exp.diff.gz"))

    Expression.writeExpressionResults(outfile, results)
コード例 #19
0
def plotDETagStats(infile, outfile):
    '''plot differential expression stats'''
    Expression.plotDETagStats(infile, outfile)
    P.touch(outfile)
コード例 #20
0
ファイル: runExpression.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--filename-tags",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "-d",
        "--filename-design",
        dest="input_filename_design",
        type="string",
        help="input file with experimental design [default=%default].")

    parser.add_option("-o",
                      "--outfile",
                      dest="output_filename",
                      type="string",
                      help="output filename [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("deseq", "edger", "cuffdiff", "mock", "summary", "dump",
                 "spike"),
        help="differential expression method to apply [default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type",
                      dest="deseq_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option(
        "-p",
        "--pseudo-counts",
        dest="pseudo_counts",
        type="float",
        help="pseudocounts to add for mock analyis [default=%default].")

    parser.add_option("-R",
                      "--save-R",
                      dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option(
        "-r",
        "--reference-group",
        dest="ref_group",
        type="string",
        help=
        "Group to use as reference to compute fold changes against [default=$default]"
    )

    parser.add_option(
        "--filter-min-counts-per-row",
        dest="filter_min_counts_per_row",
        type="int",
        help=
        "remove rows with less than this number of counts in total [default=%default]."
    )

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option(
        "--filter-percentile-rowsums",
        dest="filter_percentile_rowsums",
        type="int",
        help=
        "remove percent of rows with lowest total counts [default=%default].")

    parser.set_defaults(
        input_filename_tags="-",
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        ref_group=None,
        save_r_environment=None,
        filter_min_counts_per_row=1,
        filter_min_counts_per_sample=10,
        filter_percentile_rowsums=0,
        pseudo_counts=0,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if options.input_filename_tags == "-":
        fh = P.getTempFile()
        fh.write("".join([x for x in options.stdin]))
        fh.close()
        options.input_filename_tags = fh.name
    else:
        fh = None

    # load tag data and filter
    if options.method in ("deseq", "edger", "mock"):
        assert options.input_filename_tags and os.path.exists(
            options.input_filename_tags)
        assert options.input_filename_design and os.path.exists(
            options.input_filename_design)

        Expression.loadTagData(options.input_filename_tags,
                               options.input_filename_design)

        nobservations, nsamples = Expression.filterTagData(
            filter_min_counts_per_row=options.filter_min_counts_per_row,
            filter_min_counts_per_sample=options.filter_min_counts_per_sample,
            filter_percentile_rowsums=options.filter_percentile_rowsums)

        if nobservations == 0:
            E.warn("no observations - no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        sample_names = R('''colnames(countsTable)''')
        E.info("%i samples to test at %i observations: %s" %
               (nsamples, nobservations, ",".join(sample_names)))

    try:
        if options.method == "deseq":
            Expression.runDESeq(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                dispersion_method=options.deseq_dispersion_method,
                fit_type=options.deseq_fit_type,
                sharing_mode=options.deseq_sharing_mode,
                ref_group=options.ref_group,
            )

        elif options.method == "edger":
            Expression.runEdgeR(outfile=options.output_filename,
                                outfile_prefix=options.output_filename_pattern,
                                fdr=options.fdr,
                                ref_group=options.ref_group)

        elif options.method == "mock":
            Expression.runMockAnalysis(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                ref_group=options.ref_group,
                pseudo_counts=options.pseudo_counts,
            )

        elif options.method == "summary":
            Expression.outputTagSummary(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design)

        elif options.method == "dump":
            assert options.input_filename_tags and os.path.exists(
                options.input_filename_tags)
            Expression.dumpTagData(options.input_filename_tags,
                                   options.input_filename_design,
                                   outfile=options.stdout)

        elif options.method == "spike":
            Expression.outputSpikeIns(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design,
                foldchange_max=options.spike_foldchange_max,
                expression_max=options.spike_expression_max,
                max_counts_per_bin=options.spike_max_counts_per_bin,
                expression_bin_width=options.spike_expression_bin_width,
                foldchange_bin_width=options.spike_foldchange_bin_width,
            )

    except rpy2.rinterface.RRuntimeError, msg:
        if options.save_r_environment:
            E.info("saving R image to %s" % options.save_r_environment)
            R['save.image'](options.save_r_environment)
        raise
コード例 #21
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #22
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #23
0
def runCuffdiff(bamfiles,
                design_file,
                geneset_file,
                outfile,
                cuffdiff_options="",
                job_threads=4,
                job_memory="4G",
                fdr=0.1,
                mask_file=None):
    '''estimate differential expression using cuffdiff.

    Replicates within each track are grouped.

    Arguments
    ---------
    bamfiles : list
        List of filenames in :term:`bam` format.
    designfile : string
        Filename with experimental design in :term:`tsv` format.
    geneset_file : string
        Filename with geneset of interest in :term:`gtf format.
    outfile : string
        Output filename. The output is :term:`tsv` formatted.
    cuffdiff_options : string
        Options to pass on to cuffdiff
    job_threads : int
        Number of threads to use.
    job_memory : string
        Memory to reserve.
    fdr : float
        FDR threshold to apply.
    mask_file : string
        If given, ignore genes overlapping gene models in
        this :term:`gtf` formatted file.
    '''

    design = Expression.readDesignFile(design_file)

    outdir = outfile + ".dir"
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    # replicates are separated by ","
    reps = collections.defaultdict(list)
    for bamfile in bamfiles:
        groups = collections.defaultdict()
        # .accepted.bam kept for legacy reasons (see rnaseq pipeline)
        track = P.snip(os.path.basename(bamfile), ".bam", ".accepted.bam")
        if track not in design:
            E.warn("bamfile '%s' not part of design - skipped" % bamfile)
            continue

        d = design[track]
        if not d.include:
            continue
        reps[d.group].append(bamfile)

    groups = sorted(reps.keys())
    labels = ",".join(groups)
    reps = "   ".join([",".join(reps[group]) for group in groups])

    # Nick - add mask gtf to not assess rRNA and ChrM
    extra_options = []

    if mask_file:
        extra_options.append(" -M %s" % os.path.abspath(mask_file))

    extra_options = " ".join(extra_options)

    # IMS added a checkpoint to catch cuffdiff errors
    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    # AH: compress output in outdir
    job_memory = "7G"
    statement = '''date > %(outfile)s.log;
    hostname >> %(outfile)s.log;
    cuffdiff --output-dir %(outdir)s
             --verbose
             --num-threads %(job_threads)i
             --labels %(labels)s
             --FDR %(fdr)f
             %(extra_options)s
             %(cuffdiff_options)s
             <(gunzip < %(geneset_file)s )
             %(reps)s
    2>&1
    | grep -v 'BAM record error'
    >> %(outfile)s.log;
    checkpoint;
    gzip -f %(outdir)s/*;
    checkpoint;
    date >> %(outfile)s.log;
    '''
    P.run()

    results = parseCuffdiff(os.path.join(outdir, "gene_exp.diff.gz"))

    Expression.writeExpressionResults(outfile, results)
コード例 #24
0
def runCuffdiff(bamfiles,
                design_file,
                geneset_file,
                outfile,
                cuffdiff_options="",
                threads=4,
                fdr=0.1,
                mask_file=None):
    '''estimate differential expression using cuffdiff.

    infiles
       bam files

    geneset_file
       geneset to use for the analysis

    design_file
       design file describing which differential expression to test

    Replicates within each track are grouped.
    '''

    design = Expression.readDesignFile(design_file)

    outdir = outfile + ".dir"
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    job_threads = threads

    # replicates are separated by ","
    reps = collections.defaultdict(list)
    for bamfile in bamfiles:
        groups = collections.defaultdict()
        # .accepted.bam kept for legacy reasons (see rnaseq pipeline)
        track = P.snip(os.path.basename(bamfile), ".bam", ".accepted.bam")
        if track not in design:
            E.warn("bamfile '%s' not part of design - skipped" % bamfile)
            continue

        d = design[track]
        if not d.include:
            continue
        reps[d.group].append(bamfile)

    groups = sorted(reps.keys())
    labels = ",".join(groups)
    reps = "   ".join([",".join(reps[group]) for group in groups])

    # Nick - add mask gtf to not assess rRNA and ChrM
    extra_options = []

    if mask_file:
        extra_options.append(" -M %s" % os.path.abspath(mask_file))

    extra_options = " ".join(extra_options)

    # IMS added a checkpoint to catch cuffdiff errors
    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    # AH: compress output in outdir
    statement = '''date > %(outfile)s.log;
    hostname >> %(outfile)s.log;
    cuffdiff --output-dir %(outdir)s
             --verbose
             --num-threads %(threads)i
             --labels %(labels)s
             --FDR %(fdr)f
             %(extra_options)s
             %(cuffdiff_options)s
             <(gunzip < %(geneset_file)s )
             %(reps)s
    2>&1
    | grep -v 'BAM record error'
    >> %(outfile)s.log;
    checkpoint;
    gzip -f %(outdir)s/*;
    checkpoint;
    date >> %(outfile)s.log;
    '''
    P.run()

    results = parseCuffdiff(os.path.join(outdir, "gene_exp.diff.gz"))

    Expression.writeExpressionResults(outfile, results)
コード例 #25
0
ファイル: runMEDIPS.py プロジェクト: logust79/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
コード例 #26
0
ファイル: runExpression.py プロジェクト: kathrinjansen/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-o", "--outfile", dest="output_filename", type="string",
                      help="output filename [default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "deseq", "edger", "deseq2",
                          "ttest",
                          "mock", "summary",
                          "dump", "spike",
                          "plottagstats",
                          "plotdetagstats"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option(
        "--edger-dispersion",
        dest="edger_dispersion", type="float",
        help="dispersion value for edgeR if there are no replicates "
        "[default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-p", "--pseudocounts", dest="pseudo_counts",
                      type="float",
                      help="pseudocounts to add for mock analyis "
                      "[default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")
    parser.add_option("--deseq2-design-formula",
                      dest="model",
                      type="string",
                      help="Design formula for DESeq2")
    parser.add_option("--deseq2-contrasts",
                      dest="contrasts",
                      type="string",
                      help=("contrasts for post-hoc testing writen"
                            " variable:control:treatment,..."))
    parser.add_option("--deseq2-plot",
                      dest="plot",
                      type="int",
                      help=("draw plots during deseq2 analysis"))

    parser.set_defaults(
        input_filename_tags=None,
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=None,
        save_r_environment=None,
        filter_min_counts_per_row=1,
        filter_min_counts_per_sample=10,
        filter_percentile_rowsums=0,
        pseudo_counts=0,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
        plot=1
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if options.input_filename_tags == "-":
        fh = tempfile.NamedTemporaryFile(delete=False)
        fh.write("".join([x for x in options.stdin]))
        fh.close()
        options.input_filename_tags = fh.name
    else:
        fh = None

    # load tag data and filter
    if options.method in ("deseq2", "deseq", "edger", "mock", "ttest"):
        assert options.input_filename_tags and os.path.exists(
            options.input_filename_tags)
        assert options.input_filename_design and os.path.exists(
            options.input_filename_design)

        Expression.loadTagData(options.input_filename_tags,
                               options.input_filename_design)

        nobservations, nsamples = Expression.filterTagData(
            filter_min_counts_per_row=options.filter_min_counts_per_row,
            filter_min_counts_per_sample=options.filter_min_counts_per_sample,
            filter_percentile_rowsums=options.filter_percentile_rowsums)

        if nobservations == 0:
            E.warn("no observations - no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        sample_names = R('''colnames(countsTable)''')
        E.info("%i samples to test at %i observations: %s" %
               (nsamples, nobservations,
                ",".join(sample_names)))

    try:
        if options.method == "deseq2":
            Expression.runDESeq2(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                ref_group=options.ref_group,
                model=options.model,
                contrasts=options.contrasts,
                plot=options.plot
            )

        elif options.method == "deseq":
            Expression.runDESeq(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                dispersion_method=options.deseq_dispersion_method,
                fit_type=options.deseq_fit_type,
                sharing_mode=options.deseq_sharing_mode,
                ref_group=options.ref_group,
            )

        elif options.method == "edger":
            Expression.runEdgeR(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr,
                ref_group=options.ref_group,
                dispersion=options.edger_dispersion)

        elif options.method == "mock":
            Expression.runMockAnalysis(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                ref_group=options.ref_group,
                pseudo_counts=options.pseudo_counts,
            )

        elif options.method == "summary":
            Expression.outputTagSummary(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design
            )

        elif options.method == "dump":
            assert options.input_filename_tags and os.path.exists(
                options.input_filename_tags)
            Expression.dumpTagData(options.input_filename_tags,
                                   options.input_filename_design,
                                   outfile=options.stdout)

        elif options.method == "plottagstats":
            assert options.input_filename_tags and os.path.exists(
                options.input_filename_tags)
            Expression.plotTagStats(
                options.input_filename_tags,
                options.input_filename_design,
                outfile_prefix=options.output_filename_pattern)

        elif options.method == "plotdetagstats":
            assert options.input_filename_result and os.path.exists(
                options.input_filename_result)
            Expression.plotDETagStats(
                options.input_filename_result,
                outfile_prefix=options.output_filename_pattern)

        elif options.method == "spike":
            Expression.outputSpikeIns(
                options.input_filename_tags,
                options.stdout,
                options.output_filename_pattern,
                filename_design=options.input_filename_design,
                foldchange_max=options.spike_foldchange_max,
                expression_max=options.spike_expression_max,
                max_counts_per_bin=options.spike_max_counts_per_bin,
                expression_bin_width=options.spike_expression_bin_width,
                foldchange_bin_width=options.spike_foldchange_bin_width,
            )

        elif options.method == "ttest":
            Expression.runTTest(
                outfile=options.output_filename,
                outfile_prefix=options.output_filename_pattern,
                fdr=options.fdr)

    except rpy2.rinterface.RRuntimeError:
        if options.save_r_environment:
            E.info("saving R image to %s" % options.save_r_environment)
            R['save.image'](options.save_r_environment)
        raise

    if fh and os.path.exists(fh.name):
        os.unlink(fh.name)

    if options.save_r_environment:
        R['save.image'](options.save_r_environment)

    E.Stop()
コード例 #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--tag-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("ttest", "sleuth", "edger", "deseq2", "mock",
                               "dexseq"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq2-dispersion-method",
                      dest="deseq2_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq2 [default=%default].")

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq2 [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion",
                      type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    # currently not implemented
    # parser.add_option("-R", "--output-R-code", dest="save_r_environment",
    #                  type="string",
    #                  help="save R environment to loc [default=%default]")

    parser.add_option("-r",
                      "--reference-group",
                      dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--reduced-model",
                      dest="reduced_model",
                      type="string",
                      help=("reduced model for LRT"))

    parser.add_option("--contrast",
                      dest="contrast",
                      type="string",
                      help=("contrast for differential expression testing"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing expression estimates"
                            "from sleuth. Sleuth expects counts"
                            "files to be called abundance.h5"))

    parser.add_option("--dexseq-counts-dir",
                      dest="dexseq_counts_dir",
                      type="string",
                      help=("directory containing counts for dexseq. DEXSeq "
                            "expects counts files to be called .txt and"
                            "to be generated by the DEXSeq_counts.py script"))

    parser.add_option("--dexseq-flattened-file",
                      dest="dexseq_flattened_file",
                      type="string",
                      help=("directory containing flat gtf for dexseq. DEXSeq "
                            "expects this to be generated by the"
                            "DEXSeq_prepare_annotations.py script"))

    parser.add_option(
        "--outfile-sleuth-count",
        dest="outfile_sleuth_count",
        type="string",
        help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option(
        "--sleuth-genewise",
        dest="sleuth_genewise",
        action="store_true",
        help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.add_option("--de-test",
                      dest="DEtest",
                      type="choice",
                      choices=("wald", "lrt"),
                      help=("Differential expression test"))

    parser.add_option("--Rhistory",
                      dest="Rhistory",
                      type="string",
                      help=("Outfile for R history"))

    parser.add_option("--Rimage",
                      dest="Rimage",
                      type="string",
                      help=("Outfile for R image"))

    parser.set_defaults(input_filename_tags="-",
                        input_filename_design=None,
                        output_filename=sys.stdout,
                        method="deseq2",
                        fdr=0.1,
                        deseq2_dispersion_method="pooled",
                        deseq2_fit_type="parametric",
                        edger_dispersion=0.4,
                        ref_group=False,
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        spike_foldchange_max=4.0,
                        spike_expression_max=5.0,
                        spike_expression_bin_width=0.5,
                        spike_foldchange_bin_width=0.5,
                        spike_max_counts_per_bin=50,
                        model=None,
                        contrast=None,
                        output_filename_pattern=None,
                        sleuth_counts_dir=None,
                        dexseq_counts_dir=None,
                        dexseq_flattened_file=None,
                        outfile_sleuth_count=None,
                        outfile_sleuth_tpm=None,
                        use_ihw=False,
                        sleuth_genewise=False,
                        gene_biomart=None,
                        DEtest="wald",
                        reduced_model=None,
                        Rhistory=None,
                        Rimage=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    RH = None
    if options.Rhistory or options.Rimage:
        RH = R.R_with_History()

    outfile_prefix = options.output_filename_pattern

    # Expression.py currently expects a refernce group for edgeR and
    # sleuth, regardless of which test is used
    if not options.ref_group and (options.method is "edger"
                                  or options.method is "sleuth"):
        raise ValueError(
            "Must provide a reference group ('--reference-group')")

    # create Design object
    design = Expression.ExperimentalDesign(
        pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                    sep="\t",
                    index_col=0,
                    comment="#"))

    if len(set(design.table[options.contrast])) > 2:

        if options.method == "deseq2" or options.method == "sleuth":
            if options.DEtest == "wald":
                raise ValueError(
                    "Factor must have exactly two levels for Wald Test. "
                    "If you have more than two levels in your factor, "
                    "consider LRT")
        else:
            E.info('''There are more than 2 levels for the contrast
            specified" "(%s:%s). The log2fold changes in the results table
            and MA plots will be for the first two levels in the
            contrast. The p-value will be the p-value for the overall
            significance of the contrast. Hence, some genes will have a
            signficant p-value but 0-fold change between the first two
            levels''' % (options.contrast, set(design[options.contrast])))

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files "
            " (--sleuth-counts-dir)")

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart,
                                 DE_test=options.DEtest,
                                 ref_group=options.ref_group,
                                 reduced_model=options.reduced_model)

    # DEXSeq reads in data itself
    elif options.method == "dexseq":
        assert options.dexseq_counts_dir, (
            "need to specify the location of the .txt counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t",
                        index_col=0,
                        comment="#"))

        # validate design against counts and model
        # design.validate(model=options.model)

        experiment = Expression.DEExperiment_DEXSeq()
        results = experiment.run(design,
                                 base_dir=options.dexseq_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 ref_group=options.ref_group,
                                 outfile_prefix=outfile_prefix,
                                 flattenedfile=options.dexseq_flattened_file,
                                 fdr=options.fdr)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(sys.stdin,
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))
        else:
            counts = Counts.Counts(
                pd.io.parsers.read_csv(IOTools.openFile(
                    options.input_filename_tags, "r"),
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     ref_group=options.ref_group,
                                     fdr=options.fdr,
                                     dispersion=options.edger_dispersion)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group,
                                     DEtest=options.DEtest,
                                     R=RH)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in list(results.Summary.keys()):
        outf = IOTools.openFile(
            "_".join([outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n" %
                   results.Summary[test_group].asTable())
        outf.close()

    if options.Rhistory:
        RH.saveHistory(options.Rhistory)
    if options.Rimage:
        RH.saveImage(options.Rimage)

    E.Stop()
コード例 #28
0
ファイル: runExpression.py プロジェクト: yangjl/cgat
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-t", "--filename-tags", dest="input_filename_tags", type="string",
                      help="input file with tag counts [default=%default]."  )

    parser.add_option("-d", "--filename-design", dest="input_filename_design", type="string",
                      help="input file with experimental design [default=%default]."  )

    parser.add_option("-o", "--outfile", dest="output_filename", type="string",
                      help="output filename [default=%default]."  )

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices = ("deseq", "edger", "cuffdiff", "mock", "summary", "dump", "spike" ),
                      help="differential expression method to apply [default=%default]."  )

    parser.add_option( "--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice",
                      choices = ("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default]."  )

    parser.add_option( "--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices = ("parametric", "local"),
                      help="fit type for deseq [default=%default]."  )

    parser.add_option( "--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice",
                      choices = ("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default]."  )

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default]."  )

    parser.add_option("-p", "--pseudo-counts", dest="pseudo_counts", type="float",
                      help="pseudocounts to add for mock analyis [default=%default]."  )

    parser.add_option("-R", "--save-R", dest="save_r_environment", type="string",
                      help="save R environment [default=%default]."  )

    parser.add_option("-r","--reference-group", dest="ref_group", type="string",
                      help="Group to use as reference to compute fold changes against [default=$default]")

    parser.add_option( "--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int",
                      help="remove rows with less than this number of counts in total [default=%default]."  )

    parser.add_option( "--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int",
                      help="remove samples with a maximum count per sample of "
                       "less than this numer   [default=%default]."  )

    parser.add_option( "--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int",
                      help="remove percent of rows with lowest total counts [default=%default]."  )

    parser.set_defaults(
        input_filename_tags = "-",
        input_filename_design = None,
        output_filename = sys.stdout,
        method = "deseq",
        fdr = 0.1,
        deseq_dispersion_method = "pooled",
        deseq_fit_type = "parametric",
        deseq_sharing_mode = "maximum",
        ref_group = None,
        save_r_environment = None,
        filter_min_counts_per_row = 1,
        filter_min_counts_per_sample = 10,
        filter_percentile_rowsums = 0,
        pseudo_counts = 0,
        spike_foldchange_max = 4.0,
        spike_expression_max = 5.0,
        spike_expression_bin_width = 0.5,
        spike_foldchange_bin_width = 0.5,
        spike_max_counts_per_bin = 50,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    if options.input_filename_tags == "-":
        fh = P.getTempFile()
        fh.write( "".join( [ x for x in options.stdin ] ) )
        fh.close()
        options.input_filename_tags = fh.name
    else:
        fh = None

    # load tag data and filter
    if options.method in ("deseq", "edger", "mock"):
        assert options.input_filename_tags and os.path.exists(options.input_filename_tags)
        assert options.input_filename_design and os.path.exists(options.input_filename_design)

        Expression.loadTagData( options.input_filename_tags, 
                                options.input_filename_design )
            
        nobservations, nsamples = Expression.filterTagData(                                  
            filter_min_counts_per_row = options.filter_min_counts_per_row,
            filter_min_counts_per_sample = options.filter_min_counts_per_sample,
            filter_percentile_rowsums = options.filter_percentile_rowsums )
        
        if nobservations == 0:
            E.warn( "no observations - no output" )
            return
        
        if nsamples == 0:
            E.warn( "no samples remain after filtering - no output" )
            return

        sample_names = R('''colnames(countsTable)''')
        E.info( "%i samples to test at %i observations: %s" % ( nsamples, nobservations,
                                                                ",".join( sample_names)))

    try:
        if options.method == "deseq":
            Expression.runDESeq( outfile = options.output_filename,
                                 outfile_prefix = options.output_filename_pattern,
                                 fdr = options.fdr,
                                 dispersion_method = options.deseq_dispersion_method,
                                 fit_type = options.deseq_fit_type,
                                 sharing_mode = options.deseq_sharing_mode,
                                 ref_group = options.ref_group,
                                 )

        elif options.method == "edger":
            Expression.runEdgeR( outfile = options.output_filename,
                                 outfile_prefix = options.output_filename_pattern,
                                 fdr = options.fdr,
                                 ref_group = options.ref_group)

        elif options.method == "mock":
            Expression.runMockAnalysis( outfile = options.output_filename,
                                        outfile_prefix = options.output_filename_pattern,
                                        ref_group = options.ref_group,
                                        pseudo_counts = options.pseudo_counts,
                                        )

        elif options.method == "summary":
            Expression.outputTagSummary( options.input_filename_tags,
                                         options.stdout,
                                         options.output_filename_pattern,
                                         filename_design = options.input_filename_design
                                         )

        elif options.method == "dump":
            assert options.input_filename_tags and os.path.exists(options.input_filename_tags)
            Expression.dumpTagData( options.input_filename_tags,
                                    options.input_filename_design,
                                    outfile = options.stdout )

        elif options.method == "spike":
            Expression.outputSpikeIns( options.input_filename_tags,
                                       options.stdout,
                                       options.output_filename_pattern,
                                       filename_design = options.input_filename_design,
                                       foldchange_max = options.spike_foldchange_max,
                                       expression_max = options.spike_expression_max,
                                       max_counts_per_bin = options.spike_max_counts_per_bin,
                                       expression_bin_width = options.spike_expression_bin_width,
                                       foldchange_bin_width = options.spike_foldchange_bin_width,
                                       )

    except rpy2.rinterface.RRuntimeError, msg:
        if options.save_r_environment:
            E.info("saving R image to %s" % options.save_r_environment)
            R['save.image']( options.save_r_environment )
        raise
コード例 #29
0
ファイル: counts2table.py プロジェクト: zpeng1989/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("sleuth", "edger", "deseq2", "mock"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion", type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--contrasts",
                      dest="contrasts",
                      action="append",
                      help=("contrasts for post-hoc testing writen as comma "
                            "seperated list `condition,replicate` etc"))

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="string",
                      help=("fit type used for observed dispersion mean "
                            "relationship in deseq2"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing counts for sleuth. Sleuth "
                            "expects counts files to be called abundance.h5"))

    parser.add_option("--outfile-sleuth-count",
                      dest="outfile_sleuth_count",
                      type="string",
                      help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option("--sleuth-genewise",
                      dest="sleuth_genewise",
                      action="store_true",
                      help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.set_defaults(
        input_filename_tags="-",
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq2",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=False,
        save_r_environment=None,
        filter_min_counts_per_row=None,
        filter_min_counts_per_sample=None,
        filter_percentile_rowsums=None,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
        contrasts=None,
        output_filename_pattern=None,
        deseq2_fit_type="parametric",
        sleuth_counts_dir=None,
        outfile_sleuth_count=None,
        outfile_sleuth_tpm=None,
        use_ihw=False,
        sleuth_genewise=False,
        gene_biomart=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfile_prefix = options.output_filename_pattern + "_" + options.method

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrasts=options.contrasts,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(pd.io.parsers.read_csv(
                sys.stdin, sep="\t", index_col=0, comment="#"))
        else:
            counts = Counts.Counts(pd.io.parsers.read_csv(
                IOTools.openFile(options.input_filename_tags, "r"),
                sep="\t", index_col=0, comment="#"))

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     disperion=options.edger_dispersion,
                                     ref_group=options.ref_group,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix)
        results.plotMA(contrast, outfile_prefix=outfile_prefix)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in results.Summary.keys():
        outf = IOTools.openFile("_".join(
            [outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n"
                   % results.Summary[test_group].asTable())
        outf.close()

    E.Stop()
コード例 #30
0
def loadCuffdiff(infile, outfile, min_fpkm=1.0):
    '''load results from differential expression analysis and produce
    summary plots.

    Note: converts from ln(fold change) to log2 fold change.

    The cuffdiff output is parsed.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    fpkm_silent) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    This requires the cummeRbund library to be present in R.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued

    dbhandle = sqlite3.connect(PARAMS["database"])

    tmpname = P.getTempFilename(".")

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)
        results = parseCuffdiff(infile,
                                min_fpkm=min_fpkm)

        Expression.writeExpressionResults(tmpname, results)

        statement = '''cat %(tmpname)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=treatment_name
              --add-index=control_name
              --add-index=test_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"

        statement = '''zcat %(indir)s/%(fn)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=tracking_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        x = 0
        for line in inf:
            if x == 0:
                x += 1
                continue
            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # plesae make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            x = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[x]] + "\t")
                x += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        statement = ("cat %(tmpf)s |"
                     " python %(scriptsdir)s/csv2db.py "
                     "  %(csv2db_options)s"
                     "  --allow-empty-file"
                     "  --add-index=gene_id"
                     "  --table=%(tablename)s"
                     " >> %(outfile)s.log")
        P.run()

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #31
0
ファイル: pipeline_expression.py プロジェクト: santayana/cgat
def buildProbeset2Gene(infile, outfile):
    '''build map relating a probeset to an ENSEMBL gene_id'''
    Expression.buildProbeset2Gene(infile, outfile)
コード例 #32
0
ファイル: runMEDIPS.py プロジェクト: CGATOxford/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id",
        usage=globals()["__doc__"])

    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend", dest="extension", type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size", dest="shift", type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size", dest="window_size", type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations", type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t", "--toolset", dest="toolset", type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment",
                               "dmr", "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w", "--bigwig-file", dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment", dest="treatment_files", type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control", dest="control_files", type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input", dest="input_files", type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip", action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata", dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file", dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa", dest="bwa", action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique", dest="unique", type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms", dest="chroms", type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="Hsapiens.UCSC.hg19",
        genome_file=None,
        extend=0,
        shift=0,
        window_size=300,
        saturation_iterations=10,
        toolset=[],
        bigwig=False,
        treatment_files=[],
        control_files=[],
        input_files=[],
        output_rdata=False,
        input_rdata=None,
        is_medip=True,
        fdr_threshold=0.1,
        fdr_method="BH",
        bwa=False,
        unique=0.001,
        chroms=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin,
                                   dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" % (line['chr'],
                                      int(line['start']),
                                      int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.getOutputFile("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.getOutputFile("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.openFile(
                E.getOutputFile("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write(
                "estimated_correlation\t%s\n" %
                ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write(
                "true_correlation\t%s\n" %
                ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write(
                "nreads\t%s\n" %
                ",".join(["%i" % x for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w")
        slotnames = (("regions.CG", "regions_CG", "%i"),
                     ("regions.C", "regions_C", "%s"),
                     ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH", "%i"),
                     ("regions.GoGe", "regions_GoGe", "%i"),
                     ("genome.CG", "genome_CG", "%s"),
                     ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"),
                     ("genome.relH", "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] +
                                [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' %
              ",".join(["treatment_R%i" % x
                        for x in range(len(options.treatment_files))]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' %
                  ",".join(["control_R%i" % x
                            for x in range(len(options.control_files))]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.getOutputFile("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.getOutputFile("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.getOutputFile("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.getOutputFile("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')),
                    str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')),
                    str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.getOutputFile("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.getOutputFile("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.Stop()