Beispiel #1
0
def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())
Beispiel #2
0
def getTableFromDb(database_url, table):
    '''
    Get a table from a database with pandas
    '''

    dbhandle = Database.connect(url=database_url)
    df = pandas.read_sql("SELECT * FROM {}".format(table), con=dbhandle)
    df.index = df["track"]
    df.drop(labels="track", inplace=True, axis=1)

    return df
Beispiel #3
0
def getModelCoverage(database_url, table_regex, model_type="transcript"):
    '''
    Compute transcript model coverage stats

    Arguments
    ---------
    database_url: string
      database containing transcript counts

    table_regex: string
      regular expression for transcript count table

    model_type: string
      calculate coverages over either transcripts or
      genes.  Default is gene models

    Returns
    -------
    coverage_df: Pandas.Core.DataFrame
      model coverage stats summarised for each cell
    '''

    # need to regex for all the tables, one for each sample
    # fetch_all returns a list of tuples
    dbhandle = Database.connect(database_url)
    cc = dbhandle.execute("SELECT name FROM sqlite_master WHERE type='table';")

    tab_reg = re.compile(table_regex)
    table_list = [tx[0] for tx in cc.fetchall() if re.search(tab_reg, tx[0])]

    # pull out counts for each cell and compute coverages
    bins = range(0, 101)
    cov_dict = {}
    for tab in table_list:
        covs = extractTranscriptCounts(dbhandle, tab)
        freq_array = summariseOverBins(covs, bins)
        cov_dict[tab] = freq_array

    coverage_df = pandas.DataFrame(cov_dict).T
    # create a regex group to remove superfluous characters
    # from the track names
    ix_re = re.compile(
        "_(?P<run>\d+)_(?P<plate>\d+)_(?P<well>\d+)_(?P<mapper>\S+)_transcript_counts"
    )
    re_matches = [re.match(ix_re, ix) for ix in coverage_df.index]
    indx = ["%s_%s-%s.%s" % rm.group(1, 2, 3, 4) for rm in re_matches]
    coverage_df.index = indx
    coverage_df.columns = ["Bin%i" % bx for bx in coverage_df.columns]
    return coverage_df
def loadCodingPotential(infile, outfile):
    '''load annotations'''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s
    | cgat csv2db
              %(csv2db_options)s
              --allow-empty-file
              --add-index=gene_id
              --map=gene_id:str
              --table=%(table)s
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect(PARAMS["database_name"])
    Database.executewait(
        dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait(
        dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Beispiel #5
0
def DumpGOFromDatabase(outfile, dbhandle, options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_url,
                                   options.species)

        results = Database.executewait(dbhandle, statement,
                                       retries=0).fetchall()

        for result in results:
            outfile.write("{}\t{}\n".format(go_type,
                                            "\t".join(map(str, result))))
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" %
               (go_type, ntotal, len(genes), len(categories)))

    E.info("%s\t%i\t%i\t%i" %
           ("all", all_ntotal, len(all_genes), len(all_categories)))

    return
Beispiel #6
0
def connectToUCSC(host="genome-mysql.cse.ucsc.edu",
                  user="******",
                  database=None):
    """connect to UCSC database.

    Arguments
    ---------
    host : string
        Host to connect to
    user : string
        Username to connect with
    Database : string
        database to use

    Returns
    -------
    Database handle

    """
    dbhandle = Database.connect(url="mysql://{user}@{host}/{database}".format(**locals()))

    return dbhandle
Beispiel #7
0
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
    """read go assignments from ensembl database.

    returns a dictionary of lists.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information

    Note: assumes that external_db_id for GO is 1000
    """

    statement = GetGOStatement(go_type, database, species)
    result = Database.executewait(dbhandle, statement, retries=0).fetchall()

    gene2go = {}
    go2info = collections.defaultdict(GOInfo)
    for gene_id, goid, description, evidence in result:
        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi

    return gene2go, go2info
Beispiel #8
0
def summarizeEffectsPerGene(infile, outfile):
    '''summarize effects on a per-gene level.'''

    tablename = outfile[:-len(".load")]
    track = infile[:-len("_effects.load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT
           gene_id,
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i,
         %(track)s_effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id
    ''' % locals()

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s" % locals())
    Database.executewait(dbhandle, statement)
    Database.executewait(
        dbhandle,
        "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" %
        locals())
    dbhandle.commit()

    P.touch(outfile)
def loadSummary(infile, outfile):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists("%s_rates.load" % track):
        stmt_select.append("a.distance AS ks, a.aligned AS aligned")
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
        ON r.gene_id = a.gene_id AND
        a.aligned >= %(rates_min_aligned)i AND
        a.distance <= %(rates_max_rate)f''')

    if os.path.exists("%s_coverage.load" % track):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage")
        stmt_from.append(
            "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id")

    if os.path.exists("%s_repeats_gc.load" % track):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc")
        stmt_from.append(
            "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id")

    if os.path.exists("%s_repeats_rates.load" % track):
        stmt_select.append(
            "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska")
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar
                     ON r.gene_id = ar.gene_id AND
                     ar.exons_nval >= %(rates_min_repeats)i''')

    if os.path.exists("%s_introns_rates.load" % track):
        stmt_select.append(
            "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski")
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir
                            ON r.gene_id = ir.gene_id AND
                            ir.aligned >= %(rates_min_aligned)i''')

    x = locals()
    x.update(PARAMS)
    stmt_select = ", ".join(stmt_select) % x
    stmt_from = " ".join(stmt_from) % x
    stmt_where = " AND ".join(stmt_where) % x

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    Database.executewait(
        dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals())

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
    CAST(r.gene_id AS TEXT) AS gene_id,
    r.exons_sum as length,
    r.exons_pGC as pgc,
    %(stmt_select)s
    FROM
    %(track)s_annotation AS r
    %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait(dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)
Beispiel #10
0
def create_view(dbhandle,
                tables,
                tablename,
                outfile,
                view_type="TABLE",
                ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = Database.executewait(
            dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append([
            x.lower() for x in Database.getColumnNames(dbhandle, table)
            if x != track
        ])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError("number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join([
        "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])
    ])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()
    Database.executewait(dbhandle, statement)

    nrows = Database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError("empty view mapping, check statement = %s" %
                         (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touch_file(outfile)
Beispiel #11
0
def buildDMRStats(tables, method, outfile, dbhandle):
    '''build dmr summary statistics.

    This method counts the number of up/down, 2fold up/down, etc.
    genes in output from (:mod:`scripts/runExpression`).

    This method also creates diagnostic plots in the
    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    Arguments
    ---------
    tables ; list
        List of tables with DMR output
    method : string
        Method name
    outfile : string
        Output filename. Tab separated file summarizing

    '''
    def togeneset(tablename):
        return re.match("([^_]+)_", tablename).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join((
        "tileset",
        "design",
        "track1",
        "track2",
        "tested",
        "\t".join(["status_%s" % x for x in keys_status]),
        "significant",
        "up",
        "down",
        "twofold",
        "twofold_up",
        "twofold_down",
    )) + "\n")

    all_tables = set(Database.getTables(dbhandle))
    outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")

    for tablename in tables:

        prefix = P.snip(tablename, "_%s" % method)
        tileset, design = prefix.split("_")

        def toDict(vals, l=2):
            return collections.defaultdict(int, [(tuple(x[:l]), x[l])
                                                 for x in vals])

        E.info("collecting data from %s" % tablename)

        tested = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            GROUP BY treatment_name,control_name""" % locals()).fetchall())
        status = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, status,
            COUNT(*) FROM %(tablename)s 
            GROUP BY treatment_name,control_name,status""" %
                locals()).fetchall(), 3)
        signif = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s 
            WHERE significant
            GROUP BY treatment_name,control_name""" % locals()).fetchall())
        fold2 = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s
            WHERE (l2fold >= 1 or l2fold <= -1) AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())
        up = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 0 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        down = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < 0 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        fold2up = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 1 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        fold2down = toDict(
            Database.executewait(
                dbhandle, """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < -1 AND significant
            GROUP BY treatment_name,control_name,significant""" %
                locals()).fetchall())

        groups = list(tested.keys())

        for treatment_name, control_name in groups:
            k = (treatment_name, control_name)
            outf.write("\t".join(
                map(str, (tileset, design, treatment_name, control_name,
                          tested[k], "\t".join([
                              str(status[(treatment_name, control_name, x)])
                              for x in keys_status
                          ]), signif[(k)], up[k], down[k], fold2[k],
                          fold2up[k], fold2down[k]))) + "\n")

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait(
            dbhandle, '''SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant''' % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = list(zip(*data))

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals(
            )
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10(length)',
                            ylab='log10(pvalue)',
                            log="x",
                            pch=20,
                            cex=.1)

            R['dev.off']()

    outf.close()
Beispiel #12
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT ontology FROM %s''' %
            tablename).fetchall()
    ]

    genelists = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT genelist FROM %s''' %
            tablename).fetchall()
    ]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        load_statement = P.build_load_statement(tablename=tablename)

        statement = '''
        cgat combine_tables
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | %(load_statement)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty-file')
Beispiel #13
0
def run(infile, options, chunk_size=10000):

    # for backwards compatibility
    if options.retry:
        options.retries = 20
    else:
        options.retries = -1

    flavour = get_flavour(options.database_url)

    tablename = quote_tablename(options.tablename, flavour=flavour)

    dbhandle = Database.connect(url=options.database_url)

    if "tab" in options.dialect:
        separator = "\t"
    else:
        separator = ","

    if options.append:
        if_exists = "append"
    else:
        if_exists = "replace"

    # handle header logic up-front
    if options.replace_header:
        if options.header_names is None:
            raise ValueError("No replacement headers provided")
        header = 0
        names = options.header_names
    else:
        if options.header_names is None:
            header = 0
            names = None
        else:
            header = None
            names = options.header_names

    counter = E.Counter()
    try:
        for idx, df in enumerate(
                pandas.read_csv(infile,
                                header=header,
                                names=names,
                                sep=separator,
                                index_col=False,
                                comment="#",
                                chunksize=options.chunk_size)):

            if idx == 0 and len(df) == 0:
                if not options.allow_empty:
                    raise ValueError("table is empty")

            if idx > 0:
                if_exists = "append"

            columns = list(df.columns)

            if options.lowercase_columns:
                columns = [x.lower() for x in columns]

            if options.first_column:
                columns[0] = options.first_column

            if options.ignore_columns:
                df = df[[
                    x for x in df.columns if x not in options.ignore_columns
                ]]

            if options.ignore_empty:
                empty_list = df.columns[df.isna().all()].tolist()
                if idx == 0:
                    empty_columns = set(empty_list)
                else:
                    empty_columns = empty_columns.intersection(empty_list)

            df.to_sql(tablename,
                      con=dbhandle,
                      schema=options.database_schema,
                      index=False,
                      if_exists=if_exists)

            counter.input += len(df)
    except pandas.errors.EmptyDataError:
        if not options.allow_empty:
            raise
        else:
            return

    nindex = 0
    for index in options.indices:
        nindex += 1
        try:
            statement = "CREATE INDEX %s_index%i ON %s (%s)" % (
                tablename, nindex, tablename, index)
            cc = Database.executewait(dbhandle,
                                      statement,
                                      retries=options.retries)
            cc.close()
            E.info("added index on column %s" % (index))
            counter.indexes_created += 1
        except Exception as ex:
            E.info("adding index on column %s failed: %s" % (index, ex))

    if options.ignore_empty:
        counter.empty_columns = len(empty_columns)
        for column in empty_columns:
            try:
                statement = "ALTER TABLE %s DROP COLUMN %s".format(
                    tablename, column)
                cc = Database.executewait(dbhandle,
                                          statement,
                                          retries=options.retries)
                cc.close()
                E.info("removed empty column %s" % (column))
                counter.empty_columns_removed += 1
            except Exception as ex:
                E.info("removing empty column {} failed".format(column))

    statement = "SELECT COUNT(*) FROM %s" % (tablename)
    cc = Database.executewait(dbhandle, statement, retries=options.retries)
    result = cc.fetchone()
    cc.close()

    counter.output = result[0]

    E.info(counter)
Beispiel #14
0
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
Beispiel #15
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use [default=%default].")

    parser.add_option("-i",
                      "--slims",
                      dest="filename_slims",
                      type="string",
                      help="filename with GO SLIM categories "
                      "[default=%default].")

    parser.add_option("-g",
                      "--genes-tsv-file",
                      dest="filename_genes",
                      type="string",
                      help="filename with genes to analyse "
                      "[default=%default].")

    parser.add_option("-b",
                      "--background-tsv-file",
                      dest="filename_background",
                      type="string",
                      help="filename with background genes to analyse "
                      "[default=%default].")

    parser.add_option("-m",
                      "--min-counts",
                      dest="minimum_counts",
                      type="int",
                      help="minimum count - ignore all categories that have "
                      "fewer than # number of genes"
                      " [default=%default].")

    parser.add_option("-o",
                      "--sort-order",
                      dest="sort_order",
                      type="choice",
                      choices=("fdr", "pvalue", "ratio"),
                      help="output sort order [default=%default].")

    parser.add_option("--ontology",
                      dest="ontology",
                      type="string",
                      action="append",
                      help="go ontologies to analyze. Ontologies are tested "
                      "separately [default=%default].")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option("--filename-dump",
                      dest="filename_dump",
                      type="string",
                      help="dump GO category assignments into a flatfile "
                      "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file",
        dest="filename_gene2name",
        type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology",
        dest="filename_ontology",
        type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option("--filename-input",
                      dest="filename_input",
                      type="string",
                      help="read GO category assignments from a flatfile "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample",
                      type="int",
                      help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option("--fdr",
                      dest="fdr",
                      action="store_true",
                      help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim",
        dest="go2goslim",
        action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option("--gene-pattern",
                      dest="gene_pattern",
                      type="string",
                      help="pattern to transform identifiers to GO gene names "
                      "[default=%default].")

    parser.add_option("--filename-map-slims",
                      dest="filename_map_slims",
                      type="string",
                      help="write mapping between GO categories and GOSlims "
                      "[default=%default].")

    parser.add_option(
        "--get-genes",
        dest="get_genes",
        type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict",
        dest="strict",
        action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q",
        "--fdr-method",
        dest="qvalue_method",
        type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise",
        dest="compute_pairwise",
        action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None)

    (options, args) = E.start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"
            ]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = Database.connect(url=options.database_url)

        outfile = IOTools.open_file(options.filename_dump,
                                    "w",
                                    create_dir=True)
        GO.DumpGOFromDatabase(outfile, dbhandle, options)
        outfile.close()
        E.stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = IOTools.open_file(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = IOTools.open_file(options.filename_gene2name)
        gene2name = IOTools.read_map(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())), len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = IOTools.open_file(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)

        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId,
                                                        go_type=go.mNameSpace,
                                                        description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes, gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background, gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join(
        ("genelist", "ontology", "significant", "threshold", "ngenes",
         "ncategories", "nmaps", "nforegound", "nforeground_mapped",
         "nbackground", "nbackground_mapped", "nsample_counts",
         "nbackground_counts", "psample_assignments",
         "pbackground_assignments", "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle, test_ontology, options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn("could not find information for terms - "
                   "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" % (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set([
                x for x, y in counts_per_category.items()
                if y < options.minimum_counts
            ])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    IOTools.open_file(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims, len(go_slims), len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = IOTools.open_file(options.filename_map_slims,
                                                    "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" %
                        (ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write("# genes in GO category %s\n" %
                                     options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results, foreground,
                                                       background, options,
                                                       test_ontology, gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write("mapped_categories\t%i\tmapped categories\n" %
                          ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n"
                % (len(go_results.mSampleGenes)))
            outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n"
                % (len(go_results.mBackgroundGenes)))
            outfile.write("associations_in_fg\t%i\tassociations in sample\n" %
                          go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n"
                % (IOTools.pretty_percent(len(go_results.mSampleGenes),
                                          len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n"
                % (IOTools.pretty_percent(len(go_results.mBackgroundGenes),
                                          nbackground, "%5.2f")))
            outfile.write("significant\t%i\tsignificant results reported\n" %
                          nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n"
                % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n"
                % nselected_down)
            outfile.write("threshold\t%6.4f\tsignificance threshold\n" %
                          options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(
                map(str, (genelist_name, test_ontology, nselected,
                          options.threshold, ngenes, ncategories, nmaps,
                          len(foreground), len(go_results.mSampleGenes),
                          nbackground, len(go_results.mBackgroundGenes),
                          go_results.mSampleCountsTotal,
                          go_results.mBackgroundCountsTotal,
                          IOTools.pretty_percent(len(go_results.mSampleGenes),
                                                 len(foreground), "%5.2f"),
                          IOTools.pretty_percent(
                              len(go_results.mBackgroundGenes), nbackground,
                              "%5.2f"), ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology, go2info, options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.stop()