def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())
def createViewMapping(infile, outfile):
    """create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    """

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = """
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    """

    Database.executewait(dbhandle, statement % locals())
Example #3
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')
Example #4
0
def loadCodingPotential(infile, outfile):
    '''load annotations'''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s 
    | python %(scriptsdir)s/csv2db.py 
              %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect(PARAMS["database"])
    Database.executewait(
        dbhandle,
        '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait(
        dbhandle,
        '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Example #5
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg(
        {"counts": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="track",
                              values="counts",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Example #6
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg({"counts": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="track",
                              values="counts", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Example #7
0
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
    """read go assignments from ensembl database.

    returns a dictionary of lists.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information

    Note: assumes that external_db_id for GO is 1000
    """

    statement = GetGOStatement(go_type, database, species)
    result = Database.executewait(dbhandle, statement,
                                  retries=0).fetchall()

    gene2go = {}
    go2info = collections.defaultdict(GOInfo)
    for gene_id, goid, description, evidence in result:
        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi

    return gene2go, go2info
Example #8
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [x[:-len("_0")] if x.endswith("_0") else x
                  for x in df.columns]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg({"value": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="variable",
                              values="value", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Example #9
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [
        x[:-len("_0")] if x.endswith("_0") else x for x in df.columns
    ]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg(
        {"value": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="variable",
                              values="value",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Example #10
0
def importCodingPotential( infile, outfile ):
    '''import annotations'''
    
    table = outfile[:-len(".import")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    < %(infile)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect( PARAMS["database"] )
    Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (f_iscoding == 'coding') OR (r_iscoding == 'coding') ''' % locals())
    dbhandle.commit()
def mergeEffectsPerGene( infile, outfile ):
    '''summarize effects on a per-gene level.'''
    
    tablename = outfile[:-len(".load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT 
           track,
           gene_id, 
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i, effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id, track
    ''' % locals()
    
    Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() )
    Database.executewait( dbhandle, statement )
    Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals())
    dbhandle.commit()

    P.touch(outfile)
def loadCodingPotential( infile, outfile ):
    '''load annotations'''
    
    table = P.toTable( outfile )

    statement = '''
    gunzip < %(infile)s 
    | python %(scriptsdir)s/csv2db.py 
              %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect( PARAMS["database"] )
    Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Example #13
0
def DumpGOFromDatabase(outfile,
                       dbhandle,
                       options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(
            dbhandle, statement, retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type,) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" % (go_type, ntotal,
                                   len(genes),
                                   len(categories)))

    E.info("%s\t%i\t%i\t%i" % ("all",
                               all_ntotal,
                               len(all_genes),
                               len(all_categories)))

    return
Example #14
0
def DumpGOFromDatabase(outfile, dbhandle, options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(dbhandle, statement,
                                       retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" %
               (go_type, ntotal, len(genes), len(categories)))

    E.info("%s\t%i\t%i\t%i" %
           ("all", all_ntotal, len(all_genes), len(all_categories)))

    return
Example #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: analyze_go.py 309 2005-12-01 15:50:26Z andreas $")

    dbhandle = Database.Database()

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.set_defaults(species="dmelanogaster")
    parser.set_defaults(database="ensembl_mart_31")
    parser.set_defaults(prefix="dm_go_")

    (options, args) = E.Start(parser, add_mysql_options=True)

    dbhandle.Connect(options)

    WriteBackground("biol_process", options, "bp")
    WriteBackground("cell_location", options, "lm")
    WriteBackground("mol_function", options, "fm")
Example #16
0
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
    """read go assignments from ensembl database.

    returns a dictionary of lists.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information

    Note: assumes that external_db_id for GO is 1000
    """

    statement = GetGOStatement(go_type, database, species)
    result = Database.executewait(dbhandle, statement, retries=0).fetchall()

    gene2go = {}
    go2info = collections.defaultdict(GOInfo)
    for gene_id, goid, description, evidence in result:
        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi

    return gene2go, go2info
def summarizeEffectsPerGene(infile, outfile):
    '''summarize effects on a per-gene level.'''

    tablename = outfile[:-len(".load")]
    track = infile[:-len("_effects.load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT 
           gene_id, 
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i,
         %(track)s_effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id
    ''' % locals()

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s" % locals())
    Database.executewait(dbhandle, statement)
    Database.executewait(
        dbhandle,
        "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" %
        locals())
    dbhandle.commit()

    P.touch(outfile)
Example #18
0
def buildExpressionStats(
        dbhandle,
        outfile,
        tablenames,
        outdir,
        regex_table="(?P<design>[^_]+)_"
        "(?P<geneset>[^_]+)_"
        "(?P<counting_method>[^_]+)_"
        "(?P<method>[^_]+)_"
        "(?P<level>[^_]+)_diff"):
    """compile expression summary statistics from database.

    This method outputs a table with the number of genes tested,
    failed, differentially expressed, etc. for a series of DE tests.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    tables : list
        List of tables to process.
    outfile : string
        Output filename in :term:`tsv` format.
    outdir : string
        Output directory for diagnostic plots.
    regex : string
        Regular expression to extract experimental information
        from table name.
    """

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "counting_method",
         "treatment_name",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    for tablename in tablenames:
        r = re.search(regex_table, tablename)
        if r is None:
            raise ValueError(
                "can't match tablename '%s' to regex" % tablename)
        geneset = r.group("geneset")
        design = r.group("design")
        level = r.group("level")
        counting_method = r.group("counting_method")
        geneset = r.group("geneset")

        def toDict(vals, l=2):
            return collections.defaultdict(
                int,
                [(tuple(x[:l]), x[l]) for x in vals])

        tested = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())
        status = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, status, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name,status"
            % locals()).fetchall(), 3)
        signif = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE significant "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())

        fold2 = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
            "GROUP BY treatment_name,control_name,significant"
            % locals()).fetchall())

        for treatment_name, control_name in tested.keys():
            outf.write("\t".join(map(str, (
                design,
                geneset,
                level,
                counting_method,
                treatment_name,
                control_name,
                tested[(treatment_name, control_name)],
                "\t".join(
                    [str(status[(treatment_name, control_name, x)])
                     for x in keys_status]),
                signif[(treatment_name, control_name)],
                fold2[(treatment_name, control_name)]))) + "\n")

        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            "SELECT i.sum, pvalue "
            "FROM %(tablename)s, "
            "%(geneset)s_geneinfo as i "
            "WHERE i.gene_id = test_id AND "
            "significant" % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s"
                       "_pvalue_vs_length.png") % locals()
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10( length )',
                            ylab='log10( pvalue )',
                            log="x", pch=20, cex=.1)

            R['dev.off']()

    outf.close()
Example #19
0
def buildExpressionStats(
        dbhandle,
        outfile,
        tablenames,
        outdir,
        regex_table="(?P<design>[^_]+)_"
        "(?P<geneset>[^_]+)_"
        "(?P<counting_method>[^_]+)_"
        "(?P<method>[^_]+)_"
        "(?P<level>[^_]+)_diff"):
    """compile expression summary statistics from database.

    This method outputs a table with the number of genes tested,
    failed, differentially expressed, etc. for a series of DE tests.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    tables : list
        List of tables to process.
    outfile : string
        Output filename in :term:`tsv` format.
    outdir : string
        Output directory for diagnostic plots.
    regex : string
        Regular expression to extract experimental information
        from table name.
    """

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "counting_method",
         "treatment_name",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    for tablename in tablenames:
        r = re.search(regex_table, tablename)
        if r is None:
            raise ValueError(
                "can't match tablename '%s' to regex" % tablename)
        geneset = r.group("geneset")
        design = r.group("design")
        level = r.group("level")
        counting_method = r.group("counting_method")
        geneset = r.group("geneset")

        def toDict(vals, l=2):
            return collections.defaultdict(
                int,
                [(tuple(x[:l]), x[l]) for x in vals])

        tested = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())
        status = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, status, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name,status"
            % locals()).fetchall(), 3)
        signif = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE significant "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())

        fold2 = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
            "GROUP BY treatment_name,control_name,significant"
            % locals()).fetchall())

        for treatment_name, control_name in tested.keys():
            outf.write("\t".join(map(str, (
                design,
                geneset,
                level,
                counting_method,
                treatment_name,
                control_name,
                tested[(treatment_name, control_name)],
                "\t".join(
                    [str(status[(treatment_name, control_name, x)])
                     for x in keys_status]),
                signif[(treatment_name, control_name)],
                fold2[(treatment_name, control_name)]))) + "\n")

        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            "SELECT i.sum, pvalue "
            "FROM %(tablename)s, "
            "%(geneset)s_geneinfo as i "
            "WHERE i.gene_id = test_id AND "
            "significant" % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s"
                       "_pvalue_vs_length.png") % locals()
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10( length )',
                            ylab='log10( pvalue )',
                            log="x", pch=20, cex=.1)

            R['dev.off']()

    outf.close()
Example #20
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Example #21
0
def createView(dbhandle, tables, tablename, outfile,
               view_type="TABLE",
               ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    Database.executewait(
        dbhandle,
        "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = Database.executewait(
            dbhandle,
            "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append(
            [x.lower() for x in Database.getColumnNames(dbhandle, table)
             if x != track])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError(
            "number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join(
        ["t0.%s = t%i.%s" % (f, x + 1, y[1])
         for x, y in enumerate(tables[1:])])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()

    Database.executewait(dbhandle, statement)

    nrows = Database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError(
            "empty view mapping, check statement = %s" %
            (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touchFile(outfile)
Example #22
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--mappability-file",
                      dest="mappability",
                      type="string",
                      help="Bigwig file with mappability")
    parser.add_option("-d",
                      "--database",
                      dest="db",
                      type="string",
                      help="Database containing intron chunck table")
    parser.add_option("-l",
                      "--read-length",
                      dest="rlen",
                      type="int",
                      default=50,
                      help="Read length")
    parser.add_option("-o",
                      "--overlap-length",
                      dest="olen",
                      type="int",
                      default=10,
                      help="Min overlap before read is counted")
    parser.add_option("-M",
                      "--multimap",
                      dest="mm",
                      action="store_true",
                      default=False,
                      help="Allow multimapping")
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    introns = Database.fetch(
        """SELECT gene_id, exon_id 
                                 FROM reference_chunks_introns 
                                 WHERE intron>0""",
        Database.connect(options.db))

    introns = [tuple(x) for x in introns]

    mappability = pyBigWig.open(options.mappability)
    options.stdout.write("\t".join(["gene_id", "exon_id", "efflen"]) + "\n")
    for exon in GTF.iterator(options.stdin):

        if (unicode(exon.gene_id), int(exon.exon_id)) not in introns:
            continue

        vals = mappability.values(
            exon.contig,
            int(exon.start) - (options.rlen - options.olen),
            int(exon.end) - options.olen)
        if options.mm:
            eff_len = sum(vals)

        else:
            eff_len = int(exon.end) - int(exon.start) \
                  + options.rlen \
                  - 2 * options.olen \
                  - len([x for x in vals if x < 1])

        options.stdout.write(
            "\t".join([exon.gene_id, exon.exon_id,
                       str(eff_len)]) + "\n")

    # write footer and output benchmark information.
    E.Stop()
Example #23
0
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement)
    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement)

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
Example #24
0
def createView(dbhandle,
               tables,
               tablename,
               outfile,
               view_type="TABLE",
               ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = Database.executewait(
            dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append([
            x.lower() for x in Database.getColumnNames(dbhandle, table)
            if x != track
        ])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError("number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join([
        "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])
    ])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()

    Database.executewait(dbhandle, statement)

    nrows = Database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError("empty view mapping, check statement = %s" %
                         (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touchFile(outfile)
Example #25
0
def qcSummary(infiles, outfile):
    '''create a summary table of relevant QC metrics'''

    # Some QC metrics are specific to paired end data
    if PAIRED:
        exclude = []
        paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
                              PERCENT_DUPLICATION as pct_duplication,
                              ESTIMATED_LIBRARY_SIZE as library_size,
                              PCT_READS_ALIGNED_IN_PAIRS
                                       as pct_reads_aligned_in_pairs,
                              MEDIAN_INSERT_SIZE
                                       as median_insert_size,
                           '''
        pcat = "PAIR"

    else:
        exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
        paired_columns = ''
        pcat = "UNPAIRED"

    tables = [P.toTable(x) for x in infiles if P.toTable(x) not in exclude]

    t1 = tables[0]

    name_fields = PARAMS["name_field_titles"].strip()

    stat_start = '''select distinct %(name_fields)s,
                                    sample_information.sample_id,
                                    fraction_spliced,
                                    fraction_spike,
                                    qc_no_genes_cufflinks.protein_coding
                                       as cufflinks_no_genes_pc,
                                    qc_no_genes_cufflinks.total
                                       as cufflinks_no_genes,
                                    qc_no_genes_featurecounts.protein_coding
                                       as featurecounts_no_genes_pc,
                                    qc_no_genes_featurecounts.total
                                       as featurecounts_no_genes,
                                    three_prime_bias
                                       as three_prime_bias,
                                    nreads_uniq_map_genome,
                                    nreads_uniq_map_spike,
                                    %(paired_columns)s
                                    PCT_MRNA_BASES
                                       as pct_mrna,
                                    PCT_CODING_BASES
                                       as pct_coding,
                                    PCT_PF_READS_ALIGNED
                                       as pct_reads_aligned,
                                    TOTAL_READS
                                       as total_reads,
                                    PCT_ADAPTER
                                       as pct_adapter,
                                    PF_HQ_ALIGNED_READS*1.0/PF_READS
                                       as pct_pf_reads_aligned_hq
                   from %(t1)s
                ''' % locals()

    join_stat = ""
    for table in tables[1:]:
        join_stat += "left join " + table + "\n"
        join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"

    where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
                 ''' % locals()

    statement = "\n".join([stat_start, join_stat, where_stat])

    df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
    df.to_csv(outfile, sep="\t", index=False)
Example #26
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT ontology FROM %s''' %
            tablename).fetchall()
    ]

    genelists = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT genelist FROM %s''' %
            tablename).fetchall()
    ]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')
Example #27
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
def loadSummary( infile, outfile ):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists( "%s_rates.load" % track ):
        stmt_select.append( "a.distance AS ks, a.aligned AS aligned" )
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
                     ON r.gene_id = a.gene_id AND 
                     a.aligned >= %(rates_min_aligned)i AND 
                     a.distance <= %(rates_max_rate)f''' )

    if os.path.exists( "%s_coverage.load" % track ):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage" )
        stmt_from.append("LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id" )

    if os.path.exists( "%s_repeats_gc.load" % track ):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc" )
        stmt_from.append("LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" )

    if os.path.exists( "%s_repeats_rates.load" % track ):
        stmt_select.append("ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" )
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar 
                     ON r.gene_id = ar.gene_id AND 
                     ar.exons_nval >= %(rates_min_repeats)i''' )

    if os.path.exists( "%s_introns_rates.load" % track ):
        stmt_select.append("ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" )
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir 
                            ON r.gene_id = ir.gene_id AND 
                            ir.aligned >= %(rates_min_aligned)i''' )

    x = locals()
    x.update( PARAMS )
    stmt_select = ", ".join( stmt_select ) % x 
    stmt_from = " ".join( stmt_from ) % x
    stmt_where = " AND ".join( stmt_where ) % x

    dbhandle = sqlite3.connect( PARAMS["database"] )

    Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals() )

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
         CAST(r.gene_id AS TEXT) AS gene_id, 
				r.exons_sum as length, 
				r.exons_pGC as pgc,
                                %(stmt_select)s
	FROM 
          %(track)s_annotation AS r 
	  %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait( dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir):
    '''build expression summary statistics.

    Creates also diagnostic plots in

    <exportdir>/<method> directory.
    '''
    dbhandle = sqlite3.connect(PARAMS["database"])

    def _split(tablename):
        # this would be much easier, if feature_counts/gene_counts/etc.
        # would not contain an underscore.
        try:
            design, geneset, counting_method = re.match(
                "([^_]+)_vs_([^_]+)_(.*)_%s" % method,
                tablename).groups()
        except AttributeError:
            try:
                design, geneset = re.match(
                    "([^_]+)_([^_]+)_%s" % method,
                    tablename).groups()
                counting_method = "na"
            except AttributeError:
                raise ValueError("can't parse tablename %s" % tablename)

        return design, geneset, counting_method

        # return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "treatment_name",
         "counting_method",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    all_tables = set(Database.getTables(dbhandle))

    for level in CUFFDIFF_LEVELS:

        for tablename in tables:

            tablename_diff = "%s_%s_diff" % (tablename, level)
            tablename_levels = "%s_%s_diff" % (tablename, level)
            design, geneset, counting_method = _split(tablename_diff)
            if tablename_diff not in all_tables:
                continue

            def toDict(vals, l=2):
                return collections.defaultdict(
                    int,
                    [(tuple(x[:l]), x[l]) for x in vals])

            tested = toDict(
                Database.executewait(
                    dbhandle,
                    "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name" % locals()
                    ).fetchall())
            status = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, status, "
                "COUNT(*) FROM %(tablename_diff)s "
                "GROUP BY treatment_name,control_name,status"
                % locals()).fetchall(), 3)
            signif = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, "
                "COUNT(*) FROM %(tablename_diff)s "
                "WHERE significant "
                "GROUP BY treatment_name,control_name" % locals()
                ).fetchall())

            fold2 = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, "
                "COUNT(*) FROM %(tablename_diff)s "
                "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
                "GROUP BY treatment_name,control_name,significant"
                % locals()).fetchall())

            for treatment_name, control_name in tested.keys():
                outf.write("\t".join(map(str, (
                    design,
                    geneset,
                    level,
                    counting_method,
                    treatment_name,
                    control_name,
                    tested[(treatment_name, control_name)],
                    "\t".join(
                        [str(status[(treatment_name, control_name, x)])
                         for x in keys_status]),
                    signif[(treatment_name, control_name)],
                    fold2[(treatment_name, control_name)]))) + "\n")

            ###########################################
            ###########################################
            ###########################################
            # plot length versus P-Value
            data = Database.executewait(
                dbhandle,
                "SELECT i.sum, pvalue "
                "FROM %(tablename_diff)s, "
                "%(geneset)s_geneinfo as i "
                "WHERE i.gene_id = test_id AND "
                "significant" % locals()).fetchall()

            # require at least 10 datapoints - otherwise smooth scatter fails
            if len(data) > 10:
                data = zip(*data)

                pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals()
                R.png(pngfile)
                R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                                R.log10(ro.FloatVector(data[1])),
                                xlab='log10( length )',
                                ylab='log10( pvalue )',
                                log="x", pch=20, cex=.1)

                R['dev.off']()

    outf.close()
Example #30
0
def loadSummary(infile, outfile):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists("%s_rates.load" % track):
        stmt_select.append("a.distance AS ks, a.aligned AS aligned")
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
        ON r.gene_id = a.gene_id AND 
        a.aligned >= %(rates_min_aligned)i AND 
        a.distance <= %(rates_max_rate)f''')

    if os.path.exists("%s_coverage.load" % track):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage")
        stmt_from.append(
            "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id")

    if os.path.exists("%s_repeats_gc.load" % track):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc")
        stmt_from.append(
            "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id"
        )

    if os.path.exists("%s_repeats_rates.load" % track):
        stmt_select.append(
            "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska"
        )
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar 
                     ON r.gene_id = ar.gene_id AND 
                     ar.exons_nval >= %(rates_min_repeats)i''')

    if os.path.exists("%s_introns_rates.load" % track):
        stmt_select.append(
            "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski"
        )
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir 
                            ON r.gene_id = ir.gene_id AND 
                            ir.aligned >= %(rates_min_aligned)i''')

    x = locals()
    x.update(PARAMS)
    stmt_select = ", ".join(stmt_select) % x
    stmt_from = " ".join(stmt_from) % x
    stmt_where = " AND ".join(stmt_where) % x

    dbhandle = sqlite3.connect(PARAMS["database"])

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s " % locals())

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
    CAST(r.gene_id AS TEXT) AS gene_id,
    r.exons_sum as length,
    r.exons_pGC as pgc,
    %(stmt_select)s
    FROM
    %(track)s_annotation AS r
    %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait(dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)
Example #31
0
def loadCuffdiff(infile, outfile, min_fpkm=1.0):
    '''load results from differential expression analysis and produce
    summary plots.

    Note: converts from ln(fold change) to log2 fold change.

    The cuffdiff output is parsed.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    fpkm_silent) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    This requires the cummeRbund library to be present in R.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued

    dbhandle = sqlite3.connect(PARAMS["database"])

    tmpname = P.getTempFilename(".")

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)
        results = parseCuffdiff(infile,
                                min_fpkm=min_fpkm)

        Expression.writeExpressionResults(tmpname, results)

        statement = '''cat %(tmpname)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=treatment_name
              --add-index=control_name
              --add-index=test_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"

        statement = '''zcat %(indir)s/%(fn)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=tracking_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        x = 0
        for line in inf:
            if x == 0:
                x += 1
                continue
            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # plesae make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            x = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[x]] + "\t")
                x += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        statement = ("cat %(tmpf)s |"
                     " python %(scriptsdir)s/csv2db.py "
                     "  %(csv2db_options)s"
                     "  --allow-empty-file"
                     "  --add-index=gene_id"
                     "  --table=%(tablename)s"
                     " >> %(outfile)s.log")
        P.run()

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Example #32
0
def buildDMRStats( tables, method, outfile ):
    '''build dmr summary statistics.
    
    Creates some diagnostic plots in

    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    '''

    dbhandle = sqlite3.connect( PARAMS["database"] )

    def togeneset( tablename ):
        return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested",
                            "\t".join( [ "status_%s" % x for x in keys_status ] ),
                            "significant",
                            "up", "down",
                            "twofold",
                            "twofold_up", "twofold_down",
                            ) ) + "\n" )

    all_tables = set(Database.getTables( dbhandle ))
    outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" )

    for tablename in tables:

        prefix = P.snip( tablename, "_%s" % method )
        tileset, design = prefix.split("_")

        def toDict( vals, l = 2 ):
            return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] )

        E.info( "collecting data from %s" % tablename )
        
        tested = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name""" % locals() ).fetchall() )
        status = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 )
        signif = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE significant
                                GROUP BY treatment_name,control_name""" % locals() ).fetchall() )
        fold2 = toDict( Database.executewait( dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE (l2fold >= 1 or l2fold <= -1) AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        up = toDict( Database.executewait( dbhandle,
                                                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 0 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        down = toDict( Database.executewait( dbhandle,
                                             """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < 0 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        fold2up = toDict( Database.executewait( dbhandle,
                                           """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 1 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        fold2down = toDict( Database.executewait( dbhandle,
                                             """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < -1 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )
        
        groups = tested.keys()

        for treatment_name, control_name in groups:
            k = (treatment_name,control_name)
            outf.write( "\t".join(map(str, (
                            tileset,
                            design,
                            treatment_name,
                            control_name,
                            tested[k],
                            "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]),
                            signif[(k)],
                            up[k], down[k],
                            fold2[k],
                            fold2up[k], fold2down[k] ) ) ) + "\n" )
                            

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait( dbhandle, 
                                     '''SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant'''% locals() ).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals()
            R.png( pngfile )
            R.smoothScatter( R.log10( ro.FloatVector(data[0]) ),
                             R.log10( ro.FloatVector(data[1]) ),
                             xlab = 'log10( length )',
                             ylab = 'log10( pvalue )',
                             log="x", pch=20, cex=.1 )

            R['dev.off']()

    outf.close()
Example #33
0
def main():

    parser = E.OptionParser(
        version="%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $",
        usage=globals()["__doc__"])

    dbhandle = Database.Database()

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use [default=%default].")

    parser.add_option(
        "-i",
        "--slims",
        dest="filename_slims",
        type="string",
        help="filename with GO SLIM categories [default=%default].")

    parser.add_option(
        "-g",
        "--genes",
        dest="filename_genes",
        type="string",
        help="filename with genes to analyse [default=%default].")

    parser.add_option(
        "-b",
        "--background",
        dest="filename_background",
        type="string",
        help="filename with background genes to analyse [default=%default].")

    parser.add_option(
        "-m",
        "--minimum-counts",
        dest="minimum_counts",
        type="int",
        help=
        "minimum count - ignore all categories that have fewer than # number of genes"
        " [default=%default].")

    parser.add_option("-o",
                      "--sort-order",
                      dest="sort_order",
                      type="choice",
                      choices=("fdr", "pvalue", "ratio"),
                      help="output sort order [default=%default].")

    parser.add_option(
        "--ontology",
        dest="ontology",
        type="string",
        action="append",
        help="go ontologies to analyze. Ontologies are tested separately."
        " [default=%default].")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help=
        "significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values."
    )

    parser.add_option(
        "--filename-dump",
        dest="filename_dump",
        type="string",
        help="dump GO category assignments into a flatfile [default=%default]."
    )

    parser.add_option(
        "--filename-gene2name",
        dest="filename_gene2name",
        type="string",
        help=
        "optional filename mapping gene identifiers to gene names [default=%default]."
    )

    parser.add_option(
        "--filename-ontology",
        dest="filename_ontology",
        type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option(
        "--filename-input",
        dest="filename_input",
        type="string",
        help="read GO category assignments from a flatfile [default=%default]."
    )

    parser.add_option("--sample-size",
                      dest="sample",
                      type="int",
                      help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help=
        "pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]"
    )

    parser.add_option(
        "--fdr",
        dest="fdr",
        action="store_true",
        help=
        "calculate and filter by FDR [ReadGene2GOFromFiledefault=%default].")

    parser.add_option(
        "--go2goslim",
        dest="go2goslim",
        action="store_true",
        help=
        "convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]."
    )

    parser.add_option(
        "--gene-pattern",
        dest="gene_pattern",
        type="string",
        help=
        "pattern to transform identifiers to GO gene names [default=%default]."
    )

    parser.add_option(
        "--filename-map-slims",
        dest="filename_map_slims",
        type="string",
        help=
        "write mapping between GO categories and GOSlims [default=%default].")

    parser.add_option(
        "--get-genes",
        dest="get_genes",
        type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict",
        dest="strict",
        action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background [default=%default]."
    )

    parser.add_option(
        "-q",
        "--qvalue-method",
        dest="qvalue_method",
        type="choice",
        choices=("empirical", "storey", "BH"),
        help=
        "method to perform multiple testing correction by controlling the fdr [default=%default]."
    )

    parser.add_option(
        "--pairwise",
        dest="compute_pairwise",
        action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    #                    help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    if options.go2goslim:
        convertGo2Goslim(options)
        E.Stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    ## dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"
            ]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle.Connect(options)

        outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True)
        DumpGOFromDatabase(outfile, dbhandle, options)
        outfile.close()
        E.Stop()
        sys.exit(0)

    #############################################################
    ## read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = IOTools.openFile(options.filename_input)
        gene2gos, go2infos = ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = IOTools.openFile(options.filename_gene2name)
        gene2name = IOTools.readMap(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())), len(gene2name)))
    else:
        gene2name = None

    #############################################################
    ## read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = IOTools.openFile(options.filename_ontology)
        ontology = readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GOInfo)

        go2infos = collections.defaultdict(_g)

        ## substitute go2infos
        for go in ontology.values():
            go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId,
                                                     go_type=go.mNameSpace,
                                                     description=go.mName)

    #############################################################
    ## get foreground gene list
    input_foreground, genelists = ReadGeneLists(
        options.filename_genes, gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    ## get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from ReadGeneLists
        input_background = ReadGeneLists(options.filename_background,
                                         gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    ## sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = gene2gos.keys()

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join(
        ("genelist", "ontology", "significant", "threshold", "ngenes",
         "ncategories", "nmaps", "nforegound", "nforeground_mapped",
         "nbackground", "nbackground_mapped", "nsample_counts",
         "nbackground_counts", "psample_assignments",
         "pbackground_assignments")) + "\n")

    #############################################################
    ## get go categories for genes
    for test_ontology in options.ontology:

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        ## get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = ReadGene2GOFromDatabase(dbhandle, test_ontology,
                                                       options.database,
                                                       options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn(
                "could not find information for terms - could be mismatch between ontologies"
            )

        ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go)
        E.info(
            "assignments found: %i genes mapped to %i categories (%i maps)" %
            (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set([
                x for x, y in counts_per_category.iteritems()
                if y < options.minimum_counts
            ])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go)
            E.info(
                "assignments after filtering: %i genes mapped to %i categories (%i maps)"
                % (ngenes, ncategories, nmaps))

        for genelist_name, foreground in genelists.iteritems():

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            ## build background - reconcile with foreground
            ##################################################################
            if input_background == None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn( "%i genes in foreground that are not in background - added to background of %i" %\
                                (len(missing), len(background)) )

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            #############################################################
            ## sanity checks:
            ## are all of the foreground genes in the dataset
            ## missing = set(genes).difference( set(gene2go.keys()) )
            ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            ## read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GetGOSlims(
                    IOTools.openFile(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in go_slims.values():
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\
                                              ( options.filename_slims,
                                                len(go_slims),
                                                len( v ) ))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = IOTools.openFile(options.filename_map_slims,
                                                   "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in go_slims.items():
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = CountGO(
                        gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n"
                        % (ngenes, ncategories, nmaps))

            #############################################################
            ## Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in gene2go.items():
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                ## skip to next GO class
                if not (bg or ng): continue

                options.stdout.write("# genes in GO category %s\n" %
                                     options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in fg:
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in bg:
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in ng:
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.Stop()
                sys.exit(0)

            #############################################################
            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='foreground',
                                  set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='background',
                                  set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background[0]))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            ## do the analysis
            go_results = AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = go_results.mResults.items()

            #############################################################
            ## calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = computeFDRs(go_results, foreground,
                                                    background, options,
                                                    test_ontology, gene2go,
                                                    go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(lambda x, y: cmp(x[1].mQValue, y[1].mQValue))
            elif options.sort_order == "ratio":
                pairs.sort(lambda x, y: cmp(x[1].mRatio, y[1].mRatio))
            elif options.sort_order == "pvalue":
                pairs.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue))

            #############################################################
            #############################################################
            #############################################################
            ## output the full result
            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='overall',
                                  set=genelist_name)

            outputResults(outfile,
                          pairs,
                          go2info,
                          options,
                          fdrs=fdrs,
                          samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='results',
                                  set=genelist_name)

            outputResults(outfile,
                          filtered_pairs,
                          go2info,
                          options,
                          fdrs=fdrs,
                          samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            ## output parameters
            ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go)

            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='parameters',
                                  set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write("mapped_categories\t%i\tmapped categories\n" %
                          ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n"
                % (len(go_results.mSampleGenes)))
            outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n"
                % (len(go_results.mBackgroundGenes)))
            outfile.write("associations_in_fg\t%i\tassociations in sample\n" %
                          go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mSampleGenes),
                                         len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n"
                % (IOTools.prettyPercent(len(go_results.mBackgroundGenes),
                                         nbackground, "%5.2f")))
            outfile.write("significant\t%i\tsignificant results reported\n" %
                          nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n"
                % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n"
                % nselected_down)
            outfile.write("threshold\t%6.4f\tsignificance threshold\n" %
                          options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append( "\t".join( map(str, ( \
                                                genelist_name,
                                                  test_ontology,
                                                  nselected,
                                                  options.threshold,
                                                  ngenes,
                                                  ncategories,
                                                  nmaps,
                                                  len(foreground),
                                                  len(go_results.mSampleGenes),
                                                  nbackground,
                                                  len(go_results.mBackgroundGenes),
                                                  go_results.mSampleCountsTotal,
                                                  go_results.mBackgroundCountsTotal,
                                                  IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ),
                                                  IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ),
                                                  ",".join( msgs) ) ) ) + "\n" )

            #############################################################
            #############################################################
            #############################################################
            ## output the fg patterns
            outfile = getFileName(options,
                                  go=test_ontology,
                                  section='withgenes',
                                  set=genelist_name)

            outputResults(outfile,
                          pairs,
                          go2info,
                          options,
                          fdrs=fdrs,
                          samples=samples,
                          gene2go=gene2go,
                          foreground=foreground,
                          gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ######################################################################
            ######################################################################
            ######################################################################
            ## output various summary files
            ## significant results
            outputMultipleGeneListResults(all_significant_results,
                                          all_genelists_with_results,
                                          test_ontology,
                                          go2info,
                                          options,
                                          section='significant')

            ## all results
            outputMultipleGeneListResults(all_results,
                                          all_genelists_with_results,
                                          test_ontology,
                                          go2info,
                                          options,
                                          section='all')

            if options.compute_pairwise:
                pairwiseGOEnrichment(all_results, all_genelists_with_results,
                                     test_ontology, go2info, options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.Stop()
Example #34
0
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
Example #35
0
def importLincRNA( infile, outfile ):
    '''build a linc RNA set.

        * no coding potential
        * unknown and intergenic transcripts
        * no overlap with ``linc_exclude`` (usually: human refseq)
        * at least ``linc_min_length`` bp in length
        * at least ``linc_min_reads`` reads in transcript

    '''

    table = outfile[:-len(".import")]
    track = table[:-len("Linc")]

    dbhandle = sqlite3.connect( PARAMS["database"] )

    Database.executewait( dbhandle, '''DROP TABLE IF EXISTS %(table)s''' % locals())
    Database.executewait( dbhandle, '''CREATE TABLE %(table)s (gene_id TEXT)''' % locals())
    Database.executewait( dbhandle, '''CREATE INDEX %(table)s_index1 ON %(table)s (gene_id)''' % locals())

    joins, wheres = [], ["1"]

    if PARAMS["linc_min_reads"] > 0:
        joins.append( ", %(track)s_coverage as cov" % locals() )
        wheres.append( "cov.gene_id = m.gene_id2 AND cov.nmatches >= %(i)" % PARAMS["linc_min_reads"] )

    if PARAMS["linc_exclude"] > 0:
        joins.append( "LEFT JOIN %s_vs_%s_ovl as ovl on ovl.gene_id2 = a.gene_id" %\
                      (PARAMS["linc_exclude"], track ) )
        wheres.append( "ovl.gene_id1 IS NULL" )

    wheres = " AND ".join( wheres )
    joins = " ".join( joins )

    statement = '''INSERT INTO %(table)s 
                   SELECT DISTINCT(a.gene_id) FROM 
                          %(track)s_annotation as a
                          %(joins)s
                          LEFT JOIN %(track)s_coding AS c on c.gene_id = a.gene_id 
                    WHERE is_unknown 
                          AND is_intergenic 
                          AND exons_sum >= %(linc_min_length)i
                          AND (c.is_coding IS NULL or not c.is_coding) 
                          AND %(wheres)s
                     ''' % dict( PARAMS.items() + locals().items() )

    E.debug( "statement to build lincRNA: %s" % statement)

    Database.executewait( dbhandle, statement % locals())

    dbhandle.commit()
    
    cc = dbhandle.cursor()
    result = cc.execute("SELECT COUNT(*) FROM %(table)s" % locals() ).fetchall()[0][0]
    E.info( "build lincRNA set for %s: %i entries" % ( track, result ))

    outgtf = "%s.gtf.gz" % table

    E.info( "creating gtf file `%s`" % outgtf )
    # output gtf file
    statement = '''%(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(table)s AS t
                          WHERE t.gene_id = g.gene_id" 
                | python %(scriptsdir)s/gtf2tsv.py --invert --log=%(outfile)s
                | gzip
                > %(outgtf)s'''
    
    P.run()
Example #36
0
def qcSummary(infiles, outfile):
    '''create a summary table of relevant QC metrics'''

    # Some QC metrics are specific to paired end data
    if PAIRED:
        exclude = []
        paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
                              PERCENT_DUPLICATION as pct_duplication,
                              ESTIMATED_LIBRARY_SIZE as library_size,
                              PCT_READS_ALIGNED_IN_PAIRS
                                       as pct_reads_aligned_in_pairs,
                              MEDIAN_INSERT_SIZE
                                       as median_insert_size,
                           '''
        pcat = "PAIR"

    else:
        exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
        paired_columns = ''
        pcat = "UNPAIRED"

    tables = [P.toTable(x) for x in infiles
              if P.toTable(x) not in exclude]

    t1 = tables[0]

    name_fields = PARAMS["name_field_titles"].strip()

    stat_start = '''select distinct %(name_fields)s,
                                    sample_information.sample_id,
                                    fraction_spliced,
                                    fraction_spike,
                                    qc_no_genes_cufflinks.protein_coding
                                       as cufflinks_no_genes_pc,
                                    qc_no_genes_cufflinks.total
                                       as cufflinks_no_genes,
                                    qc_no_genes_featurecounts.protein_coding
                                       as featurecounts_no_genes_pc,
                                    qc_no_genes_featurecounts.total
                                       as featurecounts_no_genes,
                                    three_prime_bias
                                       as three_prime_bias,
                                    nreads_uniq_map_genome,
                                    nreads_uniq_map_spike,
                                    %(paired_columns)s
                                    PCT_MRNA_BASES
                                       as pct_mrna,
                                    PCT_CODING_BASES
                                       as pct_coding,
                                    PCT_PF_READS_ALIGNED
                                       as pct_reads_aligned,
                                    TOTAL_READS
                                       as total_reads,
                                    PCT_ADAPTER
                                       as pct_adapter,
                                    PF_HQ_ALIGNED_READS*1.0/PF_READS
                                       as pct_pf_reads_aligned_hq
                   from %(t1)s
                ''' % locals()

    join_stat = ""
    for table in tables[1:]:
        join_stat += "left join " + table + "\n"
        join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"

    where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
                 ''' % locals()

    statement = "\n".join([stat_start, join_stat, where_stat])

    df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
    df.to_csv(outfile, sep="\t", index=False)
def buildCuffdiffPlots(infile, outfile):
    '''create summaries of cufflinks results (including some diagnostic plots)

    Plots are created in the <exportdir>/cuffdiff directory.

    Plots are:

    <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png
        fold change against expression level
    '''
    ###########################################
    ###########################################
    # create diagnostic plots
    ###########################################
    outdir = os.path.join(PARAMS["exportdir"], "cuffdiff")

    dbhandle = sqlite3.connect(PARAMS["database"])

    prefix = P.snip(infile, ".load")

    geneset, method = prefix.split("_")

    for level in CUFFDIFF_LEVELS:
        tablename_diff = prefix + "_%s_diff" % level
        tablename_levels = prefix + "_%s_levels" % level

        # note that the ordering of EXPERIMENTS and the _diff table
        # needs to be the same as only one triangle is stored of the
        # pairwise results.  do not plot "undefined" lfold values
        # (where treatment_mean or control_mean = 0) do not plot lfold
        # values where the confidence bounds contain 0.
        for track1, track2 in itertools.combinations(EXPERIMENTS, 2):
            statement = """
            SELECT CASE WHEN d.treatment_mean < d.control_mean
            THEN d.treatment_mean
            ELSE d.control_mean END,
            d.l2fold, d.significant
            FROM %(tablename_diff)s AS d
            WHERE treatment_name = '%(track1)s' AND
            control_name = '%(track2)s' AND
            status = 'OK' AND
            treatment_mean > 0 AND
            control_mean > 0
            """ % locals()

            data = zip(*Database.executewait(dbhandle, statement))

            pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals(
            )

            # ian: Bug fix: moved R.png to after data check so that no
            #     plot is started if there is no data this was leading
            #     to R falling over from too many open devices

            if len(data) == 0:
                E.warn("no plot for %s - %s -%s vs %s" %
                       (pngfile, level, track1, track2))
                continue

            R.png(pngfile)
            R.plot(ro.FloatVector(data[0]),
                   ro.FloatVector(data[1]),
                   xlab='min(FPKM)',
                   ylab='log2fold',
                   log="x",
                   pch=20,
                   cex=.1,
                   col=R.ifelse(ro.IntVector(data[2]), "red", "black"))

            R['dev.off']()

    P.touch(outfile)
Example #38
0
def buildDMRStats(tables, method, outfile, dbhandle):
    """build dmr summary statistics.

    This method counts the number of up/down, 2fold up/down, etc.
    genes in output from (:mod:`scripts/runExpression`).

    This method also creates diagnostic plots in the
    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    Arguments
    ---------
    tables ; list
        List of tables with DMR output
    method : string
        Method name
    outfile : string
        Output filename. Tab separated file summarizing

    """

    def togeneset(tablename):
        return re.match("([^_]+)_", tablename).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "\t".join(
            (
                "tileset",
                "design",
                "track1",
                "track2",
                "tested",
                "\t".join(["status_%s" % x for x in keys_status]),
                "significant",
                "up",
                "down",
                "twofold",
                "twofold_up",
                "twofold_down",
            )
        )
        + "\n"
    )

    all_tables = set(Database.getTables(dbhandle))
    outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")

    for tablename in tables:

        prefix = P.snip(tablename, "_%s" % method)
        tileset, design = prefix.split("_")

        def toDict(vals, l=2):
            return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals])

        E.info("collecting data from %s" % tablename)

        tested = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            GROUP BY treatment_name,control_name"""
                % locals(),
            ).fetchall()
        )
        status = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, status,
            COUNT(*) FROM %(tablename)s 
            GROUP BY treatment_name,control_name,status"""
                % locals(),
            ).fetchall(),
            3,
        )
        signif = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s 
            WHERE significant
            GROUP BY treatment_name,control_name"""
                % locals(),
            ).fetchall()
        )
        fold2 = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s
            WHERE (l2fold >= 1 or l2fold <= -1) AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )
        up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 0 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < 0 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        fold2up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 1 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        fold2down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < -1 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        groups = tested.keys()

        for treatment_name, control_name in groups:
            k = (treatment_name, control_name)
            outf.write(
                "\t".join(
                    map(
                        str,
                        (
                            tileset,
                            design,
                            treatment_name,
                            control_name,
                            tested[k],
                            "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]),
                            signif[(k)],
                            up[k],
                            down[k],
                            fold2[k],
                            fold2up[k],
                            fold2down[k],
                        ),
                    )
                )
                + "\n"
            )

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            """SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant"""
            % locals(),
        ).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals()
            R.png(pngfile)
            R.smoothScatter(
                R.log10(ro.FloatVector(data[0])),
                R.log10(ro.FloatVector(data[1])),
                xlab="log10(length)",
                ylab="log10(pvalue)",
                log="x",
                pch=20,
                cex=0.1,
            )

            R["dev.off"]()

    outf.close()
def buildExpressionStats(tables, method, outfile, outdir):
    '''build expression summary statistics.

    Creates also diagnostic plots in

    <exportdir>/<method> directory.
    '''
    dbhandle = sqlite3.connect(PARAMS["database"])

    def _split(tablename):
        # this would be much easier, if feature_counts/gene_counts/etc.
        # would not contain an underscore.
        try:
            design, geneset, counting_method = re.match(
                "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups()
        except AttributeError:
            try:
                design, geneset = re.match("([^_]+)_([^_]+)_%s" % method,
                                           tablename).groups()
                counting_method = "na"
            except AttributeError:
                raise ValueError("can't parse tablename %s" % tablename)

        return design, geneset, counting_method

        # return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(("design", "geneset", "level", "treatment_name",
                          "counting_method", "control_name", "tested",
                          "\t".join(["status_%s" % x for x in keys_status]),
                          "significant", "twofold")) + "\n")

    all_tables = set(Database.getTables(dbhandle))

    for level in CUFFDIFF_LEVELS:

        for tablename in tables:

            tablename_diff = "%s_%s_diff" % (tablename, level)
            tablename_levels = "%s_%s_diff" % (tablename, level)
            design, geneset, counting_method = _split(tablename_diff)
            if tablename_diff not in all_tables:
                continue

            def toDict(vals, l=2):
                return collections.defaultdict(int, [(tuple(x[:l]), x[l])
                                                     for x in vals])

            tested = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name" %
                    locals()).fetchall())
            status = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, status, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name,status" %
                    locals()).fetchall(), 3)
            signif = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "WHERE significant "
                    "GROUP BY treatment_name,control_name" %
                    locals()).fetchall())

            fold2 = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
                    "GROUP BY treatment_name,control_name,significant" %
                    locals()).fetchall())

            for treatment_name, control_name in tested.keys():
                outf.write("\t".join(
                    map(str, (design, geneset, level, counting_method,
                              treatment_name, control_name, tested[
                                  (treatment_name, control_name)], "\t".join([
                                      str(status[(treatment_name, control_name,
                                                  x)]) for x in keys_status
                                  ]), signif[(treatment_name, control_name)],
                              fold2[(treatment_name, control_name)]))) + "\n")

            ###########################################
            ###########################################
            ###########################################
            # plot length versus P-Value
            data = Database.executewait(
                dbhandle, "SELECT i.sum, pvalue "
                "FROM %(tablename_diff)s, "
                "%(geneset)s_geneinfo as i "
                "WHERE i.gene_id = test_id AND "
                "significant" % locals()).fetchall()

            # require at least 10 datapoints - otherwise smooth scatter fails
            if len(data) > 10:
                data = zip(*data)

                pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals(
                )
                R.png(pngfile)
                R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                                R.log10(ro.FloatVector(data[1])),
                                xlab='log10( length )',
                                ylab='log10( pvalue )',
                                log="x",
                                pch=20,
                                cex=.1)

                R['dev.off']()

    outf.close()
def buildCuffdiffPlots(infile, outfile):
    '''create summaries of cufflinks results (including some diagnostic plots)

    Plots are created in the <exportdir>/cuffdiff directory.

    Plots are:

    <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png
        fold change against expression level
    '''
    ###########################################
    ###########################################
    # create diagnostic plots
    ###########################################
    outdir = os.path.join(PARAMS["exportdir"], "cuffdiff")

    dbhandle = sqlite3.connect(PARAMS["database"])

    prefix = P.snip(infile, ".load")

    geneset, method = prefix.split("_")

    for level in CUFFDIFF_LEVELS:
        tablename_diff = prefix + "_%s_diff" % level
        tablename_levels = prefix + "_%s_levels" % level

        # note that the ordering of EXPERIMENTS and the _diff table
        # needs to be the same as only one triangle is stored of the
        # pairwise results.  do not plot "undefined" lfold values
        # (where treatment_mean or control_mean = 0) do not plot lfold
        # values where the confidence bounds contain 0.
        for track1, track2 in itertools.combinations(EXPERIMENTS, 2):
            statement = """
            SELECT CASE WHEN d.treatment_mean < d.control_mean
            THEN d.treatment_mean
            ELSE d.control_mean END,
            d.l2fold, d.significant
            FROM %(tablename_diff)s AS d
            WHERE treatment_name = '%(track1)s' AND
            control_name = '%(track2)s' AND
            status = 'OK' AND
            treatment_mean > 0 AND
            control_mean > 0
            """ % locals()

            data = zip(*Database.executewait(dbhandle, statement))

            pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals()

            # ian: Bug fix: moved R.png to after data check so that no
            #     plot is started if there is no data this was leading
            #     to R falling over from too many open devices

            if len(data) == 0:
                E.warn("no plot for %s - %s -%s vs %s" %
                       (pngfile, level, track1, track2))
                continue

            R.png(pngfile)
            R.plot(ro.FloatVector(data[0]),
                   ro.FloatVector(data[1]),
                   xlab='min(FPKM)',
                   ylab='log2fold',
                   log="x", pch=20, cex=.1,
                   col=R.ifelse(ro.IntVector(data[2]), "red", "black"))

            R['dev.off']()

    P.touch(outfile)
Example #41
0
def defineTads(infile, outfile):
    '''
    Motif is "forward" if present on + strand, and "reverse" if present on - strand
    insulator pairs (from intersection) can therefore have motifs in the following orientations:
    1) convergent (F, R)
    2) divergent (R, F)
    3) same direction + strand (F, F)
    4) same direction - strand (R, R)
    
    Intervals generated from peak intersections with motifs in convergent orientation will represent TADs (or subTADS...)

    '''

    db = PARAMS["database"]
    npeaks = PARAMS["tads_npeaks"]
    pwidth = PARAMS["tads_pwidth"]
    tmp_dir = "$SCRATCH_DIR"

    # fetch insulator peaks with fimo motifs
    table = "insulators_" + '_'.join([str(npeaks), str(pwidth)
                                      ]) + "_fimo_table"
    statement = '''select * from %(table)s''' % locals()
    motifs = DB.fetch_DataFrame(statement, db)

    # get most significant motif for each peak
    motifs = motifs.sort_values(["sequence_name", "q_value"],
                                0).drop_duplicates(subset="sequence_name",
                                                   keep="first")

    motifs.to_csv("insulators_fimoMotifs.txt", sep="\t",
                  header=True)  # save peaks w/ annotated motifs as df
    upload2csvdb(motifs, "insulators_fimoMotifs", db)  # upload to csvdb

    # get peaks (bed format) corresponding to fimo motifs
    statement = '''select b.contig, b.start, b.end, a.sequence_name, b.peak_score, a.strand, a.q_value
                   from insulators_fimoMotifs a inner join insulators b on a.sequence_name = b.peak_id'''
    motif_bed = DB.fetch_DataFrame(statement, db)
    motif_bed = motif_bed.sort_values(["sequence_name", "q_value"],
                                      0).drop_duplicates(
                                          subset="sequence_name", keep="first")
    motif_bed.to_csv("motif_bed.txt", sep="\t", header=True, index=False)

    # merge peaks
    # iterate over a range of distances (1mb - 1kb) within which insulators peaks are merged
    ### descending order of distances -> favours bigger TADs, and joins up remaining intervals up to min. size of 1kb
    # merged insulators selected for with awk "," in $6 (strand col) ***Limited to merges of two peaks with F,R orientation with ($6=="\+,-")
    # merged insulators written to tmp file (tmp + str(counter))
    # for each successive merge after n=1 peaks from previous merge are subtracted from results with bedtools (w/ -A flag to remove entire intervals)
    ### a few of the later TADs are ver large and are merged over previous, how to correct for this? merge final file (only overlapping tads?)

    # n = 0

    # distances = range(0, 10100000, 10000) # 10mb to 1kb, 10kb decreases
    # distances = distances[::-1] # invert list -> descending order

    # for dist in distances:
    #     n = n +1
    #     tmp = "tmp" + str(n)

    #     if n == 1:
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - > $tmp; checkpoint;
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i $tmp |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > %(tmp)s''' % locals()

    #     elif n > 1 and n < len(distances):
    #         merge = tmp.replace(str(n), str(n-1))
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - |
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint;
    #                        subtractBed -A -a $tmp -b %(merge)s > %(tmp)s''' % locals()

    #     elif n == len(distances):
    #         merge = tmp.replace(str(n), str(n-1))
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - |
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint;
    #                        subtractBed -A -a $tmp -b %(merge)s > %(tmp)s; checkpoint;
    #                        awk 'BEGIN {OFS="\\t"} {if ($3-$2 > 1000) print $0}' <(cat tmp*) |
    #                          sort -k1,1 -k2,2n - |
    #                          mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -i - > %(outfile)s; checkpoint;
    #                        rm tmp*''' % locals()

    ### Instead of merging peaks with F/R motif orientation I could seperate insulator peaks into F & R files,
    ### then use bedtools closest to intersect peaks up to a max distance of n & remove peaks with divergent motifs.

    # Ensure "closest" matches are on the same chromosome with awk, also remove features > 1mb wide

    statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
                   Fstrand=`mktemp -p %(tmp_dir)s`; checkpoint;
                   Rstrand=`mktemp -p %(tmp_dir)s`; checkpoint;
                   awk 'BEGIN {OFS="\\t"} {if ($6 == "+") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Fstrand; checkpoint;
                   awk 'BEGIN {OFS="\\t"} {if ($6 == "-") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Rstrand; checkpoint;
                   ~/devel/GIT/bedtools2/bin/closestBed -iu -D ref 
                         -a $Fstrand 
                         -b $Rstrand
                         > $tmp; checkpoint; 
                   awk 'BEGIN {OFS="\\t"} {if ($1 == $8 && $9-$2 < 1000000) print $1,$2,$9,$4"/"$11,($5+$12)/2,$6","$13,($7+$14)/2}' $tmp > %(outfile)s '''

    ### This works better!

    # Need to incoporate CTCF & cohesin coverage over candidate insulators. Then filter out insulator pairs (candidate TADS) with large discrepancies in ChIP signal
    # Czimmerer et al use a cut off of > 2fold difference betweeen start & end peaks of TADS in ChIP signal
    # Add ChIP coverage code for insulator peaks & save to db, then incoporate CTCF & cohesin signal into awk filter at the end of this statement

    print statement
    P.run()