def createViewMapping(infile, outfile):
    '''create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    '''

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = '''
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    '''

    Database.executewait(dbhandle, statement % locals())
Esempio n. 2
0
def loadCodingPotential(infile, outfile):
    '''load annotations'''

    table = P.toTable(outfile)

    statement = '''
    gunzip < %(infile)s 
    | python %(scriptsdir)s/csv2db.py 
              %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect(PARAMS["database"])
    Database.executewait(
        dbhandle,
        '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait(
        dbhandle,
        '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Esempio n. 3
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')
def createViewMapping(infile, outfile):
    """create view in database for alignment stats.

    This view aggregates all information on a per-track basis.

    The table is built from the following tracks:

    mapping_stats
    bam_stats
    """

    tablename = P.toTable(outfile)
    # can not create views across multiple database, so use table
    view_type = "TABLE"

    dbhandle = connect()
    Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    statement = """
    CREATE %(view_type)s %(tablename)s AS
    SELECT *
    FROM bam_stats AS b
    """

    Database.executewait(dbhandle, statement % locals())
Esempio n. 5
0
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
    """read go assignments from ensembl database.

    returns a dictionary of lists.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information

    Note: assumes that external_db_id for GO is 1000
    """

    statement = GetGOStatement(go_type, database, species)
    result = Database.executewait(dbhandle, statement,
                                  retries=0).fetchall()

    gene2go = {}
    go2info = collections.defaultdict(GOInfo)
    for gene_id, goid, description, evidence in result:
        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi

    return gene2go, go2info
Esempio n. 6
0
def importCodingPotential( infile, outfile ):
    '''import annotations'''
    
    table = outfile[:-len(".import")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    < %(infile)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect( PARAMS["database"] )
    Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (f_iscoding == 'coding') OR (r_iscoding == 'coding') ''' % locals())
    dbhandle.commit()
def mergeEffectsPerGene( infile, outfile ):
    '''summarize effects on a per-gene level.'''
    
    tablename = outfile[:-len(".load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT 
           track,
           gene_id, 
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i, effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id, track
    ''' % locals()
    
    Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() )
    Database.executewait( dbhandle, statement )
    Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals())
    dbhandle.commit()

    P.touch(outfile)
def loadCodingPotential( infile, outfile ):
    '''load annotations'''
    
    table = P.toTable( outfile )

    statement = '''
    gunzip < %(infile)s 
    | python %(scriptsdir)s/csv2db.py 
              %(csv2db_options)s 
              --allow-empty
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
    > %(outfile)s'''

    P.run()

    # set the is_coding flag
    dbhandle = sqlite3.connect( PARAMS["database"] )
    Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
    Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
    dbhandle.commit()
Esempio n. 9
0
def DumpGOFromDatabase(outfile,
                       dbhandle,
                       options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(
            dbhandle, statement, retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type,) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" % (go_type, ntotal,
                                   len(genes),
                                   len(categories)))

    E.info("%s\t%i\t%i\t%i" % ("all",
                               all_ntotal,
                               len(all_genes),
                               len(all_categories)))

    return
Esempio n. 10
0
def DumpGOFromDatabase(outfile, dbhandle, options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(dbhandle, statement,
                                       retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" %
               (go_type, ntotal, len(genes), len(categories)))

    E.info("%s\t%i\t%i\t%i" %
           ("all", all_ntotal, len(all_genes), len(all_categories)))

    return
Esempio n. 11
0
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
    """read go assignments from ensembl database.

    returns a dictionary of lists.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information

    Note: assumes that external_db_id for GO is 1000
    """

    statement = GetGOStatement(go_type, database, species)
    result = Database.executewait(dbhandle, statement, retries=0).fetchall()

    gene2go = {}
    go2info = collections.defaultdict(GOInfo)
    for gene_id, goid, description, evidence in result:
        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi

    return gene2go, go2info
Esempio n. 12
0
def summarizeEffectsPerGene(infile, outfile):
    '''summarize effects on a per-gene level.'''

    tablename = outfile[:-len(".load")]
    track = infile[:-len("_effects.load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT 
           gene_id, 
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i,
         %(track)s_effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id
    ''' % locals()

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s" % locals())
    Database.executewait(dbhandle, statement)
    Database.executewait(
        dbhandle,
        "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" %
        locals())
    dbhandle.commit()

    P.touch(outfile)
Esempio n. 13
0
def buildDMRStats( tables, method, outfile ):
    '''build dmr summary statistics.
    
    Creates some diagnostic plots in

    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    '''

    dbhandle = sqlite3.connect( PARAMS["database"] )

    def togeneset( tablename ):
        return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested",
                            "\t".join( [ "status_%s" % x for x in keys_status ] ),
                            "significant",
                            "up", "down",
                            "twofold",
                            "twofold_up", "twofold_down",
                            ) ) + "\n" )

    all_tables = set(Database.getTables( dbhandle ))
    outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" )

    for tablename in tables:

        prefix = P.snip( tablename, "_%s" % method )
        tileset, design = prefix.split("_")

        def toDict( vals, l = 2 ):
            return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] )

        E.info( "collecting data from %s" % tablename )
        
        tested = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name""" % locals() ).fetchall() )
        status = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s 
                                GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 )
        signif = toDict( Database.executewait( dbhandle,
                                               """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE significant
                                GROUP BY treatment_name,control_name""" % locals() ).fetchall() )
        fold2 = toDict( Database.executewait( dbhandle,
                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE (l2fold >= 1 or l2fold <= -1) AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        up = toDict( Database.executewait( dbhandle,
                                                """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 0 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        down = toDict( Database.executewait( dbhandle,
                                             """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < 0 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        fold2up = toDict( Database.executewait( dbhandle,
                                           """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold > 1 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )

        fold2down = toDict( Database.executewait( dbhandle,
                                             """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s 
                                WHERE l2fold < -1 AND significant
                                GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() )
        
        groups = tested.keys()

        for treatment_name, control_name in groups:
            k = (treatment_name,control_name)
            outf.write( "\t".join(map(str, (
                            tileset,
                            design,
                            treatment_name,
                            control_name,
                            tested[k],
                            "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]),
                            signif[(k)],
                            up[k], down[k],
                            fold2[k],
                            fold2up[k], fold2down[k] ) ) ) + "\n" )
                            

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait( dbhandle, 
                                     '''SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant'''% locals() ).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals()
            R.png( pngfile )
            R.smoothScatter( R.log10( ro.FloatVector(data[0]) ),
                             R.log10( ro.FloatVector(data[1]) ),
                             xlab = 'log10( length )',
                             ylab = 'log10( pvalue )',
                             log="x", pch=20, cex=.1 )

            R['dev.off']()

    outf.close()
Esempio n. 14
0
def buildExpressionStats(
        dbhandle,
        outfile,
        tablenames,
        outdir,
        regex_table="(?P<design>[^_]+)_"
        "(?P<geneset>[^_]+)_"
        "(?P<counting_method>[^_]+)_"
        "(?P<method>[^_]+)_"
        "(?P<level>[^_]+)_diff"):
    """compile expression summary statistics from database.

    This method outputs a table with the number of genes tested,
    failed, differentially expressed, etc. for a series of DE tests.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    tables : list
        List of tables to process.
    outfile : string
        Output filename in :term:`tsv` format.
    outdir : string
        Output directory for diagnostic plots.
    regex : string
        Regular expression to extract experimental information
        from table name.
    """

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "counting_method",
         "treatment_name",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    for tablename in tablenames:
        r = re.search(regex_table, tablename)
        if r is None:
            raise ValueError(
                "can't match tablename '%s' to regex" % tablename)
        geneset = r.group("geneset")
        design = r.group("design")
        level = r.group("level")
        counting_method = r.group("counting_method")
        geneset = r.group("geneset")

        def toDict(vals, l=2):
            return collections.defaultdict(
                int,
                [(tuple(x[:l]), x[l]) for x in vals])

        tested = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())
        status = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, status, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name,status"
            % locals()).fetchall(), 3)
        signif = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE significant "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())

        fold2 = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
            "GROUP BY treatment_name,control_name,significant"
            % locals()).fetchall())

        for treatment_name, control_name in tested.keys():
            outf.write("\t".join(map(str, (
                design,
                geneset,
                level,
                counting_method,
                treatment_name,
                control_name,
                tested[(treatment_name, control_name)],
                "\t".join(
                    [str(status[(treatment_name, control_name, x)])
                     for x in keys_status]),
                signif[(treatment_name, control_name)],
                fold2[(treatment_name, control_name)]))) + "\n")

        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            "SELECT i.sum, pvalue "
            "FROM %(tablename)s, "
            "%(geneset)s_geneinfo as i "
            "WHERE i.gene_id = test_id AND "
            "significant" % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s"
                       "_pvalue_vs_length.png") % locals()
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10( length )',
                            ylab='log10( pvalue )',
                            log="x", pch=20, cex=.1)

            R['dev.off']()

    outf.close()
def loadSummary( infile, outfile ):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists( "%s_rates.load" % track ):
        stmt_select.append( "a.distance AS ks, a.aligned AS aligned" )
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
                     ON r.gene_id = a.gene_id AND 
                     a.aligned >= %(rates_min_aligned)i AND 
                     a.distance <= %(rates_max_rate)f''' )

    if os.path.exists( "%s_coverage.load" % track ):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage" )
        stmt_from.append("LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id" )

    if os.path.exists( "%s_repeats_gc.load" % track ):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc" )
        stmt_from.append("LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" )

    if os.path.exists( "%s_repeats_rates.load" % track ):
        stmt_select.append("ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" )
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar 
                     ON r.gene_id = ar.gene_id AND 
                     ar.exons_nval >= %(rates_min_repeats)i''' )

    if os.path.exists( "%s_introns_rates.load" % track ):
        stmt_select.append("ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" )
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir 
                            ON r.gene_id = ir.gene_id AND 
                            ir.aligned >= %(rates_min_aligned)i''' )

    x = locals()
    x.update( PARAMS )
    stmt_select = ", ".join( stmt_select ) % x 
    stmt_from = " ".join( stmt_from ) % x
    stmt_where = " AND ".join( stmt_where ) % x

    dbhandle = sqlite3.connect( PARAMS["database"] )

    Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals() )

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
         CAST(r.gene_id AS TEXT) AS gene_id, 
				r.exons_sum as length, 
				r.exons_pGC as pgc,
                                %(stmt_select)s
	FROM 
          %(track)s_annotation AS r 
	  %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait( dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)
def buildCuffdiffPlots(infile, outfile):
    '''create summaries of cufflinks results (including some diagnostic plots)

    Plots are created in the <exportdir>/cuffdiff directory.

    Plots are:

    <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png
        fold change against expression level
    '''
    ###########################################
    ###########################################
    # create diagnostic plots
    ###########################################
    outdir = os.path.join(PARAMS["exportdir"], "cuffdiff")

    dbhandle = sqlite3.connect(PARAMS["database"])

    prefix = P.snip(infile, ".load")

    geneset, method = prefix.split("_")

    for level in CUFFDIFF_LEVELS:
        tablename_diff = prefix + "_%s_diff" % level
        tablename_levels = prefix + "_%s_levels" % level

        # note that the ordering of EXPERIMENTS and the _diff table
        # needs to be the same as only one triangle is stored of the
        # pairwise results.  do not plot "undefined" lfold values
        # (where treatment_mean or control_mean = 0) do not plot lfold
        # values where the confidence bounds contain 0.
        for track1, track2 in itertools.combinations(EXPERIMENTS, 2):
            statement = """
            SELECT CASE WHEN d.treatment_mean < d.control_mean
            THEN d.treatment_mean
            ELSE d.control_mean END,
            d.l2fold, d.significant
            FROM %(tablename_diff)s AS d
            WHERE treatment_name = '%(track1)s' AND
            control_name = '%(track2)s' AND
            status = 'OK' AND
            treatment_mean > 0 AND
            control_mean > 0
            """ % locals()

            data = zip(*Database.executewait(dbhandle, statement))

            pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals(
            )

            # ian: Bug fix: moved R.png to after data check so that no
            #     plot is started if there is no data this was leading
            #     to R falling over from too many open devices

            if len(data) == 0:
                E.warn("no plot for %s - %s -%s vs %s" %
                       (pngfile, level, track1, track2))
                continue

            R.png(pngfile)
            R.plot(ro.FloatVector(data[0]),
                   ro.FloatVector(data[1]),
                   xlab='min(FPKM)',
                   ylab='log2fold',
                   log="x",
                   pch=20,
                   cex=.1,
                   col=R.ifelse(ro.IntVector(data[2]), "red", "black"))

            R['dev.off']()

    P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir):
    '''build expression summary statistics.

    Creates also diagnostic plots in

    <exportdir>/<method> directory.
    '''
    dbhandle = sqlite3.connect(PARAMS["database"])

    def _split(tablename):
        # this would be much easier, if feature_counts/gene_counts/etc.
        # would not contain an underscore.
        try:
            design, geneset, counting_method = re.match(
                "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups()
        except AttributeError:
            try:
                design, geneset = re.match("([^_]+)_([^_]+)_%s" % method,
                                           tablename).groups()
                counting_method = "na"
            except AttributeError:
                raise ValueError("can't parse tablename %s" % tablename)

        return design, geneset, counting_method

        # return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(("design", "geneset", "level", "treatment_name",
                          "counting_method", "control_name", "tested",
                          "\t".join(["status_%s" % x for x in keys_status]),
                          "significant", "twofold")) + "\n")

    all_tables = set(Database.getTables(dbhandle))

    for level in CUFFDIFF_LEVELS:

        for tablename in tables:

            tablename_diff = "%s_%s_diff" % (tablename, level)
            tablename_levels = "%s_%s_diff" % (tablename, level)
            design, geneset, counting_method = _split(tablename_diff)
            if tablename_diff not in all_tables:
                continue

            def toDict(vals, l=2):
                return collections.defaultdict(int, [(tuple(x[:l]), x[l])
                                                     for x in vals])

            tested = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name" %
                    locals()).fetchall())
            status = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, status, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name,status" %
                    locals()).fetchall(), 3)
            signif = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "WHERE significant "
                    "GROUP BY treatment_name,control_name" %
                    locals()).fetchall())

            fold2 = toDict(
                Database.executewait(
                    dbhandle, "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
                    "GROUP BY treatment_name,control_name,significant" %
                    locals()).fetchall())

            for treatment_name, control_name in tested.keys():
                outf.write("\t".join(
                    map(str, (design, geneset, level, counting_method,
                              treatment_name, control_name, tested[
                                  (treatment_name, control_name)], "\t".join([
                                      str(status[(treatment_name, control_name,
                                                  x)]) for x in keys_status
                                  ]), signif[(treatment_name, control_name)],
                              fold2[(treatment_name, control_name)]))) + "\n")

            ###########################################
            ###########################################
            ###########################################
            # plot length versus P-Value
            data = Database.executewait(
                dbhandle, "SELECT i.sum, pvalue "
                "FROM %(tablename_diff)s, "
                "%(geneset)s_geneinfo as i "
                "WHERE i.gene_id = test_id AND "
                "significant" % locals()).fetchall()

            # require at least 10 datapoints - otherwise smooth scatter fails
            if len(data) > 10:
                data = zip(*data)

                pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals(
                )
                R.png(pngfile)
                R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                                R.log10(ro.FloatVector(data[1])),
                                xlab='log10( length )',
                                ylab='log10( pvalue )',
                                log="x",
                                pch=20,
                                cex=.1)

                R['dev.off']()

    outf.close()
Esempio n. 18
0
def importLincRNA( infile, outfile ):
    '''build a linc RNA set.

        * no coding potential
        * unknown and intergenic transcripts
        * no overlap with ``linc_exclude`` (usually: human refseq)
        * at least ``linc_min_length`` bp in length
        * at least ``linc_min_reads`` reads in transcript

    '''

    table = outfile[:-len(".import")]
    track = table[:-len("Linc")]

    dbhandle = sqlite3.connect( PARAMS["database"] )

    Database.executewait( dbhandle, '''DROP TABLE IF EXISTS %(table)s''' % locals())
    Database.executewait( dbhandle, '''CREATE TABLE %(table)s (gene_id TEXT)''' % locals())
    Database.executewait( dbhandle, '''CREATE INDEX %(table)s_index1 ON %(table)s (gene_id)''' % locals())

    joins, wheres = [], ["1"]

    if PARAMS["linc_min_reads"] > 0:
        joins.append( ", %(track)s_coverage as cov" % locals() )
        wheres.append( "cov.gene_id = m.gene_id2 AND cov.nmatches >= %(i)" % PARAMS["linc_min_reads"] )

    if PARAMS["linc_exclude"] > 0:
        joins.append( "LEFT JOIN %s_vs_%s_ovl as ovl on ovl.gene_id2 = a.gene_id" %\
                      (PARAMS["linc_exclude"], track ) )
        wheres.append( "ovl.gene_id1 IS NULL" )

    wheres = " AND ".join( wheres )
    joins = " ".join( joins )

    statement = '''INSERT INTO %(table)s 
                   SELECT DISTINCT(a.gene_id) FROM 
                          %(track)s_annotation as a
                          %(joins)s
                          LEFT JOIN %(track)s_coding AS c on c.gene_id = a.gene_id 
                    WHERE is_unknown 
                          AND is_intergenic 
                          AND exons_sum >= %(linc_min_length)i
                          AND (c.is_coding IS NULL or not c.is_coding) 
                          AND %(wheres)s
                     ''' % dict( PARAMS.items() + locals().items() )

    E.debug( "statement to build lincRNA: %s" % statement)

    Database.executewait( dbhandle, statement % locals())

    dbhandle.commit()
    
    cc = dbhandle.cursor()
    result = cc.execute("SELECT COUNT(*) FROM %(table)s" % locals() ).fetchall()[0][0]
    E.info( "build lincRNA set for %s: %i entries" % ( track, result ))

    outgtf = "%s.gtf.gz" % table

    E.info( "creating gtf file `%s`" % outgtf )
    # output gtf file
    statement = '''%(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(table)s AS t
                          WHERE t.gene_id = g.gene_id" 
                | python %(scriptsdir)s/gtf2tsv.py --invert --log=%(outfile)s
                | gzip
                > %(outgtf)s'''
    
    P.run()
Esempio n. 19
0
def loadSummary(infile, outfile):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists("%s_rates.load" % track):
        stmt_select.append("a.distance AS ks, a.aligned AS aligned")
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
        ON r.gene_id = a.gene_id AND 
        a.aligned >= %(rates_min_aligned)i AND 
        a.distance <= %(rates_max_rate)f''')

    if os.path.exists("%s_coverage.load" % track):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage")
        stmt_from.append(
            "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id")

    if os.path.exists("%s_repeats_gc.load" % track):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc")
        stmt_from.append(
            "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id"
        )

    if os.path.exists("%s_repeats_rates.load" % track):
        stmt_select.append(
            "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska"
        )
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar 
                     ON r.gene_id = ar.gene_id AND 
                     ar.exons_nval >= %(rates_min_repeats)i''')

    if os.path.exists("%s_introns_rates.load" % track):
        stmt_select.append(
            "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski"
        )
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir 
                            ON r.gene_id = ir.gene_id AND 
                            ir.aligned >= %(rates_min_aligned)i''')

    x = locals()
    x.update(PARAMS)
    stmt_select = ", ".join(stmt_select) % x
    stmt_from = " ".join(stmt_from) % x
    stmt_where = " AND ".join(stmt_where) % x

    dbhandle = sqlite3.connect(PARAMS["database"])

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s " % locals())

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
    CAST(r.gene_id AS TEXT) AS gene_id,
    r.exons_sum as length,
    r.exons_pGC as pgc,
    %(stmt_select)s
    FROM
    %(track)s_annotation AS r
    %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait(dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)
Esempio n. 20
0
def buildExpressionStats(tables, method, outfile, outdir):
    '''build expression summary statistics.

    Creates also diagnostic plots in

    <exportdir>/<method> directory.
    '''
    dbhandle = sqlite3.connect(PARAMS["database"])

    def _split(tablename):
        # this would be much easier, if feature_counts/gene_counts/etc.
        # would not contain an underscore.
        try:
            design, geneset, counting_method = re.match(
                "([^_]+)_vs_([^_]+)_(.*)_%s" % method,
                tablename).groups()
        except AttributeError:
            try:
                design, geneset = re.match(
                    "([^_]+)_([^_]+)_%s" % method,
                    tablename).groups()
                counting_method = "na"
            except AttributeError:
                raise ValueError("can't parse tablename %s" % tablename)

        return design, geneset, counting_method

        # return re.match("([^_]+)_", tablename ).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "treatment_name",
         "counting_method",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    all_tables = set(Database.getTables(dbhandle))

    for level in CUFFDIFF_LEVELS:

        for tablename in tables:

            tablename_diff = "%s_%s_diff" % (tablename, level)
            tablename_levels = "%s_%s_diff" % (tablename, level)
            design, geneset, counting_method = _split(tablename_diff)
            if tablename_diff not in all_tables:
                continue

            def toDict(vals, l=2):
                return collections.defaultdict(
                    int,
                    [(tuple(x[:l]), x[l]) for x in vals])

            tested = toDict(
                Database.executewait(
                    dbhandle,
                    "SELECT treatment_name, control_name, "
                    "COUNT(*) FROM %(tablename_diff)s "
                    "GROUP BY treatment_name,control_name" % locals()
                    ).fetchall())
            status = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, status, "
                "COUNT(*) FROM %(tablename_diff)s "
                "GROUP BY treatment_name,control_name,status"
                % locals()).fetchall(), 3)
            signif = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, "
                "COUNT(*) FROM %(tablename_diff)s "
                "WHERE significant "
                "GROUP BY treatment_name,control_name" % locals()
                ).fetchall())

            fold2 = toDict(Database.executewait(
                dbhandle,
                "SELECT treatment_name, control_name, "
                "COUNT(*) FROM %(tablename_diff)s "
                "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
                "GROUP BY treatment_name,control_name,significant"
                % locals()).fetchall())

            for treatment_name, control_name in tested.keys():
                outf.write("\t".join(map(str, (
                    design,
                    geneset,
                    level,
                    counting_method,
                    treatment_name,
                    control_name,
                    tested[(treatment_name, control_name)],
                    "\t".join(
                        [str(status[(treatment_name, control_name, x)])
                         for x in keys_status]),
                    signif[(treatment_name, control_name)],
                    fold2[(treatment_name, control_name)]))) + "\n")

            ###########################################
            ###########################################
            ###########################################
            # plot length versus P-Value
            data = Database.executewait(
                dbhandle,
                "SELECT i.sum, pvalue "
                "FROM %(tablename_diff)s, "
                "%(geneset)s_geneinfo as i "
                "WHERE i.gene_id = test_id AND "
                "significant" % locals()).fetchall()

            # require at least 10 datapoints - otherwise smooth scatter fails
            if len(data) > 10:
                data = zip(*data)

                pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals()
                R.png(pngfile)
                R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                                R.log10(ro.FloatVector(data[1])),
                                xlab='log10( length )',
                                ylab='log10( pvalue )',
                                log="x", pch=20, cex=.1)

                R['dev.off']()

    outf.close()
Esempio n. 21
0
def createView(dbhandle,
               tables,
               tablename,
               outfile,
               view_type="TABLE",
               ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    Database.executewait(
        dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = Database.executewait(
            dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append([
            x.lower() for x in Database.getColumnNames(dbhandle, table)
            if x != track
        ])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError("number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join([
        "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])
    ])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()

    Database.executewait(dbhandle, statement)

    nrows = Database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError("empty view mapping, check statement = %s" %
                         (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touchFile(outfile)
Esempio n. 22
0
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT ontology FROM %s''' %
            tablename).fetchall()
    ]

    genelists = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT genelist FROM %s''' %
            tablename).fetchall()
    ]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | python %(scriptsdir)s/csv2db.py
        %(csv2db_options)s
        --table=%(tablename)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty')
Esempio n. 23
0
def buildDMRStats(tables, method, outfile, dbhandle):
    """build dmr summary statistics.

    This method counts the number of up/down, 2fold up/down, etc.
    genes in output from (:mod:`scripts/runExpression`).

    This method also creates diagnostic plots in the
    <exportdir>/<method> directory.

    Tables should be labeled <tileset>_<design>_<method>.

    Arguments
    ---------
    tables ; list
        List of tables with DMR output
    method : string
        Method name
    outfile : string
        Output filename. Tab separated file summarizing

    """

    def togeneset(tablename):
        return re.match("([^_]+)_", tablename).groups()[0]

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "\t".join(
            (
                "tileset",
                "design",
                "track1",
                "track2",
                "tested",
                "\t".join(["status_%s" % x for x in keys_status]),
                "significant",
                "up",
                "down",
                "twofold",
                "twofold_up",
                "twofold_down",
            )
        )
        + "\n"
    )

    all_tables = set(Database.getTables(dbhandle))
    outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")

    for tablename in tables:

        prefix = P.snip(tablename, "_%s" % method)
        tileset, design = prefix.split("_")

        def toDict(vals, l=2):
            return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals])

        E.info("collecting data from %s" % tablename)

        tested = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            GROUP BY treatment_name,control_name"""
                % locals(),
            ).fetchall()
        )
        status = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, status,
            COUNT(*) FROM %(tablename)s 
            GROUP BY treatment_name,control_name,status"""
                % locals(),
            ).fetchall(),
            3,
        )
        signif = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s 
            WHERE significant
            GROUP BY treatment_name,control_name"""
                % locals(),
            ).fetchall()
        )
        fold2 = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name,
            COUNT(*) FROM %(tablename)s
            WHERE (l2fold >= 1 or l2fold <= -1) AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )
        up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 0 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < 0 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        fold2up = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold > 1 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        fold2down = toDict(
            Database.executewait(
                dbhandle,
                """SELECT treatment_name, control_name, COUNT(*)
            FROM %(tablename)s 
            WHERE l2fold < -1 AND significant
            GROUP BY treatment_name,control_name,significant"""
                % locals(),
            ).fetchall()
        )

        groups = tested.keys()

        for treatment_name, control_name in groups:
            k = (treatment_name, control_name)
            outf.write(
                "\t".join(
                    map(
                        str,
                        (
                            tileset,
                            design,
                            treatment_name,
                            control_name,
                            tested[k],
                            "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]),
                            signif[(k)],
                            up[k],
                            down[k],
                            fold2[k],
                            fold2up[k],
                            fold2down[k],
                        ),
                    )
                )
                + "\n"
            )

        ###########################################
        ###########################################
        ###########################################
        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            """SELECT end - start, pvalue 
                             FROM %(tablename)s
                             WHERE significant"""
            % locals(),
        ).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals()
            R.png(pngfile)
            R.smoothScatter(
                R.log10(ro.FloatVector(data[0])),
                R.log10(ro.FloatVector(data[1])),
                xlab="log10(length)",
                ylab="log10(pvalue)",
                log="x",
                pch=20,
                cex=0.1,
            )

            R["dev.off"]()

    outf.close()
Esempio n. 24
0
def createView(dbhandle, tables, tablename, outfile,
               view_type="TABLE",
               ignore_duplicates=True):
    '''create a database view for a list of tables.

    This method performs a join across multiple tables and stores the
    result either as a view or a table in the database.

    Arguments
    ---------
    dbhandle :
        A database handle.
    tables : list of tuples
        Tables to merge. Each tuple contains the name of a table and
        the field to join with the first table. For example::

            tables = (
                "reads_summary", "track",
                "bam_stats", "track",
                "context_stats", "track",
                "picard_stats_alignment_summary_metrics", "track")

    tablename : string
        Name of the view or table to be created.
    outfile : string
        Output filename for status information.
    view_type : string
        Type of view, either ``VIEW`` or ``TABLE``.  If a view is to be
        created across multiple databases, use ``TABLE``.
    ignore_duplicates : bool
        If set to False, duplicate column names will be added with the
        tablename as prefix. The default is to ignore.

    '''

    Database.executewait(
        dbhandle,
        "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())

    tracks, columns = [], []
    tablenames = [x[0] for x in tables]
    for table, track in tables:
        d = Database.executewait(
            dbhandle,
            "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
        tracks.append(d.fetchone()[0])
        columns.append(
            [x.lower() for x in Database.getColumnNames(dbhandle, table)
             if x != track])

    E.info("creating %s from the following tables: %s" %
           (tablename, str(list(zip(tablenames, tracks)))))
    if min(tracks) != max(tracks):
        raise ValueError(
            "number of rows not identical - will not create view")

    from_statement = " , ".join(
        ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
    f = tables[0][1]
    where_statement = " AND ".join(
        ["t0.%s = t%i.%s" % (f, x + 1, y[1])
         for x, y in enumerate(tables[1:])])

    all_columns, taken = [], set()
    for x, c in enumerate(columns):
        i = set(taken).intersection(set(c))
        if i:
            E.warn("duplicate column names: %s " % i)
            if not ignore_duplicates:
                table = tables[x][0]
                all_columns.extend(
                    ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
                c = [y for y in c if y not in i]

        all_columns.extend(["t%i.%s" % (x, y) for y in c])
        taken.update(set(c))

    all_columns = ",".join(all_columns)
    statement = '''
    CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
    FROM %(from_statement)s
    WHERE %(where_statement)s
    ''' % locals()

    Database.executewait(dbhandle, statement)

    nrows = Database.executewait(
        dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]

    if nrows == 0:
        raise ValueError(
            "empty view mapping, check statement = %s" %
            (statement % locals()))
    if nrows != min(tracks):
        E.warn("view creates duplicate rows, got %i, expected %i" %
               (nrows, min(tracks)))

    E.info("created view_mapping with %i rows" % nrows)
    touchFile(outfile)
Esempio n. 25
0
def buildCuffdiffPlots(infile, outfile):
    '''create summaries of cufflinks results (including some diagnostic plots)

    Plots are created in the <exportdir>/cuffdiff directory.

    Plots are:

    <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png
        fold change against expression level
    '''
    ###########################################
    ###########################################
    # create diagnostic plots
    ###########################################
    outdir = os.path.join(PARAMS["exportdir"], "cuffdiff")

    dbhandle = sqlite3.connect(PARAMS["database"])

    prefix = P.snip(infile, ".load")

    geneset, method = prefix.split("_")

    for level in CUFFDIFF_LEVELS:
        tablename_diff = prefix + "_%s_diff" % level
        tablename_levels = prefix + "_%s_levels" % level

        # note that the ordering of EXPERIMENTS and the _diff table
        # needs to be the same as only one triangle is stored of the
        # pairwise results.  do not plot "undefined" lfold values
        # (where treatment_mean or control_mean = 0) do not plot lfold
        # values where the confidence bounds contain 0.
        for track1, track2 in itertools.combinations(EXPERIMENTS, 2):
            statement = """
            SELECT CASE WHEN d.treatment_mean < d.control_mean
            THEN d.treatment_mean
            ELSE d.control_mean END,
            d.l2fold, d.significant
            FROM %(tablename_diff)s AS d
            WHERE treatment_name = '%(track1)s' AND
            control_name = '%(track2)s' AND
            status = 'OK' AND
            treatment_mean > 0 AND
            control_mean > 0
            """ % locals()

            data = zip(*Database.executewait(dbhandle, statement))

            pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals()

            # ian: Bug fix: moved R.png to after data check so that no
            #     plot is started if there is no data this was leading
            #     to R falling over from too many open devices

            if len(data) == 0:
                E.warn("no plot for %s - %s -%s vs %s" %
                       (pngfile, level, track1, track2))
                continue

            R.png(pngfile)
            R.plot(ro.FloatVector(data[0]),
                   ro.FloatVector(data[1]),
                   xlab='min(FPKM)',
                   ylab='log2fold',
                   log="x", pch=20, cex=.1,
                   col=R.ifelse(ro.IntVector(data[2]), "red", "black"))

            R['dev.off']()

    P.touch(outfile)
Esempio n. 26
0
def buildExpressionStats(
        dbhandle,
        outfile,
        tablenames,
        outdir,
        regex_table="(?P<design>[^_]+)_"
        "(?P<geneset>[^_]+)_"
        "(?P<counting_method>[^_]+)_"
        "(?P<method>[^_]+)_"
        "(?P<level>[^_]+)_diff"):
    """compile expression summary statistics from database.

    This method outputs a table with the number of genes tested,
    failed, differentially expressed, etc. for a series of DE tests.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    tables : list
        List of tables to process.
    outfile : string
        Output filename in :term:`tsv` format.
    outdir : string
        Output directory for diagnostic plots.
    regex : string
        Regular expression to extract experimental information
        from table name.
    """

    keys_status = "OK", "NOTEST", "FAIL", "NOCALL"

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(
        ("design",
         "geneset",
         "level",
         "counting_method",
         "treatment_name",
         "control_name",
         "tested",
         "\t".join(["status_%s" % x for x in keys_status]),
         "significant",
         "twofold")) + "\n")

    for tablename in tablenames:
        r = re.search(regex_table, tablename)
        if r is None:
            raise ValueError(
                "can't match tablename '%s' to regex" % tablename)
        geneset = r.group("geneset")
        design = r.group("design")
        level = r.group("level")
        counting_method = r.group("counting_method")
        geneset = r.group("geneset")

        def toDict(vals, l=2):
            return collections.defaultdict(
                int,
                [(tuple(x[:l]), x[l]) for x in vals])

        tested = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())
        status = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, status, "
            "COUNT(*) FROM %(tablename)s "
            "GROUP BY treatment_name,control_name,status"
            % locals()).fetchall(), 3)
        signif = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE significant "
            "GROUP BY treatment_name,control_name" % locals()
            ).fetchall())

        fold2 = toDict(Database.executewait(
            dbhandle,
            "SELECT treatment_name, control_name, "
            "COUNT(*) FROM %(tablename)s "
            "WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
            "GROUP BY treatment_name,control_name,significant"
            % locals()).fetchall())

        for treatment_name, control_name in tested.keys():
            outf.write("\t".join(map(str, (
                design,
                geneset,
                level,
                counting_method,
                treatment_name,
                control_name,
                tested[(treatment_name, control_name)],
                "\t".join(
                    [str(status[(treatment_name, control_name, x)])
                     for x in keys_status]),
                signif[(treatment_name, control_name)],
                fold2[(treatment_name, control_name)]))) + "\n")

        # plot length versus P-Value
        data = Database.executewait(
            dbhandle,
            "SELECT i.sum, pvalue "
            "FROM %(tablename)s, "
            "%(geneset)s_geneinfo as i "
            "WHERE i.gene_id = test_id AND "
            "significant" % locals()).fetchall()

        # require at least 10 datapoints - otherwise smooth scatter fails
        if len(data) > 10:
            data = zip(*data)

            pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s"
                       "_pvalue_vs_length.png") % locals()
            R.png(pngfile)
            R.smoothScatter(R.log10(ro.FloatVector(data[0])),
                            R.log10(ro.FloatVector(data[1])),
                            xlab='log10( length )',
                            ylab='log10( pvalue )',
                            log="x", pch=20, cex=.1)

            R['dev.off']()

    outf.close()