def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def getTableFromDb(database_url, table): ''' Get a table from a database with pandas ''' dbhandle = Database.connect(url=database_url) df = pandas.read_sql("SELECT * FROM {}".format(table), con=dbhandle) df.index = df["track"] df.drop(labels="track", inplace=True, axis=1) return df
def getModelCoverage(database_url, table_regex, model_type="transcript"): ''' Compute transcript model coverage stats Arguments --------- database_url: string database containing transcript counts table_regex: string regular expression for transcript count table model_type: string calculate coverages over either transcripts or genes. Default is gene models Returns ------- coverage_df: Pandas.Core.DataFrame model coverage stats summarised for each cell ''' # need to regex for all the tables, one for each sample # fetch_all returns a list of tuples dbhandle = Database.connect(database_url) cc = dbhandle.execute("SELECT name FROM sqlite_master WHERE type='table';") tab_reg = re.compile(table_regex) table_list = [tx[0] for tx in cc.fetchall() if re.search(tab_reg, tx[0])] # pull out counts for each cell and compute coverages bins = range(0, 101) cov_dict = {} for tab in table_list: covs = extractTranscriptCounts(dbhandle, tab) freq_array = summariseOverBins(covs, bins) cov_dict[tab] = freq_array coverage_df = pandas.DataFrame(cov_dict).T # create a regex group to remove superfluous characters # from the track names ix_re = re.compile( "_(?P<run>\d+)_(?P<plate>\d+)_(?P<well>\d+)_(?P<mapper>\S+)_transcript_counts" ) re_matches = [re.match(ix_re, ix) for ix in coverage_df.index] indx = ["%s_%s-%s.%s" % rm.group(1, 2, 3, 4) for rm in re_matches] coverage_df.index = indx coverage_df.columns = ["Bin%i" % bx for bx in coverage_df.columns] return coverage_df
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_url, options.species) results = Database.executewait(dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("{}\t{}\n".format(go_type, "\t".join(map(str, result)))) gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def connectToUCSC(host="genome-mysql.cse.ucsc.edu", user="******", database=None): """connect to UCSC database. Arguments --------- host : string Host to connect to user : string Username to connect with Database : string database to use Returns ------- Database handle """ dbhandle = Database.connect(url="mysql://{user}@{host}/{database}".format(**locals())) return dbhandle
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species): """read go assignments from ensembl database. returns a dictionary of lists. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information Note: assumes that external_db_id for GO is 1000 """ statement = GetGOStatement(go_type, database, species) result = Database.executewait(dbhandle, statement, retries=0).fetchall() gene2go = {} go2info = collections.defaultdict(GOInfo) for gene_id, goid, description, evidence in result: gm = GOMatch(goid, go_type, description, evidence) gi = GOInfo(goid, go_type, description) if gene_id not in gene2go: gene2go[gene_id] = [] gene2go[gene_id].append(gm) go2info[goid] = gi return gene2go, go2info
def summarizeEffectsPerGene(infile, outfile): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] track = infile[:-len("_effects.load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, %(track)s_effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id ''' % locals() Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals()) Database.executewait(dbhandle, statement) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def loadSummary(infile, outfile): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists("%s_rates.load" % track): stmt_select.append("a.distance AS ks, a.aligned AS aligned") stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''') if os.path.exists("%s_coverage.load" % track): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage") stmt_from.append( "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id") if os.path.exists("%s_repeats_gc.load" % track): stmt_select.append("ar_gc.exons_mean AS repeats_gc") stmt_from.append( "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id") if os.path.exists("%s_repeats_rates.load" % track): stmt_select.append( "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska") stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''') if os.path.exists("%s_introns_rates.load" % track): stmt_select.append( "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski") stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''') x = locals() x.update(PARAMS) stmt_select = ", ".join(stmt_select) % x stmt_from = " ".join(stmt_from) % x stmt_where = " AND ".join(stmt_where) % x dbhandle = sqlite3.connect(PARAMS["database_name"]) Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals()) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait(dbhandle, statement) dbhandle.commit() P.touch(outfile)
def create_view(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append([ x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track ]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError("number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join([ "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:]) ]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError("empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touch_file(outfile)
def buildDMRStats(tables, method, outfile, dbhandle): '''build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing ''' def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", )) + "\n") all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals()).fetchall()) groups = list(tested.keys()) for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write("\t".join( map(str, (tileset, design, treatment_name, control_name, tested[k], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant''' % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = list(zip(*data)) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10(length)', ylab='log10(pvalue)', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall() ] genelists = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall() ] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) load_statement = P.build_load_statement(tablename=tablename) statement = ''' cgat combine_tables --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | %(load_statement)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty-file') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty-file')
def run(infile, options, chunk_size=10000): # for backwards compatibility if options.retry: options.retries = 20 else: options.retries = -1 flavour = get_flavour(options.database_url) tablename = quote_tablename(options.tablename, flavour=flavour) dbhandle = Database.connect(url=options.database_url) if "tab" in options.dialect: separator = "\t" else: separator = "," if options.append: if_exists = "append" else: if_exists = "replace" # handle header logic up-front if options.replace_header: if options.header_names is None: raise ValueError("No replacement headers provided") header = 0 names = options.header_names else: if options.header_names is None: header = 0 names = None else: header = None names = options.header_names counter = E.Counter() try: for idx, df in enumerate( pandas.read_csv(infile, header=header, names=names, sep=separator, index_col=False, comment="#", chunksize=options.chunk_size)): if idx == 0 and len(df) == 0: if not options.allow_empty: raise ValueError("table is empty") if idx > 0: if_exists = "append" columns = list(df.columns) if options.lowercase_columns: columns = [x.lower() for x in columns] if options.first_column: columns[0] = options.first_column if options.ignore_columns: df = df[[ x for x in df.columns if x not in options.ignore_columns ]] if options.ignore_empty: empty_list = df.columns[df.isna().all()].tolist() if idx == 0: empty_columns = set(empty_list) else: empty_columns = empty_columns.intersection(empty_list) df.to_sql(tablename, con=dbhandle, schema=options.database_schema, index=False, if_exists=if_exists) counter.input += len(df) except pandas.errors.EmptyDataError: if not options.allow_empty: raise else: return nindex = 0 for index in options.indices: nindex += 1 try: statement = "CREATE INDEX %s_index%i ON %s (%s)" % ( tablename, nindex, tablename, index) cc = Database.executewait(dbhandle, statement, retries=options.retries) cc.close() E.info("added index on column %s" % (index)) counter.indexes_created += 1 except Exception as ex: E.info("adding index on column %s failed: %s" % (index, ex)) if options.ignore_empty: counter.empty_columns = len(empty_columns) for column in empty_columns: try: statement = "ALTER TABLE %s DROP COLUMN %s".format( tablename, column) cc = Database.executewait(dbhandle, statement, retries=options.retries) cc.close() E.info("removed empty column %s" % (column)) counter.empty_columns_removed += 1 except Exception as ex: E.info("removing empty column {} failed".format(column)) statement = "SELECT COUNT(*) FROM %s" % (tablename) cc = Database.executewait(dbhandle, statement, retries=options.retries) result = cc.fetchone() cc.close() counter.output = result[0] E.info(counter)
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option("-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories " "[default=%default].") parser.add_option("-g", "--genes-tsv-file", dest="filename_genes", type="string", help="filename with genes to analyse " "[default=%default].") parser.add_option("-b", "--background-tsv-file", dest="filename_background", type="string", help="filename with background genes to analyse " "[default=%default].") parser.add_option("-m", "--min-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have " "fewer than # number of genes" " [default=%default].") parser.add_option("-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option("--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested " "separately [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this " "refers to the fdr, otherwise it is a cutoff for p-values.") parser.add_option("--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile " "[default=%default].") parser.add_option( "--gene2name-map-tsv-file", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names " "[default=%default].") parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option("--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile " "[default=%default].") parser.add_option("--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern with output filename pattern " "(should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option("--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR default=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and " "write to STDOUT [default=%default].") parser.add_option("--gene-pattern", dest="gene_pattern", type="string", help="pattern to transform identifiers to GO gene names " "[default=%default].") parser.add_option("--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims " "[default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background " "[default=%default].") parser.add_option( "-q", "--fdr-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None) (options, args) = E.start(parser, add_database_options=True) if options.go2goslim: GO.convertGo2Goslim(options) E.stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# # dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location" ] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle = Database.connect(url=options.database_url) outfile = IOTools.open_file(options.filename_dump, "w", create_dir=True) GO.DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.stop() sys.exit(0) ############################################################# # read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = IOTools.open_file(options.filename_input) gene2gos, go2infos = GO.ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.open_file(options.filename_gene2name) gene2name = IOTools.read_map(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: # use identity mapping gene2name = dict([(x, x) for x in list(gene2gos.keys())]) ############################################################# # read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.open_file(options.filename_ontology) ontology = GO.readOntology(infile) infile.close() def _g(): return collections.defaultdict(GO.GOInfo) go2infos = collections.defaultdict(_g) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# # get foreground gene list input_foreground, genelists = GO.ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# # get background if options.filename_background: # nick - bug fix: background is the first tuple element from # ReadGeneLists input_background = GO.ReadGeneLists( options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# # sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = list(gene2gos.keys()) E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join( ("genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments", "messages")) + "\n") ############################################################# # get go categories for genes for test_ontology in sorted(options.ontology): # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# # get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = GO.ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn("could not find information for terms - " "could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go) E.info("assignments found: %i genes mapped to %i categories " "(%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set([ x for x, y in counts_per_category.items() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) GO.removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) E.info("assignments after filtering: %i genes mapped " "to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) for genelist_name, foreground in sorted(genelists.items()): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## # build background - reconcile with foreground ################################################################## if input_background is None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % ( len(missing), str(missing)) else: if len(missing) != 0: E.warn("%i genes in foreground that are not in " "background - added to background of %i" % (len(missing), len(background))) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) # sort foreground and background, important for reproducibility # under random seed foreground = sorted(foreground) background = sorted(background) ############################################################# # sanity checks: # are all of the foreground genes in the dataset # missing = set(genes).difference( set(gene2go.keys()) ) # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# # read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GO.GetGOSlims( IOTools.open_file(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" % (options.filename_slims, len(go_slims), len(v))) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = IOTools.open_file(options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in sorted(list(go_slims.items())): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to " "%i categories (%i maps)\n" % (ngenes, ncategories, nmaps)) ############################################################# # Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in list(gene2go.items()): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) # skip to next GO class if not (bg or ng): continue options.stdout.write("# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in sorted(fg): options.stdout.write("%s\t%s\n" % ("fg", x)) for x in sorted(bg): options.stdout.write("%s\t%s\n" % ("bg", x)) for x in sorted(ng): options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.stop() sys.exit(0) ############################################################# outfile = GO.getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = GO.getFileName(options, go=test_ontology, section='background', set=genelist_name) # Jethro bug fix - see section 'build background' for assignment outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background)))) if options.output_filename_pattern: outfile.close() ############################################################# # do the analysis go_results = GO.AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = list(go_results.mResults.items()) ############################################################# # calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = GO.computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(key=lambda x: x[1].mQValue) elif options.sort_order == "ratio": pairs.sort(key=lambda x: x[1].mRatio) elif options.sort_order == "pvalue": pairs.sort(key=lambda x: x[1].mPValue) ############################################################# ############################################################# ############################################################# # output the full result outfile = GO.getFileName(options, go=test_ontology, section='overall', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = GO.getFileName(options, go=test_ontology, section='results', set=genelist_name) GO.outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# # output parameters ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) outfile = GO.getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write("mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes))) outfile.write("associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.pretty_percent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.pretty_percent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write("significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write("threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append("\t".join( map(str, (genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.pretty_percent(len(go_results.mSampleGenes), len(foreground), "%5.2f"), IOTools.pretty_percent( len(go_results.mBackgroundGenes), nbackground, "%5.2f"), ",".join(msgs)))) + "\n") ############################################################# ############################################################# ############################################################# # output the fg patterns outfile = GO.getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ################################################################### # output various summary files # significant results GO.outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') # all results GO.outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: GO.pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.stop()