def meta(self, tbl,organism,db_version): """Returns the html description from the trackDb file for the specified organism. """ try: trackdb = uscsreader.load_tabledata_dumpfiles(os.path.join(sett["data_dir"][db_version],organism,"trackDb")) html = trackdb[map(itemgetter('tableName'),trackdb).index(tbl)]['html'] except Exception, e: return "<h3>(No data found for {}.)</h3>".format(tbl)
def meta(self, tbl, organism, db_version): """Returns the html description from the trackDb file for the specified organism. """ try: trackdb = uscsreader.load_tabledata_dumpfiles( os.path.join(sett["data_dir"][db_version], organism, "trackDb")) html = trackdb[map(itemgetter('tableName'), trackdb).index(tbl)]['html'] except Exception, e: return "<h3>(No data found for {}.)</h3>".format(tbl)
def get_heatmaps(self, run_id, organism): """ Returns clustered and PCC matrix if they exist. 'organism': is used to load detailed labels for the GFs. """ cherrypy.response.headers['Content-Type'] = 'application/json' trackdb = uscsreader.load_tabledata_dumpfiles(os.path.join(sett["data_dir"],organism,"trackDb")) results = {} path = os.path.join(results_dir, run_id) matrix_path = os.path.join(path,"matrix.txt") # Load clustered matrix matrix_clust_path = os.path.join(path,"clustered.txt") if os.path.exists(matrix_path): with open(matrix_path) as f: results["matrix"] = f.read().replace("\"","") if os.path.exists(matrix_clust_path): with open(matrix_clust_path) as f: d = f.read() d = d.replace("\"","") if d[:6] != "ERROR:": results["matrix_data"] = d.replace("\"","") # d3 requires "gene_name" to be inserted into the first column tmp_data = results["matrix_data"].split("\n") tmp = tmp_data[:] # this copy is passed onto the results page # insert the description column if it does not exist tmp_matrix = [x.split("\t") for x in tmp] if tmp_matrix[0][-1] != "Genomic Feature Description": tmp_matrix[0] += ["Genomic Feature Description"] for i in range(1,len(tmp)): description = [x["longLabel"] for x in trackdb if x["tableName"] == tmp_matrix[i][0]] if len(description) is not 0: description = description[0] else: description = "" tmp_matrix[i] += [description] results["matrix_data"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:]) results["matrix_data"] = results["matrix_data"] results["matrix_data_gf_description"] = "\t".join([x[-1] for x in tmp_matrix[1:]]) else: results["matrix_data"] = d[6:] else: results["matrix_data"] = "Heatmap will be available after the analysis is complete." results["matrix_data_gf_description"] = "" # Pearson's matrix results matrix_cor_path = os.path.join(path,"pcc_matrix.txt") if os.path.exists(matrix_cor_path): with open(matrix_cor_path) as f: d = f.read() d = d.replace("\"","") if d[:6] != "ERROR:": results["matrix_cor_data"] = d.replace("\"","") results["matrix_cor"] = d.replace("\"","") # d3 requires "gene_name" to be inserted into the first column tmp = results["matrix_cor"].split("\n") results["matrix_cor"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:]) results["matrix_cor"] = results["matrix_cor"] else: results["matrix_cor"],results["matrix_cor_data"] = d[6:],"" else: results["matrix_cor_data"] = "" results["matrix_cor"] = "PCC heatmap will be available after the clustered matrix is created." pvalue_path = os.path.join(path,"pcc_matrix_pvalue.txt") if os.path.exists(pvalue_path): with open(pvalue_path) as f: d = f.read() results["matrix_cor_pvalues"] = d.replace("\"","") # d3 requires "gene_name" to be inserted into the first column tmp = results["matrix_cor_pvalues"].split("\n") results["matrix_cor_pvalues"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:]) results["matrix_cor_pvalues"] = results["matrix_cor_pvalues"] else: results["matrix_cor_pvalues"] = "" return simplejson.dumps(results)
def run_hypergeom(fois, gfs, bg_path,outdir,job_name="",zip_run_files=False,bkg_overlaps_path="",gr_data_dir = "" ,run_annotation=True,run_randomization_test=False,padjust="None",pct_score="",organism = ""): global formatter global detailed_outpath,matrix_outpath, progress_outpath, curprog, progmax,run_files_dir if not os.path.exists(os.path.normpath(outdir)): os.mkdir(os.path.normpath(outdir)) fh = logging.FileHandler(os.path.join(outdir,'gr_log.txt')) fh.setFormatter(formatter) logger.addHandler(fh) logger.setLevel(logging.INFO) run_files_dir = outdir curprog,progmax = 0,1 logger.info("ORGANIS " + str(organism)) logger.info("P_value adjustment used: {}".format(padjust)) try: trackdb = [] trackdb_path = os.path.join(gr_data_dir,organism,"trackDb") if os.path.exists(trackdb_path+".txt.gz") and os.path.exists(trackdb_path + ".sql"): trackdb = bedfilecreator.load_tabledata_dumpfiles(trackdb_path) # set output settings detailed_outpath = os.path.join(outdir, "detailed.txt") matrix_outpath = os.path.join(outdir,"matrix.txt") progress_outpath = os.path.join(outdir,".prog") f = open(matrix_outpath,'wb') f.close() f = open(detailed_outpath,'wb') f.close() hdlr = logging.FileHandler(logger_path) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.propagate = False # Read in the paths fois = [line for line in read_lines(fois) if not line.endswith(".tbi")] gfs = [line for line in read_lines(gfs) if not line.endswith(".tbi")] # check if there are spaces in invalid parts of the file name invalid_names = validate_filenames(fois + gfs + [bg_path]) if len(invalid_names) != 0: logger.error("The following file(s) have invalid file names, cannot have space before/in file extension:\n" + "\n".join(invalid_names)) _write_progress("ERROR: Files have invalid filenames. See log file. Terminating run. See Analysis Log.") return if bg_path.endswith(".tbi"): logger.error("Background has invalid extension (.tbi). Terminating run.") _write_progress("ERROR: Background has invalid extension (.tbi). Terminating run. See Analysis Log.") return # pre-process the FOIs fois = preprocess_fois(fois,run_files_dir,gr_data_dir,organism) if len(fois) == 0: logger.error('No valid FOIs to supplied') _write_progress("ERROR: No valid FOI files supplied. Terminating run. See Analysis Log.") return # Validate FOIs against background. Also get the size of the background (n_bgs) foi_bg,good_fois = check_background_foi_overlap(bg_path,fois) write_output("\t".join(map(base_name,good_fois))+"\n", matrix_outpath) write_output("\t".join(['foi_name', 'foi_obs', 'n_fois', 'bg_obs', 'n_bgs', 'odds_ratio', 'p_val','test_type','p_rand' if run_randomization_test else "",'p_mod' if run_randomization_test else ""]) + "\n",detailed_outpath) curprog,progmax = 0,len(gfs) # remove old detailed enrichment result files if they exit enr_path = os.path.join(run_files_dir,"enrichment") for f in good_fois: f_path = os.path.join(enr_path, base_name(f)+'.txt') if os.path.exists(f_path): os.remove(f_path) _write_progress("Performing calculations on the background.") for gf in gfs: current_gf = base_name(gf) _write_progress("Performing Hypergeometric analysis for {}".format(base_name(gf))) write_output("###"+base_name(gf)+"\t"+get_score_strand_settings(gf)+"\t"+get_description(base_name(gf),trackdb)+"###"+"\n",detailed_outpath) res = get_overlap_statistics(gf,good_fois) # calculate bg_obs bg_obs = get_bgobs(bg_path,gf,bkg_overlaps_path) if bg_obs == None: logger.error("Skipping {}".format(gf)) continue n_bgs = foi_bg[0]["indexregions"] # run the enrichment analysis and output the matrix line for the current gf write_output("\t".join([base_name(gf)] + [str(p_value(res[i]["intersectregions"],res[i]["queryregions"],bg_obs,n_bgs ,good_fois[i],gf,bg_path,run_randomization_test)) for i in range(len(good_fois))])+"\n",matrix_outpath) curprog += 1 # Adjust p values for the enrichment files list_enr = [x for x in os.listdir(enr_path)] for p in list_enr: adjust_detailed_pvalue(os.path.join(enr_path, p),padjust) if len(gfs) >= 1 and len(good_fois) >= 1: # Adjust the pvalues adjust_pvalue(matrix_outpath,padjust) # Cluster the matrix clust_path = cluster_matrix(matrix_outpath,os.path.join(outdir,"clustered.txt")) if len(gfs) > 4: pearsons_cor_matrix(clust_path,outdir) else: json_mat = [] json_mat.append({"log": False,"neg": "Underrepresented", "pos": "Overrepresented","name": "Pearsons","alpha":"","matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix."}) json_mat.append({"log": False,"neg": "Underrepresented", "pos": "Overrepresented","name": "P-value","alpha":"","matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix."}) with open(os.path.join(outdir,"pcc_matrix.json"),'wb') as f: simplejson.dump(json_mat,f) with open(os.path.join(os.path.join(outdir,"pcc_matrix.txt")),"wb") as wb: wb.write("ERROR:PCC matrix requires at least a 5 X 2 matrix.") else: with open(os.path.join(outdir,"clustered.txt"),"wb") as wb: wb.write("ERROR:Clustered matrix requires at least a 2 X 2 matrix.") if run_annotation: annot_outdir = os.path.join(outdir,"annotations") if not os.path.exists(annot_outdir): os.mkdir(annot_outdir) curprog,progmax = 0,len(fois) for f in fois: _write_progress("Running Annotation Analysis for {}.".format(base_name(f))) with open(os.path.join(annot_outdir,base_name(f) + ".txt"),"wb") as wr: anot = get_annotation(f,gfs).split("\n") anot[0] = anot[0].replace("Region\t\t","Region\t") wr.write("Region"+"\t"+"\t".join(base_name(x) for x in reversed(anot[0].split("\t")[1:])) + "\tTotal") # annotationAnalysis column order is reverse of input order for ind, a in enumerate(anot[1:]): if a.strip() != "": cur_row = a.split("\t") wr.write("\n" + str(ind) + "|"+"\t".join(cur_row + [str(sum([int(x) for x in cur_row[1:] if x != ""]))])) curprog += 1 if zip_run_files: _write_progress("Preparing run files for download") _zip_run_files(fois,gfs,bg_path,outdir,job_name) curprog,progmax = 1,1 _write_progress("Analysis Completed") except Exception, e: logger.error( traceback.print_exc()) write_output(traceback.format_exc(),logger_path) _write_progress("Run crashed. See end of log for details.")
def get_heatmaps(self, run_id, organism): """ Returns clustered and PCC matrix if they exist. 'organism': is used to load detailed labels for the GFs. """ global results_dir, uploads_dir, sett cherrypy.response.headers['Content-Type'] = 'application/json' trackdb = uscsreader.load_tabledata_dumpfiles( os.path.join(sett["data_dir"], organism, "trackDb")) results = {} path = os.path.join(results_dir, run_id) matrix_path = os.path.join(path, "matrix.txt") # Load clustered matrix matrix_clust_path = os.path.join(path, "clustered.txt") if os.path.exists(matrix_path): with open(matrix_path) as f: results["matrix"] = f.read().replace("\"", "") if os.path.exists(matrix_clust_path): with open(matrix_clust_path) as f: d = f.read() d = d.replace("\"", "") if d[:6] != "ERROR:": results["matrix_data"] = d.replace("\"", "") # d3 requires "gene_name" to be inserted into the first column tmp_data = results["matrix_data"].split("\n") tmp = tmp_data[:] # this copy is passed onto the results page # insert the description column if it does not exist tmp_matrix = [x.split("\t") for x in tmp] if tmp_matrix[0][-1] != "Genomic Feature Description": tmp_matrix[0] += ["Genomic Feature Description"] for i in range(1, len(tmp)): description = [ x["longLabel"] for x in trackdb if x["tableName"] == tmp_matrix[i][0] ] if len(description) is not 0: description = description[0] else: description = "" tmp_matrix[i] += [description] results["matrix_data"] = "\n".join( ["\t".join(["gene_name", tmp[0]])] + tmp[1:]) results["matrix_data"] = results["matrix_data"] results["matrix_data_gf_description"] = "\t".join( [x[-1] for x in tmp_matrix[1:]]) else: results["matrix_data"] = d[6:] else: results[ "matrix_data"] = "Heatmap will be available after the analysis is complete." results["matrix_data_gf_description"] = "" # Pearson's matrix results matrix_cor_path = os.path.join(path, "pcc_matrix.txt") if os.path.exists(matrix_cor_path): with open(matrix_cor_path) as f: d = f.read() d = d.replace("\"", "") if d[:6] != "ERROR:": results["matrix_cor_data"] = d.replace("\"", "") results["matrix_cor"] = d.replace("\"", "") # d3 requires "gene_name" to be inserted into the first column tmp = results["matrix_cor"].split("\n") results["matrix_cor"] = "\n".join( ["\t".join(["gene_name", tmp[0]])] + tmp[1:]) results["matrix_cor"] = results["matrix_cor"] else: results["matrix_cor"], results["matrix_cor_data"] = d[ 6:], "" else: results["matrix_cor_data"] = "" results[ "matrix_cor"] = "PCC heatmap will be available after the clustered matrix is created." pvalue_path = os.path.join(path, "pcc_matrix_pvalue.txt") if os.path.exists(pvalue_path): with open(pvalue_path) as f: d = f.read() results["matrix_cor_pvalues"] = d.replace("\"", "") # d3 requires "gene_name" to be inserted into the first column tmp = results["matrix_cor_pvalues"].split("\n") results["matrix_cor_pvalues"] = "\n".join( ["\t".join(["gene_name", tmp[0]])] + tmp[1:]) results["matrix_cor_pvalues"] = results["matrix_cor_pvalues"] else: results["matrix_cor_pvalues"] = "" return simplejson.dumps(results)
def run_hypergeom(fois, gfs, bg_path, outdir, job_name="", zip_run_files=False, bkg_overlaps_path="", gr_data_dir="", run_annotation=True, run_randomization_test=False, padjust="None", pct_score="", organism=""): global formatter global detailed_outpath, matrix_outpath, progress_outpath, curprog, progmax, run_files_dir if not os.path.exists(os.path.normpath(outdir)): os.mkdir(os.path.normpath(outdir)) fh = logging.FileHandler(os.path.join(outdir, 'gr_log.txt')) fh.setFormatter(formatter) logger.addHandler(fh) logger.setLevel(logging.INFO) run_files_dir = outdir curprog, progmax = 0, 1 logger.info("ORGANIS " + str(organism)) logger.info("P_value adjustment used: {}".format(padjust)) try: trackdb = [] trackdb_path = os.path.join(gr_data_dir, organism, "trackDb") if os.path.exists(trackdb_path + ".txt.gz") and os.path.exists(trackdb_path + ".sql"): trackdb = bedfilecreator.load_tabledata_dumpfiles(trackdb_path) # set output settings detailed_outpath = os.path.join(outdir, "detailed.txt") matrix_outpath = os.path.join(outdir, "matrix.txt") progress_outpath = os.path.join(outdir, ".prog") f = open(matrix_outpath, 'wb') f.close() f = open(detailed_outpath, 'wb') f.close() hdlr = logging.FileHandler(logger_path) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.propagate = False # Read in the paths fois = [line for line in read_lines(fois) if not line.endswith(".tbi")] gfs = [line for line in read_lines(gfs) if not line.endswith(".tbi")] # check if there are spaces in invalid parts of the file name invalid_names = validate_filenames(fois + gfs + [bg_path]) if len(invalid_names) != 0: logger.error( "The following file(s) have invalid file names, cannot have space before/in file extension:\n" + "\n".join(invalid_names)) _write_progress( "ERROR: Files have invalid filenames. See log file. Terminating run. See Analysis Log." ) return if bg_path.endswith(".tbi"): logger.error( "Background has invalid extension (.tbi). Terminating run.") _write_progress( "ERROR: Background has invalid extension (.tbi). Terminating run. See Analysis Log." ) return # pre-process the FOIs fois = preprocess_fois(fois, run_files_dir, gr_data_dir, organism) if len(fois) == 0: logger.error('No valid FOIs to supplied') _write_progress( "ERROR: No valid FOI files supplied. Terminating run. See Analysis Log." ) return # Validate FOIs against background. Also get the size of the background (n_bgs) foi_bg, good_fois = check_background_foi_overlap(bg_path, fois) write_output("\t".join(map(base_name, good_fois)) + "\n", matrix_outpath) write_output( "\t".join([ 'foi_name', 'foi_obs', 'n_fois', 'bg_obs', 'n_bgs', 'odds_ratio', 'p_val', 'test_type', 'p_rand' if run_randomization_test else "", 'p_mod' if run_randomization_test else "" ]) + "\n", detailed_outpath) curprog, progmax = 0, len(gfs) # remove old detailed enrichment result files if they exit enr_path = os.path.join(run_files_dir, "enrichment") for f in good_fois: f_path = os.path.join(enr_path, base_name(f) + '.txt') if os.path.exists(f_path): os.remove(f_path) _write_progress("Performing calculations on the background.") for gf in gfs: current_gf = base_name(gf) _write_progress("Performing Hypergeometric analysis for {}".format( base_name(gf))) write_output( "###" + base_name(gf) + "\t" + get_score_strand_settings(gf) + "\t" + get_description(base_name(gf), trackdb) + "###" + "\n", detailed_outpath) res = get_overlap_statistics(gf, good_fois) # calculate bg_obs bg_obs = get_bgobs(bg_path, gf, bkg_overlaps_path) if bg_obs == None: logger.error("Skipping {}".format(gf)) continue n_bgs = foi_bg[0]["indexregions"] # run the enrichment analysis and output the matrix line for the current gf write_output( "\t".join([base_name(gf)] + [ str( p_value(res[i]["intersectregions"], res[i] ["queryregions"], bg_obs, n_bgs, good_fois[i], gf, bg_path, run_randomization_test)) for i in range(len(good_fois)) ]) + "\n", matrix_outpath) curprog += 1 # Adjust p values for the enrichment files list_enr = [x for x in os.listdir(enr_path)] for p in list_enr: adjust_detailed_pvalue(os.path.join(enr_path, p), padjust) if len(gfs) >= 1 and len(good_fois) >= 1: # Adjust the pvalues adjust_pvalue(matrix_outpath, padjust) # Cluster the matrix clust_path = cluster_matrix(matrix_outpath, os.path.join(outdir, "clustered.txt")) if len(gfs) > 4: pearsons_cor_matrix(clust_path, outdir) else: json_mat = [] json_mat.append({ "log": False, "neg": "Underrepresented", "pos": "Overrepresented", "name": "Pearsons", "alpha": "", "matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix." }) json_mat.append({ "log": False, "neg": "Underrepresented", "pos": "Overrepresented", "name": "P-value", "alpha": "", "matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix." }) with open(os.path.join(outdir, "pcc_matrix.json"), 'wb') as f: simplejson.dump(json_mat, f) with open(os.path.join(os.path.join(outdir, "pcc_matrix.txt")), "wb") as wb: wb.write( "ERROR:PCC matrix requires at least a 5 X 2 matrix.") else: with open(os.path.join(outdir, "clustered.txt"), "wb") as wb: wb.write( "ERROR:Clustered matrix requires at least a 2 X 2 matrix.") if run_annotation: annot_outdir = os.path.join(outdir, "annotations") if not os.path.exists(annot_outdir): os.mkdir(annot_outdir) curprog, progmax = 0, len(fois) for f in fois: _write_progress("Running Annotation Analysis for {}.".format( base_name(f))) with open(os.path.join(annot_outdir, base_name(f) + ".txt"), "wb") as wr: anot = get_annotation(f, gfs).split("\n") anot[0] = anot[0].replace("Region\t\t", "Region\t") wr.write( "Region" + "\t" + "\t".join( base_name(x) for x in reversed(anot[0].split("\t")[1:])) + "\tTotal" ) # annotationAnalysis column order is reverse of input order for ind, a in enumerate(anot[1:]): if a.strip() != "": cur_row = a.split("\t") wr.write( "\n" + str(ind) + "|" + "\t".join(cur_row + [ str( sum([ int(x) for x in cur_row[1:] if x != "" ])) ])) curprog += 1 if zip_run_files: _write_progress("Preparing run files for download") _zip_run_files(fois, gfs, bg_path, outdir, job_name) curprog, progmax = 1, 1 _write_progress("Analysis Completed") except Exception, e: logger.error(traceback.print_exc()) write_output(traceback.format_exc(), logger_path) _write_progress("Run crashed. See end of log for details.")