Exemple #1
0
	def meta(self, tbl,organism,db_version):
		"""Returns the html description from the trackDb file for the specified organism.
		"""
		try:
			trackdb = uscsreader.load_tabledata_dumpfiles(os.path.join(sett["data_dir"][db_version],organism,"trackDb"))
			html = trackdb[map(itemgetter('tableName'),trackdb).index(tbl)]['html']
		except Exception, e:
			return "<h3>(No data found for {}.)</h3>".format(tbl)
Exemple #2
0
    def meta(self, tbl, organism, db_version):
        """Returns the html description from the trackDb file for the specified organism.
		"""
        try:
            trackdb = uscsreader.load_tabledata_dumpfiles(
                os.path.join(sett["data_dir"][db_version], organism,
                             "trackDb"))
            html = trackdb[map(itemgetter('tableName'),
                               trackdb).index(tbl)]['html']
        except Exception, e:
            return "<h3>(No data found for {}.)</h3>".format(tbl)
Exemple #3
0
	def get_heatmaps(self, run_id, organism):
		"""	Returns clustered and PCC matrix if they exist.
		'organism': is used to load detailed labels for the GFs.
		"""
		cherrypy.response.headers['Content-Type'] = 'application/json'
		trackdb = uscsreader.load_tabledata_dumpfiles(os.path.join(sett["data_dir"],organism,"trackDb"))
		results = {}
		path = os.path.join(results_dir, run_id)
		matrix_path = os.path.join(path,"matrix.txt")
		# Load clustered matrix
		matrix_clust_path  = os.path.join(path,"clustered.txt")
		if os.path.exists(matrix_path):
			with open(matrix_path) as f:
				results["matrix"] = f.read().replace("\"","")
		if os.path.exists(matrix_clust_path):
			with open(matrix_clust_path) as f:
				d = f.read()
				d = d.replace("\"","")
				if d[:6] != "ERROR:":
					results["matrix_data"] = d.replace("\"","")
					# d3 requires "gene_name" to be inserted into the first column
					tmp_data =  results["matrix_data"].split("\n")
					tmp = tmp_data[:] # this copy is passed onto the results page
					# insert the description column if it does not exist
					tmp_matrix = [x.split("\t") for x in tmp]
					if tmp_matrix[0][-1] != "Genomic Feature Description":
						tmp_matrix[0] += ["Genomic Feature Description"]
						for i in range(1,len(tmp)):
							description = [x["longLabel"] for x in trackdb if x["tableName"] == tmp_matrix[i][0]]
							if len(description) is not 0: description = description[0]
							else: description = ""
							tmp_matrix[i] += [description]					

					results["matrix_data"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:])  
					results["matrix_data"] = results["matrix_data"]
					results["matrix_data_gf_description"] = "\t".join([x[-1] for x in tmp_matrix[1:]])
				else: results["matrix_data"] = d[6:]
		else: 
			results["matrix_data"] = "Heatmap will be available after the analysis is complete."
			results["matrix_data_gf_description"] = ""

		# Pearson's matrix results
		matrix_cor_path = os.path.join(path,"pcc_matrix.txt")
		if os.path.exists(matrix_cor_path):
			with open(matrix_cor_path) as f:
				d = f.read()
				d = d.replace("\"","")
				if d[:6] != "ERROR:":
					results["matrix_cor_data"] =  d.replace("\"","")
					results["matrix_cor"] = d.replace("\"","")
					# d3 requires "gene_name" to be inserted into the first column
					tmp =  results["matrix_cor"].split("\n")
					results["matrix_cor"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:])  
					results["matrix_cor"] = results["matrix_cor"]
				else: results["matrix_cor"],results["matrix_cor_data"] = d[6:],""				

		else:
			results["matrix_cor_data"] = ""
			results["matrix_cor"] = "PCC heatmap will be available after the clustered matrix is created."
		pvalue_path = os.path.join(path,"pcc_matrix_pvalue.txt")
		if os.path.exists(pvalue_path):
			with open(pvalue_path) as f:
				d = f.read()
				results["matrix_cor_pvalues"] = d.replace("\"","")
				# d3 requires "gene_name" to be inserted into the first column
				tmp =  results["matrix_cor_pvalues"].split("\n")
				results["matrix_cor_pvalues"] = "\n".join(["\t".join(["gene_name",tmp[0]])]+tmp[1:])   
				results["matrix_cor_pvalues"] = results["matrix_cor_pvalues"]			
		else: 
			results["matrix_cor_pvalues"] = ""
		return simplejson.dumps(results)
Exemple #4
0
def run_hypergeom(fois, gfs, bg_path,outdir,job_name="",zip_run_files=False,bkg_overlaps_path="",gr_data_dir = "" ,run_annotation=True,run_randomization_test=False,padjust="None",pct_score="",organism = ""):
    global formatter
    global detailed_outpath,matrix_outpath, progress_outpath, curprog, progmax,run_files_dir
    if not os.path.exists(os.path.normpath(outdir)): os.mkdir(os.path.normpath(outdir))
    fh = logging.FileHandler(os.path.join(outdir,'gr_log.txt'))
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    run_files_dir = outdir
    curprog,progmax = 0,1

    logger.info("ORGANIS " + str(organism))
    logger.info("P_value adjustment used: {}".format(padjust))

    try:
        trackdb = []      

        trackdb_path = os.path.join(gr_data_dir,organism,"trackDb")
        if os.path.exists(trackdb_path+".txt.gz") and os.path.exists(trackdb_path + ".sql"):
            trackdb = bedfilecreator.load_tabledata_dumpfiles(trackdb_path)
        # set output settings
        detailed_outpath =  os.path.join(outdir, "detailed.txt") 
        matrix_outpath = os.path.join(outdir,"matrix.txt")
        progress_outpath = os.path.join(outdir,".prog")
        f = open(matrix_outpath,'wb') 
        f.close()
        f = open(detailed_outpath,'wb')
        f.close()
        hdlr = logging.FileHandler(logger_path)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.propagate = False

        # Read in the paths
        fois = [line for line in read_lines(fois) if not line.endswith(".tbi")]
        gfs = [line for line in read_lines(gfs) if not line.endswith(".tbi")]
        # check if there are spaces in invalid parts of the file name
        invalid_names = validate_filenames(fois + gfs + [bg_path])
        if len(invalid_names) != 0:
            logger.error("The following file(s) have invalid file names, cannot have space before/in file extension:\n" + "\n".join(invalid_names))
            _write_progress("ERROR: Files have invalid filenames. See log file. Terminating run. See Analysis Log.")
            return         
        if bg_path.endswith(".tbi"):
            logger.error("Background has invalid extension (.tbi). Terminating run.")
            _write_progress("ERROR: Background has invalid extension (.tbi). Terminating run. See Analysis Log.")
            return

        # pre-process the FOIs
        fois = preprocess_fois(fois,run_files_dir,gr_data_dir,organism)
        if len(fois) == 0:
            logger.error('No valid FOIs to supplied')
            _write_progress("ERROR: No valid FOI files supplied. Terminating run. See Analysis Log.")
            return
        # Validate FOIs against background. Also get the size of the background (n_bgs)
        foi_bg,good_fois = check_background_foi_overlap(bg_path,fois)   
        write_output("\t".join(map(base_name,good_fois))+"\n", matrix_outpath)
        write_output("\t".join(['foi_name', 'foi_obs', 'n_fois', 'bg_obs', 'n_bgs', 'odds_ratio', 'p_val','test_type','p_rand' if run_randomization_test else "",'p_mod' if run_randomization_test else ""]) + "\n",detailed_outpath)
        curprog,progmax = 0,len(gfs)
        # remove old detailed enrichment result files if they exit
        enr_path =  os.path.join(run_files_dir,"enrichment")
        for f in good_fois:
            f_path = os.path.join(enr_path, base_name(f)+'.txt')
            if os.path.exists(f_path): os.remove(f_path)
        _write_progress("Performing calculations on the background.")
        for gf in gfs: 
            current_gf = base_name(gf)      
            _write_progress("Performing Hypergeometric analysis for {}".format(base_name(gf)))
            write_output("###"+base_name(gf)+"\t"+get_score_strand_settings(gf)+"\t"+get_description(base_name(gf),trackdb)+"###"+"\n",detailed_outpath)
            res = get_overlap_statistics(gf,good_fois) 

            # calculate bg_obs
            bg_obs = get_bgobs(bg_path,gf,bkg_overlaps_path)
            if bg_obs == None: 
                logger.error("Skipping {}".format(gf))
                continue

            n_bgs = foi_bg[0]["indexregions"]  

            # run the enrichment analysis and output the matrix line for the current gf
            write_output("\t".join([base_name(gf)] + [str(p_value(res[i]["intersectregions"],res[i]["queryregions"],bg_obs,n_bgs ,good_fois[i],gf,bg_path,run_randomization_test)) for i in range(len(good_fois))])+"\n",matrix_outpath)
            curprog += 1
        # Adjust p values for the enrichment files
        list_enr =  [x  for x in os.listdir(enr_path)]

        for p in list_enr:
            adjust_detailed_pvalue(os.path.join(enr_path, p),padjust) 

        if len(gfs) >= 1 and len(good_fois) >= 1:
            # Adjust the pvalues
            adjust_pvalue(matrix_outpath,padjust)
            # Cluster the matrix
            clust_path =  cluster_matrix(matrix_outpath,os.path.join(outdir,"clustered.txt"))
            if len(gfs) > 4:               
                pearsons_cor_matrix(clust_path,outdir)
            else:
                json_mat = []
                json_mat.append({"log": False,"neg": "Underrepresented", "pos": "Overrepresented","name": "Pearsons","alpha":"","matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix."})      
                json_mat.append({"log": False,"neg": "Underrepresented", "pos": "Overrepresented","name": "P-value","alpha":"","matrix": "ERROR:PCC matrix requires at least a 5 X 2 matrix."})      
                with open(os.path.join(outdir,"pcc_matrix.json"),'wb') as f:   
                    simplejson.dump(json_mat,f)
                with open(os.path.join(os.path.join(outdir,"pcc_matrix.txt")),"wb") as wb:
                    wb.write("ERROR:PCC matrix requires at least a 5 X 2 matrix.")
        else:
            with open(os.path.join(outdir,"clustered.txt"),"wb") as wb:
                wb.write("ERROR:Clustered matrix requires at least a 2 X 2 matrix.")

        if run_annotation:
            annot_outdir = os.path.join(outdir,"annotations")
            if not os.path.exists(annot_outdir): os.mkdir(annot_outdir)
            curprog,progmax = 0,len(fois)
            for f in fois:                
                _write_progress("Running Annotation Analysis for {}.".format(base_name(f)))
                with open(os.path.join(annot_outdir,base_name(f) + ".txt"),"wb") as wr:
                    anot = get_annotation(f,gfs).split("\n")
                    anot[0] = anot[0].replace("Region\t\t","Region\t")
                    wr.write("Region"+"\t"+"\t".join(base_name(x) for x in reversed(anot[0].split("\t")[1:])) + "\tTotal") # annotationAnalysis column order is reverse of input order
                    for ind, a in enumerate(anot[1:]):
                        if a.strip() != "":
                            cur_row = a.split("\t")
                            wr.write("\n" + str(ind) + "|"+"\t".join(cur_row + [str(sum([int(x) for x in cur_row[1:] if x != ""]))]))                            
                curprog += 1

        if zip_run_files:
            _write_progress("Preparing run files for download")
            _zip_run_files(fois,gfs,bg_path,outdir,job_name)
        curprog,progmax = 1,1
        _write_progress("Analysis Completed")       
    except Exception, e: 
        logger.error( traceback.print_exc())
        write_output(traceback.format_exc(),logger_path)
        _write_progress("Run crashed. See end of log for details.")
Exemple #5
0
    def get_heatmaps(self, run_id, organism):
        """	Returns clustered and PCC matrix if they exist.
		'organism': is used to load detailed labels for the GFs.
		"""
        global results_dir, uploads_dir, sett
        cherrypy.response.headers['Content-Type'] = 'application/json'
        trackdb = uscsreader.load_tabledata_dumpfiles(
            os.path.join(sett["data_dir"], organism, "trackDb"))
        results = {}
        path = os.path.join(results_dir, run_id)
        matrix_path = os.path.join(path, "matrix.txt")
        # Load clustered matrix
        matrix_clust_path = os.path.join(path, "clustered.txt")
        if os.path.exists(matrix_path):
            with open(matrix_path) as f:
                results["matrix"] = f.read().replace("\"", "")
        if os.path.exists(matrix_clust_path):
            with open(matrix_clust_path) as f:
                d = f.read()
                d = d.replace("\"", "")
                if d[:6] != "ERROR:":
                    results["matrix_data"] = d.replace("\"", "")
                    # d3 requires "gene_name" to be inserted into the first column
                    tmp_data = results["matrix_data"].split("\n")
                    tmp = tmp_data[:]  # this copy is passed onto the results page
                    # insert the description column if it does not exist
                    tmp_matrix = [x.split("\t") for x in tmp]
                    if tmp_matrix[0][-1] != "Genomic Feature Description":
                        tmp_matrix[0] += ["Genomic Feature Description"]
                        for i in range(1, len(tmp)):
                            description = [
                                x["longLabel"] for x in trackdb
                                if x["tableName"] == tmp_matrix[i][0]
                            ]
                            if len(description) is not 0:
                                description = description[0]
                            else:
                                description = ""
                            tmp_matrix[i] += [description]

                    results["matrix_data"] = "\n".join(
                        ["\t".join(["gene_name", tmp[0]])] + tmp[1:])
                    results["matrix_data"] = results["matrix_data"]
                    results["matrix_data_gf_description"] = "\t".join(
                        [x[-1] for x in tmp_matrix[1:]])
                else:
                    results["matrix_data"] = d[6:]
        else:
            results[
                "matrix_data"] = "Heatmap will be available after the analysis is complete."
            results["matrix_data_gf_description"] = ""

        # Pearson's matrix results
        matrix_cor_path = os.path.join(path, "pcc_matrix.txt")
        if os.path.exists(matrix_cor_path):
            with open(matrix_cor_path) as f:
                d = f.read()
                d = d.replace("\"", "")
                if d[:6] != "ERROR:":
                    results["matrix_cor_data"] = d.replace("\"", "")
                    results["matrix_cor"] = d.replace("\"", "")
                    # d3 requires "gene_name" to be inserted into the first column
                    tmp = results["matrix_cor"].split("\n")
                    results["matrix_cor"] = "\n".join(
                        ["\t".join(["gene_name", tmp[0]])] + tmp[1:])
                    results["matrix_cor"] = results["matrix_cor"]
                else:
                    results["matrix_cor"], results["matrix_cor_data"] = d[
                        6:], ""

        else:
            results["matrix_cor_data"] = ""
            results[
                "matrix_cor"] = "PCC heatmap will be available after the clustered matrix is created."
        pvalue_path = os.path.join(path, "pcc_matrix_pvalue.txt")
        if os.path.exists(pvalue_path):
            with open(pvalue_path) as f:
                d = f.read()
                results["matrix_cor_pvalues"] = d.replace("\"", "")
                # d3 requires "gene_name" to be inserted into the first column
                tmp = results["matrix_cor_pvalues"].split("\n")
                results["matrix_cor_pvalues"] = "\n".join(
                    ["\t".join(["gene_name", tmp[0]])] + tmp[1:])
                results["matrix_cor_pvalues"] = results["matrix_cor_pvalues"]
        else:
            results["matrix_cor_pvalues"] = ""
        return simplejson.dumps(results)
Exemple #6
0
def run_hypergeom(fois,
                  gfs,
                  bg_path,
                  outdir,
                  job_name="",
                  zip_run_files=False,
                  bkg_overlaps_path="",
                  gr_data_dir="",
                  run_annotation=True,
                  run_randomization_test=False,
                  padjust="None",
                  pct_score="",
                  organism=""):
    global formatter
    global detailed_outpath, matrix_outpath, progress_outpath, curprog, progmax, run_files_dir
    if not os.path.exists(os.path.normpath(outdir)):
        os.mkdir(os.path.normpath(outdir))
    fh = logging.FileHandler(os.path.join(outdir, 'gr_log.txt'))
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.setLevel(logging.INFO)
    run_files_dir = outdir
    curprog, progmax = 0, 1

    logger.info("ORGANIS " + str(organism))
    logger.info("P_value adjustment used: {}".format(padjust))

    try:
        trackdb = []

        trackdb_path = os.path.join(gr_data_dir, organism, "trackDb")
        if os.path.exists(trackdb_path +
                          ".txt.gz") and os.path.exists(trackdb_path + ".sql"):
            trackdb = bedfilecreator.load_tabledata_dumpfiles(trackdb_path)
        # set output settings
        detailed_outpath = os.path.join(outdir, "detailed.txt")
        matrix_outpath = os.path.join(outdir, "matrix.txt")
        progress_outpath = os.path.join(outdir, ".prog")
        f = open(matrix_outpath, 'wb')
        f.close()
        f = open(detailed_outpath, 'wb')
        f.close()
        hdlr = logging.FileHandler(logger_path)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.propagate = False

        # Read in the paths
        fois = [line for line in read_lines(fois) if not line.endswith(".tbi")]
        gfs = [line for line in read_lines(gfs) if not line.endswith(".tbi")]
        # check if there are spaces in invalid parts of the file name
        invalid_names = validate_filenames(fois + gfs + [bg_path])
        if len(invalid_names) != 0:
            logger.error(
                "The following file(s) have invalid file names, cannot have space before/in file extension:\n"
                + "\n".join(invalid_names))
            _write_progress(
                "ERROR: Files have invalid filenames. See log file. Terminating run. See Analysis Log."
            )
            return
        if bg_path.endswith(".tbi"):
            logger.error(
                "Background has invalid extension (.tbi). Terminating run.")
            _write_progress(
                "ERROR: Background has invalid extension (.tbi). Terminating run. See Analysis Log."
            )
            return

        # pre-process the FOIs
        fois = preprocess_fois(fois, run_files_dir, gr_data_dir, organism)
        if len(fois) == 0:
            logger.error('No valid FOIs to supplied')
            _write_progress(
                "ERROR: No valid FOI files supplied. Terminating run. See Analysis Log."
            )
            return
        # Validate FOIs against background. Also get the size of the background (n_bgs)
        foi_bg, good_fois = check_background_foi_overlap(bg_path, fois)
        write_output("\t".join(map(base_name, good_fois)) + "\n",
                     matrix_outpath)
        write_output(
            "\t".join([
                'foi_name', 'foi_obs', 'n_fois', 'bg_obs', 'n_bgs',
                'odds_ratio', 'p_val', 'test_type',
                'p_rand' if run_randomization_test else "",
                'p_mod' if run_randomization_test else ""
            ]) + "\n", detailed_outpath)
        curprog, progmax = 0, len(gfs)
        # remove old detailed enrichment result files if they exit
        enr_path = os.path.join(run_files_dir, "enrichment")
        for f in good_fois:
            f_path = os.path.join(enr_path, base_name(f) + '.txt')
            if os.path.exists(f_path): os.remove(f_path)
        _write_progress("Performing calculations on the background.")
        for gf in gfs:
            current_gf = base_name(gf)
            _write_progress("Performing Hypergeometric analysis for {}".format(
                base_name(gf)))
            write_output(
                "###" + base_name(gf) + "\t" + get_score_strand_settings(gf) +
                "\t" + get_description(base_name(gf), trackdb) + "###" + "\n",
                detailed_outpath)
            res = get_overlap_statistics(gf, good_fois)

            # calculate bg_obs
            bg_obs = get_bgobs(bg_path, gf, bkg_overlaps_path)
            if bg_obs == None:
                logger.error("Skipping {}".format(gf))
                continue

            n_bgs = foi_bg[0]["indexregions"]

            # run the enrichment analysis and output the matrix line for the current gf
            write_output(
                "\t".join([base_name(gf)] + [
                    str(
                        p_value(res[i]["intersectregions"], res[i]
                                ["queryregions"], bg_obs, n_bgs, good_fois[i],
                                gf, bg_path, run_randomization_test))
                    for i in range(len(good_fois))
                ]) + "\n", matrix_outpath)
            curprog += 1
        # Adjust p values for the enrichment files
        list_enr = [x for x in os.listdir(enr_path)]

        for p in list_enr:
            adjust_detailed_pvalue(os.path.join(enr_path, p), padjust)

        if len(gfs) >= 1 and len(good_fois) >= 1:
            # Adjust the pvalues
            adjust_pvalue(matrix_outpath, padjust)
            # Cluster the matrix
            clust_path = cluster_matrix(matrix_outpath,
                                        os.path.join(outdir, "clustered.txt"))
            if len(gfs) > 4:
                pearsons_cor_matrix(clust_path, outdir)
            else:
                json_mat = []
                json_mat.append({
                    "log":
                    False,
                    "neg":
                    "Underrepresented",
                    "pos":
                    "Overrepresented",
                    "name":
                    "Pearsons",
                    "alpha":
                    "",
                    "matrix":
                    "ERROR:PCC matrix requires at least a 5 X 2 matrix."
                })
                json_mat.append({
                    "log":
                    False,
                    "neg":
                    "Underrepresented",
                    "pos":
                    "Overrepresented",
                    "name":
                    "P-value",
                    "alpha":
                    "",
                    "matrix":
                    "ERROR:PCC matrix requires at least a 5 X 2 matrix."
                })
                with open(os.path.join(outdir, "pcc_matrix.json"), 'wb') as f:
                    simplejson.dump(json_mat, f)
                with open(os.path.join(os.path.join(outdir, "pcc_matrix.txt")),
                          "wb") as wb:
                    wb.write(
                        "ERROR:PCC matrix requires at least a 5 X 2 matrix.")
        else:
            with open(os.path.join(outdir, "clustered.txt"), "wb") as wb:
                wb.write(
                    "ERROR:Clustered matrix requires at least a 2 X 2 matrix.")

        if run_annotation:
            annot_outdir = os.path.join(outdir, "annotations")
            if not os.path.exists(annot_outdir): os.mkdir(annot_outdir)
            curprog, progmax = 0, len(fois)
            for f in fois:
                _write_progress("Running Annotation Analysis for {}.".format(
                    base_name(f)))
                with open(os.path.join(annot_outdir,
                                       base_name(f) + ".txt"), "wb") as wr:
                    anot = get_annotation(f, gfs).split("\n")
                    anot[0] = anot[0].replace("Region\t\t", "Region\t")
                    wr.write(
                        "Region" + "\t" + "\t".join(
                            base_name(x)
                            for x in reversed(anot[0].split("\t")[1:])) +
                        "\tTotal"
                    )  # annotationAnalysis column order is reverse of input order
                    for ind, a in enumerate(anot[1:]):
                        if a.strip() != "":
                            cur_row = a.split("\t")
                            wr.write(
                                "\n" + str(ind) + "|" + "\t".join(cur_row + [
                                    str(
                                        sum([
                                            int(x)
                                            for x in cur_row[1:] if x != ""
                                        ]))
                                ]))
                curprog += 1

        if zip_run_files:
            _write_progress("Preparing run files for download")
            _zip_run_files(fois, gfs, bg_path, outdir, job_name)
        curprog, progmax = 1, 1
        _write_progress("Analysis Completed")
    except Exception, e:
        logger.error(traceback.print_exc())
        write_output(traceback.format_exc(), logger_path)
        _write_progress("Run crashed. See end of log for details.")