def runPrismX(clusterCount: int): #urllib.request.urlretrieve("https://mssm-seq-matrix.s3.amazonaws.com/mouse_matrix.h5", "mouse_matrix.h5") start = time.time() correlationFolder = "correlation_" + str(clusterCount) + "_folder_q" predictionFolder = "prediction_" + str(clusterCount) + "_folder_q" libs = px.list_libraries() gmt_file = px.load_library(libs[110]) px.createCorrelationMatrices("mouse_matrix.h5", correlationFolder, clusterCount=clusterCount, sampleCount=5000, correlationSampleCount=5000, verbose=True) t1 = time.time() - start print("T1: " + str(t1)) px.correlation_scores(gmt_file, correlationFolder, predictionFolder, verbose=True) t2 = time.time() - start print("T2: " + str(t2)) model = px.trainModel(predictionFolder, correlationFolder, gmt_file, training_size=300000, test_train_split=0.1, sample_positive=40000, sample_negative=200000, random_state=42, verbose=True) pickle.dump(model, open("gobp_model_" + str(clusterCount) + ".pkl", 'wb')) t3 = time.time() - start print("T3: " + str(t3))
import prismx as px import pickle from memory_profiler import memory_usage import os import time import matplotlib.pyplot as plt import pandas as pd import feather from prismx.utils import load_json, get_config, get_data_path, read_gmt from prismx.loaddata import get_genes from progress.bar import Bar import numpy as np px.print_libraries() libs = px.list_libraries() gmt_file = px.load_library(libs[28]) outname = libs[28] correlationFolder = "correlation_100_folder" predictionFolder = "prediction_100_folder" outfolder = "prismxresult_100" px.predict_gmt("gobp_model_100.pkl", gmt_file, correlationFolder, predictionFolder, outfolder, outname, step_size=1000, verbose=True)
clusterCount = 50 correlationFolder = "correlation_" + str(clusterCount) + "_folder" predictionFolder = "prediction_" + str(clusterCount) + "_folder" outfolder = "prismxresult_" + str(clusterCount) #px.createCorrelationMatrices("mouse_matrix.h5", correlationFolder, clusterCount=clusterCount, sampleCount=5000, correlationSampleCount=5000, verbose=True) genesetlibs = [ "ChEA_2016", "KEA_2013", "GWAS_Catalog_2019", "huMAP", "GO_Biological_Process_2018", "MGI_Mammalian_Phenotype_Level_4_2019" ] genesetlibs.sort() for lib in genesetlibs[0:2]: gmt_file = px.load_library(lib) px.correlation_scores(gmt_file, correlationFolder, predictionFolder, verbose=True) model = px.trainModel(predictionFolder, correlationFolder, gmt_file, training_size=300000, test_train_split=0.1, sample_positive=40000, sample_negative=200000, random_state=42, verbose=True) pickle.dump(model, open(lib + "_model_" + str(clusterCount) + ".pkl", 'wb'))
nx = "GO_Biological_Process_2018" set_auc.loc[nx, :] gene_auc.loc[nx, :] p1 = pd.read_feather("prediction_folder_300_umap/prediction_0.f").set_index( "index") correlationFolder = "correlation_folder_300" predictionFolder = "prediction_folder_300_umap" outfolder = "prismxresult" clustn = 300 libs = px.list_libraries() gmt_file = px.load_library(libs[111], overwrite=True) outname = libs[111] #px.predict_gmt("gobp_model_"+str(clustn)+".pkl", gmt_file, correlationFolder, predictionFolder, outfolder, libs[111], step_size=200, intersect=True, verbose=True) gop = pd.read_feather("prismxresult/GO_Biological_Process_2018.f") gop = gop.set_index("index") geneAUC, setAUC = px.benchmarkGMTfast(gmt_file, correlationFolder, predictionFolder, outfolder + "/" + outname + ".f", intersect=True, verbose=True) diff_gene = geneAUC.iloc[:, 1] - geneAUC.iloc[:, 0] diff_set = setAUC.iloc[:, 1] - setAUC.iloc[:, 0]
import time import shutil #sys.path.append('C:/prismx/') import prismx as px libs = px.list_libraries() clustn = 26 f = open("validationscore" + str(clustn) + ".txt", 'r') libraries = [x.split("\t")[0] for x in f.readlines()] newlibs = list(set(libs).difference(set(libraries))) for i in range(0, len(newlibs)): try: print(newlibs[i]) gmt_file = px.load_library(newlibs[i]) print("loaded") g1, g2, g3 = px.read_gmt(gmt_file) # set output configuration outname = newlibs[i] correlationFolder = "correlation_" + str(clustn) + "_folder" predictionFolder = "prediction_" + str(clustn) outfolder = "prismxresult_" + str(clustn) if len(g1) < 14000: # calculate PrismX predictions with pretrained model px.predict_gmt("gobp_model_" + str(clustn) + ".pkl", gmt_file, correlationFolder, predictionFolder, outfolder, outname,