Ejemplo n.º 1
0
def runPrismX(clusterCount: int):
    #urllib.request.urlretrieve("https://mssm-seq-matrix.s3.amazonaws.com/mouse_matrix.h5", "mouse_matrix.h5")
    start = time.time()
    correlationFolder = "correlation_" + str(clusterCount) + "_folder_q"
    predictionFolder = "prediction_" + str(clusterCount) + "_folder_q"
    libs = px.list_libraries()
    gmt_file = px.load_library(libs[110])
    px.createCorrelationMatrices("mouse_matrix.h5",
                                 correlationFolder,
                                 clusterCount=clusterCount,
                                 sampleCount=5000,
                                 correlationSampleCount=5000,
                                 verbose=True)
    t1 = time.time() - start
    print("T1: " + str(t1))
    px.correlation_scores(gmt_file,
                          correlationFolder,
                          predictionFolder,
                          verbose=True)
    t2 = time.time() - start
    print("T2: " + str(t2))
    model = px.trainModel(predictionFolder,
                          correlationFolder,
                          gmt_file,
                          training_size=300000,
                          test_train_split=0.1,
                          sample_positive=40000,
                          sample_negative=200000,
                          random_state=42,
                          verbose=True)
    pickle.dump(model, open("gobp_model_" + str(clusterCount) + ".pkl", 'wb'))
    t3 = time.time() - start
    print("T3: " + str(t3))
Ejemplo n.º 2
0
import prismx as px
import pickle
from memory_profiler import memory_usage
import os
import time
import matplotlib.pyplot as plt
import pandas as pd
import feather
from prismx.utils import load_json, get_config, get_data_path, read_gmt
from prismx.loaddata import get_genes
from progress.bar import Bar
import numpy as np

px.print_libraries()
libs = px.list_libraries()
gmt_file = px.load_library(libs[28])

outname = libs[28]
correlationFolder = "correlation_100_folder"
predictionFolder = "prediction_100_folder"
outfolder = "prismxresult_100"

px.predict_gmt("gobp_model_100.pkl",
               gmt_file,
               correlationFolder,
               predictionFolder,
               outfolder,
               outname,
               step_size=1000,
               verbose=True)
Ejemplo n.º 3
0
clusterCount = 50

correlationFolder = "correlation_" + str(clusterCount) + "_folder"
predictionFolder = "prediction_" + str(clusterCount) + "_folder"
outfolder = "prismxresult_" + str(clusterCount)
#px.createCorrelationMatrices("mouse_matrix.h5", correlationFolder, clusterCount=clusterCount, sampleCount=5000, correlationSampleCount=5000, verbose=True)

genesetlibs = [
    "ChEA_2016", "KEA_2013", "GWAS_Catalog_2019", "huMAP",
    "GO_Biological_Process_2018", "MGI_Mammalian_Phenotype_Level_4_2019"
]
genesetlibs.sort()

for lib in genesetlibs[0:2]:
    gmt_file = px.load_library(lib)
    px.correlation_scores(gmt_file,
                          correlationFolder,
                          predictionFolder,
                          verbose=True)
    model = px.trainModel(predictionFolder,
                          correlationFolder,
                          gmt_file,
                          training_size=300000,
                          test_train_split=0.1,
                          sample_positive=40000,
                          sample_negative=200000,
                          random_state=42,
                          verbose=True)
    pickle.dump(model, open(lib + "_model_" + str(clusterCount) + ".pkl",
                            'wb'))
Ejemplo n.º 4
0
import prismx as px
import pickle
from memory_profiler import memory_usage
import os
import time
import matplotlib.pyplot as plt
import pandas as pd
import feather
from prismx.utils import load_json, get_config, get_data_path, read_gmt
from prismx.loaddata import get_genes
from progress.bar import Bar
import numpy as np

px.print_libraries()
libs = px.list_libraries()
gmt_file = px.load_library(libs[28])

outname = libs[28]
correlationFolder = "correlation_100_folder"
predictionFolder = "prediction_100_folder"
outfolder = "prismxresult_100"

px.predict_gmt("gobp_model_100.pkl",
               gmt_file,
               correlationFolder,
               predictionFolder,
               outfolder,
               outname,
               step_size=1000,
               verbose=True)
Ejemplo n.º 5
0
nx = "GO_Biological_Process_2018"

set_auc.loc[nx, :]
gene_auc.loc[nx, :]

p1 = pd.read_feather("prediction_folder_300_umap/prediction_0.f").set_index(
    "index")

correlationFolder = "correlation_folder_300"
predictionFolder = "prediction_folder_300_umap"
outfolder = "prismxresult"

clustn = 300

libs = px.list_libraries()
gmt_file = px.load_library(libs[111], overwrite=True)

outname = libs[111]
#px.predict_gmt("gobp_model_"+str(clustn)+".pkl", gmt_file, correlationFolder, predictionFolder, outfolder, libs[111], step_size=200, intersect=True, verbose=True)

gop = pd.read_feather("prismxresult/GO_Biological_Process_2018.f")
gop = gop.set_index("index")
geneAUC, setAUC = px.benchmarkGMTfast(gmt_file,
                                      correlationFolder,
                                      predictionFolder,
                                      outfolder + "/" + outname + ".f",
                                      intersect=True,
                                      verbose=True)

diff_gene = geneAUC.iloc[:, 1] - geneAUC.iloc[:, 0]
diff_set = setAUC.iloc[:, 1] - setAUC.iloc[:, 0]
Ejemplo n.º 6
0
import time
import shutil
#sys.path.append('C:/prismx/')
import prismx as px

libs = px.list_libraries()
clustn = 26

f = open("validationscore" + str(clustn) + ".txt", 'r')
libraries = [x.split("\t")[0] for x in f.readlines()]
newlibs = list(set(libs).difference(set(libraries)))

for i in range(0, len(newlibs)):
    try:
        print(newlibs[i])
        gmt_file = px.load_library(newlibs[i])
        print("loaded")
        g1, g2, g3 = px.read_gmt(gmt_file)
        # set output configuration
        outname = newlibs[i]
        correlationFolder = "correlation_" + str(clustn) + "_folder"
        predictionFolder = "prediction_" + str(clustn)
        outfolder = "prismxresult_" + str(clustn)
        if len(g1) < 14000:
            # calculate PrismX predictions with pretrained model
            px.predict_gmt("gobp_model_" + str(clustn) + ".pkl",
                           gmt_file,
                           correlationFolder,
                           predictionFolder,
                           outfolder,
                           outname,