def cmf_cv_eval(method, dataset,output_dir, cv_data, X, D, T, cvs, para): max_metric, metric_opt, optArg = 0, [], [] for d in [100]: for x in np.arange(-2, 1): for y in np.arange(-5, -1): for z in np.arange(-5, -1): tic = time.clock() model = CMF(K=d, lambda_l=2**(x), lambda_d=2**(y), lambda_t=2**(z), max_iter=100) cmd = "Dataset:"+dataset+" CVS: "+str(cvs)+"\n"+str(model) print cmd aupr_vec, auc_vec, ndcg_vec , ndcg_inv_vec, results = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) ndcg_avg, ndcg_conf = mean_confidence_interval(ndcg_vec) ndcg_inv_avg, ndcg_inv_conf = mean_confidence_interval(ndcg_inv_vec) with open(os.path.join(output_dir,"optPar", "proc_"+dataset+"_"+str(cvs)+"_"+method+".txt"), "a") as procFile: procFile.write(str(model)+": ") procFile.write("auc:%.6f, aupr: %.6f,ndcg: %.6f,ndcg_inv: %.6f, Time:%.6f\n" % (auc_avg, aupr_avg, ndcg_avg, ndcg_inv_avg, time.clock()-tic)) print "auc:%.6f, aupr: %.6f,ndcg: %.6f,ndcg_inv: %.6f, Time:%.6f\n" % (auc_avg, aupr_avg, ndcg_avg, ndcg_inv_avg, time.clock()-tic) metric = ndcg_inv_avg + ndcg_avg if metric > max_metric: max_metric = metric metric_opt= [cmd, auc_avg, aupr_avg, ndcg_avg, ndcg_inv_avg ] optArg = {"d":d, "x":x, "y":y, "z":z} #each time a better solution is found, the params are stored with open(os.path.join(output_dir,"optPar", "res_"+dataset+"_"+str(cvs)+"_"+method+".txt"), "w") as resFile: resFile.write(str(optArg)+"\n"+str(metric_opt)) cmd = "Optimal parameter setting:\n%s\n" % metric_opt[0] cmd += "auc: %.6f, aupr: %.6f, ndcg:%.6f, ndcg_inv:%.6f\n" % (metric_opt[1], metric_opt[2], metric_opt[3], metric_opt[4]) print cmd
def cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, para): max_aupr, aupr_opt = 0, [] for d in [50, 100]: for x in np.arange(-2, -1): for y in np.arange(-3, -2): for z in np.arange(-3, -2): tic = time.clock() model = CMF(K=d, lambda_l=2**(x), lambda_d=2**(y), lambda_t=2**(z), max_iter=30) cmd = "Dataset:" + dataset + " CVS: " + str( cvs) + "\n" + str(model) print cmd aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f\n" % ( auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock() - tic) if aupr_avg > max_aupr: max_aupr = aupr_avg aupr_opt = [ cmd, auc_avg, aupr_avg, auc_conf, aupr_conf ] cmd = "Optimal parameter setting:\n%s\n" % aupr_opt[0] cmd += "auc: %.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f\n" % ( aupr_opt[1], aupr_opt[2], aupr_opt[3], aupr_opt[4]) print cmd
def cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, para): max_auc, auc_opt = 0, [] for d in [50, 100]: for x in np.arange(-2, 2, dtype='float'): for y in np.arange(-3, 6, dtype='float'): for z in np.arange(-3, 6, dtype='float'): tic = time.clock() model = CMF(K=d, lambda_l=2**(x), lambda_d=2**(y), lambda_t=2**(z), max_iter=30) cmd = "Dataset:" + dataset + " CVS: " + str( cvs) + "\n" + str(model) print(cmd) aupr_vec, auc_vec, acc_vec, sen_vec, spec_vec = train( model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) acc_avg, acc_st = mean_confidence_interval(acc_vec) sen_avg, sen_st = mean_confidence_interval(sen_vec) spec_avg, spec_st = mean_confidence_interval(spec_vec) # print("AUPR: %s, AUC:%s, ACC:%s, SEN:%s, Spec:%s, Time:%s" % (aupr_avg, auc_avg, acc_avg, sen_avg, spec_avg, time.clock() - tic)) if auc_avg > max_auc: max_auc = auc_avg auc_opt = [ cmd, auc_avg, aupr_avg, acc_avg, sen_avg, spec_avg ] cmd = "Optimal parameter setting:\n%s\n" % auc_opt[0] cmd += "auc: %.6f, aupr: %.6f, acc:%.6f, sen:%.6f, spec:%.6f\n" % ( auc_opt[1], auc_opt[2], auc_opt[3], auc_opt[4], auc_opt[5]) print(cmd)
def thear(method, dataset, data_dir, output_dir, cvs, sp_arg, model_settings, predict_num, seeds, seedsOptPar, args): intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'datasets')) invert = 0 if (method == 'inv_brdti'): invert = 1 if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv, invert = intMat.T, targetMat, drugMat, 0, 1 if cvs == 4: X, D, T, cv = intMat, drugMat, targetMat, 2 cv_data = cross_validation(X, seeds, cv, invert, num=10) if invert: X, D, T = intMat, drugMat, targetMat #cv_data_optimize_params = cross_validation(X, seedsOptPar, cv, invert, num=5) if sp_arg == 0 and predict_num == 0: if (method == "vbmklmf"): cv_eval.vbmklmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == "ensambledti"): cv_eval.vbmklmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netcbp': cv_eval.netcbp_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) # if method == 'ndaf': # cv_eval.ndaf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'grmf': cv_eval.grmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'pudt': cv_eval.pudt_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'daspfind': cv_eval.daspfind_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'dnilmf': cv_eval.dnilmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'dthybrid': cv_eval.dthybrid_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kronrlsmkl': cv_eval.kronrismkl_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'ddr'): cv_eval.ddr_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'inv_brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if (method == "netcbp"): model = NetCBP() # if (method=="ndaf"): # model = NDAF() if (method == "grmf"): model = GRMF(cv=cvs) if (method == "pudt"): model = PUDT(dataset=dataset) if (method == "vbmklmf"): model = VBMKLMF(dataset=dataset, cvs=cvs) if (method == 'dnilmf'): model = DNILMF(dataset=dataset) if (method == 'kronrlsmkl'): model = KronRLsMKL(dataset=dataset) if (method == 'dthybrid'): model = DTHYBRID(dataset=dataset) if (method == 'daspfind'): model = DASPFIND(alpha=args['alpha']) if (method == 'brdti') | (method == 'inv_brdti'): #model = BRDTI(D=args['D'],learning_rate= args['learning_rate'],max_iters=args['max_iters'],simple_predict=args['simple_predict'],bias_regularization=args['bias_regularization'],global_regularization=args['global_regularization'],cbSim=args['cbSim'],cb_alignment_regularization_user=args['cb_alignment_regularization_user'],cb_alignment_regularization_item=args['cb_alignment_regularization_item']) model = BRDTI(args) if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'ddr': model = DDR(dataset=dataset, cv=cvs) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) if (method == 'ensambledti'): model = EnsambleDTI(args=args, dataset=dataset) cmd = str(model) if predict_num == 0: print("Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd) aupr_vec, auc_vec = train(model, cv_data, X, D, T, cvs, dataset) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print( "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock() - tic)) write_metric_vector_to_file( auc_vec, os.path.join( output_dir, method + "_auc_cvs" + str(cvs) + "_" + dataset + ".txt")) write_metric_vector_to_file( aupr_vec, os.path.join( output_dir, method + "_aupr_cvs" + str(cvs) + "_" + dataset + ".txt")) elif predict_num > 0: print("Dataset:" + dataset + "\n" + cmd) seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def main(argv): try: opts, args = getopt.getopt(argv, m:d:f:c:s:o:n:p, ["method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = 'data' output_dir = 'output' cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] seedsOptPar = [156] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) if not os.path.isdir(os.path.join(output_dir,"optPar")): os.makedirs(os.path.join(output_dir,"optPar")) # default parameters for each methods if (method == 'brdti') | (method == 'inv_brdti') : args = { 'D':100, 'learning_rate':0.1, 'max_iters' : 100, 'simple_predict' :False, 'bias_regularization':1, 'global_regularization':10**(-2), "cbSim": "knn", 'cb_alignment_regularization_user' :1, 'cb_alignment_regularization_item' :1} if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'cmf': args = {'K': 100, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 100} #print(model_settings) for key, val in model_settings: args[key] = float(eval(val)) intMat, drugMat, targetMat = load_data_from_file(dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names(dataset, os.path.join(data_dir, 'datasets')) invert = 0 if (method == 'inv_brdti') : invert = 1 if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv, invert) cv_data_optimize_params = cross_validation(X, seedsOptPar, cv, invert, num=5) if sp_arg == 0 and predict_num == 0: if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if (method == 'inv_brdti'): cv_eval.brdti_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X.T, T, D, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if (method == 'brdti')|(method == 'inv_brdti'): model = BRDTI(args) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) #predict hidden part of the current datasets if predict_num == 0: print "Dataset:"+dataset+" CVS:"+str(cvs)+"\n"+cmd if (method == 'inv_brdti') : aupr_vec, auc_vec, ndcg_inv_vec, ndcg_vec, results = train(model, cv_data, X.T, T, D) else: aupr_vec, auc_vec, ndcg_vec, ndcg_inv_vec, results = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) ndcg_avg, ndcg_conf = mean_confidence_interval(ndcg_vec) ndcg_inv_avg, ndcg_inv_conf = mean_confidence_interval(ndcg_inv_vec) resfile = os.path.join('output','rawResults', method+"_res_"+str(cvs)+"_"+dataset+".csv") outd = open(resfile, "w") outd.write(('drug;target;true;predict\n')) for r in results: outd.write('%s;%s;%s;%s\n' % (r[0],r[1],r[2],r[3]) ) print "auc:%.6f, aupr: %.6f, ndcg: %.6f, ndcg_inv: %.6f, auc_conf:%.6f, aupr_conf:%.6f, ndcg_conf:%.6f, ndcg_inv_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, ndcg_avg, ndcg_inv_avg, auc_conf, aupr_conf, ndcg_conf, ndcg_inv_conf, time.clock()-tic) write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(ndcg_vec, os.path.join(output_dir, method+"_ndcg_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(ndcg_inv_vec, os.path.join(output_dir, method+"_ndcg_inv_cvs"+str(cvs)+"_"+dataset+".txt")) #predict novel DTIs elif predict_num > 0: print "Dataset:"+dataset+"\n"+cmd seed = 376 if invert: #predicting drugs for targets model.fix_model(intMat.T, intMat.T, targetMat, drugMat, seed) npa = newDTIPrediction() x, y = np.where(intMat == 0) scores = model.predict_scores(zip(y, x), 1) sz = np.array(zip(x,y,scores)) else: #predicting targets for drugs model.fix_model(intMat, intMat, drugMat, targetMat, seed) npa = newDTIPrediction() x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 1) sz = np.array(zip(x,y,scores)) ndcg_d, ndcg_t, recall_d, recall_t = npa.verify_novel_interactions(method, dataset, sz, predict_num, drug_names, target_names) st_file= os.path.join('output/newDTI', "_".join([dataset,str(predict_num), "stats.csv"])) out = open(st_file, "a") out.write(('%s;%f;%f;%f;%f\n' % (method,ndcg_d, ndcg_t, recall_d, recall_t)))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:s:o:n:p", [ "method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = os.path.join(os.path.pardir, 'data') output_dir = os.path.join(os.path.pardir, 'output') cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # default parameters for each methods if method == 'nrlmf': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = { 'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30 } for key, val in model_settings: args[key] = val intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'datasets')) if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) if predict_num == 0: print "Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % ( auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock() - tic) write_metric_vector_to_file( auc_vec, os.path.join( output_dir, method + "_auc_cvs" + str(cvs) + "_" + dataset + ".txt")) write_metric_vector_to_file( aupr_vec, os.path.join( output_dir, method + "_aupr_cvs" + str(cvs) + "_" + dataset + ".txt")) elif predict_num > 0: print "Dataset:" + dataset + "\n" + cmd seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:e:s:o:n:p:g:q:r:l:w", [ "method=", "dataset=", "data-dir=", "cvs=", "external=", "specify-arg=", "method-opt=", "predict-num=", "scoring=", "gpmi=", "params=", "output-dir=", "log=", "workdir=" ]) except getopt.GetoptError: sys.exit() # data_dir = os.path.join(os.path.pardir, 'data') # output_dir = os.path.join(os.path.pardir, 'output') method = "nrlmf" dataset = "nr" data_dir = '.' output_dir = '.' cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 external = 0 scoring = 'auc' gpmi = None params = None workdir = "./" logfile = 'job.log' seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--external": external = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-opt": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if opt == "--scoring": scoring = str(arg) if opt == "--gpmi": gpmi = dict() for s in str(arg).split(): key, val = s.split('=') gpmi[key] = float(val) if opt == "--params": params = read_params(str(arg)) if opt == "--log": logfile = str(arg) if opt == "--workdir": workdir = str(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # set logger logger = logging.getLogger("logger") logger.setLevel(logging.INFO) filename = logfile fh = logging.FileHandler(workdir + "/" + filename) fh.name = filename logger.addHandler(fh) # default parameters for each methods if method == 'nrlmf': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'nrlmfb': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = { 'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30 } for key, val in model_settings: args[key] = float(val) intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'dataset')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'dataset')) if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if cvs == 1: ev_data = external_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0 and external == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'nrlmfb': cv_eval.nrlmfb_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 0 and predict_num == 0 and external == 1: if method == 'nrlmf': ev_eval.nrlmf_ev_eval(method, ev_data, X, D, T, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'nrlmfb': ev_eval.nrlmfb_ev_eval(method, ev_data, X, D, T, logger, scoring=scoring, gpmi=gpmi, params=params) if sp_arg == 1 or predict_num > 0: if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'nrlmfb': model = NRLMFb(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter'], eta1=args['eta1'], eta2=args['eta2']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) if predict_num == 0: tic = time.time() print("Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd) aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print( "auc:%.6f, aupr:%.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.time() - tic)) # write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) # write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) logger.info( cmd + ', ' + "auc:%.6f, aupr:%.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.time() - tic)) elif predict_num > 0: print("Dataset:" + dataset + "\n" + cmd) seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:s:o:n:p:C", [ "method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", "cv_type=", "i_param=", "i_test=" ]) except getopt.GetoptError: sys.exit() # data_dir = os.path.join(os.path.pardir, 'data') # output_dir = os.path.join(os.path.pardir, 'output') data_dir = 'data' output_dir = 'results' cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 # seeds = [7771, 8367, 22, 1812, 4659] seeds = [7771, 8367, 22, 1812] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: print(opt, arg) if opt == "--method": method = arg if '"' in method: method = method.replace('"', '') if opt == "--dataset": dataset = arg if '"' in dataset: dataset = dataset.replace('"', '') if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--i_param": i_param = int(arg) if opt == "--i_test": i_test = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if opt == "--cv_type": cv_type = arg if '"' in cv_type: cv_type = cv_type.replace('"', '') if not os.path.isdir(output_dir): os.makedirs(output_dir) # default parameters for each methods if method == 'nrlmf': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method in ['wnngip', 'gip']: args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method in ['nnwnngip', 'nngip']: args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8, 'NN': 2} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = { 'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30 } if method in ['nnkronsvm', 'nnkronsvmgip']: args = {'C': 1., 'NbNeg': 10, 'PosNei': 10, 'NegNei': 2, 'n_proc': 1} if method in ['nnkronwnnsvmgip', 'nnkronwnnsvm']: args = { 'C': 1., 't': 0.1, 'NbNeg': 10, 'PosNei': 10, 'NegNei': 2, 'n_proc': 1 } for key, val in model_settings: args[key] = val if sp_arg == 2 and predict_num == 0: print(2, 'bis') m = get_name_method(method) data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ str(i_param) + '_' + str(i_test) if os.path.isfile('results/' + data_file + '.data'): print('found', 'results/' + data_file + '.data') exit(1) intMat, drugMat, targetMat, limit = load_data_from_file( dataset, os.path.join(data_dir, '')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, '')) if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, D, T, seeds, cv, limit, 10, cv_type=cv_type) print(intMat.shape) if sp_arg == 0 and predict_num == 0: print(0) cv_eval.cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nrlmf': # cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'netlaprls': # cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'blmnii': # cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'wnngip': # cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'gip': # cv_eval.gip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nngip': # cv_eval.nngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nnwnngip': # cv_eval.nnwnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'kbmf': # cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'cmf': # cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nnkronsvm': # cv_eval.nnkronsvm_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nnkronsvmgip': # cv_eval.nnkronsvmgip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nnkronwnnsvm': # cv_eval.nnkronwnnsvm_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) # if method == 'nnkronwnnsvmgip': # cv_eval.nnkronwnnsvmgip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type) if sp_arg == 2 and predict_num == 0: print(2) cv_eval.eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, i_param, i_test) # if method == 'nrlmf': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nrlmf_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'wnngip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.wnngip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'gip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.gip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nngip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nngip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nnwnngip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nnwnngip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nnkronsvm': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nnkronsvm_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nnkronsvmgip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nnkronsvmgip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nnkronwnnsvm': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nnkronwnnsvm_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) # if method == 'nnkronwnnsvmgip': # m = get_name_method(method) # data_file = dataset + "_" + str(cvs) + "_" + cv_type + '_Model:' + m + '_' + \ # str(i_param) + '_' + str(i_test) # if not os.path.isfile('results/' + data_file + '.data'): # cv_eval.nnkronwnnsvmgip_eval(method, dataset, cv_data, X, D, T, cvs, args, cv_type, # i_param, i_test) if sp_arg == 1: tic, toc = time.clock(), time.time() if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'gip': model = GIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'nngip': model = NNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha'], NN=args['NN']) if method == 'nnwnngip': model = NNWNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha'], NN=args['NN']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) elif method == 'nnkronsvm': model = NNKronSVM(C=args['C'], NbNeg=args['NbNeg'], NegNei=args['NegNei'], PosNei=args['PosNei'], dataset=dataset, n_proc=args['n_proc']) elif method == 'nnkronsvmgip': model = NNKronSVMGIP(C=args['C'], NbNeg=args['NbNeg'], NegNei=args['NegNei'], PosNei=args['PosNei'], dataset=dataset, n_proc=args['n_proc']) elif method == 'nnkronwnnsvmgip': model = NNKronWNNSVMGIP(C=args['C'], t=args['t'], NbNeg=args['NbNeg'], NegNei=args['NegNei'], PosNei=args['PosNei'], dataset=dataset, n_proc=args['n_proc']) elif method == 'nnkronwnnsvm': model = NNKronWNNSVM(C=args['C'], t=args['t'], NbNeg=args['NbNeg'], NegNei=args['NegNei'], PosNei=args['PosNei'], dataset=dataset, n_proc=args['n_proc']) cmd = str(model) print("Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd) aupr_vec, auc_vec = train(model, method, dataset, cv_data, X, D, T, cv_type) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) data_file = dataset + "_" + str( cvs) + "_" + cv_type + '_DefaultParam_Model:' + method tic, toc = tic - time.clock(), toc - time.time() print(np.mean(aupr_vec), np.std(aupr_vec), np.mean(auc_vec), np.std(auc_vec), tic, toc) pickle.dump((aupr_vec, auc_vec, tic, toc), open('results/' + data_file + '.data', 'wb'))