def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:s:o:n:p", ["method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = os.path.join(os.path.pardir, 'data') output_dir = os.path.join(os.path.pardir, 'output') cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # default parameters for each methods if method == 'nrlmf': args = {'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100} # args = {'c': 5, 'K1': 2, 'K2': 2, 'r': 3, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100} if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = {'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30} for key, val in model_settings: args[key] = val #ZINC test lines intMat, testMat, drugMat, targetMat = load_data_from_file_csv(dataset, os.path.join(data_dir)) drug_names, target_names = get_drug_target_names_zinc() #ZINC test lines # drug_names, target_names = get_drugs_targets_names(dataset, os.path.join(data_dir)) #Demo lines # intMat, testMat, drugMat, targetMat = load_data_from_file_demo(dataset, os.path.join(data_dir)) # drug_names, target_names = get_drug_target_names_demo() #demo lines if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) if predict_num == 0: print "Dataset:"+dataset+" CVS:"+str(cvs)+"\n"+cmd aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock()-tic) write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) elif predict_num > 0: print "Dataset:"+dataset+"\n"+cmd seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) # x, y = np.where(intMat == 0) x, y = np.where(intMat >= 0) #to pick all pairs including train pairs scores = model.predict_scores_NaNtest(zip(x, y), 5)
def thear(method, dataset, data_dir, output_dir, cvs, sp_arg, model_settings, predict_num, seeds, seedsOptPar, args): intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'datasets')) invert = 0 if (method == 'inv_brdti'): invert = 1 if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv, invert = intMat.T, targetMat, drugMat, 0, 1 if cvs == 4: X, D, T, cv = intMat, drugMat, targetMat, 2 cv_data = cross_validation(X, seeds, cv, invert, num=10) if invert: X, D, T = intMat, drugMat, targetMat #cv_data_optimize_params = cross_validation(X, seedsOptPar, cv, invert, num=5) if sp_arg == 0 and predict_num == 0: if (method == "vbmklmf"): cv_eval.vbmklmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == "ensambledti"): cv_eval.vbmklmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netcbp': cv_eval.netcbp_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) # if method == 'ndaf': # cv_eval.ndaf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'grmf': cv_eval.grmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'pudt': cv_eval.pudt_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'daspfind': cv_eval.daspfind_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'dnilmf': cv_eval.dnilmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'dthybrid': cv_eval.dthybrid_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kronrlsmkl': cv_eval.kronrismkl_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'ddr'): cv_eval.ddr_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if (method == 'inv_brdti'): cv_eval.brdti_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if (method == "netcbp"): model = NetCBP() # if (method=="ndaf"): # model = NDAF() if (method == "grmf"): model = GRMF(cv=cvs) if (method == "pudt"): model = PUDT(dataset=dataset) if (method == "vbmklmf"): model = VBMKLMF(dataset=dataset, cvs=cvs) if (method == 'dnilmf'): model = DNILMF(dataset=dataset) if (method == 'kronrlsmkl'): model = KronRLsMKL(dataset=dataset) if (method == 'dthybrid'): model = DTHYBRID(dataset=dataset) if (method == 'daspfind'): model = DASPFIND(alpha=args['alpha']) if (method == 'brdti') | (method == 'inv_brdti'): #model = BRDTI(D=args['D'],learning_rate= args['learning_rate'],max_iters=args['max_iters'],simple_predict=args['simple_predict'],bias_regularization=args['bias_regularization'],global_regularization=args['global_regularization'],cbSim=args['cbSim'],cb_alignment_regularization_user=args['cb_alignment_regularization_user'],cb_alignment_regularization_item=args['cb_alignment_regularization_item']) model = BRDTI(args) if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'ddr': model = DDR(dataset=dataset, cv=cvs) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) if (method == 'ensambledti'): model = EnsambleDTI(args=args, dataset=dataset) cmd = str(model) if predict_num == 0: print("Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd) aupr_vec, auc_vec = train(model, cv_data, X, D, T, cvs, dataset) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print( "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock() - tic)) write_metric_vector_to_file( auc_vec, os.path.join( output_dir, method + "_auc_cvs" + str(cvs) + "_" + dataset + ".txt")) write_metric_vector_to_file( aupr_vec, os.path.join( output_dir, method + "_aupr_cvs" + str(cvs) + "_" + dataset + ".txt")) elif predict_num > 0: print("Dataset:" + dataset + "\n" + cmd) seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def learn_hyperparameters(self, intMat, drugMat, targetMat, seed=500): cv_data_optimize_params = cross_validation(intMat, [seed], 1, 0, num=5) params = cv_eval.blmnii_cv_eval("blmnii", "", "", cv_data_optimize_params, intMat, drugMat, targetMat, 1, "") self.alpha = params["alpha"]
def main(argv): try: opts, args = getopt.getopt(argv, m:d:f:c:s:o:n:p, ["method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = 'data' output_dir = 'output' cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] seedsOptPar = [156] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) if not os.path.isdir(os.path.join(output_dir,"optPar")): os.makedirs(os.path.join(output_dir,"optPar")) # default parameters for each methods if (method == 'brdti') | (method == 'inv_brdti') : args = { 'D':100, 'learning_rate':0.1, 'max_iters' : 100, 'simple_predict' :False, 'bias_regularization':1, 'global_regularization':10**(-2), "cbSim": "knn", 'cb_alignment_regularization_user' :1, 'cb_alignment_regularization_item' :1} if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'cmf': args = {'K': 100, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 100} #print(model_settings) for key, val in model_settings: args[key] = float(eval(val)) intMat, drugMat, targetMat = load_data_from_file(dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names(dataset, os.path.join(data_dir, 'datasets')) invert = 0 if (method == 'inv_brdti') : invert = 1 if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv, invert) cv_data_optimize_params = cross_validation(X, seedsOptPar, cv, invert, num=5) if sp_arg == 0 and predict_num == 0: if (method == 'brdti'): cv_eval.brdti_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if (method == 'inv_brdti'): cv_eval.brdti_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X.T, T, D, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset,output_dir, cv_data_optimize_params, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if (method == 'brdti')|(method == 'inv_brdti'): model = BRDTI(args) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) #predict hidden part of the current datasets if predict_num == 0: print "Dataset:"+dataset+" CVS:"+str(cvs)+"\n"+cmd if (method == 'inv_brdti') : aupr_vec, auc_vec, ndcg_inv_vec, ndcg_vec, results = train(model, cv_data, X.T, T, D) else: aupr_vec, auc_vec, ndcg_vec, ndcg_inv_vec, results = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) ndcg_avg, ndcg_conf = mean_confidence_interval(ndcg_vec) ndcg_inv_avg, ndcg_inv_conf = mean_confidence_interval(ndcg_inv_vec) resfile = os.path.join('output','rawResults', method+"_res_"+str(cvs)+"_"+dataset+".csv") outd = open(resfile, "w") outd.write(('drug;target;true;predict\n')) for r in results: outd.write('%s;%s;%s;%s\n' % (r[0],r[1],r[2],r[3]) ) print "auc:%.6f, aupr: %.6f, ndcg: %.6f, ndcg_inv: %.6f, auc_conf:%.6f, aupr_conf:%.6f, ndcg_conf:%.6f, ndcg_inv_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, ndcg_avg, ndcg_inv_avg, auc_conf, aupr_conf, ndcg_conf, ndcg_inv_conf, time.clock()-tic) write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(ndcg_vec, os.path.join(output_dir, method+"_ndcg_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(ndcg_inv_vec, os.path.join(output_dir, method+"_ndcg_inv_cvs"+str(cvs)+"_"+dataset+".txt")) #predict novel DTIs elif predict_num > 0: print "Dataset:"+dataset+"\n"+cmd seed = 376 if invert: #predicting drugs for targets model.fix_model(intMat.T, intMat.T, targetMat, drugMat, seed) npa = newDTIPrediction() x, y = np.where(intMat == 0) scores = model.predict_scores(zip(y, x), 1) sz = np.array(zip(x,y,scores)) else: #predicting targets for drugs model.fix_model(intMat, intMat, drugMat, targetMat, seed) npa = newDTIPrediction() x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 1) sz = np.array(zip(x,y,scores)) ndcg_d, ndcg_t, recall_d, recall_t = npa.verify_novel_interactions(method, dataset, sz, predict_num, drug_names, target_names) st_file= os.path.join('output/newDTI', "_".join([dataset,str(predict_num), "stats.csv"])) out = open(st_file, "a") out.write(('%s;%f;%f;%f;%f\n' % (method,ndcg_d, ndcg_t, recall_d, recall_t)))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:s:o:n:p", [ "method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = os.path.join(os.path.pardir, 'data') output_dir = os.path.join(os.path.pardir, 'output') cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # default parameters for each methods if method == 'nrlmf': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = { 'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30 } for key, val in model_settings: args[key] = val intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'datasets')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'datasets')) if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) if predict_num == 0: print "Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % ( auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock() - tic) write_metric_vector_to_file( auc_vec, os.path.join( output_dir, method + "_auc_cvs" + str(cvs) + "_" + dataset + ".txt")) write_metric_vector_to_file( aupr_vec, os.path.join( output_dir, method + "_aupr_cvs" + str(cvs) + "_" + dataset + ".txt")) elif predict_num > 0: print "Dataset:" + dataset + "\n" + cmd seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:e:s:o:n:p:g:q:r:l:w", [ "method=", "dataset=", "data-dir=", "cvs=", "external=", "specify-arg=", "method-opt=", "predict-num=", "scoring=", "gpmi=", "params=", "output-dir=", "log=", "workdir=" ]) except getopt.GetoptError: sys.exit() # data_dir = os.path.join(os.path.pardir, 'data') # output_dir = os.path.join(os.path.pardir, 'output') method = "nrlmf" dataset = "nr" data_dir = '.' output_dir = '.' cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 external = 0 scoring = 'auc' gpmi = None params = None workdir = "./" logfile = 'job.log' seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--external": external = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-opt": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if opt == "--scoring": scoring = str(arg) if opt == "--gpmi": gpmi = dict() for s in str(arg).split(): key, val = s.split('=') gpmi[key] = float(val) if opt == "--params": params = read_params(str(arg)) if opt == "--log": logfile = str(arg) if opt == "--workdir": workdir = str(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # set logger logger = logging.getLogger("logger") logger.setLevel(logging.INFO) filename = logfile fh = logging.FileHandler(workdir + "/" + filename) fh.name = filename logger.addHandler(fh) # default parameters for each methods if method == 'nrlmf': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'nrlmfb': args = { 'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100 } if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = { 'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30 } for key, val in model_settings: args[key] = float(val) intMat, drugMat, targetMat = load_data_from_file( dataset, os.path.join(data_dir, 'dataset')) drug_names, target_names = get_drugs_targets_names( dataset, os.path.join(data_dir, 'dataset')) if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if cvs == 1: ev_data = external_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0 and external == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'nrlmfb': cv_eval.nrlmfb_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args, logger) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 0 and predict_num == 0 and external == 1: if method == 'nrlmf': ev_eval.nrlmf_ev_eval(method, ev_data, X, D, T, logger, scoring=scoring, gpmi=gpmi, params=params) if method == 'nrlmfb': ev_eval.nrlmfb_ev_eval(method, ev_data, X, D, T, logger, scoring=scoring, gpmi=gpmi, params=params) if sp_arg == 1 or predict_num > 0: if method == 'nrlmf': model = NRLMF(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter']) if method == 'nrlmfb': model = NRLMFb(cfix=args['c'], K1=args['K1'], K2=args['K2'], num_factors=args['r'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], alpha=args['alpha'], beta=args['beta'], theta=args['theta'], max_iter=args['max_iter'], eta1=args['eta1'], eta2=args['eta2']) if method == 'netlaprls': model = NetLapRLS(gamma_d=args['gamma_d'], gamma_t=args['gamma_t'], beta_d=args['beta_t'], beta_t=args['beta_t']) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=args['T'], sigma=args['sigma'], alpha=args['alpha']) if method == 'kbmf': model = KBMF(num_factors=args['R']) if method == 'cmf': model = CMF(K=args['K'], lambda_l=args['lambda_l'], lambda_d=args['lambda_d'], lambda_t=args['lambda_t'], max_iter=args['max_iter']) cmd = str(model) if predict_num == 0: tic = time.time() print("Dataset:" + dataset + " CVS:" + str(cvs) + "\n" + cmd) aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print( "auc:%.6f, aupr:%.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.time() - tic)) # write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) # write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) logger.info( cmd + ', ' + "auc:%.6f, aupr:%.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.time() - tic)) elif predict_num > 0: print("Dataset:" + dataset + "\n" + cmd) seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) x, y = np.where(intMat == 0) scores = model.predict_scores(zip(x, y), 5) ii = np.argsort(scores)[::-1] predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] new_dti_file = os.path.join( output_dir, "_".join([method, dataset, "new_dti.txt"])) novel_prediction_analysis(predict_pairs, new_dti_file, os.path.join(data_dir, 'biodb'))
def main(argv): try: opts, args = getopt.getopt(argv, "m:d:f:c:s:o:n:p", ["method=", "dataset=", "data-dir=", "cvs=", "specify-arg=", "method-options=", "predict-num=", "output-dir=", ]) except getopt.GetoptError: sys.exit() data_dir = os.path.join(os.path.pardir, 'data') output_dir = os.path.join(os.path.pardir, 'output') cvs, sp_arg, model_settings, predict_num = 1, 1, [], 0 seeds = [7771, 8367, 22, 1812, 4659] # seeds = np.random.choice(10000, 5, replace=False) for opt, arg in opts: if opt == "--method": method = arg if opt == "--dataset": dataset = arg if opt == "--data-dir": data_dir = arg if opt == "--output-dir": output_dir = arg if opt == "--cvs": cvs = int(arg) if opt == "--specify-arg": sp_arg = int(arg) if opt == "--method-options": model_settings = [s.split('=') for s in str(arg).split()] if opt == "--predict-num": predict_num = int(arg) if not os.path.isdir(output_dir): os.makedirs(output_dir) # default parameters for each methods if method == 'nrlmf': args = {'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100} # args = {'c': 5, 'K1': 2, 'K2': 2, 'r': 3, 'lambda_d': 0.125, 'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, 'max_iter': 100} if method == 'netlaprls': args = {'gamma_d': 10, 'gamma_t': 10, 'beta_d': 1e-5, 'beta_t': 1e-5} if method == 'blmnii': args = {'alpha': 0.7, 'gamma': 1.0, 'sigma': 1.0, 'avg': False} if method == 'wnngip': args = {'T': 0.8, 'sigma': 1.0, 'alpha': 0.8} if method == 'kbmf': args = {'R': 50} if method == 'cmf': args = {'K': 50, 'lambda_l': 0.5, 'lambda_d': 0.125, 'lambda_t': 0.125, 'max_iter': 30} for key, val in model_settings: args[key] = val #ZINC test lines drug_names, target_names = get_drug_target_names_zinc() tprs=[] #list storing TPR35 for each CV RCRS=[] #list storing rcrs for each CV #rcrs is a list of lists for testnum in range(1,11): #for 10-fold CV intMat, testMat, drugMat, targetMat = load_data_from_file_csv_10cv(testnum, os.path.join(data_dir)) #ZINC test lines # drug_names, target_names = get_drugs_targets_names(dataset, os.path.join(data_dir)) #Demo lines # intMat, testMat, drugMat, targetMat = load_data_from_file_demo(dataset, os.path.join(data_dir)) # drug_names, target_names = get_drug_target_names_demo() #demo lines if predict_num == 0: if cvs == 1: # CV setting CVS1 X, D, T, cv = intMat, drugMat, targetMat, 1 if cvs == 2: # CV setting CVS2 X, D, T, cv = intMat, drugMat, targetMat, 0 if cvs == 3: # CV setting CVS3 X, D, T, cv = intMat.T, targetMat, drugMat, 0 cv_data = cross_validation(X, seeds, cv) if sp_arg == 0 and predict_num == 0: if method == 'nrlmf': cv_eval.nrlmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'netlaprls': cv_eval.netlaprls_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'blmnii': cv_eval.blmnii_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'wnngip': cv_eval.wnngip_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'kbmf': cv_eval.kbmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if method == 'cmf': cv_eval.cmf_cv_eval(method, dataset, cv_data, X, D, T, cvs, args) if sp_arg == 1 or predict_num > 0: tic = time.clock() if method == 'nrlmf': model = NRLMF(cfix=int(args['c']), K1=int(args['K1']), K2=int(args['K2']), num_factors=int(args['r']), lambda_d=float(args['lambda_d']), lambda_t=float(args['lambda_t']), alpha=float(args['alpha']), beta=float(args['beta']), theta=float(args['theta']), max_iter=int(args['max_iter'])) if method == 'netlaprls': model = NetLapRLS(gamma_d=float(args['gamma_d']), gamma_t=float(args['gamma_t']), beta_d=float(args['beta_t']), beta_t=float(args['beta_t'])) if method == 'blmnii': model = BLMNII(alpha=args['alpha'], gamma=args['gamma'], sigma=args['sigma'], avg=args['avg']) if method == 'wnngip': model = WNNGIP(T=float(args['T']), sigma=float(args['sigma']), alpha=float(args['alpha'])) if method == 'kbmf': model = KBMF(num_factors=int(args['R'])) if method == 'cmf': model = CMF(K=int(args['K']), lambda_l=float(args['lambda_l']), lambda_d=float(args['lambda_d']), lambda_t=float(args['lambda_t']), max_iter=int(args['max_iter'])) cmd = str(model) if predict_num == 0: print "Dataset:"+dataset+" CVS:"+str(cvs)+"\n"+cmd aupr_vec, auc_vec = train(model, cv_data, X, D, T) aupr_avg, aupr_conf = mean_confidence_interval(aupr_vec) auc_avg, auc_conf = mean_confidence_interval(auc_vec) print "auc:%.6f, aupr: %.6f, auc_conf:%.6f, aupr_conf:%.6f, Time:%.6f" % (auc_avg, aupr_avg, auc_conf, aupr_conf, time.clock()-tic) write_metric_vector_to_file(auc_vec, os.path.join(output_dir, method+"_auc_cvs"+str(cvs)+"_"+dataset+".txt")) write_metric_vector_to_file(aupr_vec, os.path.join(output_dir, method+"_aupr_cvs"+str(cvs)+"_"+dataset+".txt")) elif predict_num > 0: # print "Dataset:"+dataset+"\n"+cmd seed = 7771 if method == 'cmf' else 22 model.fix_model(intMat, intMat, drugMat, targetMat, seed) # x, y = np.where(intMat == 0) x, y = np.where(intMat >= 0) #to pick all pairs including train pairs scores = model.predict_scores(zip(x, y), 5) # ii = np.argsort(scores)[::-1] # predict_pairs = [(drug_names[x[i]], target_names[y[i]], scores[i]) for i in ii[:predict_num]] # print(predict_pairs) sarr=np.array(scores) r, c = np.where(testMat > 0) rcrs=get_rcrs(sarr,zip(r,c)) RCRS=RCRS+rcrs #extending RCRS for each CV tpr_top35=TPR_by_cutRank(rcrs,35) tprs.append(tpr_top35) # tpr35s=TPR_by_cutRank(RCRS,350) print "Dataset: "+dataset+" Rank: "+str(args['r'])+" Iter: "+str(args['max_iter']) print "Avg. TPR35: "+str(np.average(tprs)) print "S.E.M. TPR35: "+str(np.std(tprs)/math.sqrt(len(tprs))) print "TPR35 values:" print tprs print "Rank\tTPR" for rank in range(1,351): tpr=TPR_by_cutRank(RCRS,rank) print "%s\t%s"%(str(rank),str(tpr))