def task(args): (d, (gdsc_data_name, gdsc_data_type), drugids) = args logging.info("repr_dim = %s", d) logging.info("gdsc_data_name = %s", gdsc_data_name) import diffpri as dp import csv import pandas # Import data logging.info("Loading gene expressions...") geneexpr = pandas.read_hdf("data/%s.h5" % (gdsc_data_name), gdsc_data_type) x_full = geneexpr.as_matrix() logging.info(" * size = %s x %s" % x_full.shape) (n, d_full) = x_full.shape logging.info("Loading drug sensitivity data...") drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses') y = drugres.as_matrix() logging.info(" * size = %s x %s" % y.shape) assert x_full.shape[0] == y.shape[0] n_pv = n - n_npv - n_test pv_max = n_pv logging.info("Running the tests...") for drugid in drugids: logging.info("drugid = %d" % drugid) sd = np.nanstd(y[:, drugid], ddof=1) S = np.zeros((len(seeds), len(eps), len(model_seeds)), dtype=np.float64) R = np.zeros((len(seeds), len(eps), len(model_seeds)), dtype=np.float64) for j, e in enumerate(eps): logging.info(" epsilon = %s", e) repr_eps = e / 2 pred_eps = e / 2 if np.isinf(e): w_x = np.inf w_y = np.inf else: w_x = np.asscalar( np.loadtxt("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n_pv, d, pred_eps))) w_y = np.asscalar( np.loadtxt("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n_pv, d, pred_eps))) for model_seed in model_seeds: logging.info(" model seed = %d" % model_seed) np.random.seed(model_seed) logging.info(" selecting features...") selected_features = select_features( x_full, y[:, drugid], [d], sparsity, repr_eps, lasso_max_iter=lasso_max_iter, fit_intercept=True) x = x_full[:, selected_features[d]] d = x.shape[1] for seed in seeds: logging.info(" seed = %d" % seed) #logging.info(" preprocessing...") # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) if np.isinf(e): private = False if clipping_only: private = False #logging.info(" fitting and evaluating...") # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, pred_eps, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, pred_eps, x_test, private) # Evaluate S[seed, j, model_seed] = dp.precision(pred, y_test) R[seed, j, model_seed] = dp.pc(pred, y_test, sd) # Save results for model_seed in model_seeds: dim_red = '%s-kifer_%d-%d' % (gdsc_data_name, d, model_seed) resname = "%s-pv%dnpv%dtst%d%s%s-%d" % ( dim_red, n_pv, n_npv, n_test, ("-cliponly" if clipping_only else ""), ("-mcmc" if mcmc else "-fixed"), drugid, ) filename = "drugsens_res/corr-%s.npy" % (resname) np.save(filename, S[:, :, model_seed]) logging.info("saved %s" % filename) filename = "drugsens_res/wpc-%s.npy" % (resname) np.save(filename, R[:, :, model_seed]) logging.info("saved %s" % filename)
# Cross-validation n_cv = 50 for seed in range(n_cv): S = np.zeros((len(pv_size),1),dtype=np.float64) R = np.zeros((len(pv_size),1),dtype=np.float64) for i in range(len(pv_size)): n_pv = pv_size[i] w_x = WX[i,w_ind] w_y = WY[i,w_ind] # Process data nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,x_test,y_test,B_x,B_y,n_train,private = dp.processData(x,y,d,n_test,n_pv,n_npv,pv_max,w_x,w_y,drugid,seed) # Fit model if mcmc: pred = dp.predictMCMC(n_train,nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,B_x,B_y,e,x_test,private) else: pred = dp.predictL(nxx_pv,nxx_npv,nxy_pv,nxy_npv,B_x,B_y,e,x_test,private) # Evaluate S[i,0] = dp.precision(pred,y_test) R[i,0] = dp.pc(pred,y_test,sd) # Save results into file csvpath = datapath+'cliptest-drugsens-corr-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv' np.savetxt(csvpath,S,delimiter=',') csvpath = datapath+'cliptest-drugsens-wpc-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv'
R = np.zeros((len(pv_size), 1), dtype=np.float64) for i in range(len(pv_size)): n_pv = pv_size[i] w_x = WX[i, w_ind] w_y = WY[i, w_ind] # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed, clipdata=False) # modification: lr private = False # modification: lr # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) else:
def task(args): (dim_red, ) = args logging.info("dim_red = %s", dim_red) import diffpri as dp import csv import pandas # Import data logging.info("Loading representation...") filename = "%s.csv" % (dim_red) f = open("data_repr/" + filename, 'rt') reader = csv.reader(f, delimiter=',') x = np.array(list(reader)).astype(float) f.close() logging.info(" * size = %s x %s" % x.shape) (n, d) = x.shape logging.info("Loading drug sensitivity data...") drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses') y = drugres.as_matrix() logging.info(" * size = %s x %s" % y.shape) assert x.shape[0] == y.shape[0] n_pv = n - n_npv - n_test pv_max = n_pv if ica: logging.info("Running FastICA...") from sklearn.decomposition import FastICA x = FastICA(max_iter=2000).fit_transform(x) logging.info("Running the tests...") for drugid in drugids: logging.info("drugid = %d" % drugid) #S = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64) #R = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64) S = np.zeros((len(seeds), len(eps)), dtype=np.float64) R = np.zeros((len(seeds), len(eps)), dtype=np.float64) for seed in seeds: logging.info("seed = %d" % seed) sd = np.nanstd(y[:, drugid], ddof=1) for j, e in enumerate(eps): if np.isinf(e): w_x = np.inf w_y = np.inf else: w_x = np.asscalar( np.loadtxt( "drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n_pv, d, e))) w_y = np.asscalar( np.loadtxt( "drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n_pv, d, e))) # Process data nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData( x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid, seed) if np.isinf(e): private = False if clipping_only: private = False # Fit model if mcmc: pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, B_x, B_y, e, x_test, private) else: pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x, B_y, e, x_test, private) # Evaluate S[seed, j] = dp.precision(pred, y_test) R[seed, j] = dp.pc(pred, y_test, sd) # Save results resname = "%s-pv%dnpv%dtst%d%s%s%s-%d" % ( dim_red, n_pv, n_npv, n_test, ("-ica" if ica else ""), ("-cliponly" if clipping_only else ""), ("-mcmc" if mcmc else "-fixed"), drugid, ) filename = "drugsens_res/corr-%s.npy" % (resname) np.save(filename, S) logging.info("saved %s" % filename) filename = "drugsens_res/wpc-%s.npy" % (resname) np.save(filename, R) logging.info("saved %s" % filename)
for i in range(len(pv_size)): n_pv = pv_size[i] d = pars['dim'] for j in range(len(eps)): pars['epsilon'] = eps[j] pars['delta'] = delta_list[j] w_x = WX[i,j] w_y = WY[i,j] # check amount of data, use maximum amount if too few samples if n_data < n_pv+n_test: #n_npv+n_test: print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test)) n_pv = n_data-n_test # Process data suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars) # calculate predictions for m in suff_stats_all: pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test) res_all[m][i,j] = dp.precision(pred,y_test) with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f: pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL) print('Done.')