def task(args):
    (d, (gdsc_data_name, gdsc_data_type), drugids) = args
    logging.info("repr_dim = %s", d)
    logging.info("gdsc_data_name = %s", gdsc_data_name)

    import diffpri as dp
    import csv
    import pandas

    # Import data
    logging.info("Loading gene expressions...")
    geneexpr = pandas.read_hdf("data/%s.h5" % (gdsc_data_name), gdsc_data_type)
    x_full = geneexpr.as_matrix()
    logging.info(" * size = %s x %s" % x_full.shape)
    (n, d_full) = x_full.shape

    logging.info("Loading drug sensitivity data...")
    drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses')
    y = drugres.as_matrix()
    logging.info(" * size = %s x %s" % y.shape)

    assert x_full.shape[0] == y.shape[0]

    n_pv = n - n_npv - n_test
    pv_max = n_pv

    logging.info("Running the tests...")

    for drugid in drugids:
        logging.info("drugid = %d" % drugid)

        sd = np.nanstd(y[:, drugid], ddof=1)

        S = np.zeros((len(seeds), len(eps), len(model_seeds)),
                     dtype=np.float64)
        R = np.zeros((len(seeds), len(eps), len(model_seeds)),
                     dtype=np.float64)

        for j, e in enumerate(eps):
            logging.info(" epsilon = %s", e)

            repr_eps = e / 2
            pred_eps = e / 2

            if np.isinf(e):
                w_x = np.inf
                w_y = np.inf
            else:
                w_x = np.asscalar(
                    np.loadtxt("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" %
                               (n_pv, d, pred_eps)))
                w_y = np.asscalar(
                    np.loadtxt("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" %
                               (n_pv, d, pred_eps)))

            for model_seed in model_seeds:
                logging.info("  model seed = %d" % model_seed)
                np.random.seed(model_seed)

                logging.info("   selecting features...")
                selected_features = select_features(
                    x_full,
                    y[:, drugid], [d],
                    sparsity,
                    repr_eps,
                    lasso_max_iter=lasso_max_iter,
                    fit_intercept=True)

                x = x_full[:, selected_features[d]]
                d = x.shape[1]

                for seed in seeds:
                    logging.info("   seed = %d" % seed)

                    #logging.info("    preprocessing...")

                    # Process data
                    nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                        x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid,
                        seed)

                    if np.isinf(e):
                        private = False

                    if clipping_only:
                        private = False

                    #logging.info("    fitting and evaluating...")

                    # Fit model
                    if mcmc:
                        pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                              nxy_npv, nyy_pv, nyy_npv, B_x,
                                              B_y, pred_eps, x_test, private)
                    else:
                        pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv,
                                           B_x, B_y, pred_eps, x_test, private)

                    # Evaluate
                    S[seed, j, model_seed] = dp.precision(pred, y_test)
                    R[seed, j, model_seed] = dp.pc(pred, y_test, sd)

        # Save results
        for model_seed in model_seeds:
            dim_red = '%s-kifer_%d-%d' % (gdsc_data_name, d, model_seed)
            resname = "%s-pv%dnpv%dtst%d%s%s-%d" % (
                dim_red,
                n_pv,
                n_npv,
                n_test,
                ("-cliponly" if clipping_only else ""),
                ("-mcmc" if mcmc else "-fixed"),
                drugid,
            )
            filename = "drugsens_res/corr-%s.npy" % (resname)
            np.save(filename, S[:, :, model_seed])
            logging.info("saved %s" % filename)
            filename = "drugsens_res/wpc-%s.npy" % (resname)
            np.save(filename, R[:, :, model_seed])
            logging.info("saved %s" % filename)
Example #2
0
# Cross-validation
n_cv = 50
for seed in range(n_cv):

	S = np.zeros((len(pv_size),1),dtype=np.float64)
	R = np.zeros((len(pv_size),1),dtype=np.float64)

	for i in range(len(pv_size)):

		n_pv = pv_size[i]
		w_x = WX[i,w_ind]
		w_y = WY[i,w_ind]

		# Process data
		nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,x_test,y_test,B_x,B_y,n_train,private = dp.processData(x,y,d,n_test,n_pv,n_npv,pv_max,w_x,w_y,drugid,seed)

		# Fit model
		if mcmc:
			pred = dp.predictMCMC(n_train,nxx_pv,nxx_npv,nxy_pv,nxy_npv,nyy_pv,nyy_npv,B_x,B_y,e,x_test,private)
		else:
			pred = dp.predictL(nxx_pv,nxx_npv,nxy_pv,nxy_npv,B_x,B_y,e,x_test,private)

		# Evaluate
		S[i,0] = dp.precision(pred,y_test)
		R[i,0] = dp.pc(pred,y_test,sd)	

	# Save results into file
	csvpath = datapath+'cliptest-drugsens-corr-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv'
	np.savetxt(csvpath,S,delimiter=',')
	csvpath = datapath+'cliptest-drugsens-wpc-'+stre+'-'+str(drugid)+'-'+str(seed)+'.csv'
Example #3
0
    R = np.zeros((len(pv_size), 1), dtype=np.float64)

    for i in range(len(pv_size)):

        n_pv = pv_size[i]
        w_x = WX[i, w_ind]
        w_y = WY[i, w_ind]

        # Process data
        nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
            x,
            y,
            d,
            n_test,
            n_pv,
            n_npv,
            pv_max,
            w_x,
            w_y,
            drugid,
            seed,
            clipdata=False)  # modification: lr

        private = False  # modification: lr

        # Fit model
        if mcmc:
            pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv, nxy_npv,
                                  nyy_pv, nyy_npv, B_x, B_y, e, x_test,
                                  private)
        else:
def task(args):
    (dim_red, ) = args
    logging.info("dim_red = %s", dim_red)

    import diffpri as dp
    import csv
    import pandas

    # Import data
    logging.info("Loading representation...")
    filename = "%s.csv" % (dim_red)
    f = open("data_repr/" + filename, 'rt')
    reader = csv.reader(f, delimiter=',')
    x = np.array(list(reader)).astype(float)
    f.close()
    logging.info(" * size = %s x %s" % x.shape)
    (n, d) = x.shape

    logging.info("Loading drug sensitivity data...")
    drugres = pandas.read_hdf("data/GDSC_drugres.h5", 'drug_responses')
    y = drugres.as_matrix()
    logging.info(" * size = %s x %s" % y.shape)

    assert x.shape[0] == y.shape[0]

    n_pv = n - n_npv - n_test
    pv_max = n_pv

    if ica:
        logging.info("Running FastICA...")
        from sklearn.decomposition import FastICA
        x = FastICA(max_iter=2000).fit_transform(x)

    logging.info("Running the tests...")

    for drugid in drugids:
        logging.info("drugid = %d" % drugid)
        #S = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64)
        #R = np.zeros((len(seeds),len(pv_size),len(eps)),dtype=np.float64)
        S = np.zeros((len(seeds), len(eps)), dtype=np.float64)
        R = np.zeros((len(seeds), len(eps)), dtype=np.float64)
        for seed in seeds:
            logging.info("seed = %d" % seed)
            sd = np.nanstd(y[:, drugid], ddof=1)

            for j, e in enumerate(eps):
                if np.isinf(e):
                    w_x = np.inf
                    w_y = np.inf
                else:
                    w_x = np.asscalar(
                        np.loadtxt(
                            "drugsens_params/clipping/wx_n%d_d%d_e%s.txt" %
                            (n_pv, d, e)))
                    w_y = np.asscalar(
                        np.loadtxt(
                            "drugsens_params/clipping/wy_n%d_d%d_e%s.txt" %
                            (n_pv, d, e)))

                # Process data
                nxx_pv, nxx_npv, nxy_pv, nxy_npv, nyy_pv, nyy_npv, x_test, y_test, B_x, B_y, n_train, private = dp.processData(
                    x, y, d, n_test, n_pv, n_npv, pv_max, w_x, w_y, drugid,
                    seed)

                if np.isinf(e):
                    private = False

                if clipping_only:
                    private = False

                # Fit model
                if mcmc:
                    pred = dp.predictMCMC(n_train, nxx_pv, nxx_npv, nxy_pv,
                                          nxy_npv, nyy_pv, nyy_npv, B_x, B_y,
                                          e, x_test, private)
                else:
                    pred = dp.predictL(nxx_pv, nxx_npv, nxy_pv, nxy_npv, B_x,
                                       B_y, e, x_test, private)

                # Evaluate
                S[seed, j] = dp.precision(pred, y_test)
                R[seed, j] = dp.pc(pred, y_test, sd)

        # Save results
        resname = "%s-pv%dnpv%dtst%d%s%s%s-%d" % (
            dim_red,
            n_pv,
            n_npv,
            n_test,
            ("-ica" if ica else ""),
            ("-cliponly" if clipping_only else ""),
            ("-mcmc" if mcmc else "-fixed"),
            drugid,
        )
        filename = "drugsens_res/corr-%s.npy" % (resname)
        np.save(filename, S)
        logging.info("saved %s" % filename)
        filename = "drugsens_res/wpc-%s.npy" % (resname)
        np.save(filename, R)
        logging.info("saved %s" % filename)
Example #5
0
for i in range(len(pv_size)):
  
  n_pv = pv_size[i]
  d = pars['dim']
  for j in range(len(eps)):
    pars['epsilon'] = eps[j]
    pars['delta'] = delta_list[j]
    
    w_x = WX[i,j]
    w_y = WY[i,j]
    
    # check amount of data, use maximum amount if too few samples
    if n_data < n_pv+n_test: #n_npv+n_test:
      print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test))
      n_pv = n_data-n_test
    
    # Process data
    suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars)
    
    # calculate predictions
    for m in suff_stats_all:
      pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test)
      res_all[m][i,j] = dp.precision(pred,y_test)
      

with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f:
  pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL)
  
print('Done.')