def run_alg(x_pub, y_pub, x_priv, y_priv, params, full_model_id): ################################## # representation learning ################################# x = x_pub y = y_pub # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=validation_split, random_state=0) x, y = train_x, train_y logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) repr_dim = int(round(params.repr_dim)) logging.info("Learning the representation on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) from models.vae_pytorch import VAE alg = VAE().init( input_dim = data_dim, latent_dim = repr_dim, #enc_dims = [], enc_dims = [int(10 ** params.hidden_layer_size_mul_log10)*repr_dim] * int(params.n_hidden_layers), dec_dims = 'same', enc_activations = 'relu', dec_activations = 'relu', prediction_mean_activation = 'sigmoid', prediction_var = 'gs', prediction_log_var_min = math.log(0.01**2), normalize_input_type = 'quantiles', normalize_input_quantile = 0.05, normalize_input_axis = 'global', normalize_input_target = (0, 1), normalize_input_clip = True, optimizer = 'Adam', optimizer_params = {'lr': 10.0 ** params.learning_rate_log10}, n_epochs = n_epochs, early_stopping = True, reduce_lr_on_plateau = False, batch_size = batch_size) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function ensure_dir_exists('param_opt/progress') progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data ensure_dir_exists("param_opt/log/") alg.learn(x, validation_data=val_x, log_file_prefix=("param_opt/log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=None, max_duration=repr_max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) ################################## # representation mapping ################################# x = x_priv y = y_priv # get the representation logging.info("Making the representation of private data...") x_repr = alg.encode(x) # test to predict the data itself x_pred = alg.decode(x_repr) rel_mse = relative_mean_squared_error(x, x_pred) logging.info(" * reconstruct the data: rel_mse = %g", rel_mse) ################################## # prediction ################################# x = x_repr # private or non-private logistic regression private = True # test prediction with cross validation logging.info("Prediction with %d-fold cross validation...", pred_cv_folds) from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=pred_cv_folds, shuffle=True, random_state=0) avg_test_acc = 0 for fold, (train, test) in enumerate(cv.split(x, y)): logging.info("Fold %d...", fold) x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] # init rng #np.random.seed(seed0) logging.info("Bounding the data to 1-sphere...") if scale_fun == "norm_max": logging.info(" * scale by max norm") scale_factor = np.amax(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_max": logging.info(" * scale each dimension by max absolute value") scale_factor = np.amax(np.abs(x_train), axis=0) elif scale_fun == "norm_avg": logging.info(" * scale by average norm") scale_factor = np.mean(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_std": logging.info(" * scale each dimension by standard deviation") scale_factor = np.std(x_train, axis=0) elif scale_fun == "none": scale_factor = 1.0 else: assert False x_train /= scale_factor * scale_const x_test /= scale_factor * scale_const if clip == "norm": logging.info(" * clip norms to max 1") x_train /= np.maximum(np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1) x_test /= np.maximum(np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack),1) elif clip == "dims": assert False, "not implemented" elif clip == "none": logging.info(" * no clipping -> no bounding") assert private == False #or np.isinf(epsilon) else: assert False # fit logging.info("Fitting a model...") if private: logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength) from models.logistic_regression import DPLogisticRegression model = DPLogisticRegression().init(repr_dim, classes=np.unique(y), alpha=regularizer_strength, epsilon=epsilon) else: logging.info(" * logistic regression: alpha=%g", regularizer_strength) from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1/regularizer_strength) model.fit(x_train, y_train) #print(model.predict(x_test)) # compute mean accuracy on test set logging.info("Testing the model...") #acc = model.score(x_test, y_test) from sklearn.metrics import accuracy_score train_acc = accuracy_score(y_train, model.predict(x_train)) test_acc = accuracy_score(y_test, model.predict(x_test)) logging.info(" * train accuracy = %.6f", train_acc) logging.info(" * test accuracy = %.6f", test_acc) avg_test_acc += test_acc avg_test_acc /= pred_cv_folds logging.info("Average test accuracy = %.6f", avg_test_acc) return avg_test_acc
def task(args): import pandas repr_dim, (alg_id, _, make_alg), seed = args logging.info("dataset = %s, algorithm = %s", data_set, alg_id) # read the data sets logging.info("Reading data...") data = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % data.shape) #aux_target = pandas.read_hdf("data/TCGA_cancertype.h5", 'cancer_types') #logging.info(" * auxiliary target size: %d" % aux_target.shape) #common_samples = data.index.intersection(aux_target.index) #data = data.loc[common_samples] #aux_target = aux_target.loc[common_samples] #logging.info(" * number of common samples: %d" % common_samples.size) from common import categorical_to_binary x = data.as_matrix() #y = categorical_to_binary(aux_target.values) #num_classes = y.shape[1] #x = x[:,0:2000] # normalize the input to _total_ unit variance and per-feature zero mean if normalize_data: x -= np.mean(x) x /= np.std(x) x -= np.mean(x, axis=0) # FIXME! #x = (x - np.amin(x,axis=0)) / (np.amax(x,axis=0) - np.amin(x,axis=0)) #x = (x - np.amin(x)) / (np.amax(x) - np.amin(x)) # init rng np.random.seed(seed) import torch torch.manual_seed(seed) torch.cuda.manual_seed(seed) #if args.cuda ?????: # torch.cuda.manual_seed(seed) # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") m = x.shape[0] perm = np.random.permutation(m) x = x[perm, :] #y = y[perm,:] split_point = int(validation_split * m) (val_x, x) = (x[:split_point, :], x[split_point:, :]) #(val_y, y) = (y[:split_point,:], y[split_point:,:]) logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) logging.info("Running the algorithm...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) alg = make_alg(data_dim, repr_dim) # create output dir if does not exist ensure_dir_exists('res') full_model_id = "%s-%d-%s-s%d%s" % (data_set, repr_dim, alg_id, seed, id_suffix) # define the progress saving function progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data alg.learn(x, validation_data=val_x, log_file_prefix=("log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=deadline, max_duration=max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) # save model logging.info("Saving the learned model...") ensure_dir_exists('repr_models') alg.save("repr_models/%s" % (full_model_id))
def learn_repr(x, y, params, full_model_id): # separate validation set if needed val_x = None #val_y = None if params.repr_learn_validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=params.repr_learn_validation_split, random_state=0) x, y = train_x, train_y logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) repr_dim = int(round(params.repr_dim)) logging.info("Learning the representation on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() (_, _, _, make_alg, _) = select_repr_alg(params.repr_alg) # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) alg = make_alg(data_dim, repr_dim, params) # create output dir if does not exist #ensure_dir_exists('res') # define the progress saving function ensure_dir_exists('param_opt/progress') progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % ( full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data ensure_dir_exists("param_opt/log/") alg.learn(x, validation_data=val_x, log_file_prefix=("param_opt/log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=None, max_duration=params.repr_learn_max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) else: val_rel_mse = np.nan logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) return alg
def task(args): import pandas param_id, priv_cancertypes, seed = args logging.info("priv classes = %s, params_id = %s, seed = %d", priv_cancertypes, param_id, seed) #repr_dim, (alg_id, _, make_alg), seed = args #logging.info("algorithm = %s, seed = %d", alg_id, seed) # read the data sets alg_id = param_id logging.info("Loading parameters...") params = np.load("run_parameters/params.npy") params = params[param_id] logging.info("Reading data...") gene_expr = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % gene_expr.shape) logging.info("Filtering out genes with low expressions...") low_expr = (np.median(gene_expr, axis=0) < 0.0) gene_expr = gene_expr.iloc[:, ~low_expr] logging.info(" * %d of %d remaining (%d removed)" % (sum(~low_expr), low_expr.size, sum(low_expr))) logging.info("Loading cancer types...") cancer_type = pandas.read_hdf("data/%s.h5" % (target_set), target_type) assert np.array_equal(gene_expr.index, cancer_type.index) # split logging.info("Splitting...") priv = cancer_type.isin(priv_cancertypes) logging.info(" * %d private samples, %d public samples (of %d total)" % (sum(priv), sum(~priv), priv.size)) from common import categorical_to_binary x_pub = gene_expr[~priv].as_matrix() y_pub = cancer_type[~priv].cat.codes.as_matrix() x_priv = gene_expr[priv].as_matrix() y_priv = cancer_type[priv].cat.codes.as_matrix() #y = categorical_to_binary(aux_target.values) #num_classes = y.shape[1] data_name = '-'.join(priv_cancertypes).replace(' ', '_') # A hack to have a different seed if the algorithm is run multiple times # with the same parameters. Destroys reproducibility... import time seed0 = int(time.time() * 100) % (2**32) # init rng np.random.seed(seed0) import torch torch.manual_seed(seed0) if torch.cuda.is_available() and torch.cuda.device_count() > 0: torch.cuda.manual_seed(seed0) ################################## # representation learning ################################# x = x_pub y = y_pub # separate validation set if needed val_x = None #val_y = None if validation_split: logging.info("Splitting into training and validation sets") from sklearn.model_selection import train_test_split train_x, val_x, train_y, val_y = train_test_split( x, y, test_size=validation_split, random_state=0) x, y = train_x, train_y #m = x.shape[0] #perm = np.random.permutation(m) #x = x[perm,:] #y = y[perm,:] #split_point = int(validation_split * m) #(val_x, x) = (x[:split_point,:], x[split_point:,:]) #(val_y, y) = (y[:split_point,:], y[split_point:,:]) logging.info(" * training set shape: %d x %d" % x.shape) logging.info(" * validation set shape: %d x %d" % val_x.shape) data_dim = x.shape[1] logging.info(" * data shape after preprocessing: %d x %d" % x.shape) logging.info("Learning the representaiton on public data...") logging.info(" * learning a representation of size %d", repr_dim) start_time = time.time() # init the algorithm #alg = make_alg(data_dim, repr_dim, num_classes) #alg = make_alg(data_dim, repr_dim) from models.vae_pytorch import VAE alg = VAE().init( input_dim=data_dim, latent_dim=repr_dim, #enc_dims = [], enc_dims=[int(10**params.hidden_layer_size_mul_log10) * repr_dim] * int(params.n_hidden_layers), dec_dims='same', enc_activations='relu', dec_activations='relu', prediction_mean_activation='sigmoid', prediction_var='gs', prediction_log_var_min=math.log(0.01**2), normalize_input_type='quantiles', normalize_input_quantile=0.05, normalize_input_axis='global', normalize_input_target=(0, 1), normalize_input_clip=True, optimizer='Adam', optimizer_params={'lr': 10.0**params.learning_rate_log10}, n_epochs=n_epochs, early_stopping=True, reduce_lr_on_plateau=False, batch_size=batch_size) # create output dir if does not exist ensure_dir_exists('res') full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, alg_id, seed, id_suffix) # define the progress saving function progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id) progress_file = open(progress_filename, 'w', encoding='utf-8') #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id) #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8') if val_x is not None: val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % ( full_model_id) val_progress_file = open(val_progress_filename, 'w', encoding='utf-8') #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id) #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8') def save_progress(): x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) progress_file.write("%g\n" % rel_mse) #aux_pred = alg.predict_secondary(x) #aux_rel_ce = relative_cross_entropy(y, aux_pred) #aux_progress_file.write("%g\n" % aux_rel_ce) if val_x is not None: val_x_pred = alg.decode(alg.encode(val_x)) rel_mse = relative_mean_squared_error(val_x, val_x_pred) val_progress_file.write("%g\n" % rel_mse) #val_aux_pred = alg.predict_secondary(val_x) #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred) #aux_val_progress_file.write("%g\n" % aux_rel_ce) # fit to the training data alg.learn(x, validation_data=val_x, log_file_prefix=("log/%s" % (full_model_id)), per_epoch_callback_funs=[save_progress], deadline=deadline, max_duration=max_duration) # test reconstruction error x_pred = alg.decode(alg.encode(x)) rel_mse = relative_mean_squared_error(x, x_pred) val_x_pred = alg.decode(alg.encode(val_x)) val_rel_mse = relative_mean_squared_error(val_x, val_x_pred) logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse, val_rel_mse) elapsed = time.time() - start_time logging.info(" * running time = %s", pretty_duration(elapsed)) # save model #logging.info("Saving the learned model...") #ensure_dir_exists('repr_models') #alg.save("repr_models/%s" % (full_model_id)) ################################## # representation mapping ################################# x = x_priv y = y_priv # get the representation logging.info("Making the representation of private data...") x_repr = alg.encode(x) # test to predict the data itself x_pred = alg.decode(x_repr) rel_mse = relative_mean_squared_error(x, x_pred) logging.info(" * reconstruct the data: rel_mse = %g", rel_mse) ensure_dir_exists("res") with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" % (repr_dim, data_name, alg_id, seed, id_suffix), 'w', encoding='utf-8') as f: f.write("%.6f\n" % rel_mse) # save the representation #logging.info("Saving the representation...") #ensure_dir_exists("data_repr") #np.savetxt("data_repr/repr-%s-%d-%s-s%d%s.csv" % # (data_name, repr_dim, alg_id, seed, id_suffix), # x_repr, delimiter=',') ################################## # prediction ################################# x = x_repr # split train and test sets logging.info("Splitting to train and test sets...") from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=pred_test_size, random_state=0) logging.info(" * train samples: %d" % x_train.shape[0]) logging.info(" * test samples: %d" % x_test.shape[0]) # init rng np.random.seed(seed0) #print(np.amax(np.linalg.norm(x_train, axis=1))) #print(np.mean(np.linalg.norm(x_train, axis=1))) logging.info("Bounding the data to 1-sphere...") if scale_fun == "norm_max": logging.info(" * scale by max norm") scale_factor = np.amax(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_max": logging.info(" * scale each dimension by max absolute value") scale_factor = np.amax(np.abs(x_train), axis=0) elif scale_fun == "norm_avg": logging.info(" * scale by average norm") scale_factor = np.mean(np.linalg.norm(x_train, axis=1)) elif scale_fun == "dims_std": logging.info(" * scale each dimension by standard deviation") scale_factor = np.std(x_train, axis=0) elif scale_fun == "none": scale_factor = 1.0 else: assert False x_train /= scale_factor * scale_const x_test /= scale_factor * scale_const #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True))) if clip == "norm": logging.info(" * clip norms to max 1") x_train /= np.maximum( np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1) x_test /= np.maximum( np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack), 1) elif clip == "dims": assert False, "not implemented" elif clip == "none": logging.info(" * no clipping -> no bounding") assert private == False #or np.isinf(epsilon) else: assert False #for private in [False, True]: for private in [True]: # fit logging.info("Fitting a model...") if private: logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength) from models.logistic_regression import DPLogisticRegression model = DPLogisticRegression().init(repr_dim, classes=np.unique(y), alpha=regularizer_strength, epsilon=epsilon) else: logging.info(" * logistic regression: alpha=%g", regularizer_strength) from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1 / regularizer_strength) model.fit(x_train, y_train) #print(model.predict(x_test)) # compute mean accuracy on test set logging.info("Testing the model...") #acc = model.score(x_test, y_test) from sklearn.metrics import accuracy_score train_acc = accuracy_score(y_train, model.predict(x_train)) test_acc = accuracy_score(y_test, model.predict(x_test)) logging.info(" * train accuracy = %.6f", train_acc) logging.info(" * test accuracy = %.6f", test_acc) logging.info("Writing results to disk...") ensure_dir_exists("res") filename = ( "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" % (repr_dim, data_name, alg_id, seed, scale_fun, scale_const, clip, ("-e%g" % (epsilon) if private else "-nonpriv"))) logging.info(" * filename: %s", filename) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % test_acc) filename = "param_opt/opt_result%s-%s.txt" % (id_suffix, full_model_id) with open(filename, 'w', encoding='utf-8') as f: f.write("%.6f\n" % test_acc)