def main(_): experiment_string = utils.get_experiment_string( FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red, features=FLAGS.features, problem=FLAGS.problem, alternate=FLAGS.alternate) worker_dir = '{}/params/{}/worker_out'.format(FILE_PATH, experiment_string) logging.info('\n' * 3) logging.info('Parameter search search_idx=%d for %s and method=%s', FLAGS.search_idx, experiment_string, FLAGS.method) logging.info('\n' * 3) if FLAGS.debug: logging.warn('Running in debug mode') # setup methods methods = parse_method_flag(FLAGS.method) # setup datasets assert FLAGS.problem in ['classification', 'regression'] if (FLAGS.dataset not in ['20news', 'sentiment_sentences'] and FLAGS.vec != 'none'): raise ValueError( 'Should not be using text vectorization with {} dataset'.format( FLAGS.dataset)) datasets = [] if 'sim' in FLAGS.dataset: d = utils.get_sim( FLAGS.dataset, problem=FLAGS.problem, features=FLAGS.features, alternate=FLAGS.alternate) for _ in range(FLAGS.num_dataset): d.reset() # must use get() over generate() for consistency in random sampling calls x_train, y_train, _, _ = d.get() if FLAGS.debug: # use smaller subset x_train, y_train = shuffle_coordinately(x_train, y_train) x_train, y_train = x_train[:500, :250], y_train[:500] datasets.append((x_train, y_train)) else: assert FLAGS.num_dataset == 1 x_train, y_train, _, _ = utils.load_nonsim_data( FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red) if FLAGS.debug: # use smaller subset x_train, y_train = shuffle_coordinately(x_train, y_train) x_train, y_train = x_train[:500, :250], y_train[:500] datasets.append((x_train, y_train)) for method in methods: # define methods and parameter grids here if method == 'l1_linear' and FLAGS.problem == 'regression': submodule = linear param_grid = LINEAR_REGR_PARAM_GRID.copy() param_grid['penalty'] = ['l1'] elif method == 'l1_linear': submodule = linear param_grid = LINEAR_CLF_PARAM_GRID.copy() param_grid['penalty'] = ['l1'] elif method == 'l2_linear' and FLAGS.problem == 'regression': submodule = linear param_grid = LINEAR_REGR_PARAM_GRID.copy() param_grid['penalty'] = ['l2'] elif method == 'l2_linear': submodule = linear param_grid = LINEAR_CLF_PARAM_GRID.copy() param_grid['penalty'] = ['l2'] elif method == 'random_forest': submodule = random_forest param_grid = RF_PARAM_GRID.copy() elif method == 'l1_gbdt': submodule = gbdt param_grid = GBDT_PARAM_GRID.copy() param_grid['reg_alpha'] = [0.0, 0.5, 1.0, 2.0, 4.0, 10.] elif method == 'l2_gbdt': submodule = gbdt param_grid = GBDT_PARAM_GRID.copy() param_grid['reg_lambda'] = [0.0, 0.5, 1.0, 2.0, 4.0, 10.] elif method == 'l1_dnn': submodule = dnn param_grid = DNN_PARAM_GRID.copy() param_grid['l1'] = [0.0, 1e-3, 1e-2, 1e-1] elif method == 'l2_dnn': submodule = dnn param_grid = DNN_PARAM_GRID.copy() param_grid['l2'] = [0.0, 1e-3, 1e-2, 1e-1] else: raise ValueError('Unknown learning method: {}'.format(method)) params = generate_param_configs( param_grid, num_iteration=FLAGS.num_search, seed=SEED) if FLAGS.search_idx >= len(params): # less configs than number of searches continue param_dict = params[FLAGS.search_idx] for dataset_idx, (x_train, y_train) in enumerate(datasets): # recursively make parent directory save_dir = '{}/dataset_idx={}_{}/{}'.format(worker_dir, dataset_idx, FLAGS.num_dataset, method) tf.gfile.MakeDirs(save_dir) # skip search if already performed save_path = '{}/search_idx={}_{}.out'.format(save_dir, FLAGS.search_idx, FLAGS.num_search) if tf.gfile.Exists(save_path) and not FLAGS.overwrite and not FLAGS.debug: logging.info('Parameter search already completed for %s, dataset %d/%d', method, dataset_idx, FLAGS.num_dataset) continue # k-fold cross-validation start = time.time() tuning_scores = [] kf = KFold(n_splits=FLAGS.k, shuffle=True, random_state=SEED) for cv_train_idx, cv_test_idx in kf.split(x_train): x_train_cv, y_train_cv = x_train[cv_train_idx], y_train[cv_train_idx] x_test_cv, y_test_cv = x_train[cv_test_idx], y_train[cv_test_idx] _, metrics = submodule.pipeline( x_train_cv, y_train_cv, x_test_cv, y_test_cv, param_dict, problem=FLAGS.problem) # assume that we maximize the score if FLAGS.problem == 'regression': tuning_scores.append(-metrics['test_mse']) else: tuning_scores.append(metrics['test_acc']) if method == 'dnn': dnn.clear_keras_session() mean_score = np.mean(tuning_scores) logging.info( 'Worker result for method=%s, search %d/%d, dataset %d/%d ' '(%.3f s)', method, FLAGS.search_idx, FLAGS.num_search, dataset_idx, FLAGS.num_dataset, time.time() - start) logging.info(param_dict) logging.info('%.4f', mean_score) logging.info('\n' * 2) if not FLAGS.debug: # save parameters and worker results to file with tf.gfile.GFile(save_path, 'w') as f: s = ','.join(['{}={}'.format(k, v) for k, v in param_dict.items()]) f.write(s) f.write('\n') f.write('{:.8f}'.format(mean_score)) logging.info('Saved results to %s', save_path) logging.info('\n\n')
def main(_): experiment_string = utils.get_experiment_string(FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red, features=FLAGS.features, problem=FLAGS.problem, alternate=FLAGS.alternate) worker_dir = '{}/params/{}/worker_out'.format(FILE_PATH, experiment_string) param_dir = '{}/params/{}'.format(FILE_PATH, experiment_string) logging.info('\n' * 3) logging.info('Collation (parameter search results) for %s', experiment_string) logging.info('\n' * 3) # worker out path format is {0}/dataset_idx={1}_{2}/{3}/search_idx={4}_{5}.out # where 0 = worker_out_dir, 1 = dataset_idx, 2 = num_dataset, 3 = method, # 4 = search_idx, 5 = num_search # gets idx from a str with format {text}={idx}_{text} extract_idx_func = lambda x: int(x.split('=')[1].split('_')[0]) dataset_ids = sorted(tf.gfile.ListDir(worker_dir), key=extract_idx_func) assert dataset_ids # check non-empty num_dataset = int(dataset_ids[0].split('_')[-1]) if FLAGS.strict: assert num_dataset == FLAGS.num_dataset for d_idx, dataset_id in enumerate(dataset_ids): assert dataset_id == 'dataset_idx={}_{}'.format(d_idx, num_dataset) dataset_id_path = '{}/{}'.format(worker_dir, dataset_id) methods = tf.gfile.ListDir(dataset_id_path) for method in methods: # setup directories method_path = '{}/{}'.format(dataset_id_path, method) save_dir = '{}/{}'.format(param_dir, dataset_id) tf.gfile.MakeDirs(save_dir) # skip collation if already performed save_path = '{}/{}.param'.format(save_dir, method) if tf.gfile.Exists(save_path) and not FLAGS.overwrite: logging.info( 'Collation already completed for %s, dataset %d/%d', method, d_idx, num_dataset) continue search_ids = sorted(tf.gfile.ListDir(method_path), key=extract_idx_func) num_search = int(search_ids[0].split('_')[-1].rstrip('.out')) start = time.time() # look for best and worst tuning scores and save related parameters best_score, worst_score = float('-inf'), float('inf') best_param_str = None best_path = None for s_idx, search_id in enumerate(search_ids): if FLAGS.strict: assert search_id == 'search_idx={}_{}.out'.format( s_idx, num_search) read_path = '{}/{}'.format(method_path, search_id) with tf.gfile.GFile(read_path, 'r') as f: lines = f.read().splitlines() score = float(lines[1]) # assume that scores should be maximized (e.g., negative MSE or acc) if score > best_score: best_param_str, best_path, best_score = lines[ 0], read_path, score if score < worst_score: # save worst config for debugging worst_score = lines[0], score # note: reports number of worker results read which is less than the value # of num_search which describes the *maximum* number of worker searches logging.info( 'Collation for method=%s, dataset %d/%d (%d reads, %.3f s)', method, d_idx, num_dataset, len(search_ids), time.time() - start) logging.info('best score=%.3f, %s', best_score, best_param_str) logging.info('\n' * 2) # save best parameters tf.gfile.Copy(best_path, save_path, overwrite=True) logging.info('Saved collation results to %s', save_path)
def main(_): experiment_string = utils.get_experiment_string( FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red, which_features=FLAGS.which_features, problem=FLAGS.problem, alternate=FLAGS.alternate) file_path = os.path.expanduser(FILE_PATH) logging.info('DIRECTORY USED %s', file_path) param_dir = '{}/params/{}'.format(file_path, experiment_string) save_dir = '{}/logs/{}'.format(file_path, experiment_string) tf.gfile.MakeDirs(save_dir) logging.info('\n' * 3) logging.info('Experiment %s and method=%s', experiment_string, FLAGS.method) logging.info('\n' * 3) if FLAGS.debug: logging.warn('Running in debug mode') # setup methods methods = parse_method_flag(FLAGS.method) # setup datasets assert FLAGS.problem in ['classification', 'regression'] if (FLAGS.dataset not in ['20news', 'sentiment_sentences'] and FLAGS.vec != 'none'): raise ValueError( 'Should not be using text vectorization with {} dataset'.format( FLAGS.dataset)) datasets = [] if 'sim' in FLAGS.dataset: oracle_preds = [] d = utils.get_sim(FLAGS.dataset, problem=FLAGS.problem, which_features=FLAGS.which_features, alternate=FLAGS.alternate) try: num_inf_feature = d._num_inf_feature # pylint: disable=protected-access except: # pylint: disable=bare-except num_inf_feature = None for _ in range(FLAGS.num_dataset): d.reset() # must use get() over generate() for consistency in random sampling calls x_train, y_train, x_test, y_test = d.get() if FLAGS.debug: # use smaller subset x_train, y_train = shuffle_coordinately(x_train, y_train) x_train, y_train = x_train[:500, :], y_train[:500] x_test, y_test = shuffle_coordinately(x_test, y_test) x_test, y_test = x_test[:500, :], y_test[:500] datasets.append((x_train, y_train, x_test, y_test)) oracle_preds.append( (d.oracle_predict(x_train), d.oracle_predict(x_test))) else: x_train, y_train, x_test, y_test = utils.load_nonsim_data( FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red) if FLAGS.debug: # use smaller subset x_train, y_train = shuffle_coordinately(x_train, y_train) x_train, y_train = x_train[:500, :250], y_train[:500] x_test, y_test = shuffle_coordinately(x_test, y_test) x_test, y_test = x_test[:500, :250], y_test[:500] datasets.append((x_train, x_test, x_test, y_test)) # evaluate oracle if experiment involves a simulation dataset if 'sim' in FLAGS.dataset: if FLAGS.problem == 'regression': oracle_metrics = {'train_mse': [], 'test_mse': []} else: oracle_metrics = {'train_acc': [], 'test_acc': []} for ((_, y_train, _, y_test), (y_train_pred, y_test_pred)) in zip(datasets, oracle_preds): if FLAGS.problem == 'regression': oracle_metrics['train_mse'].append( mean_squared_error(y_train, y_train_pred)) oracle_metrics['test_mse'].append( mean_squared_error(y_test, y_test_pred)) else: oracle_metrics['train_acc'].append( accuracy(y_train, y_train_pred)) oracle_metrics['test_acc'].append(accuracy( y_test, y_test_pred)) logging.info('\n' * 3) logging.info('oracle_results') logging.info('---') oracle_metrics = sorted(oracle_metrics.items(), key=lambda x: x[0]) print_out = '\n'.join([ '{}={:.6f}'.format(metric, np.mean(values)) for metric, values in oracle_metrics ]) print_out += '\n\n' print_out += '\n'.join([ '{}_SE={:.6f}'.format( metric, np.true_divide(np.std(values), np.sqrt(FLAGS.num_dataset))) for metric, values in oracle_metrics ]) logging.info(print_out) if not FLAGS.debug and FLAGS.logtofile: save_path = '{}/{}.log'.format(save_dir, 'oracle') with tf.gfile.GFile(save_path, 'w') as f: # save logs to file f.write(print_out) logging.info('Saved oracle results to %s', save_path) logging.info('\n' * 2) # evaluate learning methods for method in methods: if method in ['l1_linear', 'l2_linear']: submodule = linear elif method == 'random_forest': submodule = random_forest elif method in ['l1_gbdt', 'l2_gbdt']: submodule = gbdt elif method in ['l1_dnn', 'l2_dnn']: submodule = dnn else: raise ValueError('Unknown learning method: {}'.format(method)) start = time.time() all_metrics = {} other_info = {} for d_idx, (x_train, y_train, x_test, y_test) in enumerate(datasets): load_path = '{}/dataset_idx={}_{}/{}.param'.format( param_dir, d_idx, FLAGS.num_dataset, method) if tf.gfile.Exists(load_path): with tf.gfile.GFile( load_path, 'r') as f: # load best parameters from file lines = f.read().splitlines() param_str = lines[0] param_dict = { i.split('=')[0]: parse_value(i.split('=')[1]) for i in param_str.split(',') } else: if FLAGS.fast: logging.warn( 'No tuned parameters found (at %s), but using default ' 'parameters since running in FAST mode.', load_path) param_dict = None else: raise RuntimeError( '{} does not exist on Colossus'.format(load_path)) model, metrics = submodule.pipeline(x_train, y_train, x_test, y_test, param_dict=param_dict, problem=FLAGS.problem) for k, v in metrics.items(): if k not in all_metrics: all_metrics[k] = [] all_metrics[k].append(v) if FLAGS.problem == 'classification': if 'class_props' not in other_info: other_info['class_props'] = [] class_prop = np.true_divide(np.sum(y_test), np.size(y_test, 0)) other_info['class_props'].append(class_prop) if (num_inf_feature is not None and FLAGS.which_features == 'all' and 'dnn' not in method): if 'prop_inf_feat_importance' not in other_info: other_info['prop_inf_feat_importance'] = [] other_info['prop_inf_feat_importance'].append( get_prop_inf_feature_importance(method, model, num_inf_feature)) if 'dnn' in method: dnn.clear_keras_session() logging.info('Experiment results for method=%s, (%d datasets, %.3f s)', method, FLAGS.num_dataset, time.time() - start) logging.info('---') all_metrics = sorted(all_metrics.items(), key=lambda x: x[0]) print_out = '\n'.join([ '{}={:.6f}'.format(metric, np.mean(values)) for metric, values in all_metrics ]) print_out += '\n\n' print_out += '\n'.join([ '{}_SE={:.6f}'.format( metric, np.true_divide(np.std(values), np.sqrt(FLAGS.num_dataset))) for metric, values in all_metrics ]) if other_info: print_out += '\n\n' other_info = sorted(other_info.items(), key=lambda x: x[0]) print_out += '\n'.join([ '{}={:.6f}'.format(metric, np.mean(values)) for metric, values in other_info ]) logging.info(print_out) if not FLAGS.debug and FLAGS.logtofile: save_path = '{}/{}.log'.format(save_dir, method) with tf.gfile.GFile(save_path, 'w') as f: # save logs to file f.write(print_out) logging.info('Saved %s results to %s', method, save_path) logging.info('\n' * 2)