def benchmark(dataset='100k', random_state=0, n_jobs=1): results = copy.deepcopy(estimator_grid) hyperparams = _get_hyperparams() cvparams = _get_cvparams() X_tr, X_te = get_recsys_data(dataset, random_state) subdir = 'benches' output_dir = expanduser(join(trace_dir, subdir)) if not os.path.exists(output_dir): os.makedirs(output_dir) indices = sorted(results.keys()) for idx in indices: mf = results[idx]['estimator'] mf.set_params(**hyperparams[idx][dataset]) mf.set_params(**cvparams[idx][dataset]) mf.set_params(random_state=random_state) res = Parallel(n_jobs=n_jobs, max_nbytes=None)( delayed(single_fit_bench)(results[idx]['estimator'], X_tr, X_te) for idx in indices) times, rmses = zip(*res) for time, rmse, idx in zip(times, rmses, indices): results[idx]['timings'] = time results[idx]['rmse'] = rmse results[idx].pop('estimator') with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f: json.dump(results, f)
def cross_val(dataset='100k', random_state=0, n_jobs=1): results = copy.deepcopy(estimator_grid) X_tr, X_te = get_recsys_data(dataset, random_state) subdir = 'cross_val' output_dir = expanduser(join(trace_dir, subdir)) if not os.path.exists(output_dir): os.makedirs(output_dir) hyperparams = _get_hyperparams() for idx in results.keys(): mf = results[idx]['estimator'] mf.set_params(**hyperparams[idx][dataset]) if idx in ['dl', 'dl_partial']: mf.set_params(n_epochs=5) else: mf.set_params(max_iter=40) mf.set_params(random_state=random_state) param_grid = [ dict(alpha=alpha, beta=beta) for alpha in alphas[dataset] for beta in betas ] mf.verbose = 0 mf.alpha = 0 mf.beta = 0 if dataset == 'netflix': # We don't perform nested cross val here res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)( delayed(single_fit)(mf, X_tr, X_te, params) for params in param_grid) else: cv = ShuffleSplit(n_iter=3, train_size=0.66, random_state=0) res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)( delayed(single_fit_nested)(mf, X_tr, cv, params) for params in param_grid) scores, params = zip(*res) scores = np.array(scores).mean(axis=1) best_score_arg = scores.argmin() best_param = params[best_score_arg] best_score = scores[best_score_arg] results[idx]['params'] = params results[idx]['scores'] = scores.tolist() results[idx]['best_param'] = best_param results[idx]['best_score'] = best_score results[idx].pop('estimator') with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f: json.dump(results, f)
def compare_learning_rate(version='100k', n_jobs=1, random_state=0): X_tr, X_te = get_recsys_data(version, random_state) mf = copy.deepcopy(estimator_grid['dl_partial']['estimator']) hyperparams = _get_hyperparams() cvparams = _get_cvparams() mf.set_params(**hyperparams['dl_partial'][version]) mf.set_params(**cvparams['dl_partial'][version]) mf.set_params(random_state=random_state) output_dir = join(trace_dir, 'learning_rate') if not os.path.exists(output_dir): os.makedirs(output_dir) results = {} res = Parallel(n_jobs=n_jobs, max_nbytes=None)( delayed(single_learning_rate)(mf, learning_rate, X_tr, X_te) for learning_rate in learning_rates) for i, learning_rate in enumerate(learning_rates): results[learning_rate] = res[i] with open(join(output_dir, 'results_%s.json' % version), 'w+') as f: json.dump(results, f)
def cross_val(dataset='100k', random_state=0, n_jobs=1): results = copy.deepcopy(estimator_grid) X_tr, X_te = get_recsys_data(dataset, random_state) subdir = 'cross_val' output_dir = expanduser(join(trace_dir, subdir)) if not os.path.exists(output_dir): os.makedirs(output_dir) hyperparams = _get_hyperparams() for idx in results.keys(): mf = results[idx]['estimator'] mf.set_params(**hyperparams[idx][dataset]) if idx in ['dl', 'dl_partial']: mf.set_params(n_epochs=5) else: mf.set_params(max_iter=40) mf.set_params(random_state=random_state) param_grid = [dict(alpha=alpha, beta=beta) for alpha in alphas[dataset] for beta in betas] mf.verbose = 0 mf.alpha = 0 mf.beta = 0 if dataset == 'netflix': # We don't perform nested cross val here res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)( delayed(single_fit)(mf, X_tr, X_te, params) for params in param_grid) else: cv = ShuffleSplit(n_iter=3, train_size=0.66, random_state=0) res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)( delayed(single_fit_nested)(mf, X_tr, cv, params) for params in param_grid) scores, params = zip(*res) scores = np.array(scores).mean(axis=1) best_score_arg = scores.argmin() best_param = params[best_score_arg] best_score = scores[best_score_arg] results[idx]['params'] = params results[idx]['scores'] = scores.tolist() results[idx]['best_param'] = best_param results[idx]['best_score'] = best_score results[idx].pop('estimator') with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f: json.dump(results, f)