Beispiel #1
0
def benchmark(dataset='100k',
              random_state=0,
              n_jobs=1):
    results = copy.deepcopy(estimator_grid)

    hyperparams = _get_hyperparams()
    cvparams = _get_cvparams()

    X_tr, X_te = get_recsys_data(dataset, random_state)

    subdir = 'benches'
    output_dir = expanduser(join(trace_dir, subdir))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    indices = sorted(results.keys())
    for idx in indices:
        mf = results[idx]['estimator']
        mf.set_params(**hyperparams[idx][dataset])
        mf.set_params(**cvparams[idx][dataset])
        mf.set_params(random_state=random_state)
    res = Parallel(n_jobs=n_jobs,
                   max_nbytes=None)(
        delayed(single_fit_bench)(results[idx]['estimator'],
                                  X_tr, X_te)
        for idx in indices)
    times, rmses = zip(*res)
    for time, rmse, idx in zip(times, rmses, indices):
        results[idx]['timings'] = time
        results[idx]['rmse'] = rmse
        results[idx].pop('estimator')

    with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f:
        json.dump(results, f)
Beispiel #2
0
def benchmark(dataset='100k', random_state=0, n_jobs=1):
    results = copy.deepcopy(estimator_grid)

    hyperparams = _get_hyperparams()
    cvparams = _get_cvparams()

    X_tr, X_te = get_recsys_data(dataset, random_state)

    subdir = 'benches'
    output_dir = expanduser(join(trace_dir, subdir))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    indices = sorted(results.keys())
    for idx in indices:
        mf = results[idx]['estimator']
        mf.set_params(**hyperparams[idx][dataset])
        mf.set_params(**cvparams[idx][dataset])
        mf.set_params(random_state=random_state)
    res = Parallel(n_jobs=n_jobs, max_nbytes=None)(
        delayed(single_fit_bench)(results[idx]['estimator'], X_tr, X_te)
        for idx in indices)
    times, rmses = zip(*res)
    for time, rmse, idx in zip(times, rmses, indices):
        results[idx]['timings'] = time
        results[idx]['rmse'] = rmse
        results[idx].pop('estimator')

    with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f:
        json.dump(results, f)
Beispiel #3
0
def cross_val(dataset='100k', random_state=0, n_jobs=1):
    results = copy.deepcopy(estimator_grid)

    X_tr, X_te = get_recsys_data(dataset, random_state)

    subdir = 'cross_val'
    output_dir = expanduser(join(trace_dir, subdir))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    hyperparams = _get_hyperparams()

    for idx in results.keys():
        mf = results[idx]['estimator']
        mf.set_params(**hyperparams[idx][dataset])
        if idx in ['dl', 'dl_partial']:
            mf.set_params(n_epochs=5)
        else:
            mf.set_params(max_iter=40)
        mf.set_params(random_state=random_state)
        param_grid = [
            dict(alpha=alpha, beta=beta) for alpha in alphas[dataset]
            for beta in betas
        ]

        mf.verbose = 0
        mf.alpha = 0
        mf.beta = 0
        if dataset == 'netflix':
            # We don't perform nested cross val here
            res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)(
                delayed(single_fit)(mf, X_tr, X_te, params)
                for params in param_grid)
        else:
            cv = ShuffleSplit(n_iter=3, train_size=0.66, random_state=0)
            res = Parallel(n_jobs=n_jobs, verbose=10, max_nbytes=None)(
                delayed(single_fit_nested)(mf, X_tr, cv, params)
                for params in param_grid)
        scores, params = zip(*res)
        scores = np.array(scores).mean(axis=1)
        best_score_arg = scores.argmin()
        best_param = params[best_score_arg]
        best_score = scores[best_score_arg]

        results[idx]['params'] = params
        results[idx]['scores'] = scores.tolist()

        results[idx]['best_param'] = best_param
        results[idx]['best_score'] = best_score
        results[idx].pop('estimator')

    with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f:
        json.dump(results, f)
Beispiel #4
0
def compare_learning_rate(version='100k', n_jobs=1, random_state=0):
    X_tr, X_te = get_recsys_data(version, random_state)
    mf = copy.deepcopy(estimator_grid['dl_partial']['estimator'])

    hyperparams = _get_hyperparams()
    cvparams = _get_cvparams()

    mf.set_params(**hyperparams['dl_partial'][version])
    mf.set_params(**cvparams['dl_partial'][version])
    mf.set_params(random_state=random_state)
    output_dir = join(trace_dir, 'learning_rate')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    results = {}
    res = Parallel(n_jobs=n_jobs, max_nbytes=None)(
        delayed(single_learning_rate)(mf, learning_rate, X_tr, X_te) for
        learning_rate in learning_rates)

    for i, learning_rate in enumerate(learning_rates):
        results[learning_rate] = res[i]
    with open(join(output_dir, 'results_%s.json' % version), 'w+') as f:
        json.dump(results, f)
Beispiel #5
0
def compare_learning_rate(version='100k', n_jobs=1, random_state=0):
    X_tr, X_te = get_recsys_data(version, random_state)
    mf = copy.deepcopy(estimator_grid['dl_partial']['estimator'])

    hyperparams = _get_hyperparams()
    cvparams = _get_cvparams()

    mf.set_params(**hyperparams['dl_partial'][version])
    mf.set_params(**cvparams['dl_partial'][version])
    mf.set_params(random_state=random_state)
    output_dir = join(trace_dir, 'learning_rate')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    results = {}
    res = Parallel(n_jobs=n_jobs, max_nbytes=None)(
        delayed(single_learning_rate)(mf, learning_rate, X_tr, X_te)
        for learning_rate in learning_rates)

    for i, learning_rate in enumerate(learning_rates):
        results[learning_rate] = res[i]
    with open(join(output_dir, 'results_%s.json' % version), 'w+') as f:
        json.dump(results, f)
Beispiel #6
0
def cross_val(dataset='100k',
              random_state=0,
              n_jobs=1):
    results = copy.deepcopy(estimator_grid)

    X_tr, X_te = get_recsys_data(dataset, random_state)

    subdir = 'cross_val'
    output_dir = expanduser(join(trace_dir, subdir))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    hyperparams = _get_hyperparams()

    for idx in results.keys():
        mf = results[idx]['estimator']
        mf.set_params(**hyperparams[idx][dataset])
        if idx in ['dl', 'dl_partial']:
            mf.set_params(n_epochs=5)
        else:
            mf.set_params(max_iter=40)
        mf.set_params(random_state=random_state)
        param_grid = [dict(alpha=alpha, beta=beta) for alpha in alphas[dataset]
                      for beta in betas]

        mf.verbose = 0
        mf.alpha = 0
        mf.beta = 0
        if dataset == 'netflix':
            # We don't perform nested cross val here
            res = Parallel(n_jobs=n_jobs,
                           verbose=10,
                           max_nbytes=None)(
                delayed(single_fit)(mf, X_tr, X_te,
                                    params) for params in param_grid)
        else:
            cv = ShuffleSplit(n_iter=3,
                              train_size=0.66,
                              random_state=0)
            res = Parallel(n_jobs=n_jobs,
                           verbose=10,
                           max_nbytes=None)(
                delayed(single_fit_nested)(mf, X_tr, cv, params)
                for params in
                param_grid)
        scores, params = zip(*res)
        scores = np.array(scores).mean(axis=1)
        best_score_arg = scores.argmin()
        best_param = params[best_score_arg]
        best_score = scores[best_score_arg]

        results[idx]['params'] = params
        results[idx]['scores'] = scores.tolist()

        results[idx]['best_param'] = best_param
        results[idx]['best_score'] = best_score
        results[idx].pop('estimator')

    with open(join(output_dir, 'results_%s.json' % dataset), 'w+') as f:
        json.dump(results, f)