Esempio n. 1
0
def train_model_and_evaluate(lods, out_data, seed=10):
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)
    n_knob_cols = len(lods.config['COLS_KNOBS'])

    nn_params = HYPER_PARAMS['nn_params']
    kpca_params = HYPER_PARAMS['kpca_params']

    tmp_trainval = lods.trainval
    tmp_shared_trainval = lods.shared_trainval

    if N_TRAIN_PER_JOB != -1:
        tmp_trainval = lods.trainval.get_x(N_TRAIN_PER_JOB)
    if N_SHARED_TRAIN_PER_JOB != -1:
        tmp_shared_trainval = lods.shared_trainval.get_x(
            N_SHARED_TRAIN_PER_JOB)

    if tmp_trainval is not None:
        logging.info("shape of remaining trainval (X): {}".format(
            tmp_trainval.X.shape))
    else:
        logging.info("tmp_trainval is None (perhaps because of get_x(0))")

    if tmp_shared_trainval is not None:
        logging.info("shape of remaining shared trainval (X): {}".format(
            tmp_shared_trainval.X.shape))
    else:
        logging.info(
            "tmp_shared_trainval is None (perhaps because of get_x(0))")

    if tmp_trainval is None:
        # in case we're invoking dataset.get_x(0)
        ds_train = tmp_shared_trainval
    else:
        ds_train = tmp_trainval + tmp_shared_trainval

    X_train = np.hstack([ds_train.a, ds_train.X, ds_train.Y])
    y_train = ds_train.targets.ravel()

    logging.info("Fitting KPCA on data of shape: {}".format(X_train.shape))

    # Make PCA and fit on loaded data
    fit_t = time.time()
    pca = KernelPCA(**kpca_params)
    pca.altered_centroids = None
    logging.info("Fitting KPCA on data of shape: {}".format(ds_train.Y.shape))

    if ENCODING_STRATEGY == 'shared':
        shared_train = lods.shared_trainval.get_x(N_OBS)
        pca.fit(ds_train.Y)
        encods_shared = pca.transform(shared_train.Y)
        centroids = compute_centroids(encods_shared, shared_train.a)
    else:
        encods = pca.fit_transform(ds_train.Y)
        centroids = compute_centroids(encods, ds_train.a)
    pca.centroids = centroids  # thisis why I love Python! :-)
    fit_t = time.time() - fit_t
    logging.info("KPCA fitting time is: {} minutes and {} seconds".format(
        fit_t // 60, int(fit_t / 60)))

    # Adjust the X vector by transforming Y into job's centroid
    X, y = translate_X_y(X_train, y_train, pca.centroids, n_knob_cols)

    # Make and fit a NN Regressor
    logging.info("Fitting regressor on data of shapes: {}, {}".format(
        X.shape, y.shape))
    reg = NNregressor(with_calibration=True,
                      **nn_params,
                      v1_compat_mode=True,
                      random_state=seed)
    reg.fit(X, y, log_time=True)
    training_mape = reg.MAPE(X, y)
    logging.info("Training Error: {:.2f}%".format(training_mape))
    out_data['training_errs'].append(training_mape)

    if ENCODING_STRATEGY == 'shared':
        observed_traces = lods.shared_traincomplement.get_x(N_OBS)
    else:
        observed_traces = lods.traincomplement.get_x(N_OBS)

    logging.info("observed_traces description: ")
    observed_traces.describe()

    observed_traces_slices = observed_traces.slice_by_job_id(
        alias_to_id=lods.alias_to_id)

    test_aliases = sorted(list(set(lods.test.a.ravel())))

    for test_job in observed_traces_slices:
        evaluate(test_job, pca, reg, lods, observed_traces,
                 observed_traces_slices, out_data, test_aliases)

    out_data['regressors'].append(reg.get_persist_info())

    persist_data(copyDict(out_data), DATA_FNAME)