def train_model_and_evaluate(lods, out_data, seed=10): np.random.seed(seed) tf.compat.v1.set_random_seed(seed) n_knob_cols = len(lods.config['COLS_KNOBS']) nn_params = HYPER_PARAMS['nn_params'] kpca_params = HYPER_PARAMS['kpca_params'] tmp_trainval = lods.trainval tmp_shared_trainval = lods.shared_trainval if N_TRAIN_PER_JOB != -1: tmp_trainval = lods.trainval.get_x(N_TRAIN_PER_JOB) if N_SHARED_TRAIN_PER_JOB != -1: tmp_shared_trainval = lods.shared_trainval.get_x( N_SHARED_TRAIN_PER_JOB) if tmp_trainval is not None: logging.info("shape of remaining trainval (X): {}".format( tmp_trainval.X.shape)) else: logging.info("tmp_trainval is None (perhaps because of get_x(0))") if tmp_shared_trainval is not None: logging.info("shape of remaining shared trainval (X): {}".format( tmp_shared_trainval.X.shape)) else: logging.info( "tmp_shared_trainval is None (perhaps because of get_x(0))") if tmp_trainval is None: # in case we're invoking dataset.get_x(0) ds_train = tmp_shared_trainval else: ds_train = tmp_trainval + tmp_shared_trainval X_train = np.hstack([ds_train.a, ds_train.X, ds_train.Y]) y_train = ds_train.targets.ravel() logging.info("Fitting KPCA on data of shape: {}".format(X_train.shape)) # Make PCA and fit on loaded data fit_t = time.time() pca = KernelPCA(**kpca_params) pca.altered_centroids = None logging.info("Fitting KPCA on data of shape: {}".format(ds_train.Y.shape)) if ENCODING_STRATEGY == 'shared': shared_train = lods.shared_trainval.get_x(N_OBS) pca.fit(ds_train.Y) encods_shared = pca.transform(shared_train.Y) centroids = compute_centroids(encods_shared, shared_train.a) else: encods = pca.fit_transform(ds_train.Y) centroids = compute_centroids(encods, ds_train.a) pca.centroids = centroids # thisis why I love Python! :-) fit_t = time.time() - fit_t logging.info("KPCA fitting time is: {} minutes and {} seconds".format( fit_t // 60, int(fit_t / 60))) # Adjust the X vector by transforming Y into job's centroid X, y = translate_X_y(X_train, y_train, pca.centroids, n_knob_cols) # Make and fit a NN Regressor logging.info("Fitting regressor on data of shapes: {}, {}".format( X.shape, y.shape)) reg = NNregressor(with_calibration=True, **nn_params, v1_compat_mode=True, random_state=seed) reg.fit(X, y, log_time=True) training_mape = reg.MAPE(X, y) logging.info("Training Error: {:.2f}%".format(training_mape)) out_data['training_errs'].append(training_mape) if ENCODING_STRATEGY == 'shared': observed_traces = lods.shared_traincomplement.get_x(N_OBS) else: observed_traces = lods.traincomplement.get_x(N_OBS) logging.info("observed_traces description: ") observed_traces.describe() observed_traces_slices = observed_traces.slice_by_job_id( alias_to_id=lods.alias_to_id) test_aliases = sorted(list(set(lods.test.a.ravel()))) for test_job in observed_traces_slices: evaluate(test_job, pca, reg, lods, observed_traces, observed_traces_slices, out_data, test_aliases) out_data['regressors'].append(reg.get_persist_info()) persist_data(copyDict(out_data), DATA_FNAME)