def plot_learning_curve(model, x_train, y_train, x_dev, y_dev): # Generates numpy array with 10 elements equaly distrinuted in <0.0, 1.0> nr_train_sizes = 10 train_sizes = np.linspace(start=0.2, stop=1., num=nr_train_sizes) # sklearn.learning_curve() is internally using cross-validation so we join train and dev sets x_all = np.concatenate([x_train, x_dev], axis=0) y_all = np.concatenate((y_train, y_dev), axis=0) # For each element of train a model with that part of train set # evaluated by cross-validation train_sizes_abs, train_scores, dev_scores = learning_curve( model, x_all, y_all, train_sizes=train_sizes, scoring=rmse_scorer, ) # xxx_scores is calculated for each train_size and each cross-validation fold # xxx_scores.shape = (nr_train_sizes, nr_cv_folds) # We want average across all folds train_scores = np.mean(train_scores, axis=1) dev_scores = np.mean(dev_scores, axis=1) # Plot it plt.plot(train_sizes_abs, np.stack([train_scores, dev_scores], axis=1)) plt.title("Learning curve") plt.xlabel("Nr train examples") plt.ylabel("RMSE") plt.legend(["trainset", "devset(cv)"]) plt.savefig(flexp.get_file_path("learning_curve.png")) plt.close()
def eval_feature_importance(model, feature_names): feature_importance = sorted(zip(feature_names, model.feature_importances_), key=lambda x: x[1], reverse=True) header = ["feature name", "feature importance"] file = flexp.get_file_path("feature_importance.csv") csv_dump([header] + feature_importance, file)
def process(self, data): """ :param data: Data modified by the module :type data: dict|object """ error = rmse(np.array(data['test'][1]), np.array(data['predictions'])) with open(flexp.get_file_path("results.csv"), "w") as fout: print("RMSE: {}".format(error), file=fout)
def eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred): rmse_train = rmse(y_train, y_train_pred) rmse_dev = rmse(y_dev, y_dev_pred) file = flexp.get_file_path("metrics.csv") header = ["metric", "trainset", "devset"] row = ["rmse", str(rmse_train), str(rmse_dev)] csv_dump([header, row], file) logging.info(", ".join(row))
def process(self, data): """ :param data: Data modified by the module :type data: dict|object """ self.regressor.fit(data["train"][0], data["train"][1]) data['predictions'] = self.regressor.predict(data['test'][0]) # Store predictions in the experiment folder with open(flexp.get_file_path("predictions.csv"), "w") as fout: fout.write("\n".join(str(row) for row in data['predictions']))
def eval_feature_importance(model, feature_names, x_dev, y_dev_true): if not hasattr(model, "feature_importances_"): logging.warning("Model doesn't have feature_importances_") return perm_importances = score_permutation_importance(model, x_dev, y_dev_true) # Sort feature_names and feature_importances by feature_importances, decreasing order feature_importance = sorted(zip(feature_names, model.feature_importances_, perm_importances), key=lambda x: x[1], reverse=True) header = ["feature name", "feature importance", "permutation importance"] file = flexp.get_file_path("feature_importance.csv") csv_dump([header] + feature_importance, file)
def main(): flexp.setup("experiments/", "exp01", False) flog.setup("debug", path=flexp.get_file_path("experiment.log.txt")) # , disable_stderr=not cfg.SHORT_RUN) log.debug('Starting.') data = FlowData() data.id = "a" # debug level 2 - all detailes will be printed data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 18)], debug_level=2) data_chain.process(data) # hash of this and previous are same data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 20)], debug_level=1) data_chain.process(data)
def eval_pdp(model, x_dev, feature_names): # https://www.kaggle.com/dansbecker/partial-plots # pdp_isolate requires the data to be DataFrame so wrap it df_x_dev = pd.DataFrame(x_dev, columns=feature_names) for feature in feature_names: # Create the data that we will plot pdp_values = pdp.pdp_isolate(model=model, dataset=df_x_dev, model_features=feature_names, feature=feature, num_grid_points=100) # plot it pdp.pdp_plot(pdp_values, feature) plt.savefig(flexp.get_file_path("pdp_{}.png".format(feature))) plt.clf()
def plot_histograms(x_train, feature_names): for i, feature_name in enumerate(feature_names): plt.hist(x_train[:, i]) plt.title("Histogram {}".format(feature_name)) plt.savefig(flexp.get_file_path("histogram_{:02d}".format(i))) plt.clf()
chunksize=100000, make_vowpal_test_input=True, use_positions=test_positions).read() num_actions = 21 models = { "uniform": UniformPolicy(num_actions=num_actions), } for cb_type in args.cb_types: model_filename = "model_{}.vw".format(cb_type) models["vowpal_{}".format(cb_type)] = Vowpal( num_actions=num_actions, vowpal_binary_path=args.vowpal_path, cb_type=cb_type, model_path=flexp.get_file_path(model_filename), load_model_path=(os.path.join(args.load_model_dir, model_filename) if args.load_model_dir else None)) if not args.skip_train: logging.info("Training...") t = time.time() for chunk in train: chunk[["vowpal_train_input" ]].to_csv(flexp.get_file_path("vowpal_input.txt"), index=False, header=None, sep="\t", mode="a") logging.info("timestamp {}, chunk took {:.3f} s to load".format( chunk.timestamp.iloc[-1],