def plot_learning_curve(model, x_train, y_train, x_dev, y_dev):
    # Generates numpy array with 10 elements equaly distrinuted in <0.0, 1.0>
    nr_train_sizes = 10
    train_sizes = np.linspace(start=0.2, stop=1., num=nr_train_sizes)

    # sklearn.learning_curve() is internally using cross-validation so we join train and dev sets
    x_all = np.concatenate([x_train, x_dev], axis=0)
    y_all = np.concatenate((y_train, y_dev), axis=0)

    # For each element of train a model with that part of train set
    # evaluated by cross-validation
    train_sizes_abs, train_scores, dev_scores = learning_curve(
        model,
        x_all,
        y_all,
        train_sizes=train_sizes,
        scoring=rmse_scorer,
    )

    # xxx_scores is calculated for each train_size and each cross-validation fold
    # xxx_scores.shape = (nr_train_sizes, nr_cv_folds)
    # We want average across all folds
    train_scores = np.mean(train_scores, axis=1)
    dev_scores = np.mean(dev_scores, axis=1)

    # Plot it
    plt.plot(train_sizes_abs, np.stack([train_scores, dev_scores], axis=1))
    plt.title("Learning curve")
    plt.xlabel("Nr train examples")
    plt.ylabel("RMSE")
    plt.legend(["trainset", "devset(cv)"])
    plt.savefig(flexp.get_file_path("learning_curve.png"))
    plt.close()
Beispiel #2
0
def eval_feature_importance(model, feature_names):
    feature_importance = sorted(zip(feature_names, model.feature_importances_),
                                key=lambda x: x[1],
                                reverse=True)
    header = ["feature name", "feature importance"]
    file = flexp.get_file_path("feature_importance.csv")
    csv_dump([header] + feature_importance, file)
Beispiel #3
0
    def process(self, data):
        """
        :param data: Data modified by the module
        :type data: dict|object
        """
        error = rmse(np.array(data['test'][1]), np.array(data['predictions']))

        with open(flexp.get_file_path("results.csv"), "w") as fout:
            print("RMSE: {}".format(error), file=fout)
def eval_rmse(y_train, y_train_pred, y_dev, y_dev_pred):
    rmse_train = rmse(y_train, y_train_pred)
    rmse_dev = rmse(y_dev, y_dev_pred)

    file = flexp.get_file_path("metrics.csv")
    header = ["metric", "trainset", "devset"]
    row = ["rmse", str(rmse_train), str(rmse_dev)]
    csv_dump([header, row], file)

    logging.info(", ".join(row))
Beispiel #5
0
    def process(self, data):
        """
        :param data: Data modified by the module
        :type data: dict|object
        """
        self.regressor.fit(data["train"][0], data["train"][1])
        data['predictions'] = self.regressor.predict(data['test'][0])

        # Store predictions in the experiment folder
        with open(flexp.get_file_path("predictions.csv"), "w") as fout:
            fout.write("\n".join(str(row) for row in data['predictions']))
def eval_feature_importance(model, feature_names, x_dev, y_dev_true):
    if not hasattr(model, "feature_importances_"):
        logging.warning("Model doesn't have feature_importances_")
        return

    perm_importances = score_permutation_importance(model, x_dev, y_dev_true)

    # Sort feature_names and feature_importances by feature_importances, decreasing order
    feature_importance = sorted(zip(feature_names, model.feature_importances_,
                                    perm_importances),
                                key=lambda x: x[1],
                                reverse=True)
    header = ["feature name", "feature importance", "permutation importance"]
    file = flexp.get_file_path("feature_importance.csv")
    csv_dump([header] + feature_importance, file)
Beispiel #7
0
def main():
    flexp.setup("experiments/", "exp01", False)

    flog.setup("debug", path=flexp.get_file_path("experiment.log.txt"))  # , disable_stderr=not cfg.SHORT_RUN)
    log.debug('Starting.')

    data = FlowData()
    data.id = "a"

    # debug level 2 - all detailes will be printed
    data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 18)], debug_level=2)
    data_chain.process(data)

    # hash of this and previous are same
    data_chain = PickleCache("cached_pkls", "id", chain=[TestModule(12, 14, 20)], debug_level=1)
    data_chain.process(data)
def eval_pdp(model, x_dev, feature_names):
    # https://www.kaggle.com/dansbecker/partial-plots

    # pdp_isolate requires the data to be DataFrame so wrap it
    df_x_dev = pd.DataFrame(x_dev, columns=feature_names)

    for feature in feature_names:
        # Create the data that we will plot
        pdp_values = pdp.pdp_isolate(model=model,
                                     dataset=df_x_dev,
                                     model_features=feature_names,
                                     feature=feature,
                                     num_grid_points=100)

        # plot it
        pdp.pdp_plot(pdp_values, feature)
        plt.savefig(flexp.get_file_path("pdp_{}.png".format(feature)))
        plt.clf()
def plot_histograms(x_train, feature_names):
    for i, feature_name in enumerate(feature_names):
        plt.hist(x_train[:, i])
        plt.title("Histogram {}".format(feature_name))
        plt.savefig(flexp.get_file_path("histogram_{:02d}".format(i)))
        plt.clf()
Beispiel #10
0
    chunksize=100000,
    make_vowpal_test_input=True,
    use_positions=test_positions).read()

num_actions = 21

models = {
    "uniform": UniformPolicy(num_actions=num_actions),
}
for cb_type in args.cb_types:
    model_filename = "model_{}.vw".format(cb_type)
    models["vowpal_{}".format(cb_type)] = Vowpal(
        num_actions=num_actions,
        vowpal_binary_path=args.vowpal_path,
        cb_type=cb_type,
        model_path=flexp.get_file_path(model_filename),
        load_model_path=(os.path.join(args.load_model_dir, model_filename)
                         if args.load_model_dir else None))

if not args.skip_train:
    logging.info("Training...")
    t = time.time()
    for chunk in train:
        chunk[["vowpal_train_input"
               ]].to_csv(flexp.get_file_path("vowpal_input.txt"),
                         index=False,
                         header=None,
                         sep="\t",
                         mode="a")
        logging.info("timestamp {}, chunk took {:.3f} s to load".format(
            chunk.timestamp.iloc[-1],