def main(regressor, X_train_df, X_test_df, y_train_df, y_test_df, nsplit=2):
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )

    regs = split_train(regressor, X_train_df, y_train_df["revenue"], nsplit)
    preds = []
    for reg in regs:
        pred = reg.predict(X_test_df.to_numpy())
        preds.append(pred)
    pred = np.sum(preds, axis=0) / len(preds)

    print("-" * 5, "revenue evaluation", "-" * 5)
    y_test = y_test_df["revenue"].to_numpy()
    report = regression_report(y_test, pred, X_test_df.shape[1])
    print(report)

    pred_df = X_test_df.copy()
    pred_df["pred_revenue"] = pred
    pred_label_df = data.to_label(pred_df)
    true_label_df = pd.read_csv("data/revenue_per_day.csv",
                                index_col="arrival_date")
    print("-" * 5, "label evaluation", "-" * 5)
    evaluate_by_label2(pred_label_df, true_label_df, "label")
    print("-" * 5, "revenue_per_day evaluation", "-" * 5)
    evaluate_by_label2(pred_label_df, true_label_df, "revenue")
    def regression_report(self, test_loader):
        print("-" * 10, "Regression Report", "-" * 10)
        print(f"loss: {self.validation(test_loader)}")
        model = self.model
        model.eval().to(self.device)

        y_pred, y_true = [], []
        for data in test_loader:
            inputs, labels = data
            inputs, labels = inputs.to(self.device), labels.to(self.device).long()
            predicted = model(inputs)

            y_true += labels.squeeze().cpu().tolist()
            y_pred += predicted.squeeze().cpu().tolist()
        report = regression_report(y_true, y_pred, inputs.shape[1])
        print(report)
        return report
def split_train(estimator_class, X_df, y_df, nsplit=2):
    nrow = X_df.shape[0]
    part_nrow = int(nrow * (1 / nsplit))
    print(part_nrow)

    regs = []
    X_df = X_df.copy().reset_index().drop("ID", axis=1)
    y_df = y_df.copy().reset_index().drop("ID", axis=1)
    for i in range(nsplit):
        test_start = i * part_nrow
        test_end = (i + 1) * part_nrow

        X_test_df = X_df.loc[test_start:test_end - 1, :].copy()
        y_test_df = y_df.loc[test_start:test_end - 1, :].copy()
        X_train_df = pd.concat(
            [X_df.loc[test_end:, :], X_df.loc[:test_start - 1, :]],
            axis=0,
        )
        y_train_df = pd.concat(
            [y_df.loc[test_end:, :], y_df.loc[:test_start - 1, :]],
            axis=0,
        )
        print(
            f"X_train shape: {X_train_df.shape}, y_train shape: {y_train_df.shape}"
        )
        print(
            f"X_test shape: {X_test_df.shape}, y_test shape: {y_test_df.shape}"
        )

        X_train, X_test, y_train, y_test = (
            X_train_df.to_numpy(),
            X_test_df.to_numpy(),
            np.squeeze(y_train_df.to_numpy()),
            np.squeeze(y_test_df.to_numpy()),
        )
        # X_train, X_test = X_test, X_train
        # y_train, y_test = y_test, y_train
        reg = estimator_class()
        reg.fit(X_train, y_train)
        regs.append(reg)
        report = regression_report(y_test, reg.predict(X_test),
                                   X_test.shape[1])
        print("-" * 10, f"revenue report ({i})", "-" * 10)
        print(report)

    return regs
 def run(self):
     self.model.fit(self.train_X, self.train_y)
     y_pred = self.model.predict(self.test_X)
     if self.model_type == "classifier":
         report = classification_report(self.test_y, y_pred)
     elif self.model_type == "regressor":
         report = regression_report(self.test_y, y_pred,
                                    self.test_X.shape[1])
     if self.name != None:
         print(f"Method: {self.name}")
     print(report)
     if self.save:
         print(f"*Append result to SKLearn_{self.model_type}s_Report.txt")
         with open(f"SKLearn_{self.model_type}s_Report.txt", "a") as ofile:
             if self.name != None:
                 ofile.write(f"Method: {self.name}\n")
             ofile.write(f"finished time: {datetime.now()}\n")
             ofile.write(report)
             ofile.write("-" * 20 + "\n")
     print("-" * 20)
     work_slot.release()
def main(regressor, X_train_df, X_test_df, y_train_df, y_test_df, nsplit=2):
    # data
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )

    # training
    regs = split_train(regressor, X_train_df, y_train_df, nsplit)

    # evaluation on validation data
    revenue_preds = []
    for reg, models in regs:
        X_df = append_pred(models, X_test_df.copy())
        revenue_pred = reg.predict(X_df)
        revenue_preds.append(revenue_pred)
    revenue_pred = np.sum(revenue_preds, axis=0) / len(revenue_preds)

    # print report
    report = []
    report.append("[ revenue_per_order evaluation ]")
    y_test = y_test_df["revenue"].to_numpy()
    reg_report = regression_report(y_test, revenue_pred, X_test_df.shape[1])
    report.append(reg_report)

    pred_df = X_test_df.copy()
    pred_df["pred_revenue"] = revenue_pred
    pred_label_df = data.to_label(pred_df)
    true_label_df = pd.read_csv("data/revenue_per_day.csv",
                                index_col="arrival_date")

    report.append("[ label evaluation ]")
    report.append(evaluate_by_label2(pred_label_df, true_label_df, "label"))
    report.append("[ revenue_per_day evaluation ]")
    report.append(evaluate_by_label2(pred_label_df, true_label_df, "revenue"))
    return "\n".join(report) + "\n"
Ejemplo n.º 6
0
        ["revenue"], test_ratio=0.3)
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train, y_train)

    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(y_test, eval_reg.predict(X_test),
                               X_test.shape[1])
    print(report)

    print("-" * 10, "evaluation of label", "-" * 10)
    label_df = data.get_true_label(
        columns=["adr", "revenue", "is_canceled", "label"])
    pred_label_df = data.predict_label(eval_reg, X_test_df)

    print("[ label evaluation ]")
    report_label = evaluate_by_label(pred_label_df, label_df, target="label")
    print(report_label)
    print("[ revenue_per_day evaluation ]")
    report_revenue = evaluate_by_label(pred_label_df,
                                       label_df,
                                       target="revenue")
    print(report_revenue)
    # training
    regs = split_train(regressor, X_train_df, y_train_df, nsplit)

    # evaluation on validation data
    revenue_preds = []
    for reg, models in regs:
        X_df = append_pred(models, X_test_df.copy())
        revenue_pred = reg.predict(X_df)
        revenue_preds.append(revenue_pred)
    revenue_pred = np.sum(revenue_preds, axis=0) / len(revenue_preds)

    # print report
    report = []
    report.append("[ revenue_per_order evaluation ]")
    y_test = y_test_df["revenue"].to_numpy()
    reg_report = regression_report(y_test, revenue_pred, X_test_df.shape[1])
    report.append(reg_report)

    pred_df = X_test_df.copy()
    pred_df["pred_revenue"] = revenue_pred
    pred_label_df = data.to_label(pred_df)
    true_label_df = data.get_true_label(
        columns=["adr", "revenue", "is_canceled", "label"])

    report.append("[ label evaluation ]")
    report.append(evaluate_by_label(pred_label_df, true_label_df, "label"))
    report.append("[ revenue_per_day evaluation ]")
    report.append(evaluate_by_label(pred_label_df, true_label_df, "revenue"))
    report = "\n".join(report) + "\n"
    print(report)
Ejemplo n.º 8
0
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["adr"].to_numpy(),
        y_test_df["adr"].to_numpy(),
        y_train_df["is_canceled"].to_numpy(),
        y_test_df["is_canceled"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train_adr.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test_adr.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train.copy(), y_train_adr.copy())
    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(
        y_test_adr.copy(), eval_reg.predict(X_test.copy()), X_test.shape[1]
    )
    print(report)

    # eval_clf = RandomForestClassifier(random_state=1129)
    eval_clf = HistGradientBoostingClassifier(random_state=1129)
    eval_clf.fit(X_train.copy(), y_train_canceled.copy())
    print("-" * 10, "classification report", "-" * 10)
    report = classification_report(
        y_test_canceled.copy(), eval_clf.predict(X_test.copy())
    )
    print(report)

    #%%
    pred_df = predict(eval_clf, eval_reg, X_test_df)
    pred_label_df = data.to_label(pred_df)