Ejemplo n.º 1
0
if __name__ == "__main__":
    # Load trining data
    training_encoded_data_path = "./Dataset/encoded_training_data_4362.json"
    X_train, y_train = FeatureTransformer.load_encoded_data(training_encoded_data_path)

    # Load test data
    test_data_path = "./Dataset/valid_data_1091.json"
    test_data = utils.load_data(test_data_path)
    df = pd.DataFrame(test_data)
    X_test = df.content.values
    y_test = df.label.values

    # Transform test data
    ft = FeatureTransformer()
    X_test = ft.fit_transform(X_test, y_train, vocab_path=VOCAB_PATH)

    # Define models
    mnb = MultinomialNB(alpha=0.004)

    rf = RandomForestClassifier(
        max_features=0.8,
        n_estimators=20,
        max_depth=80,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE)

    etree = ExtraTreesClassifier(
        n_estimators=50,
        max_features=0.3,
Ejemplo n.º 2
0
class EnsembleModel:
    def __init__(self, scoring, vocab_path, cv=3):
        self.cv = cv
        self.scoring = scoring
        self.models = {}
        self.vocab_path = vocab_path
        self.feature_transformer = FeatureTransformer()

    def add_model(self, name, estimator):
        self.models.update({
            name: {
                "estimator": estimator,
                "pred": [],
                "training_time": 0
            }
        })

    def remove_model(self, name):
        del self.models[name]
        print("Remove model {} done".format(name))

    def fit(self, X, y, is_encoded_data=True):
        if not is_encoded_data:
            # Transform raw document to document presentation
            X = self.feature_transformer.fit_transform(X, y, vocab_path=self.vocab_path)
            self.vocab = self.feature_transformer.get_tfidf_vocab()
            print("Vocabulary size : ", len(self.vocab))

        for name, model in self.models.items():
            start_time = time.time()
            model["estimator"].fit(X, y)
            finish_time = time.time()
            training_time = finish_time - start_time
            model["training_time"] = training_time
            print("Model {} fit done. Time : {:.4f} seconds".format(name, training_time))
        self.print_stat_fit()

    def predict(self, X):
        start_time = time.time()
        X = self.feature_transformer.transform(X)
        total_preds = [[] for _ in range(X.shape[0])]
        for name, model in self.models.items():
            model["pred"] = model["estimator"].predict(X)
            for i, pred in enumerate(model["pred"]):
                total_preds[i].append(pred)

        # Major voting
        self.major_votings = []
        model_predict_rate = []
        for i, preds in enumerate(total_preds):
            major_label, num_model_predict_label = Counter(preds).most_common(1)[0]
            self.major_votings.append(major_label)
            model_predict_rate.append(num_model_predict_label / len(self.models))

        finish_time = time.time()
        print("Model predict {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time))
        return self.major_votings, model_predict_rate

    def predict_proba(self, X):
        start_time = time.time()
        X = self.feature_transformer.transform(X)
        total_probs = []
        for name, model in self.models.items():
            classes = model["estimator"].classes_
            print("Model {} start to predict proba".format(name))
            model["prob"] = model["estimator"].predict_proba(X)
            model["pred"] = classes[np.argmax(model["prob"], axis=1)]
            total_probs.append(np.array(model["prob"]))

        # Major voting
        total_prob_pred = total_probs[0]
        model_predict_rate = []
        for i in range(1, len(total_probs)):
            total_prob_pred += total_probs[i]
        total_prob_pred /= len(total_probs)
        self.max_prob_pred = np.max(total_prob_pred, axis=1)
        index_pred = np.argmax(total_prob_pred, axis=1)

        self.label_pred = classes[index_pred]

        finish_time = time.time()
        print("Model predict proba {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time))
        return self.label_pred, self.max_prob_pred

    def evaluate(self, X_test, y_test, metrics, is_predict_proba=False):
        # Predict X_test
        if is_predict_proba:
            major_pred, _ = self.predict_proba(X_test)
        else:
            major_pred, _ = self.predict(X_test)

        # Save result predict to debug
        # pred_df = {"Ensemble": major_pred}

        # Evaluate models on metrics
        result = []
        cf_mats = {}
        columns = sorted(list(metrics.keys()))
        for name, model in self.models.items():
            row = [name]
            y_pred = model["pred"]
            # pred_df.update({name: y_pred})
            for metric_name in columns:
                metric_fn = metrics.get(metric_name).get("fn")
                metric_params = metrics.get(metric_name).get("params")
                # print("Score : {}, Params : {}".format(metric_name, metric_params))
                # print(np.unique(y_test))
                # third_param = True if metric_name == "accuracy" else None
                if metric_params is None:
                    value_score = metric_fn(y_test, y_pred)
                else:
                    value_score = metric_fn(y_test, y_pred, **metric_params)
                row.append(value_score)
            result.append(row)

            # Calculate confusion matrix
            unique_label = np.unique(np.concatenate((y_test, y_pred)))
            cf_mat = confusion_matrix(y_test, y_pred, unique_label)
            cf_mats.update({name: (cf_mat, unique_label)})

        # pred_df.update({"True_Label": y_test})
        # pred_df = pd.DataFrame(pred_df)
        # pred_df = pred_df[pred_df["Ensemble"] != pred_df["True_Label"]]
        # pred_df.to_csv("./Debug/pred.csv", index=False)

        # Evaluate ensemble model
        ensemble_model_name = "Ensemble"
        row = [ensemble_model_name]
        for metric_name in columns:
            metric_fn = metrics.get(metric_name).get("fn")
            metric_params = metrics.get(metric_name).get("params")
            # third_param = True if metric_name == "accuracy" else None
            if metric_params is None:
                value_score = metric_fn(y_test, major_pred)
            else:
                value_score = metric_fn(y_test, major_pred, **metric_params)
            row.append(value_score)
        result.append(row)
        unique_label = np.unique(np.concatenate((y_test, major_pred)))
        cf_mats.update({ensemble_model_name: (confusion_matrix(y_test, major_pred, unique_label), unique_label)})

        # print("\nmodel::evaluate Accuracy", accuracy_score(y_test, major_pred))

        columns = ["Model"] + columns
        result = pd.DataFrame(result, columns=columns)

        return result, cf_mats

    def print_stat_fit(self):
        print("\n===============================")
        print("Statistic : ")
        for name, model in self.models.items():
            instance = model["estimator"]
            print("\nModel : ", name)
            print("Best params : ", instance.best_params_)
            print("Best valid {} score  : {}".format(self.scoring[0], instance.best_score_))
            best_index = instance.best_index_
            for score in self.scoring:
                if score != self.scoring[0]:
                    print("Mean valid {} score : {}".format(score, instance.cv_results_["mean_test_{}".format(score)][best_index]))
            print("Training time : {} seconds".format(model["training_time"]))
        print("===============================\n")

    def get_statistic_data(self):
        data_plot = []
        columns = ["Model", "Hyper_Parameter"] + self.scoring + ["Training_Time (Seconds)"]
        for name, model in self.models.items():
            row = [name]
            instance = model["estimator"]
            row.append(instance.best_params_)
            row.append(instance.best_score_)
            best_index = instance.best_index_
            for score in self.scoring:
                if score != self.scoring[0]:
                    row.append(instance.cv_results_["mean_test_{}".format(score)][best_index])
            row.append(model["training_time"])
            data_plot.append(row)

        data_plot = pd.DataFrame(data_plot, columns=columns)
        return data_plot

    def save_model(self, save_dir="./Model"):
        print("Start to save {} models to {} ...".format(len(self.models), save_dir))
        save_dir = os.path.join(save_dir, utils.get_format_time_now())
        utils.mkdirs(save_dir)
        meta_data = []

        for name, model in self.models.items():
            instance = model["estimator"]
            save_path = os.path.join(save_dir, "{}.joblib".format(name))
            joblib.dump(instance, save_path)
            meta_data.append({
                "model_name": name,
                "model_path": save_path,
                "model_params": instance.best_params_
            })
            print("Save model {} to {} done".format(name, save_path))

        # Save meta data about models
        meta_data_path = os.path.join(save_dir, "meta.txt")
        # print("\nMeta data : ", meta_data)
        with open(meta_data_path, 'w') as f:
            json.dump(meta_data, f, cls=utils.MyEncoder)

        print("Save {} models to {} done".format(len(self.models), save_dir))

        # Save figure about training result of models
        # Create data frame contains result
        statistic_data = self.get_statistic_data()

        # Save statistic data
        statistic_save_dir = os.path.join(save_dir, "Statistic")
        utils.mkdirs(statistic_save_dir)
        result_save_path = os.path.join(statistic_save_dir, "result.csv")
        statistic_data.to_csv(result_save_path, index=False)

        # Plot and save figure
        data_plot = statistic_data.drop("Hyper_Parameter", axis=1)
        self.plot_result(data_plot, statistic_save_dir, is_plot=False)

    def load_model(self, save_dir):
        print("Start to load models from ", save_dir)
        meta_data_path = os.path.join(save_dir, "meta.txt")
        # Load meta data about models
        with open(meta_data_path, 'r') as f:
            meta_data = json.load(f)
        self.models = {}
        for info_model in meta_data:
            model_name = info_model["model_name"]
            model_path = info_model["model_path"]
            estimator = joblib.load(model_path)
            self.models.update({
                model_name: {
                    "estimator": estimator,
                    "pred": []
                }
            })
        self.feature_transformer.fit([""], [""], vocab_path=self.vocab_path)
        print("Load {} models from {} done".format(len(self.models), save_dir))

    def plot_result(self, data_plot, save_fig_dir, is_plot=True):
        utils.mkdirs(save_fig_dir)
        columns = list(data_plot.columns)
        print("Start to plot and save {} figures to {} ...".format(len(columns) - 1, save_fig_dir))

        print("Head of data plot")
        print(data_plot.head())
        x_offset = -0.07
        y_offset = 0.01
        mpl.style.use("seaborn")

        model_column = columns[0]
        for score_solumn in columns[1:]:
            # Sort by ascending score
            data_plot.sort_values(score_solumn, ascending=True, inplace=True)

            ax = data_plot.plot(kind="bar", x=model_column, y=score_solumn,
                                legend=None, color='C1', figsize=(len(self.models) + 1, 4), width=0.3)
            title = "Mean {} score - {} cross validation".format(score_solumn, self.cv)
            ax.set(title=title, xlabel=model_column, ylabel=score_solumn)
            ax.tick_params(axis='x', rotation=0)

            # Set lower and upper limit of y-axis
            min_score = data_plot.loc[:, score_solumn].min()
            max_score = data_plot.loc[:, score_solumn].max()
            y_lim_min = (min_score - 0.2) if min_score > 0.2 else 0
            y_lim_max = (max_score + 1) if max_score > 1 else 1
            ax.set_ylim([y_lim_min, y_lim_max])

            # Show value of each column to see clearly
            for p in ax.patches:
                b = p.get_bbox()
                text_value = "{:.4f}".format(b.y1)
                ax.annotate(text_value, xy=(b.x0 + x_offset, b.y1 + y_offset))

            save_fig_path = os.path.join(save_fig_dir, "{}.png".format(score_solumn))
            plt.savefig(save_fig_path, dpi=800)

        print("Plot and save {} figures to {} done".format(len(columns) - 1, save_fig_dir))
        if is_plot:
            plt.show()