Ejemplo n.º 1
0
def precision_micro_score(y_true, y_pred, labels):
    return precision_score(y_true, y_pred, labels, average="micro")


def recall_macro_score(y_true, y_pred, labels):
    return recall_score(y_true, y_pred, labels, average="macro")


def recall_micro_score(y_true, y_pred, labels):
    return recall_score(y_true, y_pred, labels, average="micro")


if __name__ == "__main__":
    # Load trining data
    training_encoded_data_path = "./Dataset/encoded_training_data_4362.json"
    X_train, y_train = FeatureTransformer.load_encoded_data(training_encoded_data_path)

    # Load test data
    test_data_path = "./Dataset/valid_data_1091.json"
    test_data = utils.load_data(test_data_path)
    df = pd.DataFrame(test_data)
    X_test = df.content.values
    y_test = df.label.values

    # Transform test data
    ft = FeatureTransformer()
    X_test = ft.fit_transform(X_test, y_train, vocab_path=VOCAB_PATH)

    # Define models
    mnb = MultinomialNB(alpha=0.004)
Ejemplo n.º 2
0
import os, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from preprocessing import FeatureTransformer

if __name__ == "__main__":
    # Load data to explore
    training_file_path = "./Dataset/New_Data_v2/encoded_training_data_6751.json"
    # test_file_path = "./Dataset/data_sent.json"

    # training_data = utils.load_data(training_file_path)
    training_data, labels = FeatureTransformer.load_encoded_data(
        training_file_path)
    # training_size = len(training_data)
    # test_data = utils.load_data(test_file_path)
    # test_size = len(test_data)

    # print("Training data size : ", training_size)
    # print("Test data size : ", test_size)

    print("========================================")

    # training_df = utils.convert_original_data_to_df(training_data)

    # print(training_df.info())

    print("\nStatistic")
    # stats_by_label = training_df.label.value_counts().sort_index().reset_index()
    stats_by_label = pd.DataFrame(
    mean_sample_of_each_label = int(labels.shape[0] / len(unique_label)) + 1
    ratio = {}
    for label in unique_label:
        num_sample_of_label = np.sum(labels == label)
        if num_sample_of_label > mean_sample_of_each_label:
            desired_num_sample = mean_sample_of_each_label + \
                                 int(0.2 * (num_sample_of_label - mean_sample_of_each_label))
            ratio.update({label: desired_num_sample})

    return ratio


if __name__ == "__main__":
    # Load trining data
    training_encoded_data_path = "./Dataset/encoded_training_data_4362.json"
    X_train, y_train = FeatureTransformer.load_encoded_data(
        training_encoded_data_path)

    unique_label = np.unique(y_train)
    print("Num distinct labels : ", len(unique_label))
    mean_sample_of_each_label = int(X_train.shape[0] / len(unique_label))
    print("Mean sample of each label : ", mean_sample_of_each_label)
    # ratio = {label: mean_sample_of_each_label for label in unique_label}
    # ratio = get_over_sampling_ratio(y_train)
    # print("Ratio size : ", len(ratio))
    # print(ratio)

    # Resampling
    # new_X_train, new_y_train = over_sampling(X_train, y_train)
    # Over sampling
    over_ratio = get_over_sampling_ratio(y_train)
    smt = SMOTE(random_state=RANDOM_STATE, ratio=over_ratio, k=4)
Ejemplo n.º 4
0
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

if __name__ == "__main__":
    # training_data_path = "./Dataset/data_train.json"
    training_encoded_data_path = "./Dataset/encoded_smote-cc_training_data_5589.json"
    # training_data = utils.load_data(training_data_path)
    # X_train, y_train = utils.convert_orginal_data_to_list(training_data)
    X_train, y_train = FeatureTransformer.load_encoded_data(
        training_encoded_data_path)

    model = EnsembleModel(SCORING, VOCAB_PATH, CV)

    # 1. Multinomial Naive Bayes
    mnb_gs = GridSearchCV(
        MultinomialNB(),
        param_grid={
            "alpha": [0.005]
            # "alpha": np.arange(0.001, 0.02, 0.001)
        },
        scoring=SCORING,
        refit=SCORING[0],
        cv=CV,
        return_train_score=False)
    model.add_model("MultiNB", mnb_gs)
Ejemplo n.º 5
0
 def __init__(self, scoring, vocab_path, cv=3):
     self.cv = cv
     self.scoring = scoring
     self.models = {}
     self.vocab_path = vocab_path
     self.feature_transformer = FeatureTransformer()
Ejemplo n.º 6
0
class EnsembleModel:
    def __init__(self, scoring, vocab_path, cv=3):
        self.cv = cv
        self.scoring = scoring
        self.models = {}
        self.vocab_path = vocab_path
        self.feature_transformer = FeatureTransformer()

    def add_model(self, name, estimator):
        self.models.update({
            name: {
                "estimator": estimator,
                "pred": [],
                "training_time": 0
            }
        })

    def remove_model(self, name):
        del self.models[name]
        print("Remove model {} done".format(name))

    def fit(self, X, y, is_encoded_data=True):
        if not is_encoded_data:
            # Transform raw document to document presentation
            X = self.feature_transformer.fit_transform(X, y, vocab_path=self.vocab_path)
            self.vocab = self.feature_transformer.get_tfidf_vocab()
            print("Vocabulary size : ", len(self.vocab))

        for name, model in self.models.items():
            start_time = time.time()
            model["estimator"].fit(X, y)
            finish_time = time.time()
            training_time = finish_time - start_time
            model["training_time"] = training_time
            print("Model {} fit done. Time : {:.4f} seconds".format(name, training_time))
        self.print_stat_fit()

    def predict(self, X):
        start_time = time.time()
        X = self.feature_transformer.transform(X)
        total_preds = [[] for _ in range(X.shape[0])]
        for name, model in self.models.items():
            model["pred"] = model["estimator"].predict(X)
            for i, pred in enumerate(model["pred"]):
                total_preds[i].append(pred)

        # Major voting
        self.major_votings = []
        model_predict_rate = []
        for i, preds in enumerate(total_preds):
            major_label, num_model_predict_label = Counter(preds).most_common(1)[0]
            self.major_votings.append(major_label)
            model_predict_rate.append(num_model_predict_label / len(self.models))

        finish_time = time.time()
        print("Model predict {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time))
        return self.major_votings, model_predict_rate

    def predict_proba(self, X):
        start_time = time.time()
        X = self.feature_transformer.transform(X)
        total_probs = []
        for name, model in self.models.items():
            classes = model["estimator"].classes_
            print("Model {} start to predict proba".format(name))
            model["prob"] = model["estimator"].predict_proba(X)
            model["pred"] = classes[np.argmax(model["prob"], axis=1)]
            total_probs.append(np.array(model["prob"]))

        # Major voting
        total_prob_pred = total_probs[0]
        model_predict_rate = []
        for i in range(1, len(total_probs)):
            total_prob_pred += total_probs[i]
        total_prob_pred /= len(total_probs)
        self.max_prob_pred = np.max(total_prob_pred, axis=1)
        index_pred = np.argmax(total_prob_pred, axis=1)

        self.label_pred = classes[index_pred]

        finish_time = time.time()
        print("Model predict proba {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time))
        return self.label_pred, self.max_prob_pred

    def evaluate(self, X_test, y_test, metrics, is_predict_proba=False):
        # Predict X_test
        if is_predict_proba:
            major_pred, _ = self.predict_proba(X_test)
        else:
            major_pred, _ = self.predict(X_test)

        # Save result predict to debug
        # pred_df = {"Ensemble": major_pred}

        # Evaluate models on metrics
        result = []
        cf_mats = {}
        columns = sorted(list(metrics.keys()))
        for name, model in self.models.items():
            row = [name]
            y_pred = model["pred"]
            # pred_df.update({name: y_pred})
            for metric_name in columns:
                metric_fn = metrics.get(metric_name).get("fn")
                metric_params = metrics.get(metric_name).get("params")
                # print("Score : {}, Params : {}".format(metric_name, metric_params))
                # print(np.unique(y_test))
                # third_param = True if metric_name == "accuracy" else None
                if metric_params is None:
                    value_score = metric_fn(y_test, y_pred)
                else:
                    value_score = metric_fn(y_test, y_pred, **metric_params)
                row.append(value_score)
            result.append(row)

            # Calculate confusion matrix
            unique_label = np.unique(np.concatenate((y_test, y_pred)))
            cf_mat = confusion_matrix(y_test, y_pred, unique_label)
            cf_mats.update({name: (cf_mat, unique_label)})

        # pred_df.update({"True_Label": y_test})
        # pred_df = pd.DataFrame(pred_df)
        # pred_df = pred_df[pred_df["Ensemble"] != pred_df["True_Label"]]
        # pred_df.to_csv("./Debug/pred.csv", index=False)

        # Evaluate ensemble model
        ensemble_model_name = "Ensemble"
        row = [ensemble_model_name]
        for metric_name in columns:
            metric_fn = metrics.get(metric_name).get("fn")
            metric_params = metrics.get(metric_name).get("params")
            # third_param = True if metric_name == "accuracy" else None
            if metric_params is None:
                value_score = metric_fn(y_test, major_pred)
            else:
                value_score = metric_fn(y_test, major_pred, **metric_params)
            row.append(value_score)
        result.append(row)
        unique_label = np.unique(np.concatenate((y_test, major_pred)))
        cf_mats.update({ensemble_model_name: (confusion_matrix(y_test, major_pred, unique_label), unique_label)})

        # print("\nmodel::evaluate Accuracy", accuracy_score(y_test, major_pred))

        columns = ["Model"] + columns
        result = pd.DataFrame(result, columns=columns)

        return result, cf_mats

    def print_stat_fit(self):
        print("\n===============================")
        print("Statistic : ")
        for name, model in self.models.items():
            instance = model["estimator"]
            print("\nModel : ", name)
            print("Best params : ", instance.best_params_)
            print("Best valid {} score  : {}".format(self.scoring[0], instance.best_score_))
            best_index = instance.best_index_
            for score in self.scoring:
                if score != self.scoring[0]:
                    print("Mean valid {} score : {}".format(score, instance.cv_results_["mean_test_{}".format(score)][best_index]))
            print("Training time : {} seconds".format(model["training_time"]))
        print("===============================\n")

    def get_statistic_data(self):
        data_plot = []
        columns = ["Model", "Hyper_Parameter"] + self.scoring + ["Training_Time (Seconds)"]
        for name, model in self.models.items():
            row = [name]
            instance = model["estimator"]
            row.append(instance.best_params_)
            row.append(instance.best_score_)
            best_index = instance.best_index_
            for score in self.scoring:
                if score != self.scoring[0]:
                    row.append(instance.cv_results_["mean_test_{}".format(score)][best_index])
            row.append(model["training_time"])
            data_plot.append(row)

        data_plot = pd.DataFrame(data_plot, columns=columns)
        return data_plot

    def save_model(self, save_dir="./Model"):
        print("Start to save {} models to {} ...".format(len(self.models), save_dir))
        save_dir = os.path.join(save_dir, utils.get_format_time_now())
        utils.mkdirs(save_dir)
        meta_data = []

        for name, model in self.models.items():
            instance = model["estimator"]
            save_path = os.path.join(save_dir, "{}.joblib".format(name))
            joblib.dump(instance, save_path)
            meta_data.append({
                "model_name": name,
                "model_path": save_path,
                "model_params": instance.best_params_
            })
            print("Save model {} to {} done".format(name, save_path))

        # Save meta data about models
        meta_data_path = os.path.join(save_dir, "meta.txt")
        # print("\nMeta data : ", meta_data)
        with open(meta_data_path, 'w') as f:
            json.dump(meta_data, f, cls=utils.MyEncoder)

        print("Save {} models to {} done".format(len(self.models), save_dir))

        # Save figure about training result of models
        # Create data frame contains result
        statistic_data = self.get_statistic_data()

        # Save statistic data
        statistic_save_dir = os.path.join(save_dir, "Statistic")
        utils.mkdirs(statistic_save_dir)
        result_save_path = os.path.join(statistic_save_dir, "result.csv")
        statistic_data.to_csv(result_save_path, index=False)

        # Plot and save figure
        data_plot = statistic_data.drop("Hyper_Parameter", axis=1)
        self.plot_result(data_plot, statistic_save_dir, is_plot=False)

    def load_model(self, save_dir):
        print("Start to load models from ", save_dir)
        meta_data_path = os.path.join(save_dir, "meta.txt")
        # Load meta data about models
        with open(meta_data_path, 'r') as f:
            meta_data = json.load(f)
        self.models = {}
        for info_model in meta_data:
            model_name = info_model["model_name"]
            model_path = info_model["model_path"]
            estimator = joblib.load(model_path)
            self.models.update({
                model_name: {
                    "estimator": estimator,
                    "pred": []
                }
            })
        self.feature_transformer.fit([""], [""], vocab_path=self.vocab_path)
        print("Load {} models from {} done".format(len(self.models), save_dir))

    def plot_result(self, data_plot, save_fig_dir, is_plot=True):
        utils.mkdirs(save_fig_dir)
        columns = list(data_plot.columns)
        print("Start to plot and save {} figures to {} ...".format(len(columns) - 1, save_fig_dir))

        print("Head of data plot")
        print(data_plot.head())
        x_offset = -0.07
        y_offset = 0.01
        mpl.style.use("seaborn")

        model_column = columns[0]
        for score_solumn in columns[1:]:
            # Sort by ascending score
            data_plot.sort_values(score_solumn, ascending=True, inplace=True)

            ax = data_plot.plot(kind="bar", x=model_column, y=score_solumn,
                                legend=None, color='C1', figsize=(len(self.models) + 1, 4), width=0.3)
            title = "Mean {} score - {} cross validation".format(score_solumn, self.cv)
            ax.set(title=title, xlabel=model_column, ylabel=score_solumn)
            ax.tick_params(axis='x', rotation=0)

            # Set lower and upper limit of y-axis
            min_score = data_plot.loc[:, score_solumn].min()
            max_score = data_plot.loc[:, score_solumn].max()
            y_lim_min = (min_score - 0.2) if min_score > 0.2 else 0
            y_lim_max = (max_score + 1) if max_score > 1 else 1
            ax.set_ylim([y_lim_min, y_lim_max])

            # Show value of each column to see clearly
            for p in ax.patches:
                b = p.get_bbox()
                text_value = "{:.4f}".format(b.y1)
                ax.annotate(text_value, xy=(b.x0 + x_offset, b.y1 + y_offset))

            save_fig_path = os.path.join(save_fig_dir, "{}.png".format(score_solumn))
            plt.savefig(save_fig_path, dpi=800)

        print("Plot and save {} figures to {} done".format(len(columns) - 1, save_fig_dir))
        if is_plot:
            plt.show()