if __name__ == "__main__": # Load trining data training_encoded_data_path = "./Dataset/encoded_training_data_4362.json" X_train, y_train = FeatureTransformer.load_encoded_data(training_encoded_data_path) # Load test data test_data_path = "./Dataset/valid_data_1091.json" test_data = utils.load_data(test_data_path) df = pd.DataFrame(test_data) X_test = df.content.values y_test = df.label.values # Transform test data ft = FeatureTransformer() X_test = ft.fit_transform(X_test, y_train, vocab_path=VOCAB_PATH) # Define models mnb = MultinomialNB(alpha=0.004) rf = RandomForestClassifier( max_features=0.8, n_estimators=20, max_depth=80, class_weight="balanced", n_jobs=-1, random_state=RANDOM_STATE) etree = ExtraTreesClassifier( n_estimators=50, max_features=0.3,
class EnsembleModel: def __init__(self, scoring, vocab_path, cv=3): self.cv = cv self.scoring = scoring self.models = {} self.vocab_path = vocab_path self.feature_transformer = FeatureTransformer() def add_model(self, name, estimator): self.models.update({ name: { "estimator": estimator, "pred": [], "training_time": 0 } }) def remove_model(self, name): del self.models[name] print("Remove model {} done".format(name)) def fit(self, X, y, is_encoded_data=True): if not is_encoded_data: # Transform raw document to document presentation X = self.feature_transformer.fit_transform(X, y, vocab_path=self.vocab_path) self.vocab = self.feature_transformer.get_tfidf_vocab() print("Vocabulary size : ", len(self.vocab)) for name, model in self.models.items(): start_time = time.time() model["estimator"].fit(X, y) finish_time = time.time() training_time = finish_time - start_time model["training_time"] = training_time print("Model {} fit done. Time : {:.4f} seconds".format(name, training_time)) self.print_stat_fit() def predict(self, X): start_time = time.time() X = self.feature_transformer.transform(X) total_preds = [[] for _ in range(X.shape[0])] for name, model in self.models.items(): model["pred"] = model["estimator"].predict(X) for i, pred in enumerate(model["pred"]): total_preds[i].append(pred) # Major voting self.major_votings = [] model_predict_rate = [] for i, preds in enumerate(total_preds): major_label, num_model_predict_label = Counter(preds).most_common(1)[0] self.major_votings.append(major_label) model_predict_rate.append(num_model_predict_label / len(self.models)) finish_time = time.time() print("Model predict {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time)) return self.major_votings, model_predict_rate def predict_proba(self, X): start_time = time.time() X = self.feature_transformer.transform(X) total_probs = [] for name, model in self.models.items(): classes = model["estimator"].classes_ print("Model {} start to predict proba".format(name)) model["prob"] = model["estimator"].predict_proba(X) model["pred"] = classes[np.argmax(model["prob"], axis=1)] total_probs.append(np.array(model["prob"])) # Major voting total_prob_pred = total_probs[0] model_predict_rate = [] for i in range(1, len(total_probs)): total_prob_pred += total_probs[i] total_prob_pred /= len(total_probs) self.max_prob_pred = np.max(total_prob_pred, axis=1) index_pred = np.argmax(total_prob_pred, axis=1) self.label_pred = classes[index_pred] finish_time = time.time() print("Model predict proba {} docs done. Time : {:.4f} seconds".format(X.shape[0], finish_time - start_time)) return self.label_pred, self.max_prob_pred def evaluate(self, X_test, y_test, metrics, is_predict_proba=False): # Predict X_test if is_predict_proba: major_pred, _ = self.predict_proba(X_test) else: major_pred, _ = self.predict(X_test) # Save result predict to debug # pred_df = {"Ensemble": major_pred} # Evaluate models on metrics result = [] cf_mats = {} columns = sorted(list(metrics.keys())) for name, model in self.models.items(): row = [name] y_pred = model["pred"] # pred_df.update({name: y_pred}) for metric_name in columns: metric_fn = metrics.get(metric_name).get("fn") metric_params = metrics.get(metric_name).get("params") # print("Score : {}, Params : {}".format(metric_name, metric_params)) # print(np.unique(y_test)) # third_param = True if metric_name == "accuracy" else None if metric_params is None: value_score = metric_fn(y_test, y_pred) else: value_score = metric_fn(y_test, y_pred, **metric_params) row.append(value_score) result.append(row) # Calculate confusion matrix unique_label = np.unique(np.concatenate((y_test, y_pred))) cf_mat = confusion_matrix(y_test, y_pred, unique_label) cf_mats.update({name: (cf_mat, unique_label)}) # pred_df.update({"True_Label": y_test}) # pred_df = pd.DataFrame(pred_df) # pred_df = pred_df[pred_df["Ensemble"] != pred_df["True_Label"]] # pred_df.to_csv("./Debug/pred.csv", index=False) # Evaluate ensemble model ensemble_model_name = "Ensemble" row = [ensemble_model_name] for metric_name in columns: metric_fn = metrics.get(metric_name).get("fn") metric_params = metrics.get(metric_name).get("params") # third_param = True if metric_name == "accuracy" else None if metric_params is None: value_score = metric_fn(y_test, major_pred) else: value_score = metric_fn(y_test, major_pred, **metric_params) row.append(value_score) result.append(row) unique_label = np.unique(np.concatenate((y_test, major_pred))) cf_mats.update({ensemble_model_name: (confusion_matrix(y_test, major_pred, unique_label), unique_label)}) # print("\nmodel::evaluate Accuracy", accuracy_score(y_test, major_pred)) columns = ["Model"] + columns result = pd.DataFrame(result, columns=columns) return result, cf_mats def print_stat_fit(self): print("\n===============================") print("Statistic : ") for name, model in self.models.items(): instance = model["estimator"] print("\nModel : ", name) print("Best params : ", instance.best_params_) print("Best valid {} score : {}".format(self.scoring[0], instance.best_score_)) best_index = instance.best_index_ for score in self.scoring: if score != self.scoring[0]: print("Mean valid {} score : {}".format(score, instance.cv_results_["mean_test_{}".format(score)][best_index])) print("Training time : {} seconds".format(model["training_time"])) print("===============================\n") def get_statistic_data(self): data_plot = [] columns = ["Model", "Hyper_Parameter"] + self.scoring + ["Training_Time (Seconds)"] for name, model in self.models.items(): row = [name] instance = model["estimator"] row.append(instance.best_params_) row.append(instance.best_score_) best_index = instance.best_index_ for score in self.scoring: if score != self.scoring[0]: row.append(instance.cv_results_["mean_test_{}".format(score)][best_index]) row.append(model["training_time"]) data_plot.append(row) data_plot = pd.DataFrame(data_plot, columns=columns) return data_plot def save_model(self, save_dir="./Model"): print("Start to save {} models to {} ...".format(len(self.models), save_dir)) save_dir = os.path.join(save_dir, utils.get_format_time_now()) utils.mkdirs(save_dir) meta_data = [] for name, model in self.models.items(): instance = model["estimator"] save_path = os.path.join(save_dir, "{}.joblib".format(name)) joblib.dump(instance, save_path) meta_data.append({ "model_name": name, "model_path": save_path, "model_params": instance.best_params_ }) print("Save model {} to {} done".format(name, save_path)) # Save meta data about models meta_data_path = os.path.join(save_dir, "meta.txt") # print("\nMeta data : ", meta_data) with open(meta_data_path, 'w') as f: json.dump(meta_data, f, cls=utils.MyEncoder) print("Save {} models to {} done".format(len(self.models), save_dir)) # Save figure about training result of models # Create data frame contains result statistic_data = self.get_statistic_data() # Save statistic data statistic_save_dir = os.path.join(save_dir, "Statistic") utils.mkdirs(statistic_save_dir) result_save_path = os.path.join(statistic_save_dir, "result.csv") statistic_data.to_csv(result_save_path, index=False) # Plot and save figure data_plot = statistic_data.drop("Hyper_Parameter", axis=1) self.plot_result(data_plot, statistic_save_dir, is_plot=False) def load_model(self, save_dir): print("Start to load models from ", save_dir) meta_data_path = os.path.join(save_dir, "meta.txt") # Load meta data about models with open(meta_data_path, 'r') as f: meta_data = json.load(f) self.models = {} for info_model in meta_data: model_name = info_model["model_name"] model_path = info_model["model_path"] estimator = joblib.load(model_path) self.models.update({ model_name: { "estimator": estimator, "pred": [] } }) self.feature_transformer.fit([""], [""], vocab_path=self.vocab_path) print("Load {} models from {} done".format(len(self.models), save_dir)) def plot_result(self, data_plot, save_fig_dir, is_plot=True): utils.mkdirs(save_fig_dir) columns = list(data_plot.columns) print("Start to plot and save {} figures to {} ...".format(len(columns) - 1, save_fig_dir)) print("Head of data plot") print(data_plot.head()) x_offset = -0.07 y_offset = 0.01 mpl.style.use("seaborn") model_column = columns[0] for score_solumn in columns[1:]: # Sort by ascending score data_plot.sort_values(score_solumn, ascending=True, inplace=True) ax = data_plot.plot(kind="bar", x=model_column, y=score_solumn, legend=None, color='C1', figsize=(len(self.models) + 1, 4), width=0.3) title = "Mean {} score - {} cross validation".format(score_solumn, self.cv) ax.set(title=title, xlabel=model_column, ylabel=score_solumn) ax.tick_params(axis='x', rotation=0) # Set lower and upper limit of y-axis min_score = data_plot.loc[:, score_solumn].min() max_score = data_plot.loc[:, score_solumn].max() y_lim_min = (min_score - 0.2) if min_score > 0.2 else 0 y_lim_max = (max_score + 1) if max_score > 1 else 1 ax.set_ylim([y_lim_min, y_lim_max]) # Show value of each column to see clearly for p in ax.patches: b = p.get_bbox() text_value = "{:.4f}".format(b.y1) ax.annotate(text_value, xy=(b.x0 + x_offset, b.y1 + y_offset)) save_fig_path = os.path.join(save_fig_dir, "{}.png".format(score_solumn)) plt.savefig(save_fig_path, dpi=800) print("Plot and save {} figures to {} done".format(len(columns) - 1, save_fig_dir)) if is_plot: plt.show()