def main_preprocessing_mmc(conf, conf_model, dataset="train"): context_size = conf_model["context_size"] v = conf_model["verbose"] path_model_df_prep = Path(conf_model["path"] + "df_%s_preprocessed.csv" % dataset) if conf_model["preprocessing_%s" % dataset][0]: path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset) df = u.load_file(path_dataprep) df = preprocess_text_data(df, verbose=v) u.record_file(df, path_model_df_prep) else: df = u.load_file(path_model_df_prep) df["txt"] = df["txt"].map(eval) # Building learning matrix path_model_df_learn = conf_model["path"] + "df_%s_learning_cs%d.csv" % (dataset, context_size) if conf_model["preprocessing_%s" % dataset][1]: df = create_dataframe_one_line_per_date_with_context(df, context_size, verbose=v) u.vprint("Recording data", v) u.record_file(df, path_model_df_learn) else: u.vprint("Loading data in one line per date", v) df = u.load_file(Path(path_model_df_learn)) df[["pos_moy", "part_moy", "nb_app"]] = df[["pos_moy", "part_moy", "nb_app"]].astype(float) X = df[["txt_id", "date", "context_date", "pos_moy", "part_moy", "nb_app"]].values if "target" in df.columns: y = df["target"].astype(int).values u.vprint("X shape : %s" % str(X.shape), v) u.vprint("y shape : %s" % str(y.shape), v) return X, y else: u.vprint("X shape : %s" % str(X.shape), v) return X
def save_info_dataset(self, info_datasets): for dataset, path in info_datasets: name = Path(path).name if isinstance(dataset, pd.DataFrame): sample = dataset.head(min(5, len(dataset))) else: sample = pd.DataFrame(dataset[0:5]) shape = dataset.shape u.record_file(sample, self.exp_dir / Path("dataset_%s_sample.csv" % name)) info = "shape : %s\n" % str(shape) + "path : %s" % str(path) u.record_file( info, self.exp_dir / Path("dataset_%s_info.txt" % str(name)))
def main_ml_multi_classes_search_best_model(conf): conf_model = conf["models"]["ml_multi_classes"] v = conf_model["verbose"] summary = "\n******************************************* NEW SESSION *******************************************\n" summary += str(datetime.now()) + "\n" summary += str(conf_model) + "\n" ### Preprocessing X, y = main_preprocessing_mmc(conf, conf_model, dataset="train") ### Split train, test u.vprint("Splitting data in train and test", v) X_train, X_test, y_train, y_test = split_train_test(X, y) ### Learning # Get the estimator u.vprint("Initializing estimator", v) estimator = get_estimator(conf_model) # Grid search exp_dir = get_experiment_directory(conf_model) if conf_model["search_best_params"]: u.vprint("Performing best params search", v) best_params = search_best_params(conf_model, estimator, X_train, y_train) u.record_file(best_params, exp_dir / "best_params.json") # Set params estimator = set_estimator_params(estimator, conf_model, exp_dir) summary += str(estimator.get_params()) + "\n" # Learning u.vprint("Learning phase", v) estimator.fit(X_train, y_train) # Assessing : u.vprint("Assessing phase", v) assess_summary = main_assessing(conf, conf_model, estimator, X_train, X_test, y_train, y_test) summary += assess_summary u.record_file(summary, exp_dir / "summary_results.txt", mode="a")
def main_na_nc_classifier(conf): conf_model = conf["models"]["na_nc_classifier"] v = conf_model["verbose"] # Preprocessing path_model_df_prep = Path(conf_model["path"] + "df_train_preprocessed.csv") if conf_model["preprocessing"]: path_dataprep = Path(conf["paths"]["dataprep"] + "df_train.csv") df_train = u.load_file(path_dataprep) df_train = preprocessing_data(df_train, verbose=v) u.record_file(df_train, path_model_df_prep) else: df_train = u.load_file(path_model_df_prep) X = df_train["txt"].values y = df_train["date_consolidation"].values ### Split train, test u.vprint("Splitting data in train and test", v) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) ### Learning # Get the estimator u.vprint("Initializing estimator", v) estimator = get_estimator(conf_model) # Grid search exp_dir = get_experiment_directory(conf_model) if conf_model["search_best_params"]: u.vprint("Performing best params search", v) best_params = search_best_params(conf_model, estimator, X_train, y_train) u.record_file(best_params, exp_dir / "best_params.json") # Set params estimator = set_estimator_params(estimator, conf_model, exp_dir) # Learning path_pickle_model = exp_dir / "fitted_model.pkl" if conf_model["learning"]: estimator.fit(X_train, y_train) u.record_file(estimator, path_pickle_model) # Assessing res1_train, res1_val = cross_validate_model(conf_model, estimator, X_train, y_train) u.vprint("Cross validation results : ", v) print(res1_train) print(res1_val) res_test = eval_model(estimator, X_train, y_train, X_test, y_test) u.vprint("Test results : ", v) print(res_test)
def save_grid_search_results(self, grid_search_results): u.record_file(grid_search_results, self.exp_dir / "grid_search_results.pkl") pass
def save_prediction(self, predictions): u.record_file(predictions, self.exp_dir / "prediction.csv")
def save_evaluation_results(self, results): u.record_file(results, self.exp_dir / Path("results.csv"), index=True)
def save_model(self, model): u.record_file( model, self.exp_dir / Path("model_%s.pkl" % str(self.model_name)))
def save_exp_context(self): u.record_file(str(self.__dict__), self.exp_dir / "context.json")