def main_stacking_clf(conf): conf_model = conf["models"]["ml_multi_classes_ensembling"] v = conf_model["verbose"] summary = "\n******************************************* NEW SESSION *******************************************\n" summary += str(datetime.now()) + "\n" summary += str(conf_model) + "\n" summary += "***************************************************************************************************\n" u.vprint(summary, v) X, y = mmc.main_preprocessing_mmc(conf, conf_model, dataset="train") print("X shape : %s" % str(X.shape)) print("y shape : %s" % str(y.shape)) X_train, X_test, y_train, y_test = mmc.split_train_test(X, y) print("X_train shape : " + str(X_train.shape)) print("X_test shape : %s" % str(X_test.shape)) print("y_train shape : %s" % str(y_train.shape)) print("y_test shape : %s" % str(y_test.shape)) clf = get_stacked_estimator(conf_model) print("Classifier") print(clf) u.vprint("Fitting model", v) clf.fit(X_train, y_train) # Assessing assess_summary = mmc.main_assessing(conf, conf_model, clf, X_train, X_test, y_train, y_test) summary += "\n" + assess_summary + "\n"
def main_preprocessing_mmc(conf, conf_model, dataset="train"): context_size = conf_model["context_size"] v = conf_model["verbose"] path_model_df_prep = Path(conf_model["path"] + "df_%s_preprocessed.csv" % dataset) if conf_model["preprocessing_%s" % dataset][0]: path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset) df = u.load_file(path_dataprep) df = preprocess_text_data(df, verbose=v) u.record_file(df, path_model_df_prep) else: df = u.load_file(path_model_df_prep) df["txt"] = df["txt"].map(eval) # Building learning matrix path_model_df_learn = conf_model["path"] + "df_%s_learning_cs%d.csv" % (dataset, context_size) if conf_model["preprocessing_%s" % dataset][1]: df = create_dataframe_one_line_per_date_with_context(df, context_size, verbose=v) u.vprint("Recording data", v) u.record_file(df, path_model_df_learn) else: u.vprint("Loading data in one line per date", v) df = u.load_file(Path(path_model_df_learn)) df[["pos_moy", "part_moy", "nb_app"]] = df[["pos_moy", "part_moy", "nb_app"]].astype(float) X = df[["txt_id", "date", "context_date", "pos_moy", "part_moy", "nb_app"]].values if "target" in df.columns: y = df["target"].astype(int).values u.vprint("X shape : %s" % str(X.shape), v) u.vprint("y shape : %s" % str(y.shape), v) return X, y else: u.vprint("X shape : %s" % str(X.shape), v) return X
def preprocess_data(conf, dataset="train"): model_name = "simple_embedding_and_ml" conf_model = conf["models"][model_name] preprocessing = conf_model["preprocessing_%s" % dataset] v = conf_model["verbose"] context_size = conf_model["context_size"] path_model_dataprep_X = Path(conf_model["path"] + "X_%s_preprocessed_cs%d.pkl" % (dataset, context_size)) path_model_dataprep_y = Path(conf_model["path"] + "y_%s_preprocessed_cs%d.pkl" % (dataset, context_size)) if preprocessing: df = preprocess_data_general(conf, dataset=dataset) X, y = creating_matrix(conf, df) u.vprint("Record...", v) with open(path_model_dataprep_X, "wb") as f: pickle.dump(X, f) with open(path_model_dataprep_y, "wb") as f: pickle.dump(y, f) else: u.vprint("Loading already prepared data...", v) with open(path_model_dataprep_X, "rb") as f: X = pickle.load(f) with open(path_model_dataprep_y, "rb") as f: y = pickle.load(f) return X, y
def preprocess_text_data(df, verbose=True): u.vprint("Formating text", verbose) df["txt"] = df["txt"].map(convert_text_to_word_tokens) if "date_accident" in df.columns: df["date_accident"] = df["date_accident"].map( lambda x: re.sub(r"[-.]", "", x)) if "date_consolidation" in df.columns: df["date_consolidation"] = df["date_consolidation"].map( lambda x: re.sub(r"[-.]", "", x)) return df
def main_na_nc_classifier(conf): conf_model = conf["models"]["na_nc_classifier"] v = conf_model["verbose"] # Preprocessing path_model_df_prep = Path(conf_model["path"] + "df_train_preprocessed.csv") if conf_model["preprocessing"]: path_dataprep = Path(conf["paths"]["dataprep"] + "df_train.csv") df_train = u.load_file(path_dataprep) df_train = preprocessing_data(df_train, verbose=v) u.record_file(df_train, path_model_df_prep) else: df_train = u.load_file(path_model_df_prep) X = df_train["txt"].values y = df_train["date_consolidation"].values ### Split train, test u.vprint("Splitting data in train and test", v) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) ### Learning # Get the estimator u.vprint("Initializing estimator", v) estimator = get_estimator(conf_model) # Grid search exp_dir = get_experiment_directory(conf_model) if conf_model["search_best_params"]: u.vprint("Performing best params search", v) best_params = search_best_params(conf_model, estimator, X_train, y_train) u.record_file(best_params, exp_dir / "best_params.json") # Set params estimator = set_estimator_params(estimator, conf_model, exp_dir) # Learning path_pickle_model = exp_dir / "fitted_model.pkl" if conf_model["learning"]: estimator.fit(X_train, y_train) u.record_file(estimator, path_pickle_model) # Assessing res1_train, res1_val = cross_validate_model(conf_model, estimator, X_train, y_train) u.vprint("Cross validation results : ", v) print(res1_train) print(res1_val) res_test = eval_model(estimator, X_train, y_train, X_test, y_test) u.vprint("Test results : ", v) print(res_test)
def main_rule_base_classifier(conf): conf_model = conf["models"]["rule_base_classifier"] date_target = conf_model["date_target"] learn = conf_model["learn"] do_search = conf_model["search"] assess = conf_model["assess"] v = conf_model["verbose"] path_model = conf_model["path"] path_pickle = Path(path_model + "model.pkl") df = preprocess_data(conf, dataset="train") X_train, y_train = split_x_y(conf, date=date_target, df=df, dataset="train") # Get best params if do_search: vprint("Searching best param", v) best_params = search(conf, X_train, y_train) vprint("Best params are %s" % str(best_params), v) else: vprint("Reading best params in model folder", v) path_model_params = Path(conf_model["path"] + "best_params.json") with open(path_model_params, "r") as f: best_params = json.load(f) # Learning if learn: vprint("Learning phase", v) model = RuleBaseClassifier() model.set_params(best_params) model.fit(X_train, y_train) with open(path_pickle, "wb") as f: pickle.dump(model, f) # Assessing if assess: vprint("Assessing phase", v) df_test = preprocess_data(conf, dataset="test") X_test, y_test = split_x_y(conf, date=date_target, df=df_test, dataset="test") with open(path_pickle, "rb") as f: model = pickle.load(f) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) accuracy_train = compute_accuracy(y_train, y_train_pred) accuracy_test = compute_accuracy(y_test, y_test_pred) print("Accuracy train = %.2f" % accuracy_train) print("Accuracy test = %.2f" % accuracy_test)
def set_estimator_params(estimator, conf_model, exp_dir): verbose = conf_model["verbose"] path_best_params = exp_dir / "best_params.json" if (len(conf_model["params"]) != 0) and (not conf_model["search_best_params"]): u.vprint("Using param from conf file : \n %s" % (str(conf_model["params"])), verbose) estimator.set_params(**conf_model["params"]) elif path_best_params.exists(): u.vprint("Loading best params", verbose) best_params = u.load_file(exp_dir / "best_params.json") u.vprint(str(best_params), verbose) estimator.set_params(**best_params) else: # Default parameters u.vprint("Using default params", verbose) u.vprint(str(estimator.get_params()), verbose) return estimator
def main_ml_multi_classes_search_best_model(conf): conf_model = conf["models"]["ml_multi_classes"] v = conf_model["verbose"] summary = "\n******************************************* NEW SESSION *******************************************\n" summary += str(datetime.now()) + "\n" summary += str(conf_model) + "\n" ### Preprocessing X, y = main_preprocessing_mmc(conf, conf_model, dataset="train") ### Split train, test u.vprint("Splitting data in train and test", v) X_train, X_test, y_train, y_test = split_train_test(X, y) ### Learning # Get the estimator u.vprint("Initializing estimator", v) estimator = get_estimator(conf_model) # Grid search exp_dir = get_experiment_directory(conf_model) if conf_model["search_best_params"]: u.vprint("Performing best params search", v) best_params = search_best_params(conf_model, estimator, X_train, y_train) u.record_file(best_params, exp_dir / "best_params.json") # Set params estimator = set_estimator_params(estimator, conf_model, exp_dir) summary += str(estimator.get_params()) + "\n" # Learning u.vprint("Learning phase", v) estimator.fit(X_train, y_train) # Assessing : u.vprint("Assessing phase", v) assess_summary = main_assessing(conf, conf_model, estimator, X_train, X_test, y_train, y_test) summary += assess_summary u.record_file(summary, exp_dir / "summary_results.txt", mode="a")
def search(conf, X, y): conf_model = conf["models"]["rule_base_classifier"] grid_search = conf_model["grid_search"] v = conf_model["verbose"] all_combi = get_all_combi_grid_search(grid_search) nb_combi = len(all_combi) rbclf = RuleBaseClassifier() res = [] best_params = None best_metric = None for i, params in enumerate(all_combi): vprint("%d / %d" % (i + 1, nb_combi), v) vprint("\t%s" % (str(params)), v) rbclf.set_params(params) rbclf.fit(X, y) y_pred = rbclf.predict(X) accuracy = compute_accuracy(y, y_pred) if best_metric is None: best_metric = accuracy best_params = params elif best_metric < accuracy: best_metric = accuracy best_params = params vprint("\tAccuracy = %.2f" % accuracy) all_predictions = rbclf.all_predictions proba = rbclf.proba res.append((params, y_pred, all_predictions, proba)) path_model_params = Path(conf_model["path"] + "best_params.json") with open(path_model_params, "w") as f: json.dump(best_params, f) return best_params
def main_assessing(conf, conf_model, estimator, X_train, X_test, y_train, y_test): v = conf_model["verbose"] assess_summary = "" if conf_model["assessing"][0]: u.vprint("Cross validation", v) # cross validate model res1_val, res1_train, cvr = cross_validate_model(conf_model, estimator, X_train, y_train) print(res1_train) print(res1_val) assess_summary += res1_train + "\n" + res1_val if conf_model["assessing"][1]: u.vprint("Assessing model on test data at 'date level'", v) # Eval on test data res_test, report, df0, df1, df2 = eval_model(estimator, X_test, y_test) assess_summary += "\n" + res_test + "\nClassification report\n" + report + "\nConfusion matrix\tClasse 0\n\t" \ + str(df0) + "\t Classe 1\n\t" + str(df1) + "\t Classe 2\n\t" + str(df2) print(res_test) print("\nClassification report") print(report) print("\nConfusion matrix") print("\t Classe 0") print(df0) print("\t Classe 1") print(df1) print("\t Classe 2") print(df2) if conf_model["assessing"][2]: u.vprint("Assessing model on text level", v) # Assessing at text level res3 = eval_model_text_level(conf_model, estimator, X_train, X_test, conf) print(res3) assess_summary += "\n" + res3 + "\n" return assess_summary
def _make_all_dirs(log_path, log_subpath, mkdir=True, verbose=False): # create output dir if not os.path.exists(log_path) and mkdir: os.makedirs(log_path, exist_ok=True) vprint(verbose, f'Dir created: \n{log_path}') else: vprint(verbose, f'Use exisiting dir: \n{log_path}') for k, log_subpath_ in log_subpath.items(): if not os.path.exists(log_subpath_) and mkdir: os.makedirs(log_subpath_, exist_ok=True) vprint(verbose, f'- sub dir: {k}') else: vprint(verbose, f'- use exisiting sub dir: {k}')
def creating_matrix(conf, df): model_name = "simple_embedding_and_ml" conf_model = conf["models"][model_name] v = conf_model["verbose"] target_date = conf_model["date_target"] context_size = conf_model["context_size"] u.vprint("Creating numpy matrix", v) X = [] y = [] all_txt = df["txt"].values y_target_date = df[target_date] for i in range(all_txt.shape[0]): txt = all_txt[i] target_date = y_target_date[i] index_dates, dates_in_txt = get_dates_from_token_list(txt) for d in list(set(dates_in_txt)): left_context, right_context = get_context_date( context_size, d, txt) l = sum([list(c) for c in left_context], []) + sum( [list(c) for c in right_context], []) s = " ".join(l) positions = index_dates[np.argwhere(dates_in_txt == d)].ravel() positions_mean = np.mean(positions) part_of_txt = positions / len(txt) part_of_txt_mean = np.mean(part_of_txt) nb_appearances = len(positions) # Reducing set with rules discovered with exploratory analysis if 0 in positions: # The target date can't be the first word of the text pass else: X.append([s, positions_mean, part_of_txt_mean, nb_appearances]) if d == target_date: y.append(1) else: y.append(0) X = np.array(X) y = np.array(y) return X, y
def fill_buffer(self, model, limit=None): """Populates the buffer. http://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator http://www.sqlalchemy.org/trac/wiki/UsageRecipes/WindowedRangeQuery http://stackoverflow.com/questions/1078383/sqlalchemy-difference-between-query-and-query-all-in-for-loops http://www.mail-archive.com/[email protected]/msg12443.html http://stackoverflow.com/questions/1145905/scanning-huge-tables-with-sqlalchemy-using-the-orm """ where = True total = model.session.query(sqlalchemy.func.count(model.id)).filter(where).scalar() self.index_total = min(total, limit) if limit not in [None, 0] else total vprint('Number of elements to index: ' + str(self.index_total)) vprint('Populating the buffer...') query = model.filter(where) if limit not in [None, 0]: query = query.limit(limit) self.index_buffer = query.values(model.id) vprint('Buffer populated.')
## Validate model every so often if niter % VALFREQ == 0: ut.mprint("Validating model") val_iter = vset.ndata // BSZ vloss, vset.niter = [], 0 sess.run(vset.fetchOp,feed_dict=vset.fdict()) for its in range(val_iter): sess.run(swpV) outs = sess.run( lvals+[vset.fetchOp], feed_dict={**vset.fdict(), is_training: False} ) vloss.append(np.array(outs[:-1])) vloss = np.mean(np.stack(vloss, axis=0), axis=0) ut.vprint(niter, vnms, vloss.tolist()) ## Run training step and print losses sess.run(swpT) if niter % 100 == 0: outs = sess.run( lvals+[tStep, tset.fetchOp], feed_dict={**tset.fdict(), lr: get_lr(niter), is_training: True} ) ut.vprint(niter, tnms, outs[:-2]) ut.vprint(niter, ['lr'], [get_lr(niter)]) else: outs = sess.run( [loss, psnr, tStep, tset.fetchOp], feed_dict={**tset.fdict(), lr: get_lr(niter), is_training: True} )
def print_info(self): # Utility functions from utils.utils import vprint process_mode = 'Single process' if self.single_process_mode else 'Multi threaded' vprint('Process mode: {:s}'.format(process_mode)) if not self.single_process_mode: vprint('Threads: {:d}'.format(self.threads)) vprint('Index: {:s}'.format(self.es_index)) vprint('Type: {:s}'.format(self.es_type)) vprint('DB Queue size: {:d}'.format(self.db_queue_size)) vprint('Read chunk size: {:d}'.format(self.read_chunk_size)) vprint('Write chunk size: {:d}'.format(self.write_chunk_size))
def preprocess_data(conf, dataset="train"): model_name = "ml_model" conf_model = conf["models"][model_name] context_size = conf_model["context_size"] v = conf_model["verbose"] path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset) u.vprint("Loading...", v) df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8") u.vprint("Lowercase text...", v) df["txt"] = df["txt"].str.lower() u.vprint("Format dates in text...", v) df["txt"] = df["txt"].map(format_date) u.vprint("Format date accident and consolidation...", v) df["date_accident"] = df["date_accident"].map( lambda x: re.sub(r"[-.]", "", x)) df["date_consolidation"] = df["date_consolidation"].map( lambda x: re.sub(r"[-.]", "", x)) u.vprint("Tokenize words...", v) df["txt"] = df["txt"].map(nltk.word_tokenize) u.vprint("Normalize text...", v) df["txt"] = df["txt"].map(normalize) u.vprint("Record the dataframe") path_model_df_prep = Path(conf_model["path"] + "df_%s_preprocessed_cs%d.csv" % (dataset, context_size)) df.to_csv(path_model_df_prep, sep=';', encoding="utf-8", index=False) u.vprint("Preprocessing dataframe completed", v) return df
def main_ml(conf): conf_model = conf["models"]["ml_model"] ml_model = conf_model["ml_model"] learn = conf_model["learn"] text_preprocessing = conf_model["text_preprocessing"] path_fasttext_model = conf_model["path_fasttext_model"] doc2vec_strategy = conf_model["doc2vec_strategy"] do_search = conf_model["search"] context_size = conf_model["context_size"] assess = conf_model["assess"] v = conf_model["verbose"] date_target = conf_model["date_target"] preprocessing_train = conf_model["preprocessing_train"] dir_exp = get_experiment_directory(conf_model) # Get the estimator model = MLModel(ml_model=ml_model, text_preprocessing=text_preprocessing, context_size=context_size, path_fasttext_model=path_fasttext_model, doc2vec_strategy=doc2vec_strategy) # Preprocess data if preprocessing_train: df_train = preprocess_data(conf, dataset="train") else: path_model_df_prep = Path(conf_model["path"] + "df_train_preprocessed_cs%d.csv" % (context_size)) df_train = pd.read_csv(path_model_df_prep, sep=';', encoding="utf-8") df_train["txt"] = df_train["txt"].map(eval) # Splitting train and test df_train, df_test = train_test_split(df_train) # Splitting X and y df_X_train = df_train[["ID", "txt"]] df_y_train = df_train[date_target] df_X_test = df_test[["ID", "txt"]] df_y_test = df_test[date_target] # Get best params if do_search: u.vprint("Searching best param", v) X_train, y_train, ids_and_dates = model._creating_matrix( df_X_train, df_y_train) best_params = search(conf, model.estimator, X_train, y_train) u.vprint("Best params are %s" % str(best_params), v) u.vprint("Record best params", v) record(conf, best_params, "best_params") else: # search has already been done and we just read out best param recorded previously u.vprint("Reading best params in model folder", v) path_file_best_param = dir_exp / Path("best_params.json") with open(path_file_best_param, "r") as f: best_params = json.load(f) u.vprint("Best params are %s" % str(best_params), v) # Learning if learn: u.vprint("Learning", v) model.estimator.set_params(**best_params) model.fit(df_X_train, df_y_train) # Assessing if assess: u.vprint("Assessing phase", v) model.estimator.set_params(**best_params) u.vprint("First cross validate the ML estimator", v) metrics_eval = [ "f1_micro", "f1_macro", "precision_micro", "precision_macro", "recall_micro", "recall_macro" ] X_train, y_train, ids_and_dates = model._creating_matrix( df_X_train, df_y_train) cvr = cross_validate(model.estimator, X_train, y_train, cv=2, n_jobs=-1, scoring=metrics_eval, return_train_score=True, verbose=0) f1_train = np.mean(cvr["train_f1"]) recall_train = np.mean(cvr["train_recall"]) precision_train = np.mean(cvr["train_precision"]) accuracy_train = np.mean(cvr["train_accuracy"]) auc_train = np.mean(cvr["train_roc_auc"]) f1_val = np.mean(cvr["test_f1"]) recall_val = np.mean(cvr["test_recall"]) precision_val = np.mean(cvr["test_precision"]) accuracy_val = np.mean(cvr["test_accuracy"]) auc_val = np.mean(cvr["test_roc_auc"]) res1_train = "Train : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = %.2f)" % ( f1_train, recall_train, precision_train, auc_train, accuracy_train) res1_val = "Validation : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = %.2f)" % ( f1_val, recall_val, precision_val, auc_val, accuracy_val) print(res1_train) print(res1_val) u.vprint("Now assess the model to the text level") model.fit(df_X_train, df_y_train) dates_pred_train = model.predict(df_X_train) date_true_train = df_y_train.values.ravel() accuracy_train = compute_accuracy(date_true_train, dates_pred_train) dates_pred_test = model.predict(df_X_test) date_true_test = df_y_test.values.ravel() accuracy_test = compute_accuracy(date_true_test, dates_pred_test) res2_train = "Accuracy train = %.2f" % accuracy_train res2_test = "Accuracy test = %.2f" % accuracy_test print(res2_train) print(res2_test) line = "\n".join([res1_train, res1_val, res2_train, res2_test]) record(conf, line, "assess_results")
def main_ml_multi_classes_prod(conf): conf_model = conf["models"]["ml_multi_classes"] v = conf_model["verbose"] exp_dir = get_experiment_directory(conf_model) u.vprint("Conf model : ", v) u.vprint(conf_model, v) u.vprint("Exp dir : %s" % str(exp_dir)) # First we get the estimator fitted on full data u.vprint("Run main preprocessing on train data", v) X_train, y_train = main_preprocessing_mmc(conf, conf_model, dataset="train") u.vprint("Fit estimator", v) estimator = get_estimator(conf_model) estimator = set_estimator_params(estimator, conf_model, exp_dir) estimator.fit(X_train, y_train) u.vprint(estimator, v) # Now we preprocess test data : u.vprint("Preprocessing full test data", v) X_test = main_preprocessing_mmc(conf, conf_model, dataset="test") # Make prediction : u.vprint("Doing prediction", v) adjust_with_nanc_classifier = conf_model["adjust_with_nanc_classifer"] df_y_pred_test = predict_text_level(conf_model, estimator, X_test, adjust_with_nanc_classifier, conf, dataset="test") u.vprint("Final prediction : ", v) u.vprint(df_y_pred_test) return df_y_pred_test
def main(script, *args, **kwargs): start_time = time.time() # Config options config = Config() source_table = 'categories' source_relationships = { 'one_to_many': { 'alternative_languages': { 'foreign_key': 'category_id' }, 'external_pages': { 'foreign_key': 'category_id' }, 'news_groups': { 'foreign_key': 'category_id' } }, 'one_to_one': { }, 'many_to_one': { }, 'many_to_many': { }, 'self_referential': { 'parent_category': { 'foreign_key': 'parent_id', 'backref': 'child_categories' }, } } # source_table = 'external_pages' # source_relationships = { # 'one_to_many': { # }, # 'many_to_one': { # 'categories': { # 'foreign_key': 'category_id' # } # }, # 'one_to_one': { # }, # 'many_to_many': { # }, # 'self_referential': { # } # } document_map = { 'alternative_languages': 'alternative_languages', 'categories': 'categories', 'external_pages': 'external_pages', 'news_groups': 'news_groups', 'parent_category': 'parent_category', 'related_categories': 'related_categories', } # Threads list threads = [] # Db connector queue read_queue = Queue() # Db connections list db_connections = config.db_connections # Populte db connector queue (round robin) for _ in range(config.db_queue_size): db_connection = db_connections.pop(0) db_connector = DbConnector(db_connection).build(source_table, source_relationships) read_queue.put(db_connector) db_connections.append(db_connection) # Elasticsearch connector es_connector = ES(server=config.es_connections, bulk_size=config.write_chunk_size) # Create index if necessary es_connector.indices.create_index_if_missing(config.es_index) # Define mapping # es_connector.cluster.put_mapping(config.es_type, {'properties':gralSettings['mapping']}, config.indexName) # Update index settings to improve indexing speed. # # Disable refresh interval # Improve indexing speed by augmenting the merge factor (uses more RAM). # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-update-settings.html#bulk # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings # # http://blog.sematext.com/2013/07/08/elasticsearch-refresh-interval-vs-indexing-performance/ # http://www.elasticsearch.org/blog/update-settings/ # https://github.com/aparo/pyes/blob/master/docs/guide/reference/api/admin-indices-update-settings.rst # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-merge.html#log-byte-size vprint('Optimizing for bulk indexing...') es_connector.indices.update_settings(config.es_index, { 'index.refresh_interval': '-1', 'index.merge.policy.merge_factor': '30' }) indexer = Indexer( db_connector=db_connector, read_queue=read_queue, es_connector=es_connector, es_index=config.es_index, es_type=config.es_type, document_map=document_map, limit=config.limit) # Start indexing if config.single_process_mode: indexer.index(start_time, read_chunk_size=config.read_chunk_size) else: # Create new threads for i in range(config.threads): thread = Thread( indexer.index, start_time, read_chunk_size=config.read_chunk_size, autostart=config.autostart_threads) threads.append(thread) # Starts threads, by calling run() if not config.autostart_threads: for thread in threads: thread.start() # Wait for threads to terminate for thread in threads: thread.join() vprint('Optimizing for interactive indexing...') es_connector.indices.update_settings(config.es_index, { 'index.refresh_interval': '1s', 'index.merge.policy.merge_factor': '10' }) vprint('Refreshing index...') es_connector.indices.refresh() vprint('Elapsed: {:f}'.format(time.time() - start_time))
def preprocess_data(conf, dataset="train"): conf_model = conf["models"]["rule_base_classifier"] v = conf_model["verbose"] preprocessing = conf_model["preprocessing"] if preprocessing: path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset) vprint("Loading...", v) df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8") vprint("Lowercase text...", v) df["txt"] = df["txt"].str.lower() vprint("Format dates in text...", v) df["txt"] = df["txt"].map(format_date) vprint("Format date accident and consolidation...", v) df["date_accident"] = df["date_accident"].map( lambda x: re.sub(r"[-.]", "", x)) df["date_consolidation"] = df["date_consolidation"].map( lambda x: re.sub(r"[-.]", "", x)) vprint("Tokenize words...", v) df["txt"] = df["txt"].map(nltk.word_tokenize) vprint("Normalize text...", v) df["txt"] = df["txt"].map(normalize) vprint("Record...", v) path_rule_base_classifier = Path(conf_model["path"] + "df_%s_preprocessed.csv" % dataset) df.to_csv(path_rule_base_classifier, sep=';', encoding="utf-8", index=False) vprint("Preprocessing completed", v) else: path_rule_base_classifier = Path(conf_model["path"] + "df_%s_preprocessed.csv" % dataset) df = pd.read_csv(path_rule_base_classifier, sep=';', encoding="utf-8") df["txt"] = df["txt"].map(eval) return df
def main_simple_embedding_ml(conf): conf_model = conf["models"]["simple_embedding_and_ml"] learn = conf_model["learn"] do_search = conf_model["search"] assess = conf_model["assess"] v = conf_model["verbose"] date_target = conf_model["date_target"] X_train, y_train = preprocess_data(conf, dataset="train") # Get best params if do_search: u.vprint("Searching best param", v) final_res = search(conf, X_train, y_train) best_params = final_res["f1"]["best_params"] u.vprint("Best params are %s" % str(best_params), v) else: # search has already been done and we just read out best param recorded previously u.vprint("Reading best params in model folder", v) path_model_params = Path(conf_model["path"] + "best_params_%s.json" % date_target) with open(path_model_params, "r") as f: final_res = json.load(f) best_params = final_res["f1"]["best_params"] # Learning if assess or learn: if assess: u.vprint( "Learning phase since this model cannot be pickled once trained" ) else: u.vprint("Learning phase", v) model = SimpleEmbeddingAndML() model.set_params(best_params) model.fit(X_train, y_train) # Assessing if assess: u.vprint("Assessing phase", v) metrics_eval = ["f1", "recall", "precision", "accuracy", "roc_auc"] cvr = cross_validate(model.estimator, X_train, y_train, cv=5, n_jobs=-1, scoring=metrics_eval, return_train_score=True) f1_train = np.mean(cvr["train_f1"]) recall_train = np.mean(cvr["train_recall"]) precision_train = np.mean(cvr["train_precision"]) accuracy_train = np.mean(cvr["train_accuracy"]) auc_train = np.mean(cvr["train_roc_auc"]) f1_test = np.mean(cvr["test_f1"]) recall_test = np.mean(cvr["test_recall"]) precision_test = np.mean(cvr["test_precision"]) accuracy_test = np.mean(cvr["test_accuracy"]) auc_test = np.mean(cvr["test_roc_auc"]) print( "Train : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = .%2f)" % (f1_train, recall_train, precision_train, auc_train, accuracy_train)) print( "Test : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = .%2f)" % (f1_test, recall_test, precision_test, auc_test, accuracy_test))
def search(conf, X, y): model_name = "simple_embedding_and_ml" conf_model = conf["models"][model_name] grid_search = conf_model["grid_search"] date_target = conf_model["date_target"] v = conf_model["verbose"] path_model_cv_res = Path(conf_model["path"] + "full_search_results_%s.csv" % date_target) flat_grid_search = u.flat_dictionary(grid_search, [], "") all_combi = u.get_all_combi_grid_search(dict(flat_grid_search)) nb_tot_combi = len(all_combi) res = [] cv_res = [] # Eval each set of params for i, combi in enumerate(all_combi): start = time() u.vprint("%d / %d" % (i + 1, nb_tot_combi), v) model = SimpleEmbeddingAndML() params = u.restruct_dict(combi, {}) model.set_params(params) metrics_eval = ["f1", "recall", "precision", "accuracy", "roc_auc"] cv_result = cross_validate(model.estimator, X, y, cv=5, n_jobs=-1, scoring=metrics_eval, return_train_score=True) f1 = np.mean(cv_result["test_f1"]) recall = np.mean(cv_result["test_recall"]) precision = np.mean(cv_result["test_precision"]) acc = np.mean(cv_result["test_accuracy"]) auc = np.mean(cv_result["test_roc_auc"]) res.append([params, f1, recall, precision, acc, auc]) f1_train = np.mean(cv_result["train_f1"]) recall_train = np.mean(cv_result["train_recall"]) precision_train = np.mean(cv_result["train_precision"]) acc_train = np.mean(cv_result["train_accuracy"]) auc_train = np.mean(cv_result["train_roc_auc"]) results = [ params, f1_train, recall_train, precision_train, acc_train, auc_train, f1, recall, precision, acc, auc ] with open(path_model_cv_res, "a") as f: l = ";".join([str(r) for r in results]) f.write(l + "\n") u.vprint("Iteration time %.2f" % (time() - start)) # Get best params res = np.array(res) best_metrics_idx = np.argmax(res[:, 1:], axis=0) best_metrics = np.max(res[:, 1:], axis=0) best_params = res[best_metrics_idx, 0] final_res = { "f1": { "metric": best_metrics[0], "best_params": best_params[0] }, "recall": { "metric": best_metrics[1], "best_params": best_params[1] }, "precision": { "metric": best_metrics[2], "best_params": best_params[2] }, "acc": { "metric": best_metrics[3], "best_params": best_params[3] }, "auc": { "metric": best_metrics[4], "best_params": best_params[4] } } #Record path_model_params = Path(conf_model["path"] + "best_params_%s.json" % date_target) with open(path_model_params, "w") as f: json.dump(final_res, f) return final_res
def create_dataframe_one_line_per_date_with_context(df, context_size, verbose=False): """df is a preprocessed dataframe in one line per text with information on each text (date_accident, date conso, ID...). It returns a data frame in one line per date with target as 1 for date_accident, 2 for date_conso, 0 otherwise""" u.vprint( "Creating dataframe in one line per date with context size of %d" % context_size, verbose) X = [] all_txt = df["txt"].values all_txt_id = df["ID"].values if ("date_accident" in df.columns) and ("date_consolidation" in df.columns): date_accident = df["date_accident"].values date_conso = df["date_consolidation"].values for i in range(all_txt.shape[0]): txt = all_txt[i] txt_id = all_txt_id[i] index_dates, dates_in_txt = get_dates_from_token_list(txt) for d in list(set(dates_in_txt)): left_context, right_context = get_context_date( context_size, d, txt) l = sum([list(c) for c in left_context], []) + sum( [list(c) for c in right_context], []) s = " ".join(l) positions = index_dates[np.argwhere(dates_in_txt == d)].ravel() positions_mean = np.mean(positions) part_of_txt = positions / len(txt) part_of_txt_mean = np.mean(part_of_txt) nb_appearances = len(positions) # Reducing set with rules discovered with exploratory analysis if ("date_accident" in df.columns) and ("date_consolidation" in df.columns): if 0 in positions: # The target date can't be the first word of the text pass else: if d == date_accident[i]: y_target = 1 elif d == date_conso[i]: y_target = 2 else: y_target = 0 X.append([ txt_id, d, s, positions_mean, part_of_txt_mean, nb_appearances, y_target ]) else: X.append([ txt_id, d, s, positions_mean, part_of_txt_mean, nb_appearances ]) if ("date_accident" in df.columns) and ("date_consolidation" in df.columns): df_out = pd.DataFrame(X, columns=[ "txt_id", "date", "context_date", "pos_moy", "part_moy", "nb_app", "target" ]) else: df_out = pd.DataFrame(X, columns=[ "txt_id", "date", "context_date", "pos_moy", "part_moy", "nb_app" ]) u.vprint("Dataframe completed.", verbose) return df_out
def preprocess_data_general(conf, dataset="train"): model_name = "simple_embedding_and_ml" conf_model = conf["models"][model_name] v = conf_model["verbose"] path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset) u.vprint("Loading...", v) df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8") u.vprint("Lowercase text...", v) df["txt"] = df["txt"].str.lower() u.vprint("Format dates in text...", v) df["txt"] = df["txt"].map(format_date) u.vprint("Format date accident and consolidation...", v) df["date_accident"] = df["date_accident"].map( lambda x: re.sub(r"[-.]", "", x)) df["date_consolidation"] = df["date_consolidation"].map( lambda x: re.sub(r"[-.]", "", x)) u.vprint("Tokenize words...", v) df["txt"] = df["txt"].map(nltk.word_tokenize) u.vprint("Normalize text...", v) df["txt"] = df["txt"].map(normalize) u.vprint("Preprocessing dataframe completed", v) return df