Esempio n. 1
0
def main_stacking_clf(conf):
    conf_model = conf["models"]["ml_multi_classes_ensembling"]
    v = conf_model["verbose"]
    summary = "\n******************************************* NEW SESSION *******************************************\n"
    summary += str(datetime.now()) + "\n"
    summary += str(conf_model) + "\n"
    summary += "***************************************************************************************************\n"
    u.vprint(summary, v)

    X, y = mmc.main_preprocessing_mmc(conf, conf_model, dataset="train")
    print("X shape : %s" % str(X.shape))
    print("y shape : %s" % str(y.shape))
    X_train, X_test, y_train, y_test = mmc.split_train_test(X, y)
    print("X_train shape : " + str(X_train.shape))
    print("X_test shape : %s" % str(X_test.shape))
    print("y_train shape : %s" % str(y_train.shape))
    print("y_test shape : %s" % str(y_test.shape))

    clf = get_stacked_estimator(conf_model)
    print("Classifier")
    print(clf)
    u.vprint("Fitting model", v)
    clf.fit(X_train, y_train)

    # Assessing
    assess_summary = mmc.main_assessing(conf, conf_model, clf, X_train, X_test,
                                        y_train, y_test)
    summary += "\n" + assess_summary + "\n"
Esempio n. 2
0
def main_preprocessing_mmc(conf, conf_model, dataset="train"):
    context_size = conf_model["context_size"]
    v = conf_model["verbose"]
    path_model_df_prep = Path(conf_model["path"] + "df_%s_preprocessed.csv" % dataset)
    if conf_model["preprocessing_%s" % dataset][0]:
        path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset)
        df = u.load_file(path_dataprep)
        df = preprocess_text_data(df, verbose=v)
        u.record_file(df, path_model_df_prep)
    else:
        df = u.load_file(path_model_df_prep)
        df["txt"] = df["txt"].map(eval)
    # Building learning matrix
    path_model_df_learn = conf_model["path"] + "df_%s_learning_cs%d.csv" % (dataset, context_size)
    if conf_model["preprocessing_%s" % dataset][1]:
        df = create_dataframe_one_line_per_date_with_context(df, context_size, verbose=v)
        u.vprint("Recording data", v)
        u.record_file(df, path_model_df_learn)
    else:
        u.vprint("Loading data in one line per date", v)
        df = u.load_file(Path(path_model_df_learn))
    df[["pos_moy", "part_moy", "nb_app"]] = df[["pos_moy", "part_moy", "nb_app"]].astype(float)
    X = df[["txt_id", "date", "context_date", "pos_moy", "part_moy", "nb_app"]].values
    if "target" in df.columns:
        y = df["target"].astype(int).values
        u.vprint("X shape : %s" % str(X.shape), v)
        u.vprint("y shape : %s" % str(y.shape), v)
        return X, y
    else:
        u.vprint("X shape : %s" % str(X.shape), v)
        return X
Esempio n. 3
0
def preprocess_data(conf, dataset="train"):
    model_name = "simple_embedding_and_ml"
    conf_model = conf["models"][model_name]
    preprocessing = conf_model["preprocessing_%s" % dataset]
    v = conf_model["verbose"]
    context_size = conf_model["context_size"]
    path_model_dataprep_X = Path(conf_model["path"] +
                                 "X_%s_preprocessed_cs%d.pkl" %
                                 (dataset, context_size))
    path_model_dataprep_y = Path(conf_model["path"] +
                                 "y_%s_preprocessed_cs%d.pkl" %
                                 (dataset, context_size))
    if preprocessing:
        df = preprocess_data_general(conf, dataset=dataset)
        X, y = creating_matrix(conf, df)
        u.vprint("Record...", v)
        with open(path_model_dataprep_X, "wb") as f:
            pickle.dump(X, f)
        with open(path_model_dataprep_y, "wb") as f:
            pickle.dump(y, f)
    else:
        u.vprint("Loading already prepared data...", v)
        with open(path_model_dataprep_X, "rb") as f:
            X = pickle.load(f)
        with open(path_model_dataprep_y, "rb") as f:
            y = pickle.load(f)
    return X, y
Esempio n. 4
0
def preprocess_text_data(df, verbose=True):
    u.vprint("Formating text", verbose)
    df["txt"] = df["txt"].map(convert_text_to_word_tokens)
    if "date_accident" in df.columns:
        df["date_accident"] = df["date_accident"].map(
            lambda x: re.sub(r"[-.]", "", x))
    if "date_consolidation" in df.columns:
        df["date_consolidation"] = df["date_consolidation"].map(
            lambda x: re.sub(r"[-.]", "", x))
    return df
Esempio n. 5
0
def main_na_nc_classifier(conf):
    conf_model = conf["models"]["na_nc_classifier"]
    v = conf_model["verbose"]

    # Preprocessing
    path_model_df_prep = Path(conf_model["path"] + "df_train_preprocessed.csv")
    if conf_model["preprocessing"]:
        path_dataprep = Path(conf["paths"]["dataprep"] + "df_train.csv")
        df_train = u.load_file(path_dataprep)
        df_train = preprocessing_data(df_train, verbose=v)
        u.record_file(df_train, path_model_df_prep)
    else:
        df_train = u.load_file(path_model_df_prep)
    X = df_train["txt"].values
    y = df_train["date_consolidation"].values

    ### Split train, test
    u.vprint("Splitting data in train and test", v)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    ### Learning
    # Get the estimator
    u.vprint("Initializing estimator", v)
    estimator = get_estimator(conf_model)

    # Grid search
    exp_dir = get_experiment_directory(conf_model)
    if conf_model["search_best_params"]:
        u.vprint("Performing best params search", v)
        best_params = search_best_params(conf_model, estimator, X_train,
                                         y_train)
        u.record_file(best_params, exp_dir / "best_params.json")

    # Set params
    estimator = set_estimator_params(estimator, conf_model, exp_dir)

    # Learning
    path_pickle_model = exp_dir / "fitted_model.pkl"
    if conf_model["learning"]:
        estimator.fit(X_train, y_train)
        u.record_file(estimator, path_pickle_model)

    # Assessing
    res1_train, res1_val = cross_validate_model(conf_model, estimator, X_train,
                                                y_train)
    u.vprint("Cross validation results : ", v)
    print(res1_train)
    print(res1_val)
    res_test = eval_model(estimator, X_train, y_train, X_test, y_test)
    u.vprint("Test results : ", v)
    print(res_test)
Esempio n. 6
0
def main_rule_base_classifier(conf):
    conf_model = conf["models"]["rule_base_classifier"]
    date_target = conf_model["date_target"]
    learn = conf_model["learn"]
    do_search = conf_model["search"]
    assess = conf_model["assess"]
    v = conf_model["verbose"]
    path_model = conf_model["path"]
    path_pickle = Path(path_model + "model.pkl")

    df = preprocess_data(conf, dataset="train")
    X_train, y_train = split_x_y(conf,
                                 date=date_target,
                                 df=df,
                                 dataset="train")
    # Get best params
    if do_search:
        vprint("Searching best param", v)
        best_params = search(conf, X_train, y_train)
        vprint("Best params are %s" % str(best_params), v)
    else:
        vprint("Reading best params in model folder", v)
        path_model_params = Path(conf_model["path"] + "best_params.json")
        with open(path_model_params, "r") as f:
            best_params = json.load(f)
    # Learning
    if learn:
        vprint("Learning phase", v)
        model = RuleBaseClassifier()
        model.set_params(best_params)
        model.fit(X_train, y_train)
        with open(path_pickle, "wb") as f:
            pickle.dump(model, f)
    # Assessing
    if assess:
        vprint("Assessing phase", v)
        df_test = preprocess_data(conf, dataset="test")
        X_test, y_test = split_x_y(conf,
                                   date=date_target,
                                   df=df_test,
                                   dataset="test")
        with open(path_pickle, "rb") as f:
            model = pickle.load(f)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        accuracy_train = compute_accuracy(y_train, y_train_pred)
        accuracy_test = compute_accuracy(y_test, y_test_pred)
        print("Accuracy train = %.2f" % accuracy_train)
        print("Accuracy test = %.2f" % accuracy_test)
Esempio n. 7
0
def set_estimator_params(estimator, conf_model, exp_dir):
    verbose = conf_model["verbose"]
    path_best_params = exp_dir / "best_params.json"
    if (len(conf_model["params"]) != 0) and (not conf_model["search_best_params"]):
        u.vprint("Using param from conf file : \n %s" % (str(conf_model["params"])), verbose)
        estimator.set_params(**conf_model["params"])
    elif path_best_params.exists():
        u.vprint("Loading best params", verbose)
        best_params = u.load_file(exp_dir / "best_params.json")
        u.vprint(str(best_params), verbose)
        estimator.set_params(**best_params)
    else:
        # Default parameters
        u.vprint("Using default params", verbose)
        u.vprint(str(estimator.get_params()), verbose)
    return estimator
Esempio n. 8
0
def main_ml_multi_classes_search_best_model(conf):
    conf_model = conf["models"]["ml_multi_classes"]
    v = conf_model["verbose"]
    summary = "\n******************************************* NEW SESSION *******************************************\n"
    summary += str(datetime.now()) + "\n"
    summary += str(conf_model) + "\n"

    ### Preprocessing
    X, y = main_preprocessing_mmc(conf, conf_model, dataset="train")

    ### Split train, test
    u.vprint("Splitting data in train and test", v)
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    ### Learning
    # Get the estimator
    u.vprint("Initializing estimator", v)
    estimator = get_estimator(conf_model)

    # Grid search
    exp_dir = get_experiment_directory(conf_model)
    if conf_model["search_best_params"]:
        u.vprint("Performing best params search", v)
        best_params = search_best_params(conf_model, estimator, X_train, y_train)
        u.record_file(best_params, exp_dir / "best_params.json")

    # Set params
    estimator = set_estimator_params(estimator, conf_model, exp_dir)
    summary += str(estimator.get_params()) + "\n"

    # Learning
    u.vprint("Learning phase", v)
    estimator.fit(X_train, y_train)

    # Assessing :
    u.vprint("Assessing phase", v)
    assess_summary = main_assessing(conf, conf_model, estimator, X_train, X_test, y_train, y_test)

    summary += assess_summary
    u.record_file(summary, exp_dir / "summary_results.txt", mode="a")
Esempio n. 9
0
def search(conf, X, y):
    conf_model = conf["models"]["rule_base_classifier"]
    grid_search = conf_model["grid_search"]
    v = conf_model["verbose"]

    all_combi = get_all_combi_grid_search(grid_search)
    nb_combi = len(all_combi)

    rbclf = RuleBaseClassifier()
    res = []
    best_params = None
    best_metric = None
    for i, params in enumerate(all_combi):
        vprint("%d / %d" % (i + 1, nb_combi), v)
        vprint("\t%s" % (str(params)), v)
        rbclf.set_params(params)
        rbclf.fit(X, y)
        y_pred = rbclf.predict(X)
        accuracy = compute_accuracy(y, y_pred)
        if best_metric is None:
            best_metric = accuracy
            best_params = params
        elif best_metric < accuracy:
            best_metric = accuracy
            best_params = params
        vprint("\tAccuracy = %.2f" % accuracy)
        all_predictions = rbclf.all_predictions
        proba = rbclf.proba
        res.append((params, y_pred, all_predictions, proba))
    path_model_params = Path(conf_model["path"] + "best_params.json")
    with open(path_model_params, "w") as f:
        json.dump(best_params, f)
    return best_params
Esempio n. 10
0
def main_assessing(conf, conf_model, estimator, X_train, X_test, y_train, y_test):
    v = conf_model["verbose"]
    assess_summary = ""
    if conf_model["assessing"][0]:
        u.vprint("Cross validation", v)
        # cross validate model
        res1_val, res1_train, cvr = cross_validate_model(conf_model, estimator, X_train, y_train)
        print(res1_train)
        print(res1_val)
        assess_summary += res1_train + "\n" + res1_val
    if conf_model["assessing"][1]:
        u.vprint("Assessing model on test data at 'date level'", v)
        # Eval on test data
        res_test, report, df0, df1, df2 = eval_model(estimator, X_test, y_test)
        assess_summary += "\n" + res_test + "\nClassification report\n" + report + "\nConfusion matrix\tClasse 0\n\t" \
                          + str(df0) + "\t Classe 1\n\t" + str(df1) + "\t Classe 2\n\t" + str(df2)
        print(res_test)
        print("\nClassification report")
        print(report)
        print("\nConfusion matrix")
        print("\t Classe 0")
        print(df0)
        print("\t Classe 1")
        print(df1)
        print("\t Classe 2")
        print(df2)
    if conf_model["assessing"][2]:
        u.vprint("Assessing model on text level", v)
        # Assessing at text level
        res3 = eval_model_text_level(conf_model, estimator, X_train, X_test, conf)
        print(res3)
        assess_summary += "\n" + res3 + "\n"
    return assess_summary
Esempio n. 11
0
def _make_all_dirs(log_path, log_subpath, mkdir=True, verbose=False):
    # create output dir
    if not os.path.exists(log_path) and mkdir:
        os.makedirs(log_path, exist_ok=True)
        vprint(verbose, f'Dir created: \n{log_path}')
    else:
        vprint(verbose, f'Use exisiting dir: \n{log_path}')

    for k, log_subpath_ in log_subpath.items():
        if not os.path.exists(log_subpath_) and mkdir:
            os.makedirs(log_subpath_, exist_ok=True)
            vprint(verbose, f'- sub dir: {k}')
        else:
            vprint(verbose, f'- use exisiting sub dir: {k}')
Esempio n. 12
0
def creating_matrix(conf, df):
    model_name = "simple_embedding_and_ml"
    conf_model = conf["models"][model_name]
    v = conf_model["verbose"]
    target_date = conf_model["date_target"]
    context_size = conf_model["context_size"]
    u.vprint("Creating numpy matrix", v)
    X = []
    y = []
    all_txt = df["txt"].values
    y_target_date = df[target_date]
    for i in range(all_txt.shape[0]):
        txt = all_txt[i]
        target_date = y_target_date[i]
        index_dates, dates_in_txt = get_dates_from_token_list(txt)
        for d in list(set(dates_in_txt)):
            left_context, right_context = get_context_date(
                context_size, d, txt)
            l = sum([list(c) for c in left_context], []) + sum(
                [list(c) for c in right_context], [])
            s = " ".join(l)
            positions = index_dates[np.argwhere(dates_in_txt == d)].ravel()
            positions_mean = np.mean(positions)
            part_of_txt = positions / len(txt)
            part_of_txt_mean = np.mean(part_of_txt)
            nb_appearances = len(positions)
            # Reducing set with rules discovered with exploratory analysis
            if 0 in positions:
                # The target date can't be the first word of the text
                pass
            else:
                X.append([s, positions_mean, part_of_txt_mean, nb_appearances])
                if d == target_date:
                    y.append(1)
                else:
                    y.append(0)
    X = np.array(X)
    y = np.array(y)
    return X, y
Esempio n. 13
0
	def fill_buffer(self, model, limit=None):
		"""Populates the buffer.

		http://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator
		http://www.sqlalchemy.org/trac/wiki/UsageRecipes/WindowedRangeQuery

		http://stackoverflow.com/questions/1078383/sqlalchemy-difference-between-query-and-query-all-in-for-loops
		http://www.mail-archive.com/[email protected]/msg12443.html
		http://stackoverflow.com/questions/1145905/scanning-huge-tables-with-sqlalchemy-using-the-orm
		"""
		where = True

		total = model.session.query(sqlalchemy.func.count(model.id)).filter(where).scalar()
		self.index_total = min(total, limit) if limit not in [None, 0] else total
		vprint('Number of elements to index: ' + str(self.index_total))

		vprint('Populating the buffer...')

		query = model.filter(where)
		if limit not in [None, 0]:
			query = query.limit(limit)
		self.index_buffer = query.values(model.id)

		vprint('Buffer populated.')
Esempio n. 14
0
    ## Validate model every so often
    if niter % VALFREQ == 0:
        ut.mprint("Validating model")
        val_iter = vset.ndata // BSZ
        vloss, vset.niter = [], 0
        sess.run(vset.fetchOp,feed_dict=vset.fdict())
        for its in range(val_iter):
            sess.run(swpV)
            outs = sess.run(
                lvals+[vset.fetchOp],
                feed_dict={**vset.fdict(), is_training: False}
            )
            vloss.append(np.array(outs[:-1]))
        vloss = np.mean(np.stack(vloss, axis=0), axis=0)
        ut.vprint(niter, vnms, vloss.tolist())

    ## Run training step and print losses
    sess.run(swpT)
    if niter % 100 == 0:
        outs = sess.run(
            lvals+[tStep, tset.fetchOp],
            feed_dict={**tset.fdict(), lr: get_lr(niter), is_training: True}
        )
        ut.vprint(niter, tnms, outs[:-2])
        ut.vprint(niter, ['lr'], [get_lr(niter)])
    else:
        outs = sess.run(
            [loss, psnr, tStep, tset.fetchOp],
            feed_dict={**tset.fdict(), lr: get_lr(niter), is_training: True}
        )
Esempio n. 15
0
	def print_info(self):
		# Utility functions
		from utils.utils import vprint
		process_mode = 'Single process' if self.single_process_mode else 'Multi threaded'
		vprint('Process mode: {:s}'.format(process_mode))

		if not self.single_process_mode:
			vprint('Threads: {:d}'.format(self.threads))

		vprint('Index: {:s}'.format(self.es_index))
		vprint('Type: {:s}'.format(self.es_type))
		vprint('DB Queue size: {:d}'.format(self.db_queue_size))
		vprint('Read chunk size: {:d}'.format(self.read_chunk_size))
		vprint('Write chunk size: {:d}'.format(self.write_chunk_size))
Esempio n. 16
0
def preprocess_data(conf, dataset="train"):
    model_name = "ml_model"
    conf_model = conf["models"][model_name]
    context_size = conf_model["context_size"]
    v = conf_model["verbose"]
    path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset)
    u.vprint("Loading...", v)
    df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8")
    u.vprint("Lowercase text...", v)
    df["txt"] = df["txt"].str.lower()
    u.vprint("Format dates in text...", v)
    df["txt"] = df["txt"].map(format_date)
    u.vprint("Format date accident and consolidation...", v)
    df["date_accident"] = df["date_accident"].map(
        lambda x: re.sub(r"[-.]", "", x))
    df["date_consolidation"] = df["date_consolidation"].map(
        lambda x: re.sub(r"[-.]", "", x))
    u.vprint("Tokenize words...", v)
    df["txt"] = df["txt"].map(nltk.word_tokenize)
    u.vprint("Normalize text...", v)
    df["txt"] = df["txt"].map(normalize)
    u.vprint("Record the dataframe")
    path_model_df_prep = Path(conf_model["path"] +
                              "df_%s_preprocessed_cs%d.csv" %
                              (dataset, context_size))
    df.to_csv(path_model_df_prep, sep=';', encoding="utf-8", index=False)
    u.vprint("Preprocessing dataframe completed", v)
    return df
Esempio n. 17
0
def main_ml(conf):

    conf_model = conf["models"]["ml_model"]
    ml_model = conf_model["ml_model"]
    learn = conf_model["learn"]
    text_preprocessing = conf_model["text_preprocessing"]
    path_fasttext_model = conf_model["path_fasttext_model"]
    doc2vec_strategy = conf_model["doc2vec_strategy"]
    do_search = conf_model["search"]
    context_size = conf_model["context_size"]
    assess = conf_model["assess"]
    v = conf_model["verbose"]
    date_target = conf_model["date_target"]
    preprocessing_train = conf_model["preprocessing_train"]

    dir_exp = get_experiment_directory(conf_model)

    # Get the estimator
    model = MLModel(ml_model=ml_model,
                    text_preprocessing=text_preprocessing,
                    context_size=context_size,
                    path_fasttext_model=path_fasttext_model,
                    doc2vec_strategy=doc2vec_strategy)

    # Preprocess data
    if preprocessing_train:
        df_train = preprocess_data(conf, dataset="train")
    else:
        path_model_df_prep = Path(conf_model["path"] +
                                  "df_train_preprocessed_cs%d.csv" %
                                  (context_size))
        df_train = pd.read_csv(path_model_df_prep, sep=';', encoding="utf-8")
        df_train["txt"] = df_train["txt"].map(eval)

    # Splitting train and test
    df_train, df_test = train_test_split(df_train)

    # Splitting X and y
    df_X_train = df_train[["ID", "txt"]]
    df_y_train = df_train[date_target]
    df_X_test = df_test[["ID", "txt"]]
    df_y_test = df_test[date_target]
    # Get best params
    if do_search:
        u.vprint("Searching best param", v)
        X_train, y_train, ids_and_dates = model._creating_matrix(
            df_X_train, df_y_train)
        best_params = search(conf, model.estimator, X_train, y_train)
        u.vprint("Best params are %s" % str(best_params), v)
        u.vprint("Record best params", v)
        record(conf, best_params, "best_params")
    else:  # search has already been done and we just read out best param recorded previously
        u.vprint("Reading best params in model folder", v)
        path_file_best_param = dir_exp / Path("best_params.json")
        with open(path_file_best_param, "r") as f:
            best_params = json.load(f)
        u.vprint("Best params are %s" % str(best_params), v)
    # Learning
    if learn:
        u.vprint("Learning", v)
        model.estimator.set_params(**best_params)
        model.fit(df_X_train, df_y_train)
    # Assessing
    if assess:
        u.vprint("Assessing phase", v)
        model.estimator.set_params(**best_params)

        u.vprint("First cross validate the ML estimator", v)
        metrics_eval = [
            "f1_micro", "f1_macro", "precision_micro", "precision_macro",
            "recall_micro", "recall_macro"
        ]
        X_train, y_train, ids_and_dates = model._creating_matrix(
            df_X_train, df_y_train)
        cvr = cross_validate(model.estimator,
                             X_train,
                             y_train,
                             cv=2,
                             n_jobs=-1,
                             scoring=metrics_eval,
                             return_train_score=True,
                             verbose=0)
        f1_train = np.mean(cvr["train_f1"])
        recall_train = np.mean(cvr["train_recall"])
        precision_train = np.mean(cvr["train_precision"])
        accuracy_train = np.mean(cvr["train_accuracy"])
        auc_train = np.mean(cvr["train_roc_auc"])

        f1_val = np.mean(cvr["test_f1"])
        recall_val = np.mean(cvr["test_recall"])
        precision_val = np.mean(cvr["test_precision"])
        accuracy_val = np.mean(cvr["test_accuracy"])
        auc_val = np.mean(cvr["test_roc_auc"])

        res1_train = "Train : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = %.2f)" % (
            f1_train, recall_train, precision_train, auc_train, accuracy_train)
        res1_val = "Validation : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = %.2f)" % (
            f1_val, recall_val, precision_val, auc_val, accuracy_val)
        print(res1_train)
        print(res1_val)

        u.vprint("Now assess the model to the text level")
        model.fit(df_X_train, df_y_train)
        dates_pred_train = model.predict(df_X_train)
        date_true_train = df_y_train.values.ravel()
        accuracy_train = compute_accuracy(date_true_train, dates_pred_train)

        dates_pred_test = model.predict(df_X_test)
        date_true_test = df_y_test.values.ravel()
        accuracy_test = compute_accuracy(date_true_test, dates_pred_test)

        res2_train = "Accuracy train = %.2f" % accuracy_train
        res2_test = "Accuracy test = %.2f" % accuracy_test

        print(res2_train)
        print(res2_test)

        line = "\n".join([res1_train, res1_val, res2_train, res2_test])
        record(conf, line, "assess_results")
Esempio n. 18
0
def main_ml_multi_classes_prod(conf):
    conf_model = conf["models"]["ml_multi_classes"]
    v = conf_model["verbose"]
    exp_dir = get_experiment_directory(conf_model)
    u.vprint("Conf model : ", v)
    u.vprint(conf_model, v)
    u.vprint("Exp dir : %s" % str(exp_dir))

    # First we get the estimator fitted on full data
    u.vprint("Run main preprocessing on train data", v)
    X_train, y_train = main_preprocessing_mmc(conf, conf_model, dataset="train")
    u.vprint("Fit estimator", v)
    estimator = get_estimator(conf_model)
    estimator = set_estimator_params(estimator, conf_model, exp_dir)
    estimator.fit(X_train, y_train)
    u.vprint(estimator, v)

    # Now we preprocess test data :
    u.vprint("Preprocessing full test data", v)
    X_test = main_preprocessing_mmc(conf, conf_model, dataset="test")

    # Make prediction :
    u.vprint("Doing prediction", v)
    adjust_with_nanc_classifier = conf_model["adjust_with_nanc_classifer"]
    df_y_pred_test = predict_text_level(conf_model, estimator, X_test, adjust_with_nanc_classifier, conf,
                                        dataset="test")
    u.vprint("Final prediction : ", v)
    u.vprint(df_y_pred_test)
    return df_y_pred_test
Esempio n. 19
0
def main(script, *args, **kwargs):
	start_time = time.time()

	# Config options
	config = Config()

	source_table = 'categories'
	source_relationships = {
		'one_to_many': {
			'alternative_languages': {
				'foreign_key': 'category_id'
			},
			'external_pages': {
				'foreign_key': 'category_id'
			},
			'news_groups': {
				'foreign_key': 'category_id'
			}
		},
		'one_to_one': {
		},
		'many_to_one': {
		},
		'many_to_many': {
		},
		'self_referential': {
			'parent_category': {
				'foreign_key': 'parent_id',
				'backref': 'child_categories'
			},
		}
	}
	# source_table = 'external_pages'
	# source_relationships = {
	# 	'one_to_many': {
	# 	},
	# 	'many_to_one': {
	# 		'categories': {
	# 			'foreign_key': 'category_id'
	# 		}
	# 	},
	# 	'one_to_one': {
	# 	},
	# 	'many_to_many': {
	# 	},
	# 	'self_referential': {
	# 	}
	# }
	document_map = {
		'alternative_languages': 'alternative_languages',
		'categories': 'categories',
		'external_pages': 'external_pages',
		'news_groups': 'news_groups',
		'parent_category': 'parent_category',
		'related_categories': 'related_categories',
	}

	# Threads list
	threads = []

	# Db connector queue
	read_queue = Queue()

	# Db connections list
	db_connections = config.db_connections

	# Populte db connector queue (round robin)
	for _ in range(config.db_queue_size):
		db_connection = db_connections.pop(0)
		db_connector = DbConnector(db_connection).build(source_table, source_relationships)
		read_queue.put(db_connector)
		db_connections.append(db_connection)

	# Elasticsearch connector
	es_connector = ES(server=config.es_connections, bulk_size=config.write_chunk_size)

	# Create index if necessary
	es_connector.indices.create_index_if_missing(config.es_index)

	# Define mapping
	# es_connector.cluster.put_mapping(config.es_type, {'properties':gralSettings['mapping']}, config.indexName)

	# Update index settings to improve indexing speed.
	#
	# Disable refresh interval
	# Improve indexing speed by augmenting the merge factor (uses more RAM).
	# http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-update-settings.html#bulk
	# http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings
	#
	# http://blog.sematext.com/2013/07/08/elasticsearch-refresh-interval-vs-indexing-performance/
	# http://www.elasticsearch.org/blog/update-settings/
	# https://github.com/aparo/pyes/blob/master/docs/guide/reference/api/admin-indices-update-settings.rst
	# http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules-merge.html#log-byte-size
	vprint('Optimizing for bulk indexing...')
	es_connector.indices.update_settings(config.es_index, {
		'index.refresh_interval': '-1',
		'index.merge.policy.merge_factor': '30'
	})

	indexer = Indexer(
		db_connector=db_connector,
		read_queue=read_queue,
		es_connector=es_connector,
		es_index=config.es_index,
		es_type=config.es_type,
		document_map=document_map,
		limit=config.limit)

	# Start indexing
	if config.single_process_mode:
		indexer.index(start_time, read_chunk_size=config.read_chunk_size)
	else:
		# Create new threads
		for i in range(config.threads):
				thread = Thread(
					indexer.index,
					start_time,
					read_chunk_size=config.read_chunk_size,
					autostart=config.autostart_threads)
				threads.append(thread)

		# Starts threads, by calling run()
		if not config.autostart_threads:
			for thread in threads:
				thread.start()

		# Wait for threads to terminate
		for thread in threads:
			thread.join()

	vprint('Optimizing for interactive indexing...')
	es_connector.indices.update_settings(config.es_index, {
		'index.refresh_interval': '1s',
		'index.merge.policy.merge_factor': '10'
	})

	vprint('Refreshing index...')
	es_connector.indices.refresh()

	vprint('Elapsed: {:f}'.format(time.time() - start_time))
Esempio n. 20
0
def preprocess_data(conf, dataset="train"):
    conf_model = conf["models"]["rule_base_classifier"]
    v = conf_model["verbose"]
    preprocessing = conf_model["preprocessing"]
    if preprocessing:
        path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset)
        vprint("Loading...", v)
        df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8")

        vprint("Lowercase text...", v)
        df["txt"] = df["txt"].str.lower()
        vprint("Format dates in text...", v)
        df["txt"] = df["txt"].map(format_date)
        vprint("Format date accident and consolidation...", v)
        df["date_accident"] = df["date_accident"].map(
            lambda x: re.sub(r"[-.]", "", x))
        df["date_consolidation"] = df["date_consolidation"].map(
            lambda x: re.sub(r"[-.]", "", x))
        vprint("Tokenize words...", v)
        df["txt"] = df["txt"].map(nltk.word_tokenize)
        vprint("Normalize text...", v)
        df["txt"] = df["txt"].map(normalize)

        vprint("Record...", v)
        path_rule_base_classifier = Path(conf_model["path"] +
                                         "df_%s_preprocessed.csv" % dataset)
        df.to_csv(path_rule_base_classifier,
                  sep=';',
                  encoding="utf-8",
                  index=False)
        vprint("Preprocessing completed", v)
    else:
        path_rule_base_classifier = Path(conf_model["path"] +
                                         "df_%s_preprocessed.csv" % dataset)
        df = pd.read_csv(path_rule_base_classifier, sep=';', encoding="utf-8")
        df["txt"] = df["txt"].map(eval)
    return df
Esempio n. 21
0
def main_simple_embedding_ml(conf):
    conf_model = conf["models"]["simple_embedding_and_ml"]
    learn = conf_model["learn"]
    do_search = conf_model["search"]
    assess = conf_model["assess"]
    v = conf_model["verbose"]
    date_target = conf_model["date_target"]

    X_train, y_train = preprocess_data(conf, dataset="train")
    # Get best params
    if do_search:
        u.vprint("Searching best param", v)
        final_res = search(conf, X_train, y_train)
        best_params = final_res["f1"]["best_params"]
        u.vprint("Best params are %s" % str(best_params), v)
    else:  # search has already been done and we just read out best param recorded previously
        u.vprint("Reading best params in model folder", v)
        path_model_params = Path(conf_model["path"] +
                                 "best_params_%s.json" % date_target)
        with open(path_model_params, "r") as f:
            final_res = json.load(f)
        best_params = final_res["f1"]["best_params"]
    # Learning
    if assess or learn:
        if assess:
            u.vprint(
                "Learning phase since this model cannot be pickled once trained"
            )
        else:
            u.vprint("Learning phase", v)
        model = SimpleEmbeddingAndML()
        model.set_params(best_params)
        model.fit(X_train, y_train)
    # Assessing
    if assess:
        u.vprint("Assessing phase", v)
        metrics_eval = ["f1", "recall", "precision", "accuracy", "roc_auc"]
        cvr = cross_validate(model.estimator,
                             X_train,
                             y_train,
                             cv=5,
                             n_jobs=-1,
                             scoring=metrics_eval,
                             return_train_score=True)
        f1_train = np.mean(cvr["train_f1"])
        recall_train = np.mean(cvr["train_recall"])
        precision_train = np.mean(cvr["train_precision"])
        accuracy_train = np.mean(cvr["train_accuracy"])
        auc_train = np.mean(cvr["train_roc_auc"])

        f1_test = np.mean(cvr["test_f1"])
        recall_test = np.mean(cvr["test_recall"])
        precision_test = np.mean(cvr["test_precision"])
        accuracy_test = np.mean(cvr["test_accuracy"])
        auc_test = np.mean(cvr["test_roc_auc"])

        print(
            "Train : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = .%2f)"
            % (f1_train, recall_train, precision_train, auc_train,
               accuracy_train))
        print(
            "Test : f1 = %.2f (recall = %.2f, precision = %.2f, AUC = %.2f, accuracy = .%2f)"
            % (f1_test, recall_test, precision_test, auc_test, accuracy_test))
Esempio n. 22
0
def search(conf, X, y):
    model_name = "simple_embedding_and_ml"
    conf_model = conf["models"][model_name]
    grid_search = conf_model["grid_search"]
    date_target = conf_model["date_target"]
    v = conf_model["verbose"]
    path_model_cv_res = Path(conf_model["path"] +
                             "full_search_results_%s.csv" % date_target)

    flat_grid_search = u.flat_dictionary(grid_search, [], "")
    all_combi = u.get_all_combi_grid_search(dict(flat_grid_search))
    nb_tot_combi = len(all_combi)

    res = []
    cv_res = []

    # Eval each set of params
    for i, combi in enumerate(all_combi):
        start = time()
        u.vprint("%d / %d" % (i + 1, nb_tot_combi), v)
        model = SimpleEmbeddingAndML()
        params = u.restruct_dict(combi, {})
        model.set_params(params)
        metrics_eval = ["f1", "recall", "precision", "accuracy", "roc_auc"]
        cv_result = cross_validate(model.estimator,
                                   X,
                                   y,
                                   cv=5,
                                   n_jobs=-1,
                                   scoring=metrics_eval,
                                   return_train_score=True)

        f1 = np.mean(cv_result["test_f1"])
        recall = np.mean(cv_result["test_recall"])
        precision = np.mean(cv_result["test_precision"])
        acc = np.mean(cv_result["test_accuracy"])
        auc = np.mean(cv_result["test_roc_auc"])
        res.append([params, f1, recall, precision, acc, auc])

        f1_train = np.mean(cv_result["train_f1"])
        recall_train = np.mean(cv_result["train_recall"])
        precision_train = np.mean(cv_result["train_precision"])
        acc_train = np.mean(cv_result["train_accuracy"])
        auc_train = np.mean(cv_result["train_roc_auc"])
        results = [
            params, f1_train, recall_train, precision_train, acc_train,
            auc_train, f1, recall, precision, acc, auc
        ]
        with open(path_model_cv_res, "a") as f:
            l = ";".join([str(r) for r in results])
            f.write(l + "\n")
        u.vprint("Iteration time %.2f" % (time() - start))

    # Get best params
    res = np.array(res)
    best_metrics_idx = np.argmax(res[:, 1:], axis=0)
    best_metrics = np.max(res[:, 1:], axis=0)
    best_params = res[best_metrics_idx, 0]
    final_res = {
        "f1": {
            "metric": best_metrics[0],
            "best_params": best_params[0]
        },
        "recall": {
            "metric": best_metrics[1],
            "best_params": best_params[1]
        },
        "precision": {
            "metric": best_metrics[2],
            "best_params": best_params[2]
        },
        "acc": {
            "metric": best_metrics[3],
            "best_params": best_params[3]
        },
        "auc": {
            "metric": best_metrics[4],
            "best_params": best_params[4]
        }
    }

    #Record
    path_model_params = Path(conf_model["path"] +
                             "best_params_%s.json" % date_target)
    with open(path_model_params, "w") as f:
        json.dump(final_res, f)

    return final_res
Esempio n. 23
0
def create_dataframe_one_line_per_date_with_context(df,
                                                    context_size,
                                                    verbose=False):
    """df is a preprocessed dataframe in one line per text with information on each text (date_accident, date conso,
    ID...). It returns a data frame in one line per date with target as 1 for date_accident, 2 for date_conso, 0 otherwise"""
    u.vprint(
        "Creating dataframe in one line per date with context size of %d" %
        context_size, verbose)
    X = []
    all_txt = df["txt"].values
    all_txt_id = df["ID"].values
    if ("date_accident" in df.columns) and ("date_consolidation"
                                            in df.columns):
        date_accident = df["date_accident"].values
        date_conso = df["date_consolidation"].values
    for i in range(all_txt.shape[0]):
        txt = all_txt[i]
        txt_id = all_txt_id[i]
        index_dates, dates_in_txt = get_dates_from_token_list(txt)
        for d in list(set(dates_in_txt)):
            left_context, right_context = get_context_date(
                context_size, d, txt)
            l = sum([list(c) for c in left_context], []) + sum(
                [list(c) for c in right_context], [])
            s = " ".join(l)
            positions = index_dates[np.argwhere(dates_in_txt == d)].ravel()
            positions_mean = np.mean(positions)
            part_of_txt = positions / len(txt)
            part_of_txt_mean = np.mean(part_of_txt)
            nb_appearances = len(positions)
            # Reducing set with rules discovered with exploratory analysis
            if ("date_accident" in df.columns) and ("date_consolidation"
                                                    in df.columns):
                if 0 in positions:
                    # The target date can't be the first word of the text
                    pass
                else:
                    if d == date_accident[i]:
                        y_target = 1
                    elif d == date_conso[i]:
                        y_target = 2
                    else:
                        y_target = 0
                    X.append([
                        txt_id, d, s, positions_mean, part_of_txt_mean,
                        nb_appearances, y_target
                    ])
            else:
                X.append([
                    txt_id, d, s, positions_mean, part_of_txt_mean,
                    nb_appearances
                ])
    if ("date_accident" in df.columns) and ("date_consolidation"
                                            in df.columns):
        df_out = pd.DataFrame(X,
                              columns=[
                                  "txt_id", "date", "context_date", "pos_moy",
                                  "part_moy", "nb_app", "target"
                              ])
    else:
        df_out = pd.DataFrame(X,
                              columns=[
                                  "txt_id", "date", "context_date", "pos_moy",
                                  "part_moy", "nb_app"
                              ])
    u.vprint("Dataframe completed.", verbose)
    return df_out
Esempio n. 24
0
def preprocess_data_general(conf, dataset="train"):
    model_name = "simple_embedding_and_ml"
    conf_model = conf["models"][model_name]
    v = conf_model["verbose"]
    path_dataprep = Path(conf["paths"]["dataprep"] + "df_%s.csv" % dataset)
    u.vprint("Loading...", v)
    df = pd.read_csv(path_dataprep, sep=';', encoding="utf-8")
    u.vprint("Lowercase text...", v)
    df["txt"] = df["txt"].str.lower()
    u.vprint("Format dates in text...", v)
    df["txt"] = df["txt"].map(format_date)
    u.vprint("Format date accident and consolidation...", v)
    df["date_accident"] = df["date_accident"].map(
        lambda x: re.sub(r"[-.]", "", x))
    df["date_consolidation"] = df["date_consolidation"].map(
        lambda x: re.sub(r"[-.]", "", x))
    u.vprint("Tokenize words...", v)
    df["txt"] = df["txt"].map(nltk.word_tokenize)
    u.vprint("Normalize text...", v)
    df["txt"] = df["txt"].map(normalize)
    u.vprint("Preprocessing dataframe completed", v)
    return df