def grid_search(
    embeddings,
    train_X,
    train_y,
    cv_X,
    cv_y,
    grid_params,
):

    results = list()
    err = False
    total_p = len(grid_params)
    for i, params in enumerate(grid_params):
        print("Running [%d/%d]:\n%s" % (i + 1, total_p, str(params)))
        m = build_keras_embedding_classifier(embeddings, **params)

        cb = [
            EarlyStopping(patience=17),
            TerminateOnNaN(),
            ReduceLROnPlateau(verbose=1)
        ]

        m.fit(train_X,
              train_y,
              epochs=100,
              batch_size=128,
              validation_data=(cv_X, cv_y),
              callbacks=cb)

        test_preds = m.predict(cv_X).round().astype(int)
        test_actual = Y_test

        pred = m.predict(np.concatenate([sequences,
                                         test_sequences])).round().astype(int)
        actual = np.concatenate([Y, Y_test])

        try:
            perf_metrics = binary_classification_metrics(actual, pred)
            test_metrics = binary_classification_metrics(
                test_actual, test_preds)
            test_metrics = {"test_%s" % k: v for k, v in test_metrics.items()}
            print("HOLDOUT PERFORMANCE: %s" % str(test_metrics))
        except ValueError as ve:
            print("VALUE ERROR")
            perf_metrics = dict()
            err = True
        m_id = str(uuid.uuid4())[:7]
        m.save('./saved_models/model_%s.keras' % m_id)
        results.append(
            dict(error=err,
                 model_id=m_id,
                 **params,
                 **perf_metrics,
                 **test_metrics))

    # embedding distance - embed from different sources with same seed of important words, say from Glove. then tune all others around thos
    # can then compare them.

    return results
def tune_parameters(target_label, window_sizes, embed_dims, model=None):
    if model is None:
        model = GradientBoostingClassifier(n_estimators=400, verbose=1)

    results = list()
    for ws in window_sizes:
        for ed in embed_dims:
            print("Window size: %d" % ws)
            model_df, count_vec_model, model_features = create_model_ready(
                ixes=range(41),
                window_size=ws,
                target_type_label=target_label,
                neg_include_proba=.25,
                embed_dim=ed)

            cv_df, cv_vec_model, cv_features = create_model_ready(
                ixes=range(41, 53),
                window_size=ws,
                target_type_label=target_label,
                count_vec_model=count_vec_model,
                neg_include_proba=1.,
                embed_dim=ed)

            fm = model.fit(model_df[model_features], model_df.is_target)
            fm_preds = fm.predict_proba(cv_df[cv_features])

            cv_metrics = binary_classification_metrics(cv_df.is_target,
                                                       fm_preds[:, 1] > 0.5)
            cv_metrics['window_size'] = ws

            print(classification_report(cv_df.is_target, fm_preds[:, 1] > 0.5))
            print(cv_metrics)
            results.append(cv_metrics)

    return results
Exemple #3
0
def compare_auto_label():
    file_ixs = list(range(39))
    cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow(
        train_ixes=file_ixs[:23], test_ixes=file_ixs[23:31])
    preds = GradientBoostingClassifier().fit(cvec_X, Y).predict(cvec_X_test)
    #preds = DecisionTreeClassifier().fit(cvec_X, Y).predict(cvec_X_test)
    print("Labeled Only")
    metrics = utils.binary_classification_metrics(Y_test, preds)
    print(metrics)

    auto_label_p = 'top_10_auto_labeled_from_brown_external_att.txt'
    cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow(
        train_ixes=file_ixs[:23],
        test_ixes=file_ixs[23:31],
        include_auto_labeled=auto_label_p)
    preds = GradientBoostingClassifier().fit(cvec_X, Y).predict(cvec_X_test)
    #preds = DecisionTreeClassifier().fit(cvec_X, Y).predict(cvec_X_test)
    print("AUTO LABELED")
    metrics = utils.binary_classification_metrics(Y_test, preds)
    print(metrics)
def eval_on_holdout(target_label,
                    window_size,
                    embed_dim,
                    embed_type,
                    model=None,
                    eval_holdout=False):
    if model is None:
        model = GradientBoostingClassifier(n_estimators=400, verbose=1)

    print("Window size: %d" % window_size)
    model_df, count_vec_model, model_features = create_model_ready(
        ixes=range(41),
        window_size=window_size,
        target_type_label=target_label,
        neg_include_proba=.35,
        embed_dim=embed_dim,
        embed_type=embed_type)

    cv_df, cv_vec_model, cv_features = create_model_ready(
        ixes=range(41, 53),
        window_size=window_size,
        target_type_label=target_label,
        count_vec_model=count_vec_model,
        neg_include_proba=1.,
        embed_dim=embed_dim,
        embed_type=embed_type)

    holdout_df, holdout_vec_model, holdout_features = create_model_ready(
        ixes=range(53, 65),
        window_size=window_size,
        target_type_label=target_label,
        count_vec_model=count_vec_model,
        neg_include_proba=1.,
        embed_dim=embed_dim,
        embed_type=embed_type)

    if eval_holdout:
        train_df = pd.concat([model_df, cv_df]).sample(frac=1.)
        test_df = holdout_df
    else:
        train_df = model_df
        test_df = cv_df

    fm = model.fit(train_df[model_features], train_df.is_target)
    fm_preds = fm.predict_proba(test_df[model_features])

    metrics = binary_classification_metrics(test_df.is_target,
                                            fm_preds[:, 1] > 0.5)
    metrics['window_size'] = window_size
    metrics['embed_dim'] = embed_dim
    metrics['embed_type'] = embed_type
    metrics['model'] = str(model.__class__.__name__)
    pprint(metrics)
    return metrics
def evaluate_models_on_holdout(models_to_test, top_n_to_inc=None):
    ixes = list(range(39))
    train_X, train_Y, test_X, test_Y = load_train_and_test_bow(
        train_ixes=ixes[:31], test_ixes=ixes[31:], top_n_to_inc=top_n_to_inc)
    metrics = dict()
    for m_name, m in models_to_test:
        print("Running %s" % str(m_name))
        fit_m = m.fit(train_X, train_Y)
        preds = fit_m.predict(test_X)
        metrics[m_name] = utils.binary_classification_metrics(
            test_Y, preds.round())
    return metrics
    ret = load_data(embedding_dim=args.embed_dim, return_holdout=True)
    embeddings, sequences, Y, test_sequences, Y_test, holdout_sequences, Y_holdout = ret

    if args.model_load is not None:
        print("Keras model loading broken!")
        sys.exit(1)

        print("Loading model at %s" % str(args.model_load))
        m = keras.models.load_model(args.model_load)
        print("Running predictions on datasets")
        train_pred = m.predict(sequences)
        dev_pred = m.predict(test_sequences)
        holdout_pred = m.predict(holdout_sequences)

        train_metrics = binary_classification_metrics(Y, train_pred.round())
        dev_metrics = binary_classification_metrics(Y_test, dev_pred.round())
        holdout_metrics = binary_classification_metrics(
            Y_holdout, holdout_pred.round())

        print("Train: %s" % str(train_metrics))
        print("Dev: %s" % str(dev_metrics))
        print("Holdout: %s" % str(holdout_metrics))

    elif args.grid_search:
        grid_params = build_params_from_grid(
            activations=['tanh'],
            hidden_size=[15, 20, 25],
            depth=[4, 5, 6],  #range(1, 3),
            lr=[0.0000002],
            dropout=[.5],
Exemple #7
0
def run_gridsearch(model_type='dt', n_jobs=2, resample_n=1500, top_n_to_inc=0):
    # Recurse into this if given a list of model types
    if isinstance(model_type, list):
        return {
            mt: run_gridsearch(mt,
                               n_jobs=n_jobs,
                               resample_n=resample_n,
                               top_n_to_inc=top_n_to_inc)
            for mt in model_type
        }

    file_ixs = list(range(65))
    top_n = top_n_to_inc if top_n_to_inc != 0 else None
    #auto_label_p =  'top_10_auto_labeled_from_brown_external_att.txt'
    cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow(
        train_ixes=file_ixs[:41],
        test_ixes=file_ixs[41:53],
        top_n_to_inc=top_n,
        resample=resample_n)
    print("train X shape: %s" % str(cvec_X.shape))

    cv_kwargs = dict(n_jobs=n_jobs, X=cvec_X, Y=Y)

    #Best Params: {'criterion': 'entropy', 'max_depth': None,
    #  'max_leaf_nodes': 12, 'min_samples_leaf': 2, 'min_samples_split': 2}
    #---
    #Best Params: {'criterion': 'entropy', 'max_depth': None,
    #  'max_leaf_nodes': 12, 'min_samples_leaf': 2, 'min_samples_split': 2}
    #---
    #Best Params: {'criterion': 'gini', 'max_depth': 25,
    # 'max_leaf_nodes': None, 'min_samples_leaf': 3, 'min_samples_split': 4}
    if model_type == 'dt':
        cv_m = grid_search(DecisionTreeClassifier(),
                           param_grid=dict(
                               criterion=['gini', 'entropy'],
                               max_depth=[None] + list(range(24, 30, 3)),
                               max_leaf_nodes=[None] + list(range(13, 20, 2)),
                               min_samples_leaf=list(range(2, 5, 2)),
                               min_samples_split=list(range(2, 5, 2))),
                           **cv_kwargs)

    elif model_type == 'gb':
        cv_m = grid_search(
            GradientBoostingClassifier(),
            param_grid=dict(
                max_depth=list(range(3, 7, 1)),
                max_leaf_nodes=[None],  #+ list(range(9, 22, 2)),
                min_samples_leaf=list(range(2, 4, 1)),
                min_samples_split=list(range(2, 5, 2)),
                learning_rate=np.arange(0.7, 1.5, 0.33),
                n_estimators=range(100, 251, 50)),
            **cv_kwargs)

    elif model_type == 'rf':
        cv_m = grid_search(RandomForestClassifier(),
                           param_grid=dict(max_depth=range(2, 35, 4),
                                           min_samples_split=range(2, 45, 4),
                                           n_estimators=range(25, 226, 25)),
                           **cv_kwargs)

    elif model_type == 'nb':
        cv_m = grid_search(
            MultinomialNB(),
            param_grid=dict(alpha=[10**a for a in range(-3, 4, 1)]),
            #[.5, 1.5, 3.5, 9, 17, 25]),#np.arange(.1, 3.5, .35)),
            **cv_kwargs)
    else:
        raise ValueError("No model type %s" % model_type)

    pred_Y = cv_m.predict(cvec_X)
    metrics = utils.binary_classification_metrics(Y, pred_Y)
    print("Best Params: %s" % str(cv_m.best_params_))
    print("Train + CV Overall Performance:")
    print("%s: %s" % (model_type, str(metrics)))

    pred_Y_test = cv_m.predict(cvec_X_test)
    test_metrics = utils.binary_classification_metrics(Y_test, pred_Y_test)
    print("Hold out performance:")
    print("%s: %s" % (model_type, str(test_metrics)))

    test_metrics = {"test_%s" % k: v for k, v in test_metrics.items()}
    metrics.update(test_metrics)
    metrics['best_params'] = dict(cv_m.best_params_)
    return metrics