def grid_search( embeddings, train_X, train_y, cv_X, cv_y, grid_params, ): results = list() err = False total_p = len(grid_params) for i, params in enumerate(grid_params): print("Running [%d/%d]:\n%s" % (i + 1, total_p, str(params))) m = build_keras_embedding_classifier(embeddings, **params) cb = [ EarlyStopping(patience=17), TerminateOnNaN(), ReduceLROnPlateau(verbose=1) ] m.fit(train_X, train_y, epochs=100, batch_size=128, validation_data=(cv_X, cv_y), callbacks=cb) test_preds = m.predict(cv_X).round().astype(int) test_actual = Y_test pred = m.predict(np.concatenate([sequences, test_sequences])).round().astype(int) actual = np.concatenate([Y, Y_test]) try: perf_metrics = binary_classification_metrics(actual, pred) test_metrics = binary_classification_metrics( test_actual, test_preds) test_metrics = {"test_%s" % k: v for k, v in test_metrics.items()} print("HOLDOUT PERFORMANCE: %s" % str(test_metrics)) except ValueError as ve: print("VALUE ERROR") perf_metrics = dict() err = True m_id = str(uuid.uuid4())[:7] m.save('./saved_models/model_%s.keras' % m_id) results.append( dict(error=err, model_id=m_id, **params, **perf_metrics, **test_metrics)) # embedding distance - embed from different sources with same seed of important words, say from Glove. then tune all others around thos # can then compare them. return results
def tune_parameters(target_label, window_sizes, embed_dims, model=None): if model is None: model = GradientBoostingClassifier(n_estimators=400, verbose=1) results = list() for ws in window_sizes: for ed in embed_dims: print("Window size: %d" % ws) model_df, count_vec_model, model_features = create_model_ready( ixes=range(41), window_size=ws, target_type_label=target_label, neg_include_proba=.25, embed_dim=ed) cv_df, cv_vec_model, cv_features = create_model_ready( ixes=range(41, 53), window_size=ws, target_type_label=target_label, count_vec_model=count_vec_model, neg_include_proba=1., embed_dim=ed) fm = model.fit(model_df[model_features], model_df.is_target) fm_preds = fm.predict_proba(cv_df[cv_features]) cv_metrics = binary_classification_metrics(cv_df.is_target, fm_preds[:, 1] > 0.5) cv_metrics['window_size'] = ws print(classification_report(cv_df.is_target, fm_preds[:, 1] > 0.5)) print(cv_metrics) results.append(cv_metrics) return results
def compare_auto_label(): file_ixs = list(range(39)) cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow( train_ixes=file_ixs[:23], test_ixes=file_ixs[23:31]) preds = GradientBoostingClassifier().fit(cvec_X, Y).predict(cvec_X_test) #preds = DecisionTreeClassifier().fit(cvec_X, Y).predict(cvec_X_test) print("Labeled Only") metrics = utils.binary_classification_metrics(Y_test, preds) print(metrics) auto_label_p = 'top_10_auto_labeled_from_brown_external_att.txt' cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow( train_ixes=file_ixs[:23], test_ixes=file_ixs[23:31], include_auto_labeled=auto_label_p) preds = GradientBoostingClassifier().fit(cvec_X, Y).predict(cvec_X_test) #preds = DecisionTreeClassifier().fit(cvec_X, Y).predict(cvec_X_test) print("AUTO LABELED") metrics = utils.binary_classification_metrics(Y_test, preds) print(metrics)
def eval_on_holdout(target_label, window_size, embed_dim, embed_type, model=None, eval_holdout=False): if model is None: model = GradientBoostingClassifier(n_estimators=400, verbose=1) print("Window size: %d" % window_size) model_df, count_vec_model, model_features = create_model_ready( ixes=range(41), window_size=window_size, target_type_label=target_label, neg_include_proba=.35, embed_dim=embed_dim, embed_type=embed_type) cv_df, cv_vec_model, cv_features = create_model_ready( ixes=range(41, 53), window_size=window_size, target_type_label=target_label, count_vec_model=count_vec_model, neg_include_proba=1., embed_dim=embed_dim, embed_type=embed_type) holdout_df, holdout_vec_model, holdout_features = create_model_ready( ixes=range(53, 65), window_size=window_size, target_type_label=target_label, count_vec_model=count_vec_model, neg_include_proba=1., embed_dim=embed_dim, embed_type=embed_type) if eval_holdout: train_df = pd.concat([model_df, cv_df]).sample(frac=1.) test_df = holdout_df else: train_df = model_df test_df = cv_df fm = model.fit(train_df[model_features], train_df.is_target) fm_preds = fm.predict_proba(test_df[model_features]) metrics = binary_classification_metrics(test_df.is_target, fm_preds[:, 1] > 0.5) metrics['window_size'] = window_size metrics['embed_dim'] = embed_dim metrics['embed_type'] = embed_type metrics['model'] = str(model.__class__.__name__) pprint(metrics) return metrics
def evaluate_models_on_holdout(models_to_test, top_n_to_inc=None): ixes = list(range(39)) train_X, train_Y, test_X, test_Y = load_train_and_test_bow( train_ixes=ixes[:31], test_ixes=ixes[31:], top_n_to_inc=top_n_to_inc) metrics = dict() for m_name, m in models_to_test: print("Running %s" % str(m_name)) fit_m = m.fit(train_X, train_Y) preds = fit_m.predict(test_X) metrics[m_name] = utils.binary_classification_metrics( test_Y, preds.round()) return metrics
ret = load_data(embedding_dim=args.embed_dim, return_holdout=True) embeddings, sequences, Y, test_sequences, Y_test, holdout_sequences, Y_holdout = ret if args.model_load is not None: print("Keras model loading broken!") sys.exit(1) print("Loading model at %s" % str(args.model_load)) m = keras.models.load_model(args.model_load) print("Running predictions on datasets") train_pred = m.predict(sequences) dev_pred = m.predict(test_sequences) holdout_pred = m.predict(holdout_sequences) train_metrics = binary_classification_metrics(Y, train_pred.round()) dev_metrics = binary_classification_metrics(Y_test, dev_pred.round()) holdout_metrics = binary_classification_metrics( Y_holdout, holdout_pred.round()) print("Train: %s" % str(train_metrics)) print("Dev: %s" % str(dev_metrics)) print("Holdout: %s" % str(holdout_metrics)) elif args.grid_search: grid_params = build_params_from_grid( activations=['tanh'], hidden_size=[15, 20, 25], depth=[4, 5, 6], #range(1, 3), lr=[0.0000002], dropout=[.5],
def run_gridsearch(model_type='dt', n_jobs=2, resample_n=1500, top_n_to_inc=0): # Recurse into this if given a list of model types if isinstance(model_type, list): return { mt: run_gridsearch(mt, n_jobs=n_jobs, resample_n=resample_n, top_n_to_inc=top_n_to_inc) for mt in model_type } file_ixs = list(range(65)) top_n = top_n_to_inc if top_n_to_inc != 0 else None #auto_label_p = 'top_10_auto_labeled_from_brown_external_att.txt' cvec_X, Y, cvec_X_test, Y_test = load_train_and_test_bow( train_ixes=file_ixs[:41], test_ixes=file_ixs[41:53], top_n_to_inc=top_n, resample=resample_n) print("train X shape: %s" % str(cvec_X.shape)) cv_kwargs = dict(n_jobs=n_jobs, X=cvec_X, Y=Y) #Best Params: {'criterion': 'entropy', 'max_depth': None, # 'max_leaf_nodes': 12, 'min_samples_leaf': 2, 'min_samples_split': 2} #--- #Best Params: {'criterion': 'entropy', 'max_depth': None, # 'max_leaf_nodes': 12, 'min_samples_leaf': 2, 'min_samples_split': 2} #--- #Best Params: {'criterion': 'gini', 'max_depth': 25, # 'max_leaf_nodes': None, 'min_samples_leaf': 3, 'min_samples_split': 4} if model_type == 'dt': cv_m = grid_search(DecisionTreeClassifier(), param_grid=dict( criterion=['gini', 'entropy'], max_depth=[None] + list(range(24, 30, 3)), max_leaf_nodes=[None] + list(range(13, 20, 2)), min_samples_leaf=list(range(2, 5, 2)), min_samples_split=list(range(2, 5, 2))), **cv_kwargs) elif model_type == 'gb': cv_m = grid_search( GradientBoostingClassifier(), param_grid=dict( max_depth=list(range(3, 7, 1)), max_leaf_nodes=[None], #+ list(range(9, 22, 2)), min_samples_leaf=list(range(2, 4, 1)), min_samples_split=list(range(2, 5, 2)), learning_rate=np.arange(0.7, 1.5, 0.33), n_estimators=range(100, 251, 50)), **cv_kwargs) elif model_type == 'rf': cv_m = grid_search(RandomForestClassifier(), param_grid=dict(max_depth=range(2, 35, 4), min_samples_split=range(2, 45, 4), n_estimators=range(25, 226, 25)), **cv_kwargs) elif model_type == 'nb': cv_m = grid_search( MultinomialNB(), param_grid=dict(alpha=[10**a for a in range(-3, 4, 1)]), #[.5, 1.5, 3.5, 9, 17, 25]),#np.arange(.1, 3.5, .35)), **cv_kwargs) else: raise ValueError("No model type %s" % model_type) pred_Y = cv_m.predict(cvec_X) metrics = utils.binary_classification_metrics(Y, pred_Y) print("Best Params: %s" % str(cv_m.best_params_)) print("Train + CV Overall Performance:") print("%s: %s" % (model_type, str(metrics))) pred_Y_test = cv_m.predict(cvec_X_test) test_metrics = utils.binary_classification_metrics(Y_test, pred_Y_test) print("Hold out performance:") print("%s: %s" % (model_type, str(test_metrics))) test_metrics = {"test_%s" % k: v for k, v in test_metrics.items()} metrics.update(test_metrics) metrics['best_params'] = dict(cv_m.best_params_) return metrics