def do_reg(): d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold)) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] clf_stage1 = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False) # Try random forest clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual = [LABELS[int(a)] for a in y_holdout] report_score(actual, predicted)
def __init__(self, dataset, n_folds=10): self.dataset = dataset #print('generating folds') self.folds, self.hold_out = kfold_split(dataset, n_folds=n_folds) self.fold_stances, self.hold_out_stances = get_stances_for_folds( dataset, self.folds, self.hold_out) self.ys = dict() self.Xcs = dict() self.ys_nb = dict() self.Xcs_nb = dict() self.fold_stances_nb = dict() self.ys_true = dict() self.Xbasenb = dict() self.Xtotalnb = dict() self.Xtotal = dict() self.X_baseline = dict() self.y_baseline = dict()
X_overlap_pos, X_overlap_pos_sentence, X_tfidf, X_tfidf_max, X_overlap_bpe_SS] return X, y if __name__ == "__main__": check_version() print('Running Conditioned CNN on FNC1 Dataset') dl_model_pred, _unused1, _unused2 = get_predictions_from_FNC_1_Test( params.dl_weights_file, params.apply_pos_filter, DEVICE) #Load the training dataset and generate folds d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load the competition dataset competition_dataset = DataSet("competition_test") stances = pd.DataFrame(competition_dataset.stances) X_competition, y_competition = generate_features( competition_dataset.stances, competition_dataset, "competition") Xs = dict() ys = dict() # Load/Precompute all features now X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout") for fold in fold_stances: Xs[fold], ys[fold] = generate_features(fold_stances[fold], d, str(fold))
def run_stage(fn, d, competition_dataset): global runpass runpass += 1 folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load/Precompute all features now Xs = dict() ys = dict() ids = dict() comp_stances = competition_dataset.get_unlabelled_stances() X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset, "competition_{}".format(str(runpass))) X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d, "holdout_{}".format(str(runpass))) for fold in fold_stances: Xs[fold], ys[fold], ids[fold] = fn( fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass))) best_score = 0 best_fold = None # Classifier for each fold for fold in fold_stances: id_train = np.hstack( tuple([ids[i] for i in range(len(fold_stances)) if i != fold])) X_train = np.vstack( tuple([Xs[i] for i in range(len(fold_stances)) if i != fold])) y_train = np.hstack( tuple([ys[i] for i in range(len(fold_stances)) if i != fold])) id_test = ids[fold] X_test = Xs[fold] y_test = ys[fold] clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) clf.fit(X_train, y_train) predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)] actual_test = [LABELS[int(a)] for a in y_test] for i in range(len(actual_test)): d.stances[id_test[i]]['Predict'] = actual_test[i] # Data is known fold_score, _ = score_submission(actual_test, predicted_test) max_fold_score, _ = score_submission(actual_test, actual_test) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Run on Holdout set and report the final score on the holdout set predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)] actual_hold = [LABELS[int(a)] for a in y_holdout] for i in range(len(predicted_hold)): d.stances[id_holdout[i]]['Predict'] = predicted_hold[ i] # Data is unknown #Run on competition dataset predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)] actual_comp = [LABELS[int(a)] for a in y_comp] for i in range(len(actual_comp)): competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[ i] # Data is unknown return id_holdout
if __name__ == "__main__": check_version() parse_params() #Load the training dataset and generate folds d = DataSet() # Load the competition dataset competition_dataset = DataSet("competition_test") X_competition, y_competition, y_competition_bi = generate_features( competition_dataset.stances, competition_dataset, "competition") # step1 : classification model for related or unrelated folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) if not os.path.isfile("models/finalClassifier.1.model"): generate_model(fold_stances, 1) best_fold = joblib.load("models/finalClassifier.1.model") # Load/Precompute all features now X_holdout, y_holdout, y_holdout_bi = generate_features( hold_out_stances, d, "holdout") # step2 : classification model for related (3 classes : Agree, Disagree, Discuss) related_folds, related_hold_out = kfold_split(d, n_folds=10, biClass=True) related_fold_stances, related_hold_out_stances = get_stances_for_folds( d, related_folds, related_hold_out, only_related=True) if not os.path.isfile("models/finalClassifier.2.model"): generate_model(related_fold_stances, 2) related_best_fold = joblib.load("models/finalClassifier.2.model")
X = np.c_[X_hand, X_polarity, X_refuting, X_overlap] return X,y if __name__ == "__main__": if sys.version_info.major < 3: sys.stderr.write('Please use Python version 3 and above\n') sys.exit(1) d = DataSet() folds,hold_out = kfold_split(d,n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out) Xs = dict() ys = dict() # Load/Precompute all features now X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout") for fold in fold_stances: Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold)) best_score = 0 best_fold = None # Classifier for each fold