def evaluate(norm, ignored_feats=set()): """ Evaluates the saved model against the eval file :param norm: :param ignored_feats: :return: """ global log, eval_file, model_file, scores_file, meta_dict log.info("Loading model from file") logistic = cPickle.load(open(model_file, 'r')) log.info("Loading eval data") x_eval, y_eval, ids_eval = \ data_util.load_very_sparse_feats(eval_file, meta_dict, ignored_feats) if norm is not None: normalize(x_eval, norm=norm, copy=False) #endif log.info("Evaluating") y_pred_probs = logistic.predict_log_proba(x_eval) # Though we don't evaluate against them here, we want # to store scores for both ij and ji pairs pred_scores = dict() for i in range(len(ids_eval)): pred_scores[ids_eval[i]] = y_pred_probs[i] pred_scores = induce_ji_predictions(pred_scores) # We evaluate here only on ij pairs, but since this script # is not our final evaluation (and because the score should be # identical anyway) that's fine; this is functionally an estimate # of the true score scores = ScoreDict() for i in range(len(y_eval)): scores.increment(y_eval[i], np.argmax(y_pred_probs[i])) log.info("---Confusion matrix---") scores.print_confusion() log.info("---Scores---") for label in scores.keys: print str(label) + "\t" + scores.get_score(label).to_string() + " - %d (%.2f%%)" % \ (scores.get_gold_count(label), scores.get_gold_percent(label)) print "Acc: " + str(scores.get_accuracy()) + "%" if scores_file is not None: log.info("Writing probabilities to " + scores_file) with open(scores_file, 'w') as f: for id in pred_scores.keys(): line = list() line.append(id) for j in range(len(pred_scores[id])): line.append(str(pred_scores[id][j])) f.write(','.join(line) + '\n') #endfor f.close()
def train(model, balance, max_iter=None, max_depth=None, num_estimators=None, warm_start=None, ignored_feats=set()): """ Trains the model :param model: :param balance: :param max_iter: :param max_depth: :param num_estimators: :param warm_start: :param ignored_feats: :return: """ global log, train_file, meta_dict, model_file log.tic('info', "Loading training data") x, y, ids = \ data_util.load_very_sparse_feats(train_file, meta_dict, ignored_feats) log.toc('info') log.tic('info', "Training") class_weight = None if balance: class_weight = 'balanced' #endif learner = None if model == 'svm': learner = SVC(probability=True, class_weight=class_weight, max_iter=max_iter) elif model == 'logistic': learner = LogisticRegression(class_weight=class_weight, max_iter=max_iter, n_jobs=-1) elif model == "decision_tree": learner = DecisionTreeClassifier(max_depth=max_depth, class_weight=class_weight) elif model == 'random_forest': learner = RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth, n_jobs=-1, warm_start=warm_start, class_weight=class_weight) #endif learner.fit(x, y) log.toc('info') log.info("Saving model") with open(model_file, 'wb') as pickle_file: cPickle.dump(learner, pickle_file)
def evaluate(ignored_feats=set()): """ Evaluates the model, optionally ignoring features and saving predicted class scores :param ignored_feats: :return: """ global log, eval_file, model_file, scores_file log.info("Loading model from file") learner = cPickle.load(open(model_file, 'r')) log.info("Loading eval data") x_eval, y_eval, ids_eval = \ data_util.load_very_sparse_feats(eval_file, meta_dict, ignored_feats) log.info("Evaluating") y_pred_probs = learner.predict_log_proba(x_eval) scores = ScoreDict() for i in range(len(y_eval)): y_pred = 0 max_prob = -float('inf') for j in range(len(y_pred_probs[i])): if y_pred_probs[i][j] > max_prob: max_prob = y_pred_probs[i][j] y_pred = j #endif #endfor scores.increment(y_eval[i], y_pred) #endfor log.info("---Confusion matrix---") scores.print_confusion() log.info("---Scores---") for label in scores.keys: print str(label) + "\t" + scores.get_score(label).to_latex_string() + " & %.2f\\%%\\\\" % \ (scores.get_gold_percent(label)) kurtoses = list() for log_proba in y_pred_probs: kurtoses.append( stats.kurtosis(log_proba, axis=0, fisher=True, bias=True)) log.info(None, "Accuracy: %.2f%%; Kurtoses: %.2f", scores.get_accuracy(), sum(kurtoses) / len(kurtoses)) log.info("Writing probabilities to file") with open(scores_file, 'w') as f: for i in range(len(ids_eval)): line = list() line.append(ids_eval[i]) for j in range(len(y_pred_probs[i])): line.append(str(y_pred_probs[i][j])) f.write(','.join(line) + '\n')
def train(solver, max_iter, balance, norm, warm_start, multiclass_mode, ignored_feats=set()): """ Trains the relation model as a multinomial logistic regression model :param solver: :param max_iter: :param balance: :param norm: :param warm_start: :param multiclass_mode: :param ignored_feats: :return: """ global log, meta_dict, train_file, model_file log.tic('info', "Loading training data") x, y, ids = \ data_util.load_very_sparse_feats(train_file, meta_dict, ignored_feats) if norm is not None: normalize(x, norm=norm, copy=False) log.toc('info') log.tic('info', "Training") class_weight = None if balance: class_weight = 'balanced' #endif logistic = LogisticRegression(class_weight=class_weight, solver=solver, max_iter=max_iter, multi_class=multiclass_mode, n_jobs=-1, warm_start=warm_start) logistic.fit(x, y) log.toc('info') log.info("Saving model") with open(model_file, 'wb') as pickle_file: cPickle.dump(logistic, pickle_file)
def train(max_iter, balance=False, warm_start=None, ignored_feats=set()): """ Trains the cardinality classifier as a multinomial logistic regression model with max_iter iterations; optional parameters enable balanced class weights, warm start, and the ability to ignore features :param max_iter: :param balance: :param warm_start: :param ignored_feats: :return: """ global log, train_file, meta_dict, model_file log.tic('info', "Loading training data") x, y, ids = \ data_util.load_very_sparse_feats(train_file, meta_dict, ignored_feats) log.toc('info') log.tic('info', "Training") class_weight = None if balance: class_weight = 'balanced' #endif learner = LogisticRegression(class_weight=class_weight, solver='lbfgs', max_iter=max_iter, multi_class='multinomial', n_jobs=-1, warm_start=warm_start) #learner = mord.OrdinalRidge(max_iter=max_iter) learner.fit(x, y) log.toc('info') log.info("Saving model") with open(model_file, 'wb') as pickle_file: cPickle.dump(learner, pickle_file)
def evaluate(lemma_file=None, hyp_file=None, ignored_feats=set(), save_scores=True): """ Evaluates the model against the eval data :param lemma_file: :param hyp_file: :param ignored_feats: :param save_scores: :return: """ global log, eval_file, model_file, scores_file log.info("Loading model from file") learner = cPickle.load(open(model_file, 'r')) log.info("Loading eval data") x_eval, y_eval, ids_eval = \ data_util.load_very_sparse_feats(eval_file, meta_dict, ignored_feats) lemma_dict = dict() lemmas = set() if lemma_file is not None: log.info("Loading mention lemmas") with open(lemma_file, 'r') as f: for line in f: parts = line.replace('"', '').strip().split(",") lemma_dict[parts[0]] = parts[1] lemmas.add(parts[1]) #endfor f.close() #endwith #endif hypernyms = set() if hyp_file is not None: log.info("Loading mention hypernyms") with open(hyp_file, 'r') as f: id_hyp_dict = json.load(f) for hyps in id_hyp_dict.values(): if isinstance(hyps, list): for h in hyps: hypernyms.add(h) else: hypernyms.add(hyps) #endif #endfor #endif log.info("Evaluating") lemma_scores = dict() for l in lemmas: lemma_scores[l] = ScoreDict() hyp_scores = dict() for h in hypernyms: hyp_scores[h] = ScoreDict() y_pred_eval = learner.predict_proba(x_eval) scores = ScoreDict() pred_scores = dict() for idx in range(len(y_pred_eval)): id = ids_eval[idx] pred_scores[id] = y_pred_eval[idx] pred = 0 if pred_scores[id][0] > pred_scores[id][1] else 1 scores.increment(y_eval[idx], pred) #endfor log.info("---Confusion matrix---") scores.print_confusion() log.info("---Scores---") for label in scores.keys: print str(label) + "\t" + scores.get_score(label).to_string() + " - %d (%.2f%%)" % \ (scores.get_gold_count(label), scores.get_gold_percent(label)) log.info(None, "Accuracy: %.2f%%", scores.get_accuracy()) if save_scores: log.info("Writing scores to " + scores_file) with open(scores_file, 'w') as f: for id in pred_scores.keys(): score_line = list() score_line.append(id) for s in pred_scores[id]: score = s if score == 0: score = np.nextafter(0, 1) score = str(np.log(score)) score_line.append(score) #endfor f.write(','.join(score_line) + '\n') f.close()