def main( training_set, language, outfile, model_class, model_param, extractor_class, extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold, ): """ Trains the classifier """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} model_cls, model_args = initialize(model_class, model_param, False) if evaluate_gold: gold_extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True) gold_evaluation(map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args) training_set.seek(0) extractor = initialize(extractor_class, [("language", language)] + list(extractor_param), True) logger.info("Building training set from '%s' ..." % training_set.name) for row in training_set: data = json.loads(row) extractor.process_sentence(data["sentence"], data["lu"], data["fes"], add_unknown=True, gazetteer=gazetteer) x, y = extractor.get_features(refit=True) logger.info("Got %d samples with %d features each", *x.shape) model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) if folds > 1: kfolds_evaluation(folds, model, scoring, skip_majority, x, y) logger.info("Fitting model ...") model.fit(x, y) joblib.dump((model, {"extractor": extractor}), outfile) logger.info("Done, dumped model to '%s'", outfile)
def main(training_set, language, outfile, model_class, model_param, extractor_class, extractor_param, gazetteer, folds, scoring, skip_majority, evaluate_gold): """ Trains the classifier """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} model_cls, model_args = initialize(model_class, model_param, False) if evaluate_gold: gold_extractor = initialize( extractor_class, [('language', language)] + list(extractor_param), True ) gold_evaluation( map(json.loads, training_set), gold_extractor, gazetteer, model_cls, model_args ) training_set.seek(0) extractor = initialize(extractor_class, [('language', language)] + list(extractor_param), True) logger.info("Building training set from '%s' ..." % training_set.name) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=True, gazetteer=gazetteer) x, y = extractor.get_features(refit=True) logger.info('Got %d samples with %d features each', *x.shape) model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) if folds > 1: kfolds_evaluation(folds, model, scoring, skip_majority, x, y) logger.info('Fitting model ...') model.fit(x, y) joblib.dump((model, { 'extractor': extractor }), outfile) logger.info("Done, dumped model to '%s'", outfile)
def gold_evaluation(sentences, extractor, gazetteer, model_cls, model_args): logger.info("Evaluating on the gold sentences") for each in sentences: if not each.get("gold_fes"): extractor.process_sentence(each["sentence"], each["lu"], each["fes"], add_unknown=True, gazetteer=gazetteer) x_tr, y_tr = extractor.get_features(refit=True) extractor.start() tagged_gold = [] for each in sentences: if each.get("gold_fes"): tagged_gold.append( ( each["gold_fes"], extractor.process_sentence( each["sentence"], each["lu"], each["fes"], add_unknown=False, gazetteer=gazetteer ), ) ) if not tagged_gold: logger.warn("asked to evaluate gold, but no gold sentences found") return x_gold, _ = extractor.get_features(refit=False) y_gold = [] for gold_fes, tagged in tagged_gold: for chunk, is_sample in tagged: if is_sample: y_gold.append([extractor.label_index[fe or "O"] for fe in gold_fes.get(chunk, [])]) assert len(y_gold) == x_gold.shape[0] model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) model.fit(x_tr, y_tr) y_pred = model.predict(x_gold) correct = len([1 for actual, predicted in zip(y_gold, y_pred) if predicted in actual]) logger.info("Gold accuracy: %f (%d / %d roles)", float(correct) / len(y_gold), correct, len(y_gold))
def gold_evaluation(sentences, extractor, gazetteer, model_cls, model_args): logger.info('Evaluating on the gold sentences') for each in sentences: if not each.get('gold_fes'): extractor.process_sentence(each['sentence'], each['lu'], each['fes'], add_unknown=True, gazetteer=gazetteer) x_tr, y_tr = extractor.get_features(refit=True) extractor.start() tagged_gold = [] for each in sentences: if each.get('gold_fes'): tagged_gold.append((each['gold_fes'], extractor.process_sentence( each['sentence'], each['lu'], each['fes'], add_unknown=False, gazetteer=gazetteer ))) if not tagged_gold: logger.warn('asked to evaluate gold, but no gold sentences found') return x_gold, _ = extractor.get_features(refit=False) y_gold = [] for gold_fes, tagged in tagged_gold: for chunk, is_sample in tagged: if is_sample: y_gold.append([extractor.label_index[fe or 'O'] for fe in gold_fes.get(chunk, [])]) assert len(y_gold) == x_gold.shape[0] model = FeatureSelectedClassifier(model_cls, extractor.lu_column(), model_args) model.fit(x_tr, y_tr) y_pred = model.predict(x_gold) correct = len([1 for actual, predicted in zip(y_gold, y_pred) if predicted in actual]) logger.info('Gold accuracy: %f (%d / %d roles)', float(correct) / len(y_gold), correct, len(y_gold))