def train(train_corpus: str, dev_corpus: str, c1: float = 0.0, c2: float = 0.0, algorithm: str = 'lbfgs', max_iterations: int = 100, all_possible_transitions: bool = False, window_size: int = 1, model_filename: str = None, _run: Run = None, _log: logger = None): """ running crf experiment """ _run.add_resource(train_corpus) _run.add_resource(dev_corpus) train_sents, _ = get_tagged_sents_and_words(train_corpus) dev_sents, _ = get_tagged_sents_and_words(dev_corpus) X_train = [sent2features(s, window_size) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_dev = [sent2features(s, window_size) for s in dev_sents] y_dev = [sent2labels(s) for s in dev_sents] crf = sklearn_crfsuite.CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions, model_filename=model_filename, ) crf.fit(X_train, y_train) y_pred = crf.predict(X_dev) overall, by_type = evaluate(y_dev, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}') if model_filename is not None: _log.info(f'saving to: {model_filename}.pkl') joblib.dump(crf, f'{model_filename}.pkl') _run.add_artifact(f'{model_filename}.pkl')
def test(model_filename: str, test_corpus: str, window_size: int = 5, _run: Run = None, _log: logger = None): _run.add_resource(test_corpus) _run.add_resource(f'{model_filename}.pkl') test_sents, _ = get_tagged_sents_and_words(test_corpus) X_test = [sent2features(s, window_size) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] _log.info(f'load from: {model_filename}.pkl') crf = sklearn_crfsuite.CRF(model_filename=model_filename) y_pred = crf.predict(X_test) overall, by_type = evaluate(y_test, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}')