def train(train_corpus: str, dev_corpus: str, c1: float = 0.0, c2: float = 0.0, algorithm: str = 'lbfgs', max_iterations: int = 100, all_possible_transitions: bool = False, window_size: int = 1, model_filename: str = None, _run: Run = None, _log: logger = None): """ running crf experiment """ _run.add_resource(train_corpus) _run.add_resource(dev_corpus) train_sents, _ = get_tagged_sents_and_words(train_corpus) dev_sents, _ = get_tagged_sents_and_words(dev_corpus) X_train = [sent2features(s, window_size) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_dev = [sent2features(s, window_size) for s in dev_sents] y_dev = [sent2labels(s) for s in dev_sents] crf = sklearn_crfsuite.CRF( algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=all_possible_transitions, model_filename=model_filename, ) crf.fit(X_train, y_train) y_pred = crf.predict(X_dev) overall, by_type = evaluate(y_dev, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}') if model_filename is not None: _log.info(f'saving to: {model_filename}.pkl') joblib.dump(crf, f'{model_filename}.pkl') _run.add_artifact(f'{model_filename}.pkl')
def tag_to_json(tagger, text, sep="\n", window_size=0): annotations = {} def _add_ann(start, end, _type): annotations[len(annotations)] = { 'type': _type, 'offsets': ((start, end), ), 'texts': ((text[start:end]), ), } print(text) text = text.decode("utf-8") print(SEP) data = text.split(SEP) # data = re.split(SEP, text) # data = text.split(sep) data = [x for x in data if x] length = 0 for sent in data: print("sent : ", sent) x_feat = sent2features(word_tokenize(sent), window_size) result = tagger.predict(x_feat) result = tag(sent, sent.split(), result) print(result) for span in result: if span["tagname"] != "O": start = length + int(span["start"]) end = length + int(span["end"]) # print(start) # print(end) # print(text[start:end]) _add_ann(start, end, span["tagname"]) length += len(sent+sep) return annotations
def test(model_filename: str, test_corpus: str, window_size: int = 5, _run: Run = None, _log: logger = None): _run.add_resource(test_corpus) _run.add_resource(f'{model_filename}.pkl') test_sents, _ = get_tagged_sents_and_words(test_corpus) X_test = [sent2features(s, window_size) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] _log.info(f'load from: {model_filename}.pkl') crf = sklearn_crfsuite.CRF(model_filename=model_filename) y_pred = crf.predict(X_test) overall, by_type = evaluate(y_test, y_pred) _run.info[f'overall_f1'] = overall.f1_score _run.log_scalar('overall_f1', overall.f1_score) _run.info[f'overall_precision'] = overall.precision _run.log_scalar('overall_precision', overall.precision) _run.info[f'overall_recall'] = overall.recall _run.log_scalar('overall_recall', overall.recall) _log.info(f'Overall F1 score: {overall.f1_score}') for _, key in enumerate(sorted(by_type.keys())): for metric_key in by_type[key]._fields: metric_val = getattr(by_type[key], metric_key) _run.info[f'{key}-{metric_key}'] = metric_val _run.log_scalar(f'{key}-{metric_key}', metric_val) _log.info(f'{key}-{metric_key}: {metric_val}')
def print_corpus(corpus: List, labels: List, fileout: str, window_size: int = 0): for tagged_sent in corpus: feats = sent2features(tagged_sent, window_size) tags = sent2partial_labels(tagged_sent, labels=labels) for tag, feature in zip(tags, feats): feature = ['{}={}'.format(k, v) for k, v in feature.items()] print('{}\t{}'.format(tag, '\t'.join(feature)), file=fileout) print('', file=fileout) logging.info(f"print corpus to {fileout.name}")
help='window sizes') parser.add_argument('--features', default='default', choices=['default', 'stanford'], \ type=str, help='features prep, default, or stanford-ner') parser.add_argument('--encoding', default='utf-8', help='file encoding') args = parser.parse_args() pre = ['B', 'I'] ent = ['Person', 'Place', 'Organisation'] labels = [f'{x}-{y}' for x in pre for y in ent] + ['O'] # print(labels) corpus, _ = get_tagged_sents_and_words(args.file) for tagged_sent in corpus: if args.features == 'default': feats = sent2features(tagged_sent, args.window_size) tags = sent2partial_labels(tagged_sent, labels=labels) elif args.features == 'stanford': feats = sent2stanfordfeats(tagged_sent) tags = sent2stanford_partial(tagged_sent, labels=labels) for tag, feature in zip(tags, feats): # feature = ['{}={}'.format(k, v) for k, v in feature.items()] feat = [] for k, v in feature.items(): if k.split(':')[0].isdigit(): weight_name = k.split(':') feat.append('{}={}:{}'.format( ''.join(weight_name[1:]).strip(':'), v, weight_name[0])) else: feat.append('{}={}'.format(k.strip(':'), v))