def main(train_data_dir, test_data_dir=None, output_dir=None, lang='de', blank_model=False, verbose=False, epochs=100, init_batch_size=1, init_dropout_rate=0.5, batch_comp_rate=1.001): if blank_model: nlp = spacy.blank(INSTALLED_MODELS[lang]) add_pipe(nlp, 'ner') else: nlp = spacy.load(INSTALLED_MODELS[lang]) train_data = load_data(train_data_dir) add_labels(nlp.get_pipe('ner'), get_ner_labels(train_data)) train(train_data, 'ner', epochs, nlp, blank_model, verbose, init_batch_size, init_dropout_rate, batch_comp_rate) if test_data_dir is not None: test_data = load_data(test_data_dir) print('\nPerformance on test data:') test(test_data, nlp, False) if output_dir: save_model(nlp, output_dir)
def main(test_data_dir, model='de', verbose=False): if model in INSTALLED_MODELS: model = INSTALLED_MODELS['de'] nlp = spacy.load(model) test_data = load_data(test_data_dir) test(test_data, nlp, verbose)
def test_load_data(self): data = load_data(self.ANNOTATIONS_PATH) self.assertEqual(2, len(data)) self.assertEqual( 'Die Revision des Klägers gegen das Urteil des 6. Zivilsenats des Oberlandesgerichts Köln ' 'vom 16. Dezember 2016 wird zurückgewiesen.', data[0][0]) self.assertEqual([(65, 88, 'ORG')], data[0][1]['entities'])
def test_train_overfit(self): data = load_data(self.ANNOTATIONS_PATH) nlp = spacy.load('de_core_news_sm') before_score = test(data, nlp, False) add_labels(nlp.get_pipe('ner'), get_ner_labels(data)) train(data, 'ner', 10, nlp, True, True, 1, 0.0, 1.0) after_score = test(data, nlp, False) self.assertGreater(after_score, before_score)
def test_train_blank(self): data = load_data(self.ANNOTATIONS_PATH) nlp = spacy.blank('de') add_pipe(nlp, 'ner') add_labels(nlp.get_pipe('ner'), get_ner_labels(data)) train(data, 'ner', 1, nlp, True, True, 1, 0.0, 1.0)