def run_index(project_id, directory, suffix, force, limit, threshold, backend_param): """ Index a directory with documents, suggesting subjects for each document. Write the results in TSV files with the given suffix. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param, project) hit_filter = SuggestionFilter(project.subjects, limit, threshold) for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( directory, require_subjects=False): with open(docfilename, encoding='utf-8-sig') as docfile: text = docfile.read() subjectfilename = re.sub(r'\.txt$', suffix, docfilename) if os.path.exists(subjectfilename) and not force: click.echo("Not overwriting {} (use --force to override)".format( subjectfilename)) continue with open(subjectfilename, 'w', encoding='utf-8') as subjfile: results = project.suggest(text, backend_params) for hit in hit_filter(results).as_list(project.subjects): line = "<{}>\t{}\t{}".format( hit.uri, '\t'.join(filter(None, (hit.label, hit.notation))), hit.score) click.echo(line, file=subjfile)
def run_optimize(project_id, paths, docs_limit, backend_param): """ Analyze documents, testing multiple limits and thresholds. Evaluate the analysis results for a directory with documents against a gold standard given in subject files. Test different limit/threshold values and report the precision, recall and F-measure of each combination of settings. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param, project) filter_batches = generate_filter_batches(project.subjects) ndocs = 0 docs = open_documents(paths, docs_limit) for doc in docs.documents: raw_hits = project.suggest(doc.text, backend_params) hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT) assert isinstance(hits, ListSuggestionResult), \ "Optimize should only be done with ListSuggestionResult " + \ "as it would be very slow with VectorSuggestionResult." gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels)) for hit_filter, batch in filter_batches.values(): batch.evaluate(hit_filter(hits), gold_subjects) ndocs += 1 click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1'))) best_scores = collections.defaultdict(float) best_params = {} template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}" # Store the batches in a list that gets consumed along the way # This way GC will have a chance to reclaim the memory filter_batches = list(filter_batches.items()) while filter_batches: params, filter_batch = filter_batches.pop(0) metrics = [ 'Precision (doc avg)', 'Recall (doc avg)', 'F1 score (doc avg)' ] results = filter_batch[1].results(metrics=metrics) for metric, score in results.items(): if score >= best_scores[metric]: best_scores[metric] = score best_params[metric] = params click.echo( template.format(params[0], params[1], results['Precision (doc avg)'], results['Recall (doc avg)'], results['F1 score (doc avg)'])) click.echo() template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}" for metric in metrics: click.echo( template2.format(metric, best_scores[metric], best_params[metric][0], best_params[metric][1])) click.echo("Documents evaluated:\t{}".format(ndocs))
def test_project_suggest_combine(app): with app.app_context(): project = annif.project.get_project('dummydummy') result = project.suggest('this is some text') assert len(result) == 1 assert result[0].uri == 'http://example.org/dummy' assert result[0].label == 'dummy' assert result[0].score == 1.0
def test_project_suggest_combine(registry): project = registry.get_project('dummydummy') result = project.suggest('this is some text') assert len(result) == 1 hits = result.as_list(project.subjects) assert hits[0].uri == 'http://example.org/dummy' assert hits[0].label == 'dummy' assert hits[0].score == 1.0
def run_suggest(project_id, limit, threshold, backend_param): """ Suggest subjects for a single document from standard input. """ project = get_project(project_id) text = sys.stdin.read() backend_params = parse_backend_params(backend_param) hit_filter = SuggestionFilter(limit, threshold) hits = hit_filter(project.suggest(text, backend_params)) for hit in hits: click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
def test_project_train_state_not_available(registry, caplog): project = registry.get_project('dummydummy') project.backend.is_trained = None with caplog.at_level(logging.WARNING): result = project.suggest('this is some text') assert project.is_trained is None assert len(result) == 1 hits = result.as_list(project.subjects) assert hits[0].uri == 'http://example.org/dummy' assert hits[0].label == 'dummy' assert hits[0].score == 1.0 assert 'Could not get train state information' in caplog.text
def test_project_learn(registry, tmpdir): tmpdir.join('doc1.txt').write('doc1') tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1') tmpdir.join('doc2.txt').write('doc2') tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2') docdir = annif.corpus.DocumentDirectory(str(tmpdir)) project = registry.get_project('dummy-fi') project.learn(docdir) result = project.suggest('this is some text') assert len(result) == 1 hits = result.as_list(project.subjects) assert hits[0].uri == 'http://example.org/key1' assert hits[0].label == 'key1' assert hits[0].score == 1.0
def test_project_learn(app, tmpdir): tmpdir.join('doc1.txt').write('doc1') tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1') tmpdir.join('doc2.txt').write('doc2') tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2') docdir = annif.corpus.DocumentDirectory(str(tmpdir)) with app.app_context(): project = annif.project.get_project('dummy-fi') project.learn(docdir) result = project.suggest('this is some text') assert len(result) == 1 assert result[0].uri == 'http://example.org/key1' assert result[0].label == 'key1' assert result[0].score == 1.0
def suggest(project_id, text, limit, threshold): """suggest subjects for the given text and return a dict with results formatted according to Swagger spec""" try: project = annif.project.get_project(project_id, min_access=Access.hidden) except ValueError: return project_not_found_error(project_id) hit_filter = SuggestionFilter(limit, threshold) try: result = project.suggest(text) except AnnifException as err: return server_error(err) hits = hit_filter(result) return {'results': [hit._asdict() for hit in hits]}
def run_eval(project_id, paths, limit, threshold, backend_param): """ Analyze documents and evaluate the result. Compare the results of automated indexing against a gold standard. The path may be either a TSV file with short documents or a directory with documents in separate files. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param) hit_filter = SuggestionFilter(limit=limit, threshold=threshold) eval_batch = annif.eval.EvaluationBatch(project.subjects) docs = open_documents(paths) for doc in docs.documents: results = project.suggest(doc.text, backend_params) hits = hit_filter(results) eval_batch.evaluate(hits, annif.corpus.SubjectSet((doc.uris, doc.labels))) template = "{0:<20}\t{1}" for metric, score in eval_batch.results().items(): click.echo(template.format(metric + ":", score))