def run_index(project_id, directory, suffix, force, limit, threshold, backend_param): """ Index a directory with documents, suggesting subjects for each document. Write the results in TSV files with the given suffix. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param, project) hit_filter = SuggestionFilter(project.subjects, limit, threshold) for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( directory, require_subjects=False): with open(docfilename, encoding='utf-8-sig') as docfile: text = docfile.read() subjectfilename = re.sub(r'\.txt$', suffix, docfilename) if os.path.exists(subjectfilename) and not force: click.echo("Not overwriting {} (use --force to override)".format( subjectfilename)) continue with open(subjectfilename, 'w', encoding='utf-8') as subjfile: results = project.suggest(text, backend_params) for hit in hit_filter(results).as_list(project.subjects): line = "<{}>\t{}\t{}".format( hit.uri, '\t'.join(filter(None, (hit.label, hit.notation))), hit.score) click.echo(line, file=subjfile)
def test_hitfilter_vector_suggestion_results_with_deprecated_subjects( subject_index): subject_index.append('http://example.org/deprecated', None, None) vector = np.ones(len(subject_index)) suggestions = VectorSuggestionResult(vector) filtered_suggestions = SuggestionFilter(subject_index)(suggestions) assert len(suggestions) == len(filtered_suggestions) \ + len(subject_index.deprecated_ids()) deprecated = SubjectSuggestion(uri='http://example.org/deprecated', label=None, notation=None, score=1.0) assert deprecated in suggestions.as_list(subject_index) assert deprecated not in filtered_suggestions.as_list(subject_index)
def test_hitfilter_zero_score(subject_index): origsuggestions = ListSuggestionResult( [SubjectSuggestion(uri='uri', label='label', score=0.0)], subject_index) suggestions = SuggestionFilter()(origsuggestions) assert isinstance(suggestions, SuggestionResult) assert len(suggestions) == 0
def generate_filter_batches(subjects): filter_batches = collections.OrderedDict() for limit in range(1, 16): for threshold in [i * 0.05 for i in range(20)]: hit_filter = SuggestionFilter(limit, threshold) batch = annif.eval.EvaluationBatch(subjects) filter_batches[(limit, threshold)] = (hit_filter, batch) return filter_batches
def run_suggest(project_id, limit, threshold, backend_param): """ Suggest subjects for a single document from standard input. """ project = get_project(project_id) text = sys.stdin.read() backend_params = parse_backend_params(backend_param) hit_filter = SuggestionFilter(limit, threshold) hits = hit_filter(project.suggest(text, backend_params)) for hit in hits: click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
def test_hitfilter_list_suggestion_results_with_deprecated_subjects( subject_index): subject_index.append('http://example.org/deprecated', None, None) suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', notation=None, score=1.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', notation=None, score=0.5), SubjectSuggestion(uri='http://example.org/deprecated', label=None, notation=None, score=0.5) ]) filtered_suggestions = SuggestionFilter(subject_index)(suggestions) assert isinstance(filtered_suggestions, SuggestionResult) assert len(filtered_suggestions) == 2 assert filtered_suggestions.as_list( subject_index)[0] == suggestions.as_list(subject_index)[0] assert filtered_suggestions.as_list( subject_index)[1] == suggestions.as_list(subject_index)[1]
def suggest(project_id, text, limit, threshold): """suggest subjects for the given text and return a dict with results formatted according to Swagger spec""" try: project = annif.registry.get_project( project_id, min_access=Access.hidden) except ValueError: return project_not_found_error(project_id) try: hit_filter = SuggestionFilter(project.subjects, limit, threshold) result = project.suggest(text) except AnnifException as err: return server_error(err) hits = hit_filter(result).as_list(project.subjects) return {'results': [hit._asdict() for hit in hits]}
def run_eval(project_id, paths, limit, threshold, backend_param): """ Analyze documents and evaluate the result. Compare the results of automated indexing against a gold standard. The path may be either a TSV file with short documents or a directory with documents in separate files. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param) hit_filter = SuggestionFilter(limit=limit, threshold=threshold) eval_batch = annif.eval.EvaluationBatch(project.subjects) docs = open_documents(paths) for doc in docs.documents: results = project.suggest(doc.text, backend_params) hits = hit_filter(results) eval_batch.evaluate(hits, annif.corpus.SubjectSet((doc.uris, doc.labels))) template = "{0:<20}\t{1}" for metric, score in eval_batch.results().items(): click.echo(template.format(metric + ":", score))
def test_hitfilter_threshold(subject_index): origsuggestions = generate_suggestions(10, subject_index) suggestions = SuggestionFilter(threshold=0.5)(origsuggestions) assert isinstance(suggestions, SuggestionResult) assert len(suggestions) == 2
def test_hitfilter_limit(subject_index): origsuggestions = generate_suggestions(10, subject_index) suggestions = SuggestionFilter(limit=5)(origsuggestions) assert isinstance(suggestions, SuggestionResult) assert len(suggestions) == 5