Example #1
0
def run_index(project_id, directory, suffix, force, limit, threshold,
              backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo("Not overwriting {} (use --force to override)".format(
                subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri, '\t'.join(filter(None,
                                              (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)
Example #2
0
def test_hitfilter_vector_suggestion_results_with_deprecated_subjects(
        subject_index):
    subject_index.append('http://example.org/deprecated', None, None)
    vector = np.ones(len(subject_index))
    suggestions = VectorSuggestionResult(vector)
    filtered_suggestions = SuggestionFilter(subject_index)(suggestions)

    assert len(suggestions) == len(filtered_suggestions) \
        + len(subject_index.deprecated_ids())

    deprecated = SubjectSuggestion(uri='http://example.org/deprecated',
                                   label=None,
                                   notation=None,
                                   score=1.0)
    assert deprecated in suggestions.as_list(subject_index)
    assert deprecated not in filtered_suggestions.as_list(subject_index)
Example #3
0
def test_hitfilter_zero_score(subject_index):
    origsuggestions = ListSuggestionResult(
        [SubjectSuggestion(uri='uri', label='label', score=0.0)],
        subject_index)
    suggestions = SuggestionFilter()(origsuggestions)
    assert isinstance(suggestions, SuggestionResult)
    assert len(suggestions) == 0
Example #4
0
File: cli.py Project: pelly/Annif
def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = SuggestionFilter(limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches
Example #5
0
File: cli.py Project: pelly/Annif
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits:
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
Example #6
0
def test_hitfilter_list_suggestion_results_with_deprecated_subjects(
        subject_index):
    subject_index.append('http://example.org/deprecated', None, None)
    suggestions = ListSuggestionResult([
        SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141',
                          label='sinetit',
                          notation=None,
                          score=1.0),
        SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479',
                          label='viikingit',
                          notation=None,
                          score=0.5),
        SubjectSuggestion(uri='http://example.org/deprecated',
                          label=None,
                          notation=None,
                          score=0.5)
    ])
    filtered_suggestions = SuggestionFilter(subject_index)(suggestions)
    assert isinstance(filtered_suggestions, SuggestionResult)
    assert len(filtered_suggestions) == 2
    assert filtered_suggestions.as_list(
        subject_index)[0] == suggestions.as_list(subject_index)[0]
    assert filtered_suggestions.as_list(
        subject_index)[1] == suggestions.as_list(subject_index)[1]
Example #7
0
def suggest(project_id, text, limit, threshold):
    """suggest subjects for the given text and return a dict with results
    formatted according to Swagger spec"""

    try:
        project = annif.registry.get_project(
            project_id, min_access=Access.hidden)
    except ValueError:
        return project_not_found_error(project_id)

    try:
        hit_filter = SuggestionFilter(project.subjects, limit, threshold)
        result = project.suggest(text)
    except AnnifException as err:
        return server_error(err)
    hits = hit_filter(result).as_list(project.subjects)
    return {'results': [hit._asdict() for hit in hits]}
Example #8
0
File: cli.py Project: pelly/Annif
def run_eval(project_id, paths, limit, threshold, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))
Example #9
0
def test_hitfilter_threshold(subject_index):
    origsuggestions = generate_suggestions(10, subject_index)
    suggestions = SuggestionFilter(threshold=0.5)(origsuggestions)
    assert isinstance(suggestions, SuggestionResult)
    assert len(suggestions) == 2
Example #10
0
def test_hitfilter_limit(subject_index):
    origsuggestions = generate_suggestions(10, subject_index)
    suggestions = SuggestionFilter(limit=5)(origsuggestions)
    assert isinstance(suggestions, SuggestionResult)
    assert len(suggestions) == 5