コード例 #1
0
ファイル: cli.py プロジェクト: juhoinkinen/Annif
def run_index(project_id, directory, suffix, force, limit, threshold,
              backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)
    hit_filter = SuggestionFilter(project.subjects, limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8-sig') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo("Not overwriting {} (use --force to override)".format(
                subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results).as_list(project.subjects):
                line = "<{}>\t{}\t{}".format(
                    hit.uri, '\t'.join(filter(None,
                                              (hit.label, hit.notation))),
                    hit.score)
                click.echo(line, file=subjfile)
コード例 #2
0
ファイル: cli.py プロジェクト: juhoinkinen/Annif
def run_optimize(project_id, paths, docs_limit, backend_param):
    """
    Analyze documents, testing multiple limits and thresholds.

    Evaluate the analysis results for a directory with documents against a
    gold standard given in subject files. Test different limit/threshold
    values and report the precision, recall and F-measure of each combination
    of settings.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    filter_batches = generate_filter_batches(project.subjects)

    ndocs = 0
    docs = open_documents(paths, docs_limit)
    for doc in docs.documents:
        raw_hits = project.suggest(doc.text, backend_params)
        hits = raw_hits.filter(project.subjects, limit=BATCH_MAX_LIMIT)
        assert isinstance(hits, ListSuggestionResult), \
            "Optimize should only be done with ListSuggestionResult " + \
            "as it would be very slow with VectorSuggestionResult."
        gold_subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        for hit_filter, batch in filter_batches.values():
            batch.evaluate(hit_filter(hits), gold_subjects)
        ndocs += 1

    click.echo("\t".join(('Limit', 'Thresh.', 'Prec.', 'Rec.', 'F1')))

    best_scores = collections.defaultdict(float)
    best_params = {}

    template = "{:d}\t{:.02f}\t{:.04f}\t{:.04f}\t{:.04f}"
    # Store the batches in a list that gets consumed along the way
    # This way GC will have a chance to reclaim the memory
    filter_batches = list(filter_batches.items())
    while filter_batches:
        params, filter_batch = filter_batches.pop(0)
        metrics = [
            'Precision (doc avg)', 'Recall (doc avg)', 'F1 score (doc avg)'
        ]
        results = filter_batch[1].results(metrics=metrics)
        for metric, score in results.items():
            if score >= best_scores[metric]:
                best_scores[metric] = score
                best_params[metric] = params
        click.echo(
            template.format(params[0], params[1],
                            results['Precision (doc avg)'],
                            results['Recall (doc avg)'],
                            results['F1 score (doc avg)']))

    click.echo()
    template2 = "Best {:>19}: {:.04f}\tLimit: {:d}\tThreshold: {:.02f}"
    for metric in metrics:
        click.echo(
            template2.format(metric, best_scores[metric],
                             best_params[metric][0], best_params[metric][1]))
    click.echo("Documents evaluated:\t{}".format(ndocs))
コード例 #3
0
def test_project_suggest_combine(app):
    with app.app_context():
        project = annif.project.get_project('dummydummy')
    result = project.suggest('this is some text')
    assert len(result) == 1
    assert result[0].uri == 'http://example.org/dummy'
    assert result[0].label == 'dummy'
    assert result[0].score == 1.0
コード例 #4
0
def test_project_suggest_combine(registry):
    project = registry.get_project('dummydummy')
    result = project.suggest('this is some text')
    assert len(result) == 1
    hits = result.as_list(project.subjects)
    assert hits[0].uri == 'http://example.org/dummy'
    assert hits[0].label == 'dummy'
    assert hits[0].score == 1.0
コード例 #5
0
ファイル: cli.py プロジェクト: pelly/Annif
def run_suggest(project_id, limit, threshold, backend_param):
    """
    Suggest subjects for a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)
    hits = hit_filter(project.suggest(text, backend_params))
    for hit in hits:
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
コード例 #6
0
def test_project_train_state_not_available(registry, caplog):
    project = registry.get_project('dummydummy')
    project.backend.is_trained = None
    with caplog.at_level(logging.WARNING):
        result = project.suggest('this is some text')
    assert project.is_trained is None
    assert len(result) == 1
    hits = result.as_list(project.subjects)
    assert hits[0].uri == 'http://example.org/dummy'
    assert hits[0].label == 'dummy'
    assert hits[0].score == 1.0
    assert 'Could not get train state information' in caplog.text
コード例 #7
0
def test_project_learn(registry, tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))

    project = registry.get_project('dummy-fi')
    project.learn(docdir)
    result = project.suggest('this is some text')
    assert len(result) == 1
    hits = result.as_list(project.subjects)
    assert hits[0].uri == 'http://example.org/key1'
    assert hits[0].label == 'key1'
    assert hits[0].score == 1.0
コード例 #8
0
def test_project_learn(app, tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))

    with app.app_context():
        project = annif.project.get_project('dummy-fi')
        project.learn(docdir)
        result = project.suggest('this is some text')
        assert len(result) == 1
        assert result[0].uri == 'http://example.org/key1'
        assert result[0].label == 'key1'
        assert result[0].score == 1.0
コード例 #9
0
def suggest(project_id, text, limit, threshold):
    """suggest subjects for the given text and return a dict with results
    formatted according to Swagger spec"""

    try:
        project = annif.project.get_project(project_id,
                                            min_access=Access.hidden)
    except ValueError:
        return project_not_found_error(project_id)

    hit_filter = SuggestionFilter(limit, threshold)
    try:
        result = project.suggest(text)
    except AnnifException as err:
        return server_error(err)
    hits = hit_filter(result)
    return {'results': [hit._asdict() for hit in hits]}
コード例 #10
0
ファイル: cli.py プロジェクト: pelly/Annif
def run_eval(project_id, paths, limit, threshold, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))