예제 #1
0
def test_count_items(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()

    items = ["Helmut", "Kohl", "CDU"]
    queries = [formulate_cqp_query([item]) for item in items]

    # whole corpus
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    # subcorpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')

    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq'])

    # whole corpus
    cqp.nqr_activate(corpus.corpus_name)
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    cqp.__kill__()
예제 #2
0
def test_marginals_patterns(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    counts = corpus.marginals(["H*", "Kohl", "CDU"])
    assert (len(counts) == 3)
    counts = corpus.marginals(["H*", "Kohl", "CDU"], pattern=True)
    assert (len(counts) == 3)