def test_count_items(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() items = ["Helmut", "Kohl", "CDU"] queries = [formulate_cqp_query([item]) for item in items] # whole corpus counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) # subcorpus cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq']) # whole corpus cqp.nqr_activate(corpus.corpus_name) counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) cqp.__kill__()
def test_marginals_patterns(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) counts = corpus.marginals(["H*", "Kohl", "CDU"]) assert (len(counts) == 3) counts = corpus.marginals(["H*", "Kohl", "CDU"], pattern=True) assert (len(counts) == 3)