def test_count_items(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() items = ["Helmut", "Kohl", "CDU"] queries = [formulate_cqp_query([item]) for item in items] # whole corpus counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) # subcorpus cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq']) # whole corpus cqp.nqr_activate(corpus.corpus_name) counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) cqp.__kill__()
def __init__(self, corpus, items, p_query, s_query, s_context, context=20, flags="%cd", escape=False): """ .items .parameters .idx .dump """ self.items = items self.parameters = { 'p_query': p_query, 's_query': s_query, 's_context': s_context, 'context': context, 'flags': flags, 'escape': escape } # run query query = formulate_cqp_query(items, p_query, s_query, flags, escape) dump = corpus.query(query, context, context_break=s_context) self.dump = dump self.idx = dump.name_cache self._context = None self._matches = None
def test_count_mwus_3(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts3['freq']['CSU'] == 635)
def test_count_mwus_strategies(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert ('([word="CSU"])' in counts1.index) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts2.equals(counts3)) assert (sum(counts1['freq']) == sum(counts2['freq']))
def test_count_items_subcorpora(germaparl): # subcorpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() dump = corpus.dump_from_s_att("text_role", ["presidency"]) cqp.nqr_from_dump(dump.df, 'presidency') cqp.nqr_activate(corpus.corpus_name, 'presidency') items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert (sum(counts1['freq']) > 0) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) assert (counts2.equals(counts3)) cqp.__kill__()