def test_counts_matches_3(germaparl): strategy = 3 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() cqp.nqr_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]', name='Last') df = corpus.counts.matches(cqp, 'Last', p_atts=['word'], split=True, flags="%cd", strategy=strategy) assert ("helmut" in df.index) df = corpus.counts.matches(cqp, 'Last', p_atts=['word'], split=True, strategy=strategy) assert ("Helmut" in df.index) df = corpus.counts.matches(cqp, 'Last', p_atts=['word', 'pos'], split=True, strategy=strategy) assert (("Helmut", "NE") in df.index) cqp.__kill__()
def test_count_items(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() items = ["Helmut", "Kohl", "CDU"] queries = [formulate_cqp_query([item]) for item in items] # whole corpus counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) # subcorpus cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq']) # whole corpus cqp.nqr_activate(corpus.corpus_name) counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) cqp.__kill__()
def test_cwb_counts(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() queries = [ '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]' ] df = corpus.counts.mwus(cqp, queries) assert (df['freq'][queries[1]] == 55) cqp.__kill__()
def test_count_matches(brexit): corpus = Corpus(brexit['corpus_name']) corpus.query(cqp_query='[lemma="nigel"]', context=10, context_break='tweet', name='Test', save=True) cqp = corpus.start_cqp() counts = corpus.counts.matches(cqp, 'Test') assert ("Nigel" in counts.index)
def test_count_mwus_3(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts3['freq']['CSU'] == 635)
def test_counts_mwus(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() queries = [ '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]' ] df = corpus.counts.mwus(cqp, queries, strategy=1) assert (df['freq'][queries[0]] == 6) df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma', 'pos']) assert (df['freq'][('Horst', 'NE')] == 55) df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma']) assert (df['freq']['Horst'] == 55) cqp.__kill__()
def test_count_mwus_strategies(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert ('([word="CSU"])' in counts1.index) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts2.equals(counts3)) assert (sum(counts1['freq']) == sum(counts2['freq']))
def test_count_items_subcorpora(germaparl): # subcorpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() dump = corpus.dump_from_s_att("text_role", ["presidency"]) cqp.nqr_from_dump(dump.df, 'presidency') cqp.nqr_activate(corpus.corpus_name, 'presidency') items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert (sum(counts1['freq']) > 0) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) assert (counts2.equals(counts3)) cqp.__kill__()
def test_cwb_scan_corpus(germaparl): from tempfile import NamedTemporaryFile corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() cqp.nqr_from_query('[lemma="Helmut"]', name='tmp') with NamedTemporaryFile(mode="wt") as f: cqp.Exec('dump tmp > "%s"' % f.name) df1 = cwb_scan_corpus(f.name, germaparl['corpus_name'], germaparl['registry_path']) cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') cqp.nqr_from_query('[lemma="Kohl"]', name='tmp') with NamedTemporaryFile(mode="wt") as f: cqp.Exec('dump tmp > "%s"' % f.name) df2 = cwb_scan_corpus(f.name, germaparl['corpus_name'], germaparl['registry_path']) cqp.__kill__() assert (sum(df2['freq']) != sum(df1['freq']))