def test_counts_matches_3(germaparl): strategy = 3 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() cqp.nqr_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]', name='Last') df = corpus.counts.matches(cqp, 'Last', p_atts=['word'], split=True, flags="%cd", strategy=strategy) assert ("helmut" in df.index) df = corpus.counts.matches(cqp, 'Last', p_atts=['word'], split=True, strategy=strategy) assert ("Helmut" in df.index) df = corpus.counts.matches(cqp, 'Last', p_atts=['word', 'pos'], split=True, strategy=strategy) assert (("Helmut", "NE") in df.index) cqp.__kill__()
def test_counts_dump_2(germaparl): strategy = 2 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]') df = corpus.counts.dump(dump, p_atts=['word'], split=True, strategy=strategy) assert (df["freq"]["Helmut"] == 6) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=True, strategy=strategy) assert (df["freq"][("Helmut", "NE")] == 6) df = corpus.counts.dump(dump, p_atts=['word'], split=False, strategy=strategy) assert ("Helmut Kohl" in df.index) assert (df["freq"].iloc[0] == 6) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=False, strategy=strategy) assert (("Helmut Kohl", "NE NE") in df.index) assert (df["freq"].iloc[0] == 6)
def test_marginals_patterns(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) counts = corpus.marginals(["H*", "Kohl", "CDU"]) assert (len(counts) == 3) counts = corpus.marginals(["H*", "Kohl", "CDU"], pattern=True) assert (len(counts) == 3)
def test_concordance_options(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump.concordance(form='raw')) print(dump.concordance(form='simple')) print(dump.concordance(form='kwic')) print(dump.concordance(form='dataframes')) print(dump.concordance(form='extended'))
def test_cwb_counts(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() queries = [ '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]' ] df = corpus.counts.mwus(cqp, queries) assert (df['freq'][queries[1]] == 55) cqp.__kill__()
def test_count_matches(brexit): corpus = Corpus(brexit['corpus_name']) corpus.query(cqp_query='[lemma="nigel"]', context=10, context_break='tweet', name='Test', save=True) cqp = corpus.start_cqp() counts = corpus.counts.matches(cqp, 'Test') assert ("Nigel" in counts.index)
def test_dumps(brexit): corpus = Corpus(brexit['corpus_name']) ids = { 't740982320711249920', 't731037753241112576', 't729363812802039814', 't733648546881277953', 't741216447595220992', 't705780723018539012', 't745930343627243520', 't730870826178904065', 't745691821477605377', 't730419966818783232', 't746069538693750784' } dump = corpus.dump_from_s_att('tweet_id', ids) print(dump.concordance())
def test_count_mwus_3(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts3['freq']['CSU'] == 635)
def test_counts_mwus(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() queries = [ '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]' ] df = corpus.counts.mwus(cqp, queries, strategy=1) assert (df['freq'][queries[0]] == 6) df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma', 'pos']) assert (df['freq'][('Horst', 'NE')] == 55) df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma']) assert (df['freq']['Horst'] == 55) cqp.__kill__()
def get_corpus(corpus_settings, data_path=DATA_PATH): return Corpus( corpus_settings['corpus_name'], registry_path=corpus_settings['registry_path'], lib_path=corpus_settings.get('lib_path', None), data_path=data_path )
def test_counts_dump_1_split(germaparl): strategy = 1 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="die" %cd] [pos="N.*"]') df = corpus.counts.dump(dump, p_atts=['word'], split=True, strategy=strategy) assert (int(df["freq"]["der"]) == 3775) df = corpus.counts.dump(dump, p_atts=['word', 'lemma'], split=True, strategy=strategy) assert (int(df["freq"][("der", "die")]) == 3775)
def test_counts_dump_1_no_split(germaparl): strategy = 1 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]') # no split df = corpus.counts.dump(dump, p_atts=['word'], split=False, strategy=strategy) assert ("Helmut Kohl" in df.index) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=False, strategy=strategy) assert (("Helmut Kohl", "NE NE") in df.index)
def test_count_mwus_strategies(germaparl): # whole corpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert ('([word="CSU"])' in counts1.index) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts2.equals(counts3)) assert (sum(counts1['freq']) == sum(counts2['freq']))
def test_count_items_subcorpora(germaparl): # subcorpus corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() dump = corpus.dump_from_s_att("text_role", ["presidency"]) cqp.nqr_from_dump(dump.df, 'presidency') cqp.nqr_activate(corpus.corpus_name, 'presidency') items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [formulate_cqp_query([item]) for item in items] counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert (sum(counts1['freq']) > 0) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) assert (counts2.equals(counts3)) cqp.__kill__()
def test_count_items(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() items = ["Helmut", "Kohl", "CDU"] queries = [formulate_cqp_query([item]) for item in items] # whole corpus counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) # subcorpus cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq']) # whole corpus cqp.nqr_activate(corpus.corpus_name) counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) cqp.__kill__()
def test_argmin_query(brexit): corpus = Corpus(brexit['corpus_name'], lib_path=brexit['lib_path']) query = brexit['query_argmin'] dump = corpus.query(cqp_query=query['cqp'], context=query.get('context', None), context_break=query.get('s_context', None), corrections=query['corrections'], match_strategy=query['match_strategy']) conc = dump.concordance(p_show=query['p_show'], s_show=query['s_show'], p_text=query['p_text'], p_slots=query['p_slots'], slots=query['slots'], order='first', cut_off=None, form='extended') print(conc) print(conc['df'].iloc[0])
def test_cwb_scan_corpus(germaparl): from tempfile import NamedTemporaryFile corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) cqp = corpus.start_cqp() cqp.nqr_from_query('[lemma="Helmut"]', name='tmp') with NamedTemporaryFile(mode="wt") as f: cqp.Exec('dump tmp > "%s"' % f.name) df1 = cwb_scan_corpus(f.name, germaparl['corpus_name'], germaparl['registry_path']) cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') cqp.nqr_from_query('[lemma="Kohl"]', name='tmp') with NamedTemporaryFile(mode="wt") as f: cqp.Exec('dump tmp > "%s"' % f.name) df2 = cwb_scan_corpus(f.name, germaparl['corpus_name'], germaparl['registry_path']) cqp.__kill__() assert (sum(df2['freq']) != sum(df1['freq']))
def test_collocates(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump.collocates())
def test_collocates_options(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump.collocates(order='log_likelihood', cut_off=200))
def test_count_cpos_combo(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) counts = corpus.counts.cpos(list(range(1, 1000)), p_atts=['lemma', 'pos']) assert (type(counts) == pd.DataFrame) assert (counts.index.names == ['lemma', 'pos'])
def test_keywords_options(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD" expand to s') print(dump.keywords(order='log_ratio', cut_off=200))
def test_keywords(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD" expand to s') print(dump.keywords())
def test_count_cpos(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) counts = corpus.counts.cpos(list(range(1, 1000)), p_atts=['word']) assert (type(counts) == pd.DataFrame)
def test_query2dump(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump)
def test_breakdown(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump.breakdown())
from ccc.cwb import Corpus from ccc.concordances import read_query_json, run_query from argparse import ArgumentParser import os import logging logger = logging.getLogger(__name__) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('query_path') parser.add_argument('corpus') parser.add_argument('data_path') args = parser.parse_args() # read file query = read_query_json(args.query_path) # patch path to query query['query_path'] = args.query_path # run query corpus = Corpus(args.corpus, query['lib_path'], data_path=args.data_path) query, result = run_query(corpus, query) # get path for output path_out = os.path.join(args.data_path, query['name']) + ".tsv" result['df'] = result['df'].apply(lambda row: row.to_json()) result.to_csv(path_out, sep="\t")
def test_context_matches(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD"') print(dump.matches()) print(dump.context())