def test_dump_from_query_lib(brexit): corpus = Corpus(brexit['corpus_name'], lib_path=brexit['lib_path']) df_dump = corpus.dump_from_query(query=brexit['query_lib'], s_query=brexit['s_query'], match_strategy='longest') assert (type(df_dump) == pd.DataFrame) assert (df_dump.shape[0] > 99)
def test_collocates_no_mws(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="Laschet"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word', mws=None) collocates.show()
def test_concordance_lines(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) # standard = raw lines = concordance.lines() assert(len(lines) > 10) assert('raw' in lines.columns) assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word'])) # simple lines = concordance.lines(form='simple', cut_off=10) assert('text' in lines.columns) assert(len(lines) == 10) # kwic lines = concordance.lines(form='kwic', cut_off=10) assert(all(elem in lines.columns for elem in ['left', 'node', 'right'])) assert(len(lines) == 10) # kwic with s-attribute lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10) assert(len(lines) == 10) assert('text_id' in lines.columns)
def test_disc_context(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) # init topic disc topic = Disc(corpus, ["SPD", "CSU", "Grünen"], 'lemma', 's', 's') print(topic.dump.context())
def test_query_breakdown(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"' dump = corpus.query(query) print(dump.breakdown())
def test_collocates_speed_many(): corpus = Corpus("GERMAPARL_1114") query = '[lemma="sagen"]' df_dump = corpus.query(query, context=2, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') c2 = collocates.show(window=2, cut_off=50) assert (type(c2) == pd.DataFrame)
def test_concordance_persistence(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query_1 = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) query_2 = ( '"und"' ) # will show results for query_1 dump = corpus.query(query_1, context_break='s') concordance = Concordance(corpus, dump.df) line_1 = concordance.lines(cut_off=1, form='dataframes') df_1 = line_1['df'].iloc[0] # will show results for query_1 dump = corpus.query(query_2, context_break='s') line_2 = concordance.lines(cut_off=1, form='dataframes') df_2 = line_2['df'].iloc[0] # will show results for query_2 concordance = Concordance(corpus, dump.df) line_3 = concordance.lines(cut_off=1, form='dataframes') df_3 = line_3['df'].iloc[0] assert(df_1.equals(df_2)) assert(not df_2.equals(df_3))
def test_query_satt_easy(brexit): corpus = Corpus(brexit['corpus_name'], registry_path=brexit['registry_path']) dump = corpus.query_s_att('np') conc = dump.concordance(form='simple') print(conc)
def test_query_satt(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query_s_att('p_type', {'interjection'}) conc = dump.concordance(form='simple') print(conc)
def test_collocates_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="NAHH"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word') collocates.show()
def test_keywords_query(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD" expand to s') keywords = dump.keywords() print(keywords.head(50))
def test_query_keywords_collocates(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('"Horst" expand to s') dump = corpus.query(query) keywords = Keywords(corpus, df_dump=dump.df, p_query='lemma') assert ('Horst' == keywords.show(order='log_likelihood').head(1).index[0])
def test_keywords_switch(germaparl): name_all = 'test_all' # get some regions corpus = Corpus(corpus_name=germaparl['corpus_name'], registry_path=germaparl['registry_path']) df_all = corpus.query('"und" expand to s', name=name_all).df df_head = df_all.head(500) df_tail = df_all.tail(500) # will show keywords for head keywords = Keywords(corpus, df_dump=df_head, p_query="lemma") line_head_name = keywords.show(order='log_likelihood') # will show keywords for head keywords = Keywords(corpus, df_dump=df_head, p_query="lemma") line_head_df = keywords.show(order='log_likelihood') assert (line_head_df.equals(line_head_name)) # will show keywords for tail keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma") line_tail_name = keywords.show(order='log_likelihood') # will show keywords for tail keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma") line_tail_df = keywords.show(order='log_likelihood') assert (line_tail_df.equals(line_tail_name)) assert (not line_tail_df.equals(line_head_df))
def test_filter_df(germaparl): c = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = c.query(germaparl['query']) coll = dump.collocates() print(coll) print(filter_df(coll, 'resources/stopwords-de.txt'))
def test_concordance_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="NAHH"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines() is None)
def test_concordance_last(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type(conc.lines(order='last')) == pd.DataFrame)
def test_dump_from_query_1(brexit): corpus = Corpus(brexit['corpus_name']) df_dump = corpus.dump_from_query( query='[lemma="angela"] @1[lemma="merkel"]', anchors=[1], match_strategy='longest') assert (type(df_dump) == pd.DataFrame) assert (df_dump.shape[0] > 99)
def test_filter_df(germaparl): c = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = c.query(germaparl['query']) coll = dump.collocates() assert ',' in coll.index coll_filtered = filter_df(coll, 'resources/stopwords-de.txt') assert ',' not in coll_filtered.index
def test_concordance_p_slots(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines(p_slots='lemma') is None)
def test_keywords(germaparl): party = {"CDU", "CSU"} corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_s_att('text_party', party) keywords = dump.keywords() print(keywords.head(50))
def test_query_context(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?) + "\]"' dump = corpus.query(cqp_query=query, context=20, context_break='s') print(dump) print(dump.df)
def test_query_logging(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, 'fail') c = collocates.show(order='log_likelihood', window=15) assert (type(c) == pd.DataFrame) assert ('Dr.' in c.index)
def test_concordance_many(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="oder"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert(len(lines) == 100)
def test_concordance_anchors(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframes') assert(len(lines) == 13)
def test_anchor(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'@1[pos="NE"]? @2[pos="NE"] "\[" (@3[word="[A-Z]+"]+ "/"?)+ "\]"' dump = corpus.query(query, context_break='s') lines = dump.concordance(form='dataframes') print() print(lines['df'].iloc[1])
def test_concordancing_dataframes(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"' dump = corpus.query(query, context_break='s') lines = dump.concordance(form='dataframes') from pprint import pprint pprint(lines['df'].iloc[1])
def test_concordance_form_simple_kwic(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='kwic') assert(len(lines) == 13)
def test_macro(brexit): corpus = Corpus(brexit['corpus_name'], lib_path=brexit['lib_path'], registry_path=brexit['registry_path']) cqp = corpus.start_cqp() cqp.Exec("Last=/ap();") counts = corpus.counts.matches(cqp, name="Last") cqp.__kill__() print(counts)
def test_disc_concordance_form(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) # init topic disc topic = Disc(corpus, ["SPD", "CSU", "Grünen"], 'lemma', 's', 's') print(topic.concordance(cut_off=None, form='kwic')) print(topic.concordance(matches=[148430], cut_off=None, form='extended'))
def test_concordance_order(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) with pytest.raises(NotImplementedError): conc.lines(order='fail')