def test_collo_combo(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, ['lemma', 'pos']) c = collocates.show(order='log_likelihood') assert (type(c) == pd.DataFrame)
def test_collocates_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="NAHH"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word') collocates.show()
def test_collocates_no_mws(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="Laschet"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word', mws=None) collocates.show()
def test_collocates_speed_many(): corpus = Corpus("GERMAPARL_1114") query = '[lemma="sagen"]' df_dump = corpus.query(query, context=2, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') c2 = collocates.show(window=2, cut_off=50) assert (type(c2) == pd.DataFrame)
def test_query_logging(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, 'fail') c = collocates.show(order='log_likelihood', window=15) assert (type(c) == pd.DataFrame) assert ('Dr.' in c.index)
def test_collocates_speed_many(germaparl): corpus = get_corpus(germaparl) query = '[lemma="die"]' df_dump = corpus.query(query, context_break='text').df collocates = Collocates(corpus, df_dump, p_query='lemma', mws=100) c2 = collocates.show(window=50, cut_off=50) assert c2.index[0] == ',' assert type(c2) == pd.DataFrame
def test_query_default(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, 'lemma') c = collocates.show(order='log_likelihood') assert (type(c) == pd.DataFrame) assert ('Dr.' in c.index)
def test_collo_single(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, 'lemma') c = collocates.show(order='log_likelihood') print(c) assert (type(c) == pd.DataFrame) assert ('Dr.' in c.index)
def test_collocates_pp(germaparl): corpus = get_corpus(germaparl) query = ('"SPD"') result = corpus.query(query) collocates = Collocates(corpus, result.df, p_query='word') c = collocates.show(order='log_likelihood', cut_off=None) assert (int(c.loc['Die']['O11']) < int(c.loc['die']['O11'])) c = collocates.show(order='log_likelihood', cut_off=None, flags="%cd") assert ('die' in c.index and 'Die' not in c.index)
def test_collocates_nodes(germaparl): corpus = get_corpus(germaparl) query = ('[lemma=","] | [lemma="\\."] | [lemma="\\)"] | [lemma="\\("]') # three discoursemes dump = corpus.query(query) collocates = Collocates(corpus, dump.df) df = collocates.show(cut_off=None) assert ("," not in df.index) assert ("(" not in df.index)
def test_collocates_mwu(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="CDU"] "/"? [lemma="CSU"]?') result = corpus.query(query, match_strategy='longest') collocates = Collocates(corpus, result.df, 'lemma') c = collocates.show(order='log_likelihood', cut_off=None) assert (type(c) == pd.DataFrame) assert (len(c) > 9) assert ('CSU' in c.index) assert (int(c.loc['CSU']['in_nodes']) > int(c.loc['CSU']['f']))
def collocates(self, window=5, order='f', cut_off=100, p_query="lemma", ams=None, min_freq=2, frequencies=True, flags=None): coll = Collocates(self.dump.corpus.copy(), self.dump.df, p_query) return coll.show(window=window, order=order, cut_off=cut_off, ams=ams, min_freq=min_freq, frequencies=frequencies, flags=flags)
def compare_counts(lemma, window, min_freq=0): # TODO: update to reproduceable example # CCC corpus = Corpus("GERMAPARL_1114") query = '[lemma="' + lemma + '"]' df_dump = corpus.query(query, context=window, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') col = collocates.show(window=5, cut_off=None, min_freq=min_freq) # UCS ucs = pd.read_csv("tests/gold/ucs-germaparl1114-" + lemma + ".ds.gz", sep="\t", index_col=2, comment="#", quoting=3, na_filter=False) ucs.index.name = 'item' try: O11_ucs_node = ucs.loc[lemma]['f'] ucs.drop(lemma, inplace=True) except KeyError: O11_ucs_node = 0 # identities that should hold between counting strategies # (1) N_ccc + f1_ccc = N_ucs # (2) f1_infl_ccc = f1_infl_ucs - O11_ucs_node nr = { 'f1_ccc': int(corpus.marginals([lemma], "lemma")[['freq']].values[0]), 'N_ccc': int(col[['N']].values[0]), 'f1_infl_ccc': int(col[['f1']].values[0]), 'N_ucs': int(ucs[['N']].values[0]), 'f1_infl_ucs': int(ucs[['f1']].values[0]), 'O11_ucs_node': O11_ucs_node } # make dataframes comparable ucs = ucs[['f', 'f2']] ucs.columns = ['O11', 'f2'] ucs.sort_values(by=['O11', 'item'], ascending=False, inplace=True) assert (nr['N_ccc'] + nr['f1_ccc'] == nr['N_ucs']) assert (nr['f1_infl_ccc'] == nr['f1_infl_ucs'] - nr['O11_ucs_node'])
def test_compare_counts(germaparl, ucs_counts): # identities that should hold between counting strategies: # O11 = f_ucs # O11 + O21 = f2_ucs # O11 + O12 + O21 + O22 + freq[node] = N_ucs # O11 + O12 = f1_ucs - O11_ucs[node] corpus = get_corpus(germaparl) # [lemma="Land"] lemma = "Land" context = 10 min_freq = 0 df_dump = corpus.query('[lemma="%s"]' % lemma, context=context, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') counts = collocates.show(window=context, cut_off=None, min_freq=min_freq)[[ 'O11', 'O12', 'O21', 'O22', 'in_nodes' ]] counts = counts.join(ucs_counts[lemma]) ucs_node_cooc = ucs_counts[lemma].loc[lemma] ccc_node_freq = corpus.marginals([lemma], "lemma")['freq'].values[0] assert (counts['O11'].equals(counts['f_ucs'])) assert ((counts['O11'] + counts['O21']).equals(counts['f2_ucs'])) assert ((counts['O11'] + counts['O12'] + counts['O21'] + counts['O22'] + ccc_node_freq).equals(counts['N_ucs'])) assert ((counts['O11'] + counts['O12']).equals(counts['f1_ucs'] - ucs_node_cooc['f_ucs'])) # [lemma="und"] lemma = "und" context = 5 min_freq = 2 df_dump = corpus.query('[lemma="%s"]' % lemma, context=context, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') counts = collocates.show(window=context, cut_off=None, min_freq=min_freq)[[ 'O11', 'O12', 'O21', 'O22', 'in_nodes' ]] counts = counts.join(ucs_counts['und']) ucs_node_cooc = ucs_counts['und'].loc['und'] ccc_node_freq = corpus.marginals(['und'], "lemma")['freq'].values[0] assert (counts['O11'].equals(counts['f_ucs'])) assert ((counts['O11'] + counts['O21']).equals(counts['f2_ucs'])) assert ((counts['O11'] + counts['O12'] + counts['O21'] + counts['O22'] + ccc_node_freq).equals(counts['N_ucs'])) assert ((counts['O11'] + counts['O12']).equals(counts['f1_ucs'] - ucs_node_cooc['f_ucs']))
def test_collocates_persistence(germaparl): corpus = get_corpus(germaparl) query_1 = ('"SPD"') query_2 = ('"CSU"') # will show collocates for query_1 result = corpus.query(query_1, context_break='s').df collocates = Collocates(corpus, result, 'lemma') line_1 = collocates.show() # will show collocates for query_1 result = corpus.query(query_2, context_break='s').df line_2 = collocates.show() # will show collocates for query_2 collocates = Collocates(corpus, result, 'lemma') line_3 = collocates.show() assert (line_1.equals(line_2)) assert (not line_2.equals(line_3))
def test_collocates_no_mws(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Armin"]? [lemma="Laschet"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word', mws=None) collocates.show()
def test_collocates_empty(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Armin"]? [lemma="NAHH"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word') collocates.show()