def test_concordance_anchors(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe') assert (len(lines) == 13)
def test_concordance_many(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="oder"]') result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert (len(lines) == 100)
def test_concordance_simple(germaparl): corpus = get_corpus(germaparl) df_dump = corpus.query('"CSU"').df concordance = Concordance(corpus, df_dump) lines = concordance.simple(df_dump, p_show=['word', 'lemma']) assert (len(lines) == len(df_dump)) assert (all(col in lines.columns for col in ['word', 'lemma']))
def test_concordance_form_kwic(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='kwic') assert (len(lines) == 13)
def test_concordance_dict(germaparl): corpus = get_corpus(germaparl) df_dump = corpus.query('[lemma="gehen"]', context=None).df concordance = Concordance(corpus, df_dump) lines = concordance.dict(df_dump, p_show=['word']) assert (isinstance(lines['dict'].iloc[0], dict)) assert ('word' in lines['dict'].iloc[0])
def test_concordance_anchors_weird(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @9[lemma="CDU"] "/" @2".*" @5[word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='dataframe') assert (len(lines) == 13)
def test_concordance_simple_nocontext(germaparl): corpus = get_corpus(germaparl) df_dump = corpus.query('[lemma="gehen"]', context=None).df concordance = Concordance(corpus, df_dump) lines = concordance.simple(df_dump, p_show=['word']) assert (len(lines) == len(df_dump)) assert (all(col in lines.columns for col in ['word']))
def test_concordance_p_atts(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] [lemma="CDU"] "/" ".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe') assert ('pos' in lines.iloc[0]['dataframe'].columns) assert (len(lines) == 13)
def test_concordance_p_slots(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines(p_slots='lemma') is None)
def test_concordance_last(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type(conc.lines(order='last')) == pd.DataFrame)
def test_concordance_fallback(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Gerhard"]? [lemma="Schröder"]') dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert (isinstance( conc.lines(order='last', form='simple', p_show=['word', 'lemma']), pd.DataFrame))
def test_concordance_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="NAHH"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines() is None)
def test_concordance_order(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) with pytest.raises(NotImplementedError): conc.lines(order='fail')
def test_concordance_form_simple(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='simple') assert(len(lines) == 13)
def test_concordance_many(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="oder"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert(len(lines) == 100)
def test_concordance_dataframes(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]') result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) df = concordance.dict(result.df, p_show=['word', 'lemma']) lines = concordance.dataframe(df, p_show=['word', 'lemma']) assert ('dataframe' in lines.columns) assert (isinstance(lines['dataframe'].iloc[0], pd.DataFrame))
def test_concordance_fallback(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type( conc.lines(order='last', form='simple', p_show=['word', 'lemma']) ) == pd.DataFrame)
def test_concordance_kwic(germaparl): corpus = get_corpus(germaparl) df_dump = corpus.query('[lemma="gehen"]').df concordance = Concordance(corpus, df_dump) lines = concordance.kwic(df_dump, p_show=['word', 'lemma']) assert (len(lines) == len(df_dump)) assert (all(col in lines.columns for col in [ 'left_word', 'node_word', 'right_word', 'left_lemma', 'node_lemma', 'right_lemma' ]))
def test_concordance_meta(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines(s_show=['text_name', 'p_type']) assert('text_name' in lines.columns) assert('p_type' in lines.columns) assert(len(lines) == 13)
def test_concordance_lines_dataframes(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10) assert('df' in lines.columns) assert(type(lines['df'].iloc[0]) == pd.DataFrame)
def test_concordance_export_dataframe(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) line = result.df.iloc[0] text_line = concordance._export(line.name, line, p_show=['word', 'pos'], form='dataframe') assert (isinstance(text_line, pd.DataFrame))
def test_concordance_export_dict(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]') result = corpus.query(query) concordance = Concordance(corpus, result.df) line = result.df.iloc[0] text_line = concordance._export(line.name, line, p_show=['word', 'pos'], form='dict') assert (isinstance(text_line, dict)) assert ('cpos' in text_line)
def test_concordance_line(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) line = result.df.iloc[0] text_line = concordance.text_line( line.name, line, p_show=['word', 'pos'] ) assert(type(text_line) == dict) assert('cpos' in text_line)
def test_concordance_line2df(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) line = result.df.iloc[0] text_line = concordance.text_line( line.name, line, ['word'] ) res = line2df(text_line) assert(type(res) == dict) assert(type(res['df']) == pd.DataFrame)
def test_concordance_slots_regions(germaparl): corpus = get_corpus(germaparl) query = ( r'[pos="NE"]? @1[pos="NE"] @2"\[" ([word="[A-Z]+"]+ "/"?)+ @3"\]"') df_dump = corpus.query(query, context=10, context_break='s', match_strategy='longest', corrections={ 2: +1, 3: -1 }).df concordance = Concordance(corpus, df_dump) lines = concordance.slots(df_dump, ['word'], slots=[['match', 1], [2, 3]]) assert (set(lines.columns) == {"word", "match..1_word", "2..3_word"})
def test_concordance_persistence(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query_1 = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) query_2 = ( '"und"' ) # will show results for query_1 dump = corpus.query(query_1, context_break='s') concordance = Concordance(corpus, dump.df) line_1 = concordance.lines(cut_off=1, form='dataframes') df_1 = line_1['df'].iloc[0] # will show results for query_1 dump = corpus.query(query_2, context_break='s') line_2 = concordance.lines(cut_off=1, form='dataframes') df_2 = line_2['df'].iloc[0] # will show results for query_2 concordance = Concordance(corpus, dump.df) line_3 = concordance.lines(cut_off=1, form='dataframes') df_3 = line_3['df'].iloc[0] assert(df_1.equals(df_2)) assert(not df_2.equals(df_3))
def test_concordance_slots_singletons(germaparl): corpus = get_corpus(germaparl) query = ( r'[pos="NE"]? @1[pos="NE"] @2"\[" ([word="[A-Z]+"]+ "/"?)+ @3"\]"') df_dump = corpus.query(query, context=2, context_break='s', match_strategy='longest', corrections={ 2: +1, 3: -1 }).df concordance = Concordance(corpus, df_dump) lines = concordance.slots(df_dump, ['word', 'lemma']) assert (set(lines.columns) == { "word", "lemma", "1_word", "1_lemma", "2_word", "2_lemma", "3_word", "3_lemma", "match..matchend_word", "match..matchend_lemma" })
def test_concordance_line2simple(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) line = result.df.iloc[0] text_line = concordance.text_line( line.name, line, ['word'] ) # simple res = line2simple(text_line) assert(type(res) == dict) assert(type(res["text"]) == str) # kwic res = line2simple(text_line, kwic=True) assert(type(res) == dict) assert(type(res["left"]) == str) assert(type(res["node"]) == str) assert(type(res["right"]) == str)
def test_concordance_lines(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) # standard = raw lines = concordance.lines() assert(len(lines) > 10) assert('raw' in lines.columns) assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word'])) # simple lines = concordance.lines(form='simple', cut_off=10) assert('text' in lines.columns) assert(len(lines) == 10) # kwic lines = concordance.lines(form='kwic', cut_off=10) assert(all(elem in lines.columns for elem in ['left', 'node', 'right'])) assert(len(lines) == 10) # kwic with s-attribute lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10) assert(len(lines) == 10) assert('text_id' in lines.columns)
def concordance(self, context=None, matches=None, p_show=['word'], s_show=[], order='random', cut_off=100, form='dataframes'): # deal with context if context is None: context = self.parameters['context'] if context > self.parameters['context']: logger.warning("out of context; falling back to context=%d" % self.parameters['context']) context = self.parameters['context'] df = self.dump.df elif context < self.parameters['context']: df = self.dump.df.reset_index() df['context_new'] = df['match'] - context df['contextend_new'] = df['matchend'] + context df['context'] = df[['context', 'context_new']].max(axis=1) df['contextend'] = df[['contextend', 'contextend_new']].min(axis=1) df = df.drop(['context_new', 'contextend_new'], axis=1) df = df.set_index(['match', 'matchend']) else: df = self.dump.df conc = Concordance(self.dump.corpus.copy(), df) return conc.lines(matches=matches, p_show=p_show, s_show=s_show, p_text=None, p_slots=None, slots=[], order=order, cut_off=cut_off, form=form)