def test_concordance_persistence(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query_1 = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) query_2 = ( '"und"' ) # will show results for query_1 dump = corpus.query(query_1, context_break='s') concordance = Concordance(corpus, dump.df) line_1 = concordance.lines(cut_off=1, form='dataframes') df_1 = line_1['df'].iloc[0] # will show results for query_1 dump = corpus.query(query_2, context_break='s') line_2 = concordance.lines(cut_off=1, form='dataframes') df_2 = line_2['df'].iloc[0] # will show results for query_2 concordance = Concordance(corpus, dump.df) line_3 = concordance.lines(cut_off=1, form='dataframes') df_3 = line_3['df'].iloc[0] assert(df_1.equals(df_2)) assert(not df_2.equals(df_3))
def test_concordance_lines(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) # standard = raw lines = concordance.lines() assert(len(lines) > 10) assert('raw' in lines.columns) assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word'])) # simple lines = concordance.lines(form='simple', cut_off=10) assert('text' in lines.columns) assert(len(lines) == 10) # kwic lines = concordance.lines(form='kwic', cut_off=10) assert(all(elem in lines.columns for elem in ['left', 'node', 'right'])) assert(len(lines) == 10) # kwic with s-attribute lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10) assert(len(lines) == 10) assert('text_id' in lines.columns)
def test_concordance_order(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) with pytest.raises(NotImplementedError): conc.lines(order='fail')
def test_concordance_form_kwic(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='kwic') assert (len(lines) == 13)
def test_concordance_anchors_weird(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @9[lemma="CDU"] "/" @2".*" @5[word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='dataframe') assert (len(lines) == 13)
def test_concordance_anchors(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe') assert (len(lines) == 13)
def test_concordance_many(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="oder"]') result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert (len(lines) == 100)
def test_concordance_last(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type(conc.lines(order='last')) == pd.DataFrame)
def test_concordance_p_atts(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] [lemma="CDU"] "/" ".*" [word="\\]"]') result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe') assert ('pos' in lines.iloc[0]['dataframe'].columns) assert (len(lines) == 13)
def test_concordance_fallback(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Gerhard"]? [lemma="Schröder"]') dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert (isinstance( conc.lines(order='last', form='simple', p_show=['word', 'lemma']), pd.DataFrame))
def test_concordance_p_slots(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines(p_slots='lemma') is None)
def test_concordance_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="NAHH"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines() is None)
def test_concordance_form_simple(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='simple') assert(len(lines) == 13)
def test_concordance_many(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="oder"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert(len(lines) == 100)
def test_concordance_fallback(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type( conc.lines(order='last', form='simple', p_show=['word', 'lemma']) ) == pd.DataFrame)
def test_concordance_meta(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines(s_show=['text_name', 'p_type']) assert('text_name' in lines.columns) assert('p_type' in lines.columns) assert(len(lines) == 13)
def test_concordance_lines_dataframes(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10) assert('df' in lines.columns) assert(type(lines['df'].iloc[0]) == pd.DataFrame)
def concordance(self, context=None, matches=None, p_show=['word'], s_show=[], order='random', cut_off=100, form='dataframes'): # deal with context if context is None: context = self.parameters['context'] if context > self.parameters['context']: logger.warning("out of context; falling back to context=%d" % self.parameters['context']) context = self.parameters['context'] df = self.dump.df elif context < self.parameters['context']: df = self.dump.df.reset_index() df['context_new'] = df['match'] - context df['contextend_new'] = df['matchend'] + context df['context'] = df[['context', 'context_new']].max(axis=1) df['contextend'] = df[['contextend', 'contextend_new']].min(axis=1) df = df.drop(['context_new', 'contextend_new'], axis=1) df = df.set_index(['match', 'matchend']) else: df = self.dump.df conc = Concordance(self.dump.corpus.copy(), df) return conc.lines(matches=matches, p_show=p_show, s_show=s_show, p_text=None, p_slots=None, slots=[], order=order, cut_off=cut_off, form=form)
def test_concordance_lines_extended(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2 ".*" @3[word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) p_slots = 'lemma' slots = {'test': [1, 3]} lines = concordance.lines(form='extended', p_show=['word', 'lemma'], s_show=['text_id'], slots=slots, p_text='word', p_slots=p_slots, cut_off=10) assert('df' in lines.columns) assert(type(lines['df'].iloc[0]) == pd.DataFrame) assert(3 in lines.columns) assert('text' in lines.columns)
def test_concordance_lines(germaparl): corpus = get_corpus(germaparl) query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]') result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) # default = simple lines = concordance.lines() assert (len(lines) > 10) assert ('word' in lines.columns) # kwic lines = concordance.lines(form='kwic', cut_off=10) assert (all(elem in lines.columns for elem in ['left_word', 'node_word', 'right_word'])) assert (len(lines) == 10) # kwic with s-attribute lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10) assert (len(lines) == 10) assert ('text_id' in lines.columns) # slots lines = concordance.lines(form='slots', s_show=['text_id'], cut_off=10) assert (len(lines) == 10) assert (all(elem in lines.columns for elem in ['match..matchend_word', '1_word'])) # dict lines = concordance.lines(form='dict', s_show=['text_id'], cut_off=10) assert (len(lines) == 10) assert (all(elem in lines.columns for elem in ['dict', 'text_id'])) # dict lines = concordance.lines(form='dataframe', s_show=['text_id'], cut_off=10) assert (len(lines) == 10) assert (all(elem in lines.columns for elem in ['dataframe', 'text_id']))
def concordance(self, window=5, matches=None, p_show=['word'], s_show=[], order='random', cut_off=100, form='dataframes'): """ self.df_nodes has duplicate entries (1) convert to (match matchend) disc_1_set disc_2_set ... (2) convert each line to dataframe """ # make sure we're having the right context if window not in self.df_nodes.keys(): df_nodes = self.slice_discs(window).copy() else: df_nodes = self.df_nodes[window] # get ids of all discoursemes disc_ids = set(self.discoursemes.keys()) logger.info("converting discourse nodes to regular dump") # TODO speed up all_matches = set(df_nodes['match']) rows = list() for match in all_matches: row = dict() df_loc = df_nodes.loc[df_nodes.match == match] row['match'] = match row['matchend'] = df_loc.iloc[0]['matchend'] row['context_id'] = df_loc.iloc[0]['context_id'] row['context'] = df_loc.iloc[0]['context'] row['contextend'] = df_loc.iloc[0]['contextend'] for idx in disc_ids: disc_f1 = set() for a, b in zip(df_loc['match_' + idx], df_loc['matchend_' + idx]): disc_f1.update(range(a, b + 1)) row[idx] = disc_f1 rows.append(row) df = DataFrame(rows) df = df.set_index(["match", "matchend"]) logger.info("converting each line to dataframe") conc = Concordance(self.corpus.copy(), df) lines = conc.lines(matches=matches, p_show=p_show, s_show=s_show, p_text=None, p_slots=None, slots=[], order=order, cut_off=cut_off, form=form) logger.info("inserting discourseme and window/context info") # TODO mark out of context dfs = list() for line in lines.iterrows(): df = line[1]['df'] # indicate topic matches match, matchend = line[0] df[self.topic.idx] = df.index.isin(set(range(match, matchend + 1))) # indicate discourseme matches for idx in disc_ids: df[idx] = df.index.isin(line[1][idx]) df = df.drop(['match', 'matchend', 'context', 'contextend'], axis=1) dfs.append(df) lines['df'] = dfs return lines
def test_concordance_order(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Gerhard"]? [lemma="Schröder"]') dump = corpus.query(query) conc = Concordance(corpus, dump.df) conc.lines(order='fail')
def test_concordance_empty(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Gerhard"]? [lemma="NAHH"]') dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert (conc.lines().empty)
def test_concordance_last(germaparl): corpus = get_corpus(germaparl) query = ('[lemma="Gerhard"]? [lemma="Schröder"]') dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert (isinstance(conc.lines(order='last'), pd.DataFrame))