Example #1
0
def test_dump_from_query_lib(brexit):
    corpus = Corpus(brexit['corpus_name'], lib_path=brexit['lib_path'])
    df_dump = corpus.dump_from_query(query=brexit['query_lib'],
                                     s_query=brexit['s_query'],
                                     match_strategy='longest')
    assert (type(df_dump) == pd.DataFrame)
    assert (df_dump.shape[0] > 99)
Example #2
0
def test_collocates_no_mws(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="Laschet"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word', mws=None)
    collocates.show()
def test_concordance_lines(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')

    concordance = Concordance(corpus, result.df)

    # standard = raw
    lines = concordance.lines()
    assert(len(lines) > 10)
    assert('raw' in lines.columns)
    assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word']))

    # simple
    lines = concordance.lines(form='simple', cut_off=10)
    assert('text' in lines.columns)
    assert(len(lines) == 10)

    # kwic
    lines = concordance.lines(form='kwic', cut_off=10)
    assert(all(elem in lines.columns for elem in ['left', 'node', 'right']))
    assert(len(lines) == 10)

    # kwic with s-attribute
    lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10)
    assert(len(lines) == 10)
    assert('text_id' in lines.columns)
Example #4
0
def test_disc_context(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    # init topic disc
    topic = Disc(corpus, ["SPD", "CSU", "Grünen"], 'lemma', 's', 's')
    print(topic.dump.context())
Example #5
0
def test_query_breakdown(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"'
    dump = corpus.query(query)
    print(dump.breakdown())
Example #6
0
def test_collocates_speed_many():
    corpus = Corpus("GERMAPARL_1114")
    query = '[lemma="sagen"]'
    df_dump = corpus.query(query, context=2, context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    c2 = collocates.show(window=2, cut_off=50)
    assert (type(c2) == pd.DataFrame)
def test_concordance_persistence(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query_1 = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    query_2 = (
        '"und"'
    )

    # will show results for query_1
    dump = corpus.query(query_1, context_break='s')
    concordance = Concordance(corpus, dump.df)
    line_1 = concordance.lines(cut_off=1, form='dataframes')
    df_1 = line_1['df'].iloc[0]

    # will show results for query_1
    dump = corpus.query(query_2, context_break='s')
    line_2 = concordance.lines(cut_off=1, form='dataframes')
    df_2 = line_2['df'].iloc[0]

    # will show results for query_2
    concordance = Concordance(corpus, dump.df)
    line_3 = concordance.lines(cut_off=1, form='dataframes')
    df_3 = line_3['df'].iloc[0]

    assert(df_1.equals(df_2))
    assert(not df_2.equals(df_3))
Example #8
0
def test_query_satt_easy(brexit):

    corpus = Corpus(brexit['corpus_name'],
                    registry_path=brexit['registry_path'])
    dump = corpus.query_s_att('np')
    conc = dump.concordance(form='simple')
    print(conc)
Example #9
0
def test_query_satt(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query_s_att('p_type', {'interjection'})
    conc = dump.concordance(form='simple')
    print(conc)
Example #10
0
def test_collocates_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="NAHH"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word')
    collocates.show()
Example #11
0
def test_keywords_query(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD" expand to s')
    keywords = dump.keywords()
    print(keywords.head(50))
Example #12
0
def test_query_keywords_collocates(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('"Horst" expand to s')
    dump = corpus.query(query)
    keywords = Keywords(corpus, df_dump=dump.df, p_query='lemma')
    assert ('Horst' == keywords.show(order='log_likelihood').head(1).index[0])
Example #13
0
def test_keywords_switch(germaparl):

    name_all = 'test_all'

    # get some regions
    corpus = Corpus(corpus_name=germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    df_all = corpus.query('"und" expand to s', name=name_all).df
    df_head = df_all.head(500)
    df_tail = df_all.tail(500)

    # will show keywords for head
    keywords = Keywords(corpus, df_dump=df_head, p_query="lemma")
    line_head_name = keywords.show(order='log_likelihood')

    # will show keywords for head
    keywords = Keywords(corpus, df_dump=df_head, p_query="lemma")
    line_head_df = keywords.show(order='log_likelihood')

    assert (line_head_df.equals(line_head_name))

    # will show keywords for tail
    keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma")
    line_tail_name = keywords.show(order='log_likelihood')

    # will show keywords for tail
    keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma")
    line_tail_df = keywords.show(order='log_likelihood')

    assert (line_tail_df.equals(line_tail_name))

    assert (not line_tail_df.equals(line_head_df))
Example #14
0
def test_filter_df(germaparl):
    c = Corpus(germaparl['corpus_name'],
               registry_path=germaparl['registry_path'])
    dump = c.query(germaparl['query'])
    coll = dump.collocates()
    print(coll)
    print(filter_df(coll, 'resources/stopwords-de.txt'))
def test_concordance_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="NAHH"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines() is None)
def test_concordance_last(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(conc.lines(order='last')) == pd.DataFrame)
Example #17
0
def test_dump_from_query_1(brexit):
    corpus = Corpus(brexit['corpus_name'])
    df_dump = corpus.dump_from_query(
        query='[lemma="angela"] @1[lemma="merkel"]',
        anchors=[1],
        match_strategy='longest')
    assert (type(df_dump) == pd.DataFrame)
    assert (df_dump.shape[0] > 99)
Example #18
0
def test_filter_df(germaparl):
    c = Corpus(germaparl['corpus_name'],
               registry_path=germaparl['registry_path'])
    dump = c.query(germaparl['query'])
    coll = dump.collocates()
    assert ',' in coll.index
    coll_filtered = filter_df(coll, 'resources/stopwords-de.txt')
    assert ',' not in coll_filtered.index
def test_concordance_p_slots(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines(p_slots='lemma') is None)
Example #20
0
def test_keywords(germaparl):

    party = {"CDU", "CSU"}
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_s_att('text_party', party)
    keywords = dump.keywords()
    print(keywords.head(50))
Example #21
0
def test_query_context(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = r'"\[" ([word="[A-Z]+"] "/"?) + "\]"'
    dump = corpus.query(cqp_query=query, context=20, context_break='s')
    print(dump)
    print(dump.df)
Example #22
0
def test_query_logging(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, 'fail')
    c = collocates.show(order='log_likelihood', window=15)
    assert (type(c) == pd.DataFrame)
    assert ('Dr.' in c.index)
def test_concordance_many(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="oder"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert(len(lines) == 100)
def test_concordance_anchors(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframes')
    assert(len(lines) == 13)
Example #25
0
def test_anchor(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = r'@1[pos="NE"]? @2[pos="NE"] "\[" (@3[word="[A-Z]+"]+ "/"?)+ "\]"'
    dump = corpus.query(query, context_break='s')
    lines = dump.concordance(form='dataframes')
    print()
    print(lines['df'].iloc[1])
Example #26
0
def test_concordancing_dataframes(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"'
    dump = corpus.query(query, context_break='s')
    lines = dump.concordance(form='dataframes')
    from pprint import pprint
    pprint(lines['df'].iloc[1])
def test_concordance_form_simple_kwic(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='kwic')
    assert(len(lines) == 13)
Example #28
0
def test_macro(brexit):
    corpus = Corpus(brexit['corpus_name'],
                    lib_path=brexit['lib_path'],
                    registry_path=brexit['registry_path'])
    cqp = corpus.start_cqp()
    cqp.Exec("Last=/ap();")
    counts = corpus.counts.matches(cqp, name="Last")
    cqp.__kill__()
    print(counts)
Example #29
0
def test_disc_concordance_form(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    # init topic disc
    topic = Disc(corpus, ["SPD", "CSU", "Grünen"], 'lemma', 's', 's')
    print(topic.concordance(cut_off=None, form='kwic'))
    print(topic.concordance(matches=[148430], cut_off=None, form='extended'))
def test_concordance_order(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    with pytest.raises(NotImplementedError):
        conc.lines(order='fail')