Example #1
0
def test_counts_matches_3(germaparl):
    strategy = 3

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    cqp.nqr_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]', name='Last')
    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word'],
                               split=True,
                               flags="%cd",
                               strategy=strategy)
    assert ("helmut" in df.index)

    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word'],
                               split=True,
                               strategy=strategy)
    assert ("Helmut" in df.index)

    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word', 'pos'],
                               split=True,
                               strategy=strategy)
    assert (("Helmut", "NE") in df.index)
    cqp.__kill__()
Example #2
0
def test_counts_dump_2(germaparl):
    strategy = 2

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]')

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=True,
                            strategy=strategy)
    assert (df["freq"]["Helmut"] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=True,
                            strategy=strategy)
    assert (df["freq"][("Helmut", "NE")] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=False,
                            strategy=strategy)
    assert ("Helmut Kohl" in df.index)
    assert (df["freq"].iloc[0] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=False,
                            strategy=strategy)
    assert (("Helmut Kohl", "NE NE") in df.index)
    assert (df["freq"].iloc[0] == 6)
Example #3
0
def test_marginals_patterns(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    counts = corpus.marginals(["H*", "Kohl", "CDU"])
    assert (len(counts) == 3)
    counts = corpus.marginals(["H*", "Kohl", "CDU"], pattern=True)
    assert (len(counts) == 3)
Example #4
0
def test_concordance_options(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump.concordance(form='raw'))
    print(dump.concordance(form='simple'))
    print(dump.concordance(form='kwic'))
    print(dump.concordance(form='dataframes'))
    print(dump.concordance(form='extended'))
Example #5
0
def test_cwb_counts(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    queries = [
        '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]'
    ]
    df = corpus.counts.mwus(cqp, queries)
    assert (df['freq'][queries[1]] == 55)
    cqp.__kill__()
Example #6
0
def test_count_matches(brexit):
    corpus = Corpus(brexit['corpus_name'])
    corpus.query(cqp_query='[lemma="nigel"]',
                 context=10,
                 context_break='tweet',
                 name='Test',
                 save=True)
    cqp = corpus.start_cqp()
    counts = corpus.counts.matches(cqp, 'Test')
    assert ("Nigel" in counts.index)
Example #7
0
def test_dumps(brexit):

    corpus = Corpus(brexit['corpus_name'])
    ids = {
        't740982320711249920', 't731037753241112576', 't729363812802039814',
        't733648546881277953', 't741216447595220992', 't705780723018539012',
        't745930343627243520', 't730870826178904065', 't745691821477605377',
        't730419966818783232', 't746069538693750784'
    }
    dump = corpus.dump_from_s_att('tweet_id', ids)
    print(dump.concordance())
Example #8
0
def test_count_mwus_3(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    cqp.__kill__()

    assert (counts3['freq']['CSU'] == 635)
Example #9
0
def test_counts_mwus(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    queries = [
        '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]'
    ]
    df = corpus.counts.mwus(cqp, queries, strategy=1)
    assert (df['freq'][queries[0]] == 6)

    df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma', 'pos'])
    assert (df['freq'][('Horst', 'NE')] == 55)

    df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma'])

    assert (df['freq']['Horst'] == 55)
    cqp.__kill__()
Example #10
0
def get_corpus(corpus_settings, data_path=DATA_PATH):

    return Corpus(
        corpus_settings['corpus_name'],
        registry_path=corpus_settings['registry_path'],
        lib_path=corpus_settings.get('lib_path', None),
        data_path=data_path
    )
Example #11
0
def test_counts_dump_1_split(germaparl):
    strategy = 1

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="die" %cd] [pos="N.*"]')

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=True,
                            strategy=strategy)
    assert (int(df["freq"]["der"]) == 3775)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'lemma'],
                            split=True,
                            strategy=strategy)
    assert (int(df["freq"][("der", "die")]) == 3775)
Example #12
0
def test_counts_dump_1_no_split(germaparl):
    strategy = 1

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]')

    # no split
    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=False,
                            strategy=strategy)
    assert ("Helmut Kohl" in df.index)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=False,
                            strategy=strategy)
    assert (("Helmut Kohl", "NE NE") in df.index)
Example #13
0
def test_count_mwus_strategies(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert ('([word="CSU"])' in counts1.index)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)

    cqp.__kill__()
    assert (counts2.equals(counts3))
    assert (sum(counts1['freq']) == sum(counts2['freq']))
Example #14
0
def test_count_items_subcorpora(germaparl):

    # subcorpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    dump = corpus.dump_from_s_att("text_role", ["presidency"])
    cqp.nqr_from_dump(dump.df, 'presidency')
    cqp.nqr_activate(corpus.corpus_name, 'presidency')
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert (sum(counts1['freq']) > 0)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    assert (counts2.equals(counts3))
    cqp.__kill__()
Example #15
0
def test_count_items(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()

    items = ["Helmut", "Kohl", "CDU"]
    queries = [formulate_cqp_query([item]) for item in items]

    # whole corpus
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    # subcorpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')

    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq'])

    # whole corpus
    cqp.nqr_activate(corpus.corpus_name)
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    cqp.__kill__()
Example #16
0
def test_argmin_query(brexit):
    corpus = Corpus(brexit['corpus_name'], lib_path=brexit['lib_path'])

    query = brexit['query_argmin']

    dump = corpus.query(cqp_query=query['cqp'],
                        context=query.get('context', None),
                        context_break=query.get('s_context', None),
                        corrections=query['corrections'],
                        match_strategy=query['match_strategy'])

    conc = dump.concordance(p_show=query['p_show'],
                            s_show=query['s_show'],
                            p_text=query['p_text'],
                            p_slots=query['p_slots'],
                            slots=query['slots'],
                            order='first',
                            cut_off=None,
                            form='extended')

    print(conc)
    print(conc['df'].iloc[0])
Example #17
0
def test_cwb_scan_corpus(germaparl):
    from tempfile import NamedTemporaryFile
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    cqp.nqr_from_query('[lemma="Helmut"]', name='tmp')
    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df1 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')

    cqp.nqr_activate(corpus.corpus_name, 'und')
    cqp.nqr_from_query('[lemma="Kohl"]', name='tmp')

    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df2 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.__kill__()
    assert (sum(df2['freq']) != sum(df1['freq']))
Example #18
0
def test_collocates(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump.collocates())
Example #19
0
def test_collocates_options(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump.collocates(order='log_likelihood', cut_off=200))
Example #20
0
def test_count_cpos_combo(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    counts = corpus.counts.cpos(list(range(1, 1000)), p_atts=['lemma', 'pos'])
    assert (type(counts) == pd.DataFrame)
    assert (counts.index.names == ['lemma', 'pos'])
Example #21
0
def test_keywords_options(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD" expand to s')
    print(dump.keywords(order='log_ratio', cut_off=200))
Example #22
0
def test_keywords(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD" expand to s')
    print(dump.keywords())
Example #23
0
def test_count_cpos(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    counts = corpus.counts.cpos(list(range(1, 1000)), p_atts=['word'])
    assert (type(counts) == pd.DataFrame)
Example #24
0
def test_query2dump(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump)
Example #25
0
def test_breakdown(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump.breakdown())
Example #26
0
from ccc.cwb import Corpus
from ccc.concordances import read_query_json, run_query
from argparse import ArgumentParser
import os
import logging
logger = logging.getLogger(__name__)

if __name__ == '__main__':

    parser = ArgumentParser()
    parser.add_argument('query_path')
    parser.add_argument('corpus')
    parser.add_argument('data_path')
    args = parser.parse_args()

    # read file
    query = read_query_json(args.query_path)

    # patch path to query
    query['query_path'] = args.query_path

    # run query
    corpus = Corpus(args.corpus, query['lib_path'], data_path=args.data_path)
    query, result = run_query(corpus, query)

    # get path for output
    path_out = os.path.join(args.data_path, query['name']) + ".tsv"
    result['df'] = result['df'].apply(lambda row: row.to_json())
    result.to_csv(path_out, sep="\t")
Example #27
0
def test_context_matches(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD"')
    print(dump.matches())
    print(dump.context())