Example #1
0
def test_counts_matches_3(germaparl):
    strategy = 3

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    cqp.nqr_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]', name='Last')
    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word'],
                               split=True,
                               flags="%cd",
                               strategy=strategy)
    assert ("helmut" in df.index)

    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word'],
                               split=True,
                               strategy=strategy)
    assert ("Helmut" in df.index)

    df = corpus.counts.matches(cqp,
                               'Last',
                               p_atts=['word', 'pos'],
                               split=True,
                               strategy=strategy)
    assert (("Helmut", "NE") in df.index)
    cqp.__kill__()
Example #2
0
def test_count_items(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()

    items = ["Helmut", "Kohl", "CDU"]
    queries = [formulate_cqp_query([item]) for item in items]

    # whole corpus
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    # subcorpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')

    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq'])

    # whole corpus
    cqp.nqr_activate(corpus.corpus_name)
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    cqp.__kill__()
Example #3
0
def test_cwb_counts(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    queries = [
        '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]'
    ]
    df = corpus.counts.mwus(cqp, queries)
    assert (df['freq'][queries[1]] == 55)
    cqp.__kill__()
Example #4
0
def test_count_matches(brexit):
    corpus = Corpus(brexit['corpus_name'])
    corpus.query(cqp_query='[lemma="nigel"]',
                 context=10,
                 context_break='tweet',
                 name='Test',
                 save=True)
    cqp = corpus.start_cqp()
    counts = corpus.counts.matches(cqp, 'Test')
    assert ("Nigel" in counts.index)
Example #5
0
def test_count_mwus_3(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    cqp.__kill__()

    assert (counts3['freq']['CSU'] == 635)
Example #6
0
def test_counts_mwus(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    queries = [
        '[lemma="Helmut"%cd & pos="NE"] [lemma="Kohl"]', '[lemma="Horst"]'
    ]
    df = corpus.counts.mwus(cqp, queries, strategy=1)
    assert (df['freq'][queries[0]] == 6)

    df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma', 'pos'])
    assert (df['freq'][('Horst', 'NE')] == 55)

    df = corpus.counts.mwus(cqp, queries, strategy=3, p_atts=['lemma'])

    assert (df['freq']['Horst'] == 55)
    cqp.__kill__()
Example #7
0
def test_count_mwus_strategies(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert ('([word="CSU"])' in counts1.index)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)

    cqp.__kill__()
    assert (counts2.equals(counts3))
    assert (sum(counts1['freq']) == sum(counts2['freq']))
Example #8
0
def test_count_items_subcorpora(germaparl):

    # subcorpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    dump = corpus.dump_from_s_att("text_role", ["presidency"])
    cqp.nqr_from_dump(dump.df, 'presidency')
    cqp.nqr_activate(corpus.corpus_name, 'presidency')
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert (sum(counts1['freq']) > 0)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    assert (counts2.equals(counts3))
    cqp.__kill__()
Example #9
0
def test_cwb_scan_corpus(germaparl):
    from tempfile import NamedTemporaryFile
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    cqp.nqr_from_query('[lemma="Helmut"]', name='tmp')
    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df1 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')

    cqp.nqr_activate(corpus.corpus_name, 'und')
    cqp.nqr_from_query('[lemma="Kohl"]', name='tmp')

    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df2 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.__kill__()
    assert (sum(df2['freq']) != sum(df1['freq']))