def test_count_items(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()

    items = ["Helmut", "Kohl", "CDU"]
    queries = [formulate_cqp_query([item]) for item in items]

    # whole corpus
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    # subcorpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')

    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq'])

    # whole corpus
    cqp.nqr_activate(corpus.corpus_name)
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    cqp.__kill__()
Exemple #2
0
    def __init__(self,
                 corpus,
                 items,
                 p_query,
                 s_query,
                 s_context,
                 context=20,
                 flags="%cd",
                 escape=False):
        """
        .items
        .parameters
        .idx
        .dump
        """

        self.items = items

        self.parameters = {
            'p_query': p_query,
            's_query': s_query,
            's_context': s_context,
            'context': context,
            'flags': flags,
            'escape': escape
        }

        # run query
        query = formulate_cqp_query(items, p_query, s_query, flags, escape)
        dump = corpus.query(query, context, context_break=s_context)
        self.dump = dump
        self.idx = dump.name_cache

        self._context = None
        self._matches = None
def test_count_mwus_3(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    cqp.__kill__()

    assert (counts3['freq']['CSU'] == 635)
def test_count_mwus_strategies(germaparl):

    # whole corpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert ('([word="CSU"])' in counts1.index)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)

    cqp.__kill__()
    assert (counts2.equals(counts3))
    assert (sum(counts1['freq']) == sum(counts2['freq']))
def test_count_items_subcorpora(germaparl):

    # subcorpus
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    dump = corpus.dump_from_s_att("text_role", ["presidency"])
    cqp.nqr_from_dump(dump.df, 'presidency')
    cqp.nqr_activate(corpus.corpus_name, 'presidency')
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [formulate_cqp_query([item]) for item in items]

    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert (sum(counts1['freq']) > 0)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    assert (counts2.equals(counts3))
    cqp.__kill__()