コード例 #1
0
def test_cwb_scan_corpus_subcorpora(germaparl):

    corpus = get_corpus(germaparl)
    cqp = corpus.start_cqp()

    # get a DumpFrame
    cqp.nqr_from_query('[lemma="Helmut"]', name='Tmp')

    # run cwb-scan-corpus on dump
    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump Tmp > "%s"' % f.name)
        df1, R1 = cwb_scan_corpus(germaparl['corpus_name'],
                                  germaparl['registry_path'], f.name)

    # activate a sub-corpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')

    # run same query on sub-corpus
    cqp.nqr_from_query('[lemma="Helmut"]', name='Tmp')

    # run cwb-scan-corpus on dump
    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump Tmp > "%s"' % f.name)
        df2, R2 = cwb_scan_corpus(germaparl['corpus_name'],
                                  germaparl['registry_path'], f.name)

    # check that results are different
    assert (sum(df2['freq']) != sum(df1['freq']))

    cqp.__kill__()
コード例 #2
0
def test_cwb_scan_corpus_marginal(germaparl):

    df1, R1 = cwb_scan_corpus(germaparl['corpus_name'],
                              germaparl['registry_path'])
    df2, R2 = cwb_scan_corpus(germaparl['corpus_name'],
                              germaparl['registry_path'],
                              p_atts=['lemma', 'pos'])
    assert (df1.index.name == 'item')
    assert (df2.index.name == 'item')
    assert (list(df1.columns) == ['freq', 'word'])
    assert (list(df2.columns) == ['freq', 'lemma', 'pos'])
コード例 #3
0
ファイル: test_07_counts.py プロジェクト: dokempf/cwb-ccc
def test_cwb_scan_corpus_marginal(germaparl):

    df1 = cwb_scan_corpus(None, germaparl['corpus_name'],
                          germaparl['registry_path'])
    df2 = cwb_scan_corpus(None,
                          germaparl['corpus_name'],
                          germaparl['registry_path'],
                          p_atts=['lemma', 'pos'])
    assert (isinstance(df1.index, pd.MultiIndex))
    assert (isinstance(df2.index, pd.MultiIndex))
    assert (df1.index.names == ['word'])
    assert (df2.index.names == ['lemma', 'pos'])
    assert (df1.columns == df2.columns)
コード例 #4
0
ファイル: test_06_counts.py プロジェクト: bingokorean/cwb-ccc
def test_cwb_scan_corpus(germaparl):
    from tempfile import NamedTemporaryFile
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    cqp = corpus.start_cqp()
    cqp.nqr_from_query('[lemma="Helmut"]', name='tmp')
    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df1 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')

    cqp.nqr_activate(corpus.corpus_name, 'und')
    cqp.nqr_from_query('[lemma="Kohl"]', name='tmp')

    with NamedTemporaryFile(mode="wt") as f:
        cqp.Exec('dump tmp > "%s"' % f.name)
        df2 = cwb_scan_corpus(f.name, germaparl['corpus_name'],
                              germaparl['registry_path'])

    cqp.__kill__()
    assert (sum(df2['freq']) != sum(df1['freq']))