コード例 #1
0
def test_one_gram_count():
    ngram = WordNGram(1, sample[0])
    expected_count = {
        'the': {
            'over': 1
        },
        'quick': {
            'the': 1
        },
        'brown': {
            'quick': 1
        },
        'fox': {
            'brown': 1
        },
        'jumps': {
            'fox': 1
        },
        'over': {
            'jumps': 1
        },
        'lazy': {
            'the': 1
        },
        'dog': {
            'lazy': 1
        },
    }
    assert ngram.counts == expected_count

    ngram = WordNGram(1, sample[1])
    expected_count = {
        'auni': {
            'auni': 1,
            'the': 1
        },
        'the': {
            'auni': 1,
            'the': 3
        },
    }
    assert ngram.counts == expected_count
コード例 #2
0
def test_zero_gram_count():
    ngram = WordNGram(0, sample[0])
    expected_count = {
        'the': {
            '': 2
        },
        'quick': {
            '': 1
        },
        'brown': {
            '': 1
        },
        'fox': {
            '': 1
        },
        'jumps': {
            '': 1
        },
        'over': {
            '': 1
        },
        'lazy': {
            '': 1
        },
        'dog': {
            '': 1
        },
    }
    assert ngram.counts == expected_count

    ngram = WordNGram(0, sample[1])
    expected_count = {
        'auni': {
            '': 2
        },
        'the': {
            '': 5
        },
    }
    assert ngram.counts == expected_count
コード例 #3
0
def test_prev_n():
    _prev_n = lambda text, c, n: WordNGram(n, text).prev_n(c)

    assert _prev_n('abc', 3, 1) == list('')
    assert _prev_n('abc', 3, 3) == 'abc'.split(' ')
    assert _prev_n(sample[0], 3, 1) == 'brown'.split(' ')
    assert _prev_n(sample[0], 3, 2) == 'quick brown'.split(' ')
    x = 0
    assert _prev_n(sample[0], 8, 8 - x) == sample[0].lower().split(' ')[x:-1]
    x = 1
    assert _prev_n(sample[0], 8, 8 - x) == sample[0].lower().split(' ')[x:-1]
    x = 5
    assert _prev_n(sample[0], 8, 8 - x) == sample[0].lower().split(' ')[x:-1]
コード例 #4
0
def test_three_gram_count():
    ngram = WordNGram(3, sample[1])
    expected_count = {
        'auni': {
            'the^the^the': 1,
            'the^the^auni': 1
        },
        'the': {
            'the^the^the': 1,
            'the^auni^auni': 1
        },
    }
    assert ngram.counts == expected_count
コード例 #5
0
def test_clean():
    _clean = lambda text: WordNGram(0, text).text

    assert _clean('~~') == []
    assert _clean('abc~~') == ['abc']
    assert _clean('ABC~~') == ['abc']
    assert _clean('the quick brown fox ') == ['the', 'quick', 'brown', 'fox']
    assert _clean('the quick brown fox ~~@!') == [
        'the', 'quick', 'brown', 'fox'
    ]
    assert _clean('the quick@#% brown fox ') == [
        'the', 'quick', 'brown', 'fox'
    ]
    assert _clean('th@#%!e quick brown fox ') == [
        'the', 'quick', 'brown', 'fox'
    ]
コード例 #6
0
ファイル: process.py プロジェクト: auni53/compulinguistics
def main():
    with open(FILENAME) as f:
        text = f.read()

    print
    ngram = WordNGram(0, text)
    print "sentences from monogram"
    print ngram.generate_sentence(6)
    print ngram.generate_sentence(7)
    print ngram.generate_sentence(8)
    print ngram.generate_sentence(9)
    print ngram.generate_sentence(10)
    print

    ngram = WordNGram(1, text)
    print "sentences from bigrams"
    print ngram.generate_sentence(6)
    print ngram.generate_sentence(7)
    print ngram.generate_sentence(8)
    print ngram.generate_sentence(9)
    print ngram.generate_sentence(10)
    print

    ngram = WordNGram(2, text)
    print "sentences from trigram"
    print ngram.generate_sentence(6)
    print ngram.generate_sentence(7)
    print ngram.generate_sentence(8)
    print ngram.generate_sentence(9)
    print ngram.generate_sentence(10)
    print
コード例 #7
0
def test_empty_count():
    ngram = WordNGram(0, '')
    result = ngram.empty_count()

    assert set(result.keys()) == set([c for c in ngram.cols()])
    assert len(set(map(lambda v: id(v), result.values()))) == len(ngram.cols())