Beispiel #1
0
def test_get_indices_1grams(unigram_indices):
    indices = list(get_indices(1))
    assert len(set(indices)) == len(indices)

    assert set(indices) == unigram_indices
Beispiel #2
0
def test_get_indices_4grams_coverage_1m(quadgram_indices_1m):
    indices = list(get_indices(4, coverage="1M"))
    assert len(set(indices)) == len(indices)

    assert set(indices) == quadgram_indices_1m
Beispiel #3
0
def test_get_indices_5grams_coverage_1m(fivegram_indices_1m):
    indices = list(get_indices(5, coverage="1M"))
    assert len(set(indices)) == len(indices)

    assert set(indices) == fivegram_indices_1m
Beispiel #4
0
def test_get_indices_2grams_coverage_1m(bigram_indices_1m):
    indices = list(get_indices(2, coverage="1M"))
    assert len(set(indices)) == len(indices)

    assert set(indices) == bigram_indices_1m
Beispiel #5
0
def test_get_indices_3grams_coverage_1m(trigram_indices_1m):
    indices = list(get_indices(3, coverage="1M"))
    assert len(set(indices)) == len(indices)

    assert set(indices) == trigram_indices_1m
Beispiel #6
0
def test_get_indices_5grams(bigrams_indices):
    """Check that there is no "qk" index for 5grams."""
    indices = list(get_indices(5))
    assert len(indices) == len(set(bigrams_indices)) - 1

    assert set(indices) == (bigrams_indices - set(['qk']))
Beispiel #7
0
def test_get_indices_1grams_coverage_1m(unigram_indices_1m):
    indices = list(get_indices(1, coverage="1M"))
    assert len(set(indices)) == len(indices)

    assert set(indices) == unigram_indices_1m
Beispiel #8
0
def test_get_indices_manygrams(bigrams_indices):
    indices = list(get_indices(2))
    assert len(set(indices)) == len(indices)

    assert set(indices) == bigrams_indices
Beispiel #9
0
import csv
import os
import string
from google_ngram_downloader import readline_google_store, util

list_not = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_ADJ_', '_ADP_',
    '_ADV_', '_CONJ_', '_DET_', '_NOUN_', '_NUM_', '_PRON_', '_PRT_', '_VERB_'
]
ngrams = 3
result = {}
list_indices = util.get_indices(ngrams)
dict_ngram = {}
for item in list_indices:
    if not (item in list_not):
        list_tmp = []
        list_tmp.append(item)
        try:
            fnames, urls, records = next(
                readline_google_store(ngram_len=ngrams,
                                      indices=list_tmp,
                                      lang='spa'))
            for i in records:
                try:
                    ngram = str(i.ngram).lower()
                    # print(i)
                    if ngram.find('_') == -1:
                        if ngram in dict_ngram:
                            temp = dict_ngram.get(ngram)
                            freq = float(temp['freq'] + i.match_count)
                            count = temp['count'] + 1
def test_get_indices_1grams(unigram_indices):
    indices = list(get_indices(1))
    assert len(set(indices)) == len(indices)

    assert set(indices) == unigram_indices
def test_get_indices_5grams(bigrams_indices):
    """Check that there is no "qk" index for 5grams."""
    indices = list(get_indices(5))
    assert len(indices) == len(set(bigrams_indices)) - 1

    assert set(indices) == (bigrams_indices - set(['qk']))
def test_get_indices_manygrams(bigrams_indices):
    indices = list(get_indices(2))
    assert len(set(indices)) == len(indices)

    assert set(indices) == bigrams_indices