Beispiel #1
0
def test_ngrams():
    # bigrams
    strings = ["this is my favorite", "book on my bookshelf"]
    dstrings = nvstrings.to_device(strings)
    expected = [
        "this_is",
        "is_my",
        "my_favorite",
        "favorite_book",
        "book_on",
        "on_my",
        "my_bookshelf",
    ]
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=2, sep="_")
    assert outcome.to_host() == expected

    # trigrams
    strings = ["this is my favorite", "book on my bookshelf"]
    dstrings = nvstrings.to_device(strings)
    expected = [
        "this-is-my",
        "is-my-favorite",
        "my-favorite-book",
        "favorite-book-on",
        "book-on-my",
        "on-my-bookshelf",
    ]
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=3, sep="-")
    assert outcome.to_host() == expected
Beispiel #2
0
def test_ngrams():
    # bigrams
    strings = ['this is my favorite', 'book on my bookshelf']
    dstrings = nvstrings.to_device(strings)
    expected = [
        'this_is',
        'is_my',
        'my_favorite',
        'favorite_book',
        'book_on',
        'on_my',
        'my_bookshelf'
    ]
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=2, sep='_')
    assert outcome.to_host() == expected

    # trigrams
    strings = ['this is my favorite', 'book on my bookshelf']
    dstrings = nvstrings.to_device(strings)
    expected = [
        'this-is-my',
        'is-my-favorite',
        'my-favorite-book',
        'favorite-book-on',
        'book-on-my',
        'on-my-bookshelf'
    ]
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=3, sep='-')
    assert outcome.to_host() == expected
Beispiel #3
0
def test_tokenize():
    # default space delimiter
    strs = nvstrings.to_device(
        [
            "the quick fox jumped over the lazy dog",
            "the siamésé cat jumped under the sofa",
            None,
            "",
        ]
    )
    outcome = nvtext.tokenize(strs)
    expected = [
        "the",
        "quick",
        "fox",
        "jumped",
        "over",
        "the",
        "lazy",
        "dog",
        "the",
        "siamésé",
        "cat",
        "jumped",
        "under",
        "the",
        "sofa",
    ]
    assert outcome.to_host() == expected
def get_word_count(str_col):
    """
        returns the count of input strings
    """ 
    ## Tokenize: convierte oraciones en largas listas de palabras


    df = cudf.DataFrame()

    principales = []
    for data in str_col:
      principales.append(data)

    # Tokenizando strings con nvstring y llevandolo al gpu con to_device
      
    letras = nvstrings.to_device(principales)

    df['string'] = nvtext.tokenize(letras)
    
    # Se usa group by para realizar el conteo de palabras por columna
    # Nativamente sera soportado pronto 
    # Mirar para errores: https://github.com/rapidsai/cudf/issues/1951

    df['counts'] = np.dtype('int32').type(0)
    
    res = df.groupby('string').count()
    res = res.reset_index(drop=False).sort_values(by='counts', ascending=False)
    return res.rename({'index':'string'})