Example #1
def test_ngrams():
    # bigrams
    strings = ["this is my favorite", "book on my bookshelf"]
    dstrings = nvstrings.to_device(strings)
    expected = [
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=2, sep="_")
    assert outcome.to_host() == expected

    # trigrams
    strings = ["this is my favorite", "book on my bookshelf"]
    dstrings = nvstrings.to_device(strings)
    expected = [
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=3, sep="-")
    assert outcome.to_host() == expected
Example #2
def test_ngrams():
    # bigrams
    strings = ['this is my favorite', 'book on my bookshelf']
    dstrings = nvstrings.to_device(strings)
    expected = [
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=2, sep='_')
    assert outcome.to_host() == expected

    # trigrams
    strings = ['this is my favorite', 'book on my bookshelf']
    dstrings = nvstrings.to_device(strings)
    expected = [
    tokens = nvtext.tokenize(dstrings)
    outcome = nvtext.ngrams(tokens, N=3, sep='-')
    assert outcome.to_host() == expected
Example #3
def test_tokenize():
    # default space delimiter
    strs = nvstrings.to_device(
            "the quick fox jumped over the lazy dog",
            "the siamésé cat jumped under the sofa",
    outcome = nvtext.tokenize(strs)
    expected = [
    assert outcome.to_host() == expected
def get_word_count(str_col):
        returns the count of input strings
    ## Tokenize: convierte oraciones en largas listas de palabras

    df = cudf.DataFrame()

    principales = []
    for data in str_col:

    # Tokenizando strings con nvstring y llevandolo al gpu con to_device
    letras = nvstrings.to_device(principales)

    df['string'] = nvtext.tokenize(letras)
    # Se usa group by para realizar el conteo de palabras por columna
    # Nativamente sera soportado pronto 
    # Mirar para errores: https://github.com/rapidsai/cudf/issues/1951

    df['counts'] = np.dtype('int32').type(0)
    res = df.groupby('string').count()
    res = res.reset_index(drop=False).sort_values(by='counts', ascending=False)
    return res.rename({'index':'string'})