def test_ngrams(): # bigrams strings = ["this is my favorite", "book on my bookshelf"] dstrings = nvstrings.to_device(strings) expected = [ "this_is", "is_my", "my_favorite", "favorite_book", "book_on", "on_my", "my_bookshelf", ] tokens = nvtext.tokenize(dstrings) outcome = nvtext.ngrams(tokens, N=2, sep="_") assert outcome.to_host() == expected # trigrams strings = ["this is my favorite", "book on my bookshelf"] dstrings = nvstrings.to_device(strings) expected = [ "this-is-my", "is-my-favorite", "my-favorite-book", "favorite-book-on", "book-on-my", "on-my-bookshelf", ] tokens = nvtext.tokenize(dstrings) outcome = nvtext.ngrams(tokens, N=3, sep="-") assert outcome.to_host() == expected
def test_ngrams(): # bigrams strings = ['this is my favorite', 'book on my bookshelf'] dstrings = nvstrings.to_device(strings) expected = [ 'this_is', 'is_my', 'my_favorite', 'favorite_book', 'book_on', 'on_my', 'my_bookshelf' ] tokens = nvtext.tokenize(dstrings) outcome = nvtext.ngrams(tokens, N=2, sep='_') assert outcome.to_host() == expected # trigrams strings = ['this is my favorite', 'book on my bookshelf'] dstrings = nvstrings.to_device(strings) expected = [ 'this-is-my', 'is-my-favorite', 'my-favorite-book', 'favorite-book-on', 'book-on-my', 'on-my-bookshelf' ] tokens = nvtext.tokenize(dstrings) outcome = nvtext.ngrams(tokens, N=3, sep='-') assert outcome.to_host() == expected
def test_tokenize(): # default space delimiter strs = nvstrings.to_device( [ "the quick fox jumped over the lazy dog", "the siamésé cat jumped under the sofa", None, "", ] ) outcome = nvtext.tokenize(strs) expected = [ "the", "quick", "fox", "jumped", "over", "the", "lazy", "dog", "the", "siamésé", "cat", "jumped", "under", "the", "sofa", ] assert outcome.to_host() == expected
def get_word_count(str_col): """ returns the count of input strings """ ## Tokenize: convierte oraciones en largas listas de palabras df = cudf.DataFrame() principales = [] for data in str_col: principales.append(data) # Tokenizando strings con nvstring y llevandolo al gpu con to_device letras = nvstrings.to_device(principales) df['string'] = nvtext.tokenize(letras) # Se usa group by para realizar el conteo de palabras por columna # Nativamente sera soportado pronto # Mirar para errores: https://github.com/rapidsai/cudf/issues/1951 df['counts'] = np.dtype('int32').type(0) res = df.groupby('string').count() res = res.reset_index(drop=False).sort_values(by='counts', ascending=False) return res.rename({'index':'string'})