def get_unique_tokens(words, words_hash=None): df = gd.DataFrame() df['hash'] = on_gpu(words, 'hash') if words_hash is None else words_hash df['ID'] = np.arange(words.size()).astype(np.int32) df = drop_duplicates(df, by='hash', keep='first') rows = df['ID'].to_array() #.astype(np.int32) res = words.sublist(rows.tolist()) del df return res
def get_token_counts(words, words_hash=None): df = gd.DataFrame() df['hash'] = on_gpu(words, 'hash') if words_hash is None else words_hash df['ID'] = np.arange(words.size()).astype(np.int32) dg = df.groupby('hash').agg({'hash': 'count'}) df = drop_duplicates(df, by='hash', keep='first') df = df.merge(dg, on=['hash'], how='left') rows = df['ID'].to_array() #.astype(np.int32) res = words.sublist(rows.tolist()).to_host() #res = pd.DataFrame({'tokens':res,'count':df['count_hash'].to_array()}) res = dict(zip(res, df['count_hash'].to_array().tolist())) #del df return Counter(res)