Example #1
0
def get_unique_tokens(words, words_hash=None):
    df = gd.DataFrame()
    df['hash'] = on_gpu(words, 'hash') if words_hash is None else words_hash
    df['ID'] = np.arange(words.size()).astype(np.int32)
    df = drop_duplicates(df, by='hash', keep='first')
    rows = df['ID'].to_array()  #.astype(np.int32)
    res = words.sublist(rows.tolist())
    del df
    return res
def get_token_counts(words, words_hash=None):
    df = gd.DataFrame()
    df['hash'] = on_gpu(words, 'hash') if words_hash is None else words_hash
    df['ID'] = np.arange(words.size()).astype(np.int32)
    dg = df.groupby('hash').agg({'hash': 'count'})
    df = drop_duplicates(df, by='hash', keep='first')
    df = df.merge(dg, on=['hash'], how='left')
    rows = df['ID'].to_array()  #.astype(np.int32)
    res = words.sublist(rows.tolist()).to_host()
    #res = pd.DataFrame({'tokens':res,'count':df['count_hash'].to_array()})
    res = dict(zip(res, df['count_hash'].to_array().tolist()))
    #del df
    return Counter(res)