Ejemplo n.º 1
0
def data_prepare(x, verbous=True):
    X = x.copy()
    #Пока так
    if verbous:
        print('Склеиваем строки...')
    X['Text'] = X['Txt'].apply(lambda x: ' '.join(x)) 
    if verbous:
        print('Готово\nНормализиция текстов...')

    X['Words'] = pd.Series([[]]*len(X))
    X['WrdCnt'] = pd.Series([0]*len(X))
    X['Tokens'] = pd.Series([[]]*len(X))
    X['TokCnt'] = pd.Series([0]*len(X))
    
    if verbous:
        pb = ProgressBar(max_value = len(X))
        pb.start()
        
    for i in range(0,len(X)):
        if verbous:
            pb.update(i)
        w,t = text_preprocess(X.loc[i,'Text'])
        X.at[i,'Words'] = w
        X.at[i,'WrdCnt'] = len(w)
        X.at[i,'Tokens'] = t
        X.at[i,'TokCnt'] = len(t)
    if verbous:
        pb.finish()
    
    if verbous: 
        X['TokCnt'].hist(bins=100)
        X['WrdCnt'].hist(bins=100)
    return X
Ejemplo n.º 2
0
    tid, data = pk.load(f)

if not data:
    exit(-1)

msg_ids = []
creators = []
timestamps = []
src_links = []
del_reasons = []
texts = []
codes = []
quotes = []

pb = ProgressBar(max_value=len(data))
pb.start()
for i, rec in enumerate(data):
    msg_ids.append(rec['MsgId'])
    creators.append(rec['Creator'])
    timestamps.append(rec['Time'])
    src_links.append(rec['SrcLink'])
    del_reasons.append(rec['DelReason'])
    texts.append(rec['Txt'])
    codes.append(rec['Code'])
    quotes.append(rec['Quotes'])
    pb.update(i)
pb.finish()

data = None

print('Создаю DataFrame...')