def data_prepare(x, verbous=True): X = x.copy() #Пока так if verbous: print('Склеиваем строки...') X['Text'] = X['Txt'].apply(lambda x: ' '.join(x)) if verbous: print('Готово\nНормализиция текстов...') X['Words'] = pd.Series([[]]*len(X)) X['WrdCnt'] = pd.Series([0]*len(X)) X['Tokens'] = pd.Series([[]]*len(X)) X['TokCnt'] = pd.Series([0]*len(X)) if verbous: pb = ProgressBar(max_value = len(X)) pb.start() for i in range(0,len(X)): if verbous: pb.update(i) w,t = text_preprocess(X.loc[i,'Text']) X.at[i,'Words'] = w X.at[i,'WrdCnt'] = len(w) X.at[i,'Tokens'] = t X.at[i,'TokCnt'] = len(t) if verbous: pb.finish() if verbous: X['TokCnt'].hist(bins=100) X['WrdCnt'].hist(bins=100) return X
tid, data = pk.load(f) if not data: exit(-1) msg_ids = [] creators = [] timestamps = [] src_links = [] del_reasons = [] texts = [] codes = [] quotes = [] pb = ProgressBar(max_value=len(data)) pb.start() for i, rec in enumerate(data): msg_ids.append(rec['MsgId']) creators.append(rec['Creator']) timestamps.append(rec['Time']) src_links.append(rec['SrcLink']) del_reasons.append(rec['DelReason']) texts.append(rec['Txt']) codes.append(rec['Code']) quotes.append(rec['Quotes']) pb.update(i) pb.finish() data = None print('Создаю DataFrame...')