def Remove_stop_words(self, df, title='Remove stop_words'): print(bcolors.BLUE) stop_words = [x for x in stopwords.words('english') if x != 'not'] new_list2 = [] for num, text in enumerate(df): filtered = [] text_token = word_tokenize(str(text)) for w in text_token: if w not in stop_words: filtered.append(w) new_list2.append(" ".join(filtered).strip()) EstimateFaster(num, len(df), title) print(tcolors.RED, " *** Done! ***") print(bcolors.ENDC) return new_list2
def ckeckspeller(self, df, label, name): start = pendulum.now() aa = 0 Listlabel = [] for num, li in enumerate(df): df1 = pd.DataFrame() lA = TextBlob(str(li)).correct() self.ListC.append(str(lA)) Listlabel.append(label[num]) df1['text'] = pd.Series(self.ListC) df1['label'] = Listlabel df1.to_csv('temperray_clean_file{}.csv'.format(name)) #editie = editor() #multiprocessing.Process(target=editie.Editor,args=(li,)) #self.ListC.append(lA) EstimateFaster(df, num, 'check_spell') print("%%%%% finshed %%%%") return self.ListC
def Normlize_nMonth(df,numday=90,Date="Date",price="Adj Close"): #df['month'] = df[Date].apply(lambda x: datetime.datetime.strptime(str(x),"%Y-%m-%d").month) lenght = int(round(len(df[price])/numday))-1 Plist = [[] for i in range(lenght)] new_df = pd.DataFrame() print(lenght) for num in range(0,lenght): Plist[num] = [x for x in (df[price][numday*num:(numday*(num+1))])] new_df[str(num)] = Plist[num] EstimateFaster(num,Plist[num],'Normalize') new_df['norm_{}'.format(num)] = normalize_df(new_df[str(num)]) Plist2 = pd.Series([x for x in (df[price][numday*lenght:])]) Plist2 = normalize_df(Plist2) frames = [new_df['norm_{}'.format(num)] for num in range(lenght)] x = pd.concat(frames,axis=0) frames = [x, Plist2] result = pd.concat(frames, names=['index','norm']).sample(frac=1).reset_index(drop=True) print('*** Done! ***') return result