def Remove_stop_words(self, df, title='Remove stop_words'):
        print(bcolors.BLUE)
        stop_words = [x for x in stopwords.words('english') if x != 'not']

        new_list2 = []
        for num, text in enumerate(df):
            filtered = []
            text_token = word_tokenize(str(text))
            for w in text_token:
                if w not in stop_words:
                    filtered.append(w)
            new_list2.append(" ".join(filtered).strip())
            EstimateFaster(num, len(df), title)

        print(tcolors.RED, " *** Done! ***")
        print(bcolors.ENDC)
        return new_list2
Exemple #2
0
    def ckeckspeller(self, df, label, name):

        start = pendulum.now()
        aa = 0
        Listlabel = []
        for num, li in enumerate(df):
            df1 = pd.DataFrame()
            lA = TextBlob(str(li)).correct()
            self.ListC.append(str(lA))
            Listlabel.append(label[num])
            df1['text'] = pd.Series(self.ListC)
            df1['label'] = Listlabel
            df1.to_csv('temperray_clean_file{}.csv'.format(name))
            #editie = editor()
            #multiprocessing.Process(target=editie.Editor,args=(li,))
            #self.ListC.append(lA)

            EstimateFaster(df, num, 'check_spell')
        print("%%%%% finshed %%%%")
        return self.ListC
Exemple #3
0
def Normlize_nMonth(df,numday=90,Date="Date",price="Adj Close"):
    #df['month'] = df[Date].apply(lambda x: datetime.datetime.strptime(str(x),"%Y-%m-%d").month)

    lenght = int(round(len(df[price])/numday))-1
    Plist = [[] for i in range(lenght)]
    new_df = pd.DataFrame()
    print(lenght)
    for num  in range(0,lenght):
        Plist[num] = [x for x in (df[price][numday*num:(numday*(num+1))])]
        new_df[str(num)] = Plist[num] 
        EstimateFaster(num,Plist[num],'Normalize')
        new_df['norm_{}'.format(num)] = normalize_df(new_df[str(num)])
        
    Plist2 = pd.Series([x for x in (df[price][numday*lenght:])])
    Plist2 = normalize_df(Plist2)
        
    frames = [new_df['norm_{}'.format(num)] for num in range(lenght)]
    x = pd.concat(frames,axis=0)
    frames = [x, Plist2]
    result = pd.concat(frames, names=['index','norm']).sample(frac=1).reset_index(drop=True)
    print('*** Done! ***')
    return  result