def load_data_from_file1(self): print('Load BOW from file.') self.bag = [] with open('bag4000.txt', encoding='utf8') as f: for line in f: self.bag.append(line.strip()) print('Get bag. Size:', len(self.bag)) self.df_count, self.count = pretreatment.count_words_in_label( self.sizeofdata) self.bag = list(set(self.bag) & set(self.df_count.columns)) self.set_bag = set(self.bag) self.df_count = self.df_count[list(self.set_bag)]
def load_data_from_file2(self): print('加载词袋...') self.bag = [] with open('bag4000.txt', encoding='utf8') as f: for line in f: self.bag.append(line.strip()) print('词袋大小:', len(self.bag)) self.df_count, self.count = pretreatment.count_words_in_label( self.sizeofdata) self.bag = list(set(self.bag) & set(self.df_count.columns)) self.set_bag = set(self.bag) #降维最后得到特征词对应的类中词出现的数量 self.df_count = self.df_count[list(self.set_bag)]
def load_train_data(self): """ 从载入数据 :param path: :return: """ print('Load train data...') self.df_count, self.bag, self.count, self.idf = pretreatment.pre_treat( count=self.sizeofdata, sizeOfBOW=4000) self.df_count, _ = pretreatment.count_words_in_label(self.sizeofdata) self.set_bag = set(self.bag) self.df_count = self.df_count[self.bag] print('size of bag:', len(self.bag)) print('Bag get.')