def pretreatment(self): #read data [title, content, result] = self.DT.read_excel(self.origin_data_file) for i in range(len(result)): if result[i] < 0: result[i] = -1 PT = PreTreater() keydata = PT.get_keywords(content) wd_dict = PT.getdict() traindata = PT.create_train_data_dict(wd_dict, keydata) #if self.model_dict['lrTmodel']: keydata_title = PT.get_keywords(title, all_tag=True) train_title_data = PT.create_train_data_dict(wd_dict, keydata_title) np.save(self.wd_dict_file, [wd_dict]) np.save(self.data_title_file, [train_title_data]) #if self.model_dict['scoreModel']: [wd_id_dict, id_score_dict] = PT.get_score_dict() traindata_score = PT.create_train_data_dict(wd_id_dict, keydata) np.save(self.wd_id_dict_file, [wd_id_dict]) np.save(self.id_score_dict_file, [id_score_dict]) np.save(self.data_score_file, [traindata_score]) traindata_title_score = PT.create_train_data_dict(wd_id_dict, keydata_title) np.save(self.data_score_title_file, [traindata_title_score]) #traindata = self.normalize_data(trainData) np.save(self.data_file, [traindata, np.array(result)]) self.create_random_seed(len(result))
def pre_data_treate(self, filename): test_title, test_content, empty = self.DT.read_excel(filename) [wd_dict] = np.load(self.wd_dict_file) [wd_score_dict] = np.load(self.wd_id_dict_file) PT = PreTreater() keydata = PT.get_keywords(test_content) testdata = PT.create_train_data_dict(wd_dict, keydata) test_score_data = PT.create_train_data_dict(wd_score_dict, keydata) keydata_title = PT.get_keywords(test_title) testdata_title = PT.create_train_data_dict(wd_dict, keydata_title) test_score_data_title = PT.create_train_data_dict(wd_score_dict, keydata_title) return [testdata, testdata_title, test_score_data, test_score_data_title]