def int_words(words): with open('../Model/wordDictionary.pkl', 'rb') as file: one_hot_dict = pickle.load(file).one_hot_dict str_words = Helper.splitSentence(words) int_words = [] dismiss_cnt = 0 total_cnt = 0 for word in str_words: total_cnt += 1 if word not in one_hot_dict: int_words.append(0) dismiss_cnt += 1 else: int_words.append(one_hot_dict[word]) int_words = pad_sequence(int_words, Meta.max_string_len) Helper.debug('[WARNING] dismiss: %d\ttotal: %d' % (dismiss_cnt, total_cnt)) return int_words
def load_data(self, file_list: list, max_len=None): # 对出现过的单词进行onehot编码,并形成字典 for idx, file in enumerate(file_list): self.personDictionary.addPerson(idx, file) label = [0] * len(file_list) label[idx] = 1 self.one_hot(file, label) # 将字典保存 self.save_dict() # 计算原始数据长度的均值和方差 count = [len(x) for x in self.X] mean, var = Helper.analysis(count) Helper.debug("[INFORMATION] MEAN: %f\tVAR: %f" % (mean, var)) # 对每个句子进行pad操作 self.pad_sequences(max_len, 0) Helper.debug("[SUCCESS] Load data from file") return np.array(self.X), np.array(self.Y)
def one_hot(self, filename, label): with open(filename, "r", encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: words = Helper.splitSentence(row['text']) features = [] for word in words: # 去除标点符号 # word = word.strip(string.punctuation) if 'http' in word: continue if self.wordDictionary.isFull(): Helper.debug("[WARNING] Missing word:" + word) continue features.append(self.wordDictionary.lookup(word)) self.X.append(features) self.Y.append(label) Helper.debug("[INFORMATION] Total One Hot: %d" % self.wordDictionary.count())