Exemple #1
0
def int_words(words):
    with open('../Model/wordDictionary.pkl', 'rb') as file:
        one_hot_dict = pickle.load(file).one_hot_dict
    str_words = Helper.splitSentence(words)
    int_words = []
    dismiss_cnt = 0
    total_cnt = 0
    for word in str_words:
        total_cnt += 1
        if word not in one_hot_dict:
            int_words.append(0)
            dismiss_cnt += 1
        else:
            int_words.append(one_hot_dict[word])
    int_words = pad_sequence(int_words, Meta.max_string_len)
    Helper.debug('[WARNING] dismiss: %d\ttotal: %d' % (dismiss_cnt, total_cnt))
    return int_words
Exemple #2
0
 def load_data(self, file_list: list, max_len=None):
     # 对出现过的单词进行onehot编码,并形成字典
     for idx, file in enumerate(file_list):
         self.personDictionary.addPerson(idx, file)
         label = [0] * len(file_list)
         label[idx] = 1
         self.one_hot(file, label)
     # 将字典保存
     self.save_dict()
     # 计算原始数据长度的均值和方差
     count = [len(x) for x in self.X]
     mean, var = Helper.analysis(count)
     Helper.debug("[INFORMATION] MEAN: %f\tVAR: %f" % (mean, var))
     # 对每个句子进行pad操作
     self.pad_sequences(max_len, 0)
     Helper.debug("[SUCCESS] Load data from file")
     return np.array(self.X), np.array(self.Y)
Exemple #3
0
 def load_data(self, file_list: list, max_len=None):
     # 对出现过的单词进行onehot编码,并形成字典
     for idx, file in enumerate(file_list):
         self.personDictionary.addPerson(idx, file)
         label = [0] * len(file_list)
         label[idx] = 1
         self.one_hot(file, label)
     # 将字典保存
     self.save_dict()
     # 计算原始数据长度的均值和方差
     count = [len(x) for x in self.X]
     mean, var = Helper.analysis(count)
     Helper.debug("[INFORMATION] MEAN: %f\tVAR: %f" % (mean, var))
     # 对每个句子进行pad操作
     self.pad_sequences(max_len, 0)
     Helper.debug("[SUCCESS] Load data from file")
     return np.array(self.X), np.array(self.Y)
Exemple #4
0
 def one_hot(self, filename, label):
     with open(filename, "r", encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
             words = Helper.splitSentence(row['text'])
             features = []
             for word in words:
                 # 去除标点符号
                 # word = word.strip(string.punctuation)
                 if 'http' in word:
                     continue
                 if self.wordDictionary.isFull():
                     Helper.debug("[WARNING] Missing word:" + word)
                     continue
                 features.append(self.wordDictionary.lookup(word))
             self.X.append(features)
             self.Y.append(label)
     Helper.debug("[INFORMATION] Total One Hot: %d" % self.wordDictionary.count())
Exemple #5
0
 def one_hot(self, filename, label):
     with open(filename, "r", encoding='utf-8') as f:
         reader = csv.DictReader(f)
         for row in reader:
             words = Helper.splitSentence(row['text'])
             features = []
             for word in words:
                 # 去除标点符号
                 # word = word.strip(string.punctuation)
                 if 'http' in word:
                     continue
                 if self.wordDictionary.isFull():
                     Helper.debug("[WARNING] Missing word:" + word)
                     continue
                 features.append(self.wordDictionary.lookup(word))
             self.X.append(features)
             self.Y.append(label)
     Helper.debug("[INFORMATION] Total One Hot: %d" %
                  self.wordDictionary.count())