Beispiel #1
0
 def cal_weight_improve(self, key_words, class_label):
     """
     计算获取特征词后的权重信息
     :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据
     :return:
     """
     print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S')
     if not self.istrain:
         dir_ = os.path.join(TEXT_OUT, "key_words")
         filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt"
         url = os.path.join(dir_, filename)
         train_key_words = FileUtil.read(url)
         train_class_label = [d.get("emotion-1-type") for d in train_key_words]
     else:
         train_key_words = key_words
         train_class_label = class_label
     train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words]
     key_words = [d.get("sentence") if "sentence" in d else d for d in key_words]
     # 获得 tf
     key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words]
     fit_train_key_words = Feature_Hasher.transform(train_key_words)
     fit_key_words = Feature_Hasher.transform(key_words)
     tfidf = TfidfImprove()
     # 训练 idf
     tfidf.fit(fit_train_key_words, train_class_label)
     weight_matrix = tfidf.transform(fit_key_words, class_label)
     print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     print
     return weight_matrix
 def fit_data(self, datas):
     fit_datas = datas
     if not sp.issparse(datas):
         datas = [
             d.get("sentence") if "sentence" in d else d for d in datas
         ]
         fit_datas = Feature_Hasher.transform(datas)
     return fit_datas
 def _collect(self, splited_words_list, sentence_size):
     print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S')
     data = [d.get("sentence") for d in splited_words_list[: sentence_size]]
     class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]
     fit_data = Feature_Hasher.transform(data)
     tfidf = TfidfTransformer()
     tfidf.fit(fit_data)
     a = tfidf.transform(fit_data)
     print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     return a, class_label, []
 def _collect(self, splited_words_list, sentence_size):
     print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S')
     data = [d.get("sentence") for d in splited_words_list[:sentence_size]]
     class_label = [
         d.get("emotion-1-type") for d in splited_words_list[:sentence_size]
     ]
     fit_data = Feature_Hasher.transform(data)
     tfidf = TfidfTransformer()
     tfidf.fit(fit_data)
     a = tfidf.transform(fit_data)
     print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     return a, class_label, []
 def fit_data(self, datas):
     fit_datas = datas
     if not sp.issparse(datas):
         datas = [d.get("sentence") if "sentence" in d else d for d in datas]
         fit_datas = Feature_Hasher.transform(datas)
     return fit_datas