def cal_weight_improve(self, key_words, class_label): """ 计算获取特征词后的权重信息 :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据 :return: """ print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S') if not self.istrain: dir_ = os.path.join(TEXT_OUT, "key_words") filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt" url = os.path.join(dir_, filename) train_key_words = FileUtil.read(url) train_class_label = [d.get("emotion-1-type") for d in train_key_words] else: train_key_words = key_words train_class_label = class_label train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words] key_words = [d.get("sentence") if "sentence" in d else d for d in key_words] # 获得 tf key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words] fit_train_key_words = Feature_Hasher.transform(train_key_words) fit_key_words = Feature_Hasher.transform(key_words) tfidf = TfidfImprove() # 训练 idf tfidf.fit(fit_train_key_words, train_class_label) weight_matrix = tfidf.transform(fit_key_words, class_label) print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return weight_matrix
def fit_data(self, datas): fit_datas = datas if not sp.issparse(datas): datas = [ d.get("sentence") if "sentence" in d else d for d in datas ] fit_datas = Feature_Hasher.transform(datas) return fit_datas
def _collect(self, splited_words_list, sentence_size): print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S') data = [d.get("sentence") for d in splited_words_list[: sentence_size]] class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]] fit_data = Feature_Hasher.transform(data) tfidf = TfidfTransformer() tfidf.fit(fit_data) a = tfidf.transform(fit_data) print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S') return a, class_label, []
def _collect(self, splited_words_list, sentence_size): print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S') data = [d.get("sentence") for d in splited_words_list[:sentence_size]] class_label = [ d.get("emotion-1-type") for d in splited_words_list[:sentence_size] ] fit_data = Feature_Hasher.transform(data) tfidf = TfidfTransformer() tfidf.fit(fit_data) a = tfidf.transform(fit_data) print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S') return a, class_label, []
def fit_data(self, datas): fit_datas = datas if not sp.issparse(datas): datas = [d.get("sentence") if "sentence" in d else d for d in datas] fit_datas = Feature_Hasher.transform(datas) return fit_datas