def get_classificator(self): sentences = self.read_train(ImageClassification.image_train_path) if not sentences: # 读取微博数据 sentences = collect.read_weibo(ImageClassification.weibo_path, isreadimg=True) pure_sentences = [ sentence.get("sentence") for sentence in sentences ] # predict c_pred = self.__classifict(CHIFeature(), pure_sentences, incr=True) # reconstruct sentences sentences = self.__reconstruct(sentences, c_pred) # save self.__save_result(sentences) texts, imgs, labels = self.split(sentences) img_feature = self.__get_feature_from_img(imgs) self.nn.get_classificator(img_feature, labels) return self
"""<weibo emotion-type="%s"> <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s"> %s </sentence> </weibo> """ % ("None", "None", "N", s + "\n Can't recognize because it has insufficient key_words")) else: print c_pred if __name__ == "__main__": if False: [collect.collect_weibo() for i in range(10)] if True: feature = CHIFeature() path = "collect" sentences = collect.read_weibo(path) sentences = [s.get("sentence") for s in sentences] classifict(feature, sentences, incr=True, out=True) # test_classification(feature, incr=True) if False: test_classification(CHIFeature(subjective=False)) # s1 = "寂寞人生爱无休,寂寞是爱永远的主题、我和我的影子独处、它说它有悄悄话想跟我说、" \ # "它说它很想念你,原来我和我的影子,都在想你。" # classifict(CHIFeature(), [s1,s1], out=True) # print # print
minority_index = [np.argmin(label_count)] else: minority_index = [unique_label.index(target) for target in minority_target] majority = np.max(label_count) for i in minority_index: N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100 safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5) syn_label = np.array([unique_label[i]] * synthetic.shape[0]) X = sp.vstack([X, synthetic]) Y = np.concatenate([Y, syn_label]) return X, Y if __name__ == "__main__": feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) data_smote, target_smote = my_smote(train, class_label, minority_target=["fear", "surprise"]) print np.sum([target_smote == "fear"]) print np.sum([target_smote == "surprise"]) # data = load_creditscoring1() # print len((data.target == 0).nonzero()[0]) # print len((data.target == 1).nonzero()[0]) # data_smote, target_smote = smote(data.data, data.target, per=0.7) # print len((target_smote == 0).nonzero()[0])
:return: """ index = np.where(self.classes_ == c_pred)[0][0] # 获得与 c_pred 相对应的那一类的数据 copy_feature_count = self.feature_count_.copy() correct_row = copy_feature_count[index: index + 1, :] # l = [sentence] # fit_sentence = Feature_Hasher.transform(l).toarray() b_matrix = sentence.toarray() np.power(correct_row + b_matrix, 1, correct_row) np.power(copy_feature_count, 1, out) return copy_feature_count if __name__ == "__main__": # 加载情绪分类数据集 feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, test_label) crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate:
N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100 safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5) syn_label = np.array([unique_label[i]] * synthetic.shape[0]) X = sp.vstack([X, synthetic]) Y = np.concatenate([Y, syn_label]) return X, Y if __name__ == "__main__": feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) data_smote, target_smote = my_smote(train, class_label, minority_target=["fear", "surprise"]) print np.sum([target_smote == "fear"]) print np.sum([target_smote == "surprise"]) # data = load_creditscoring1() # print len((data.target == 0).nonzero()[0]) # print len((data.target == 1).nonzero()[0])