""" if len(c_true) != len(c_pred): raise ValueError("the two lists have different size!") l = filter(lambda x: x[1] != "unknow", zip(c_true, c_pred)) temp = zip(*l) return temp[0], temp[1] if __name__ == "__main__": print # 加载情绪分类数据集 feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(train_datas): test = feature.cal_weight_improve(test_datas, test_label) clf = Classification() crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf.cross_validation(train, class_label, score="recall")
copy_feature_count = self.feature_count_.copy() correct_row = copy_feature_count[index: index + 1, :] # l = [sentence] # fit_sentence = Feature_Hasher.transform(l).toarray() b_matrix = sentence.toarray() np.power(correct_row + b_matrix, 1, correct_row) np.power(copy_feature_count, 1, out) return copy_feature_count if __name__ == "__main__": # 加载情绪分类数据集 feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, test_label) crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf0 = Classification() clf0.cross_validation(train, class_label, score="recall")
majority = np.max(label_count) for i in minority_index: N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100 safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5) syn_label = np.array([unique_label[i]] * synthetic.shape[0]) X = sp.vstack([X, synthetic]) Y = np.concatenate([Y, syn_label]) return X, Y if __name__ == "__main__": feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) data_smote, target_smote = my_smote(train, class_label, minority_target=["fear", "surprise"]) print np.sum([target_smote == "fear"]) print np.sum([target_smote == "surprise"]) # data = load_creditscoring1() # print len((data.target == 0).nonzero()[0]) # print len((data.target == 1).nonzero()[0]) # data_smote, target_smote = smote(data.data, data.target, per=0.7) # print len((target_smote == 0).nonzero()[0]) # print len((target_smote == 1).nonzero()[0]) # print