コード例 #1
0
        """
        if len(c_true) != len(c_pred):
            raise ValueError("the two lists have different size!")

        l = filter(lambda x: x[1] != "unknow", zip(c_true, c_pred))
        temp = zip(*l)
        return temp[0], temp[1]

if __name__ == "__main__":
    print
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(train_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    clf = Classification()
    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf.cross_validation(train, class_label, score="recall")
コード例 #2
0
        copy_feature_count = self.feature_count_.copy()
        correct_row = copy_feature_count[index: index + 1, :]
#        l = [sentence]
#        fit_sentence = Feature_Hasher.transform(l).toarray()
        b_matrix = sentence.toarray()
        np.power(correct_row + b_matrix, 1, correct_row)
        np.power(copy_feature_count, 1, out)
        return copy_feature_count

if __name__ == "__main__":
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf0 = Classification()
            clf0.cross_validation(train, class_label, score="recall")
コード例 #3
0
    majority = np.max(label_count)
    for i in minority_index:
        N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100
        safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5)
        syn_label = np.array([unique_label[i]] * synthetic.shape[0])
        X = sp.vstack([X, synthetic])
        Y = np.concatenate([Y, syn_label])

    return X, Y

if __name__ == "__main__":
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    data_smote, target_smote = my_smote(train, class_label, minority_target=["fear", "surprise"])

    print np.sum([target_smote == "fear"])
    print np.sum([target_smote == "surprise"])

#    data = load_creditscoring1()
#    print len((data.target == 0).nonzero()[0])
#    print len((data.target == 1).nonzero()[0])
#    data_smote, target_smote = smote(data.data, data.target, per=0.7)
#    print len((target_smote == 0).nonzero()[0])
#    print len((target_smote == 1).nonzero()[0])
#    print