Beispiel #1
0
    def get_classificator(self):
        sentences = self.read_train(ImageClassification.image_train_path)
        if not sentences:
            # 读取微博数据
            sentences = collect.read_weibo(ImageClassification.weibo_path,
                                           isreadimg=True)
            pure_sentences = [
                sentence.get("sentence") for sentence in sentences
            ]

            # predict
            c_pred = self.__classifict(CHIFeature(), pure_sentences, incr=True)

            # reconstruct sentences
            sentences = self.__reconstruct(sentences, c_pred)

            # save
            self.__save_result(sentences)

        texts, imgs, labels = self.split(sentences)
        img_feature = self.__get_feature_from_img(imgs)
        self.nn.get_classificator(img_feature, labels)
        return self
Beispiel #2
0
                        """<weibo emotion-type="%s">
    <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s">
        %s
    </sentence>
</weibo>
""" % ("None", "None", "N", s + "\n Can't recognize because it has insufficient key_words"))

    else:
        print c_pred

if __name__ == "__main__":
    if False:
        [collect.collect_weibo() for i in range(10)]

    if True:
        feature = CHIFeature()
        path = "collect"
        sentences = collect.read_weibo(path)
        sentences = [s.get("sentence") for s in sentences]
        classifict(feature, sentences, incr=True, out=True)

#        test_classification(feature, incr=True)

    if False:
        test_classification(CHIFeature(subjective=False))

#    s1 = "寂寞人生爱无休,寂寞是爱永远的主题、我和我的影子独处、它说它有悄悄话想跟我说、" \
#         "它说它很想念你,原来我和我的影子,都在想你。"
#    classifict(CHIFeature(), [s1,s1], out=True)
#    print
#    print
        minority_index = [np.argmin(label_count)]
    else:
        minority_index = [unique_label.index(target) for target in minority_target]

    majority = np.max(label_count)
    for i in minority_index:
        N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100
        safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5)
        syn_label = np.array([unique_label[i]] * synthetic.shape[0])
        X = sp.vstack([X, synthetic])
        Y = np.concatenate([Y, syn_label])

    return X, Y

if __name__ == "__main__":
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    data_smote, target_smote = my_smote(train, class_label, minority_target=["fear", "surprise"])

    print np.sum([target_smote == "fear"])
    print np.sum([target_smote == "surprise"])

#    data = load_creditscoring1()
#    print len((data.target == 0).nonzero()[0])
#    print len((data.target == 1).nonzero()[0])
#    data_smote, target_smote = smote(data.data, data.target, per=0.7)
#    print len((target_smote == 0).nonzero()[0])
Beispiel #4
0
        :return:
        """
        index = np.where(self.classes_ == c_pred)[0][0]
        # 获得与 c_pred 相对应的那一类的数据
        copy_feature_count = self.feature_count_.copy()
        correct_row = copy_feature_count[index: index + 1, :]
#        l = [sentence]
#        fit_sentence = Feature_Hasher.transform(l).toarray()
        b_matrix = sentence.toarray()
        np.power(correct_row + b_matrix, 1, correct_row)
        np.power(copy_feature_count, 1, out)
        return copy_feature_count

if __name__ == "__main__":
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
Beispiel #5
0
        N = (int((majority * 1.0 /
                  (1 - per) - majority) / label_count[i]) - 1) * 100
        safe, synthetic, danger = _smote._borderlineSMOTE(X,
                                                          Y,
                                                          unique_label[i],
                                                          N,
                                                          k=5)
        syn_label = np.array([unique_label[i]] * synthetic.shape[0])
        X = sp.vstack([X, synthetic])
        Y = np.concatenate([Y, syn_label])

    return X, Y


if __name__ == "__main__":
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    data_smote, target_smote = my_smote(train,
                                        class_label,
                                        minority_target=["fear", "surprise"])

    print np.sum([target_smote == "fear"])
    print np.sum([target_smote == "surprise"])

#    data = load_creditscoring1()
#    print len((data.target == 0).nonzero()[0])
#    print len((data.target == 1).nonzero()[0])