def serializeSVMModel():
    try:
        dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel(
            "bank-addtional-format-svm")
    except Exception as e:
        dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional")
    dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, -1)
    trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
        dataSet, labelSet)
    kTup = ("lin", 1.3)
    alphas, b = SVMLib.realSMO(trainSet, trainLabel, 0.6, 0.01, kTup, 10)
    errorCount = 0
    sv, svl = SVMLib.getSupportVectorandSupportLabel(trainSet, trainLabel,
                                                     alphas)
    for data, label in zip(testSet, testLabel):
        predict_label = SVMLib.predictLabel(data, *[sv, svl, alphas, b, kTup])
        if predict_label != label:
            errorCount += 1
    ratio = errorCount / len(testLabel)
    print("the error ratio is %.3f, the correct ratio is %.3f" %
          (ratio, 1 - ratio))
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    db['SVMModel'] = [sv, svl, alphas, b, kTup]
    db['SVMModelCorrectRatio'] = 1 - ratio
    db.close()
def loadDataSet(filename):
    print("Loading data...")
    dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename)
    print("Loaded data!")
    print("Undersampling data...")
    dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no")
    print("Undersampled data!")
    return dataSet, labelSet
Esempio n. 3
0
def main(visualize=True):
    data_obj = DataUtil('example_data.csv')
    x, y = data_obj.get_dataSet()
    fit_time = time.time()
    tree = CartTree(whether_continuous=[False] * 4)
    tree.fit(x, y, train_only=True)
    fit_time = time.time() - fit_time
    if visualize:
        tree.view()
    estimate_time = time.time()
    tree.evaluate(x, y)
    x2 = ['紫色', '小', '小孩', '用脚踩']
    tree.evaluat2(x2)
Esempio n. 4
0
 def feed_data(self, x, y, sample_weight=None):
     if sample_weight is not None:
         sample_weight = np.array(sample_weight)
     x, y, wc, features, feat_dics, label_dic = DataUtil.quantize_data(
         x, y, wc=self._whether_continuous, separate=True)
     if self._whether_continuous is None:
         self._whether_continuous = wc
         self._whether_discrete = ~self._whether_continuous
     self._label_dic = label_dic
     discrete_x, continuous_x = x
     cat_counter = np.bincount(y)
     self._cat_counter = cat_counter
     labels = [y == value for value in range(len(cat_counter))]
     #训练离散型朴素贝叶斯
     labelled_x = [discrete_x[ci].T for ci in labels]
     self._multinomial._x, self._multinomial._y = x, y
     self._multinomial._labelled_x, self._multinomial._label_zip = (
         labelled_x, list(zip(labels, labelled_x)))
     self._multinomial._cat_counter = cat_counter
     self._multinomial._feat_dics = [
         _dic for i, _dic in enumerate(feat_dics)
         if self._whether_discrete[i]
     ]
     self._multinomial._n_possibilities = [
         len(feats) for i, feats in enumerate(features)
         if self._whether_discrete[i]
     ]
     self._multinomial._label_dic = label_dic
     #训练连续型朴素贝叶斯
     labelled_x = [continuous_x[label].T for label in labels]
     self._gaussian._x, self._gaussian._y = continuous_x.T, y
     self._gaussian._labelled_x, self._gaussian._label_zip = labelled_x, labels
     self._gaussian._cat_counter, self._gaussian._label_dic = cat_counter, label_dic
     #处理样本权重
     self.feed_sample_weight(sample_weight)
def testRFModel(dataSet, labelSet, T=20):
    trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
        dataSet, labelSet)
    forest = RFLib.generateRandomForest(trainSet, trainLabel, T)
    errorCount = 0
    for data, label in zip(testSet, testLabel):
        predict_label = RFLib.predictByRandomForest(forest, data)
        if predict_label != label:
            errorCount += 1
    RFratio = float(errorCount) / len(testLabel)
    print("RF:total error ratio is %.3f, correct ratio is %.3f" %
          (RFratio, 1 - RFratio))
    return RFratio
def serializeLRModel():
    try:
        dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel(
            "bank-addtional-format-lr")
    except Exception as e:
        dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional")
    dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, 0)
    trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
        dataSet, labelSet)
    weight, logList = LRLib.stocGradDescent(trainSet, trainLabel)
    errorCount = 0
    for data, label in zip(testSet, testLabel):
        predict_label = LRLib.classifyVector(data, weight)
        if predict_label != label:
            errorCount += 1
    ratio = errorCount / len(testLabel)
    print("the error ratio is %.3f, the correct ratio is %.3f" %
          (ratio, 1 - ratio))
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    db["LRModel"] = weight
    db["LRModelCorrectRatio"] = 1 - ratio
    db.close()
Esempio n. 7
0
def testRFModel(filename="bank-additional"):
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    maxCorrectRatio = db["RFModelCorrectRatio"]
    model = db["RFModel"]
    db.close()
    dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename)
    error = 0
    for data, label in zip(dataSet, labelSet):
        predict_label = RFLib.predictByRandomForest(model, data)
        if predict_label != label:
            error += 1
    errorRatio = error / len(dataSet)
    print(
        "RF:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f"
        % (errorRatio, 1 - errorRatio, maxCorrectRatio))
Esempio n. 8
0
def testLRModel(filename="bank-additional"):
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    maxCorrectRatio = db["LRModelCorrectRatio"]
    weight = db["LRModel"]
    db.close()
    #     dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-lr")
    dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel(filename, "lr")
    error = 0
    for data, label in zip(dataSet, labelSet):
        predict_label = LRLib.classifyVector(data, weight)
        if predict_label != label:
            error += 1
    errorRatio = error / len(dataSet)
    print(
        "LR:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f"
        % (errorRatio, 1 - errorRatio, maxCorrectRatio))
Esempio n. 9
0
def testSVMModel(filename="bank-additional"):
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    maxCorrectRatio = db["SVMModelCorrectRatio"]
    model = db["SVMModel"]
    db.close()
    #     dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-svm")
    dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel(filename, "svm")
    error = 0
    for data, label in zip(dataSet, labelSet):
        predict_label = SVMLib.predictLabel(data, *model)
        if predict_label != label:
            error += 1
    errorRatio = error / len(dataSet)
    print(
        "SVM:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f"
        % (errorRatio, 1 - errorRatio, maxCorrectRatio))
def serializeDTModel():
    dataSet, labelSet = loadDataSet("bank-additional")
    tmp_lst = []
    maxRatio = 0
    finalModel = {}
    for i in range(100):
        trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
            dataSet, labelSet)
        model = DTLib.createDecisionTree(trainSet, trainLabel)
        errorRatio = DTLib.testDTModel(testSet, testLabel, model)
        tmp_lst.append(1 - errorRatio)
        if (1 - errorRatio) > maxRatio:
            maxRatio = 1 - errorRatio
            finalModel = model
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    db["DTModel"] = finalModel
    db["DTModelCorrectRatio"] = maxRatio
    db.close()
Esempio n. 11
0
def main():
    # _data, _x, _y = [], [], []
    # with open("Data/data.txt", "r") as file:
    #     for line in file:
    #         _data.append(line.strip().split(","))
    # np.random.shuffle(_data)
    # for line in _data:
    #     _y.append(line.pop(0))
    #     _x.append(line)
    # _x, _y = np.array(_x), np.array(_y)
    # train_num = 5000
    # x_train = _x[:train_num]
    # y_train = _y[:train_num]
    # x_test = _x[train_num:]
    # y_test = _y[train_num:]
    # _fit_time = time.time()
    # _tree = CartTree()
    # _tree.fit(x_train, y_train)
    # _fit_time = time.time() - _fit_time
    # _tree.view()
    # _estimate_time = time.time()
    # _tree.estimate(x_test, y_test)
    # _estimate_time = time.time() - _estimate_time
    # print("Fit      Process : {:8.6} s\n"
    #       "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time))
    # _tree.visualize()

    from Util import DataUtil
    _x, _y = DataUtil.gen_xor()
    _y = np.argmax(_y, axis=1)
    _fit_time = time.time()
    _tree = ID3Tree()
    _tree.fit(_x, _y)
    _fit_time = time.time() - _fit_time
    # _tree.view()
    _estimate_time = time.time()
    # _tree.estimate(_x, _y)
    _estimate_time = time.time() - _estimate_time
    print("Fit      Process : {:8.6} s\n"
          "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time))
    _tree.visualize2d(_x, _y)
def serializeRFModel():
    dataSet, labelSet = loadDataSet("bank-additional")
    maxRatio = 0
    finalModel = None
    for i in range(10):
        trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
            dataSet, labelSet)
        forest = RFLib.generateRandomForest(trainSet, trainLabel, 20)
        errorCount = 0
        for data, label in zip(testSet, testLabel):
            predict_label = RFLib.predictByRandomForest(forest, data)
            if predict_label != label:
                errorCount += 1
        RFratio = float(errorCount) / len(testLabel)
        if (1 - RFratio) > maxRatio:
            maxRatio = 1 - RFratio
            finalModel = forest
        print("RF:total error ratio is %.3f, correct ratio is %.3f" %
              (RFratio, 1 - RFratio))
    db = shelve.open("{0}/MiningModel".format(sys.path[0]))
    db["RFModel"] = finalModel
    db["RFModelCorrectRatio"] = maxRatio
    db.close()
def loadDataSet(filename):
    print("Loading data...")
    dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename)
    print("Loaded data!")
    print("Undersampling data...")
    dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no")
    print("Undersampled data!")
    return dataSet, labelSet


if __name__ == "__main__":
    start = time.clock()
    dataSet, labelSet = loadDataSet("bank-additional")
    tmp_lst = []
    for i in range(100):
        trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
            dataSet, labelSet)
        model = DTLib.createDecisionTree(trainSet, trainLabel)
        errorRatio = DTLib.testDTModel(testSet, testLabel, model)
        tmp_lst.append(1 - errorRatio)
    y = np.array(tmp_lst, dtype=np.float)
    print("the avg correct ratio is %.3f, the std is %.3f" %
          (y.mean(), y.std()))
    x = np.arange(0, len(tmp_lst))
    fig = plt.figure("test")
    ax = fig.add_subplot(111)
    ax.plot(x, y)
    ax.set_ylim([0, 1])
    ax.set_ylabel("correct ratio of DT")
    ax.set_xlabel("count of exp")
    plt.show()
Esempio n. 14
0
                    x[i][d] = _feat_dics[idx][line[d]]
            if discrete:
                idx += 1
        return x

if __name__ == '__main__':
    import time
    _whether_discrete = [True] * 16
    _continuous_lst = [0, 5, 9, 11, 12, 13, 14]
    for _cl in _continuous_lst:
        _whether_discrete[_cl] = False

    train_num = 40000

    data_time = time.time()
    _data = DataUtil.get_dataset("bank1.0", "../../_Data/bank1.0.txt")
    np.random.shuffle(_data)
    train_x = _data[:train_num]
    test_x = _data[train_num:]
    train_y = [xx.pop() for xx in train_x]
    test_y = [xx.pop() for xx in test_x]
    data_time = time.time() - data_time

    learning_time = time.time()
    nb = MergedNB(_whether_discrete)
    nb.fit(train_x, train_y)
    learning_time = time.time() - learning_time

    estimation_time = time.time()
    nb.estimate(train_x, train_y)
    nb.estimate(test_x, test_y)
Esempio n. 15
0
def main(visualize=True):
    # x, y = DataUtil.get_dataset("balloon1.0(en)", "../_Data/balloon1.0(en).txt")
    x, y = DataUtil.get_dataset("test", "../_Data/test.txt")
    fit_time = time.time()
    tree = CartTree(whether_continuous=[False] * 4)
    tree.fit(x, y, train_only=True)
    fit_time = time.time() - fit_time
    if visualize:
        tree.view()
    estimate_time = time.time()
    tree.evaluate(x, y)
    estimate_time = time.time() - estimate_time
    print("Model building  : {:12.6} s\n"
          "Estimation      : {:12.6} s\n"
          "Total           : {:12.6} s".format(fit_time, estimate_time,
                                               fit_time + estimate_time))
    if visualize:
        tree.visualize()

    train_num = 6000
    (x_train,
     y_train), (x_test,
                y_test), *_ = DataUtil.get_dataset("mushroom",
                                                   "../_Data/mushroom.txt",
                                                   tar_idx=0,
                                                   n_train=train_num)
    fit_time = time.time()
    tree = C45Tree()
    tree.fit(x_train, y_train)
    fit_time = time.time() - fit_time
    if visualize:
        tree.view()
    estimate_time = time.time()
    tree.evaluate(x_train, y_train)
    tree.evaluate(x_test, y_test)
    estimate_time = time.time() - estimate_time
    print("Model building  : {:12.6} s\n"
          "Estimation      : {:12.6} s\n"
          "Total           : {:12.6} s".format(fit_time, estimate_time,
                                               fit_time + estimate_time))
    if visualize:
        tree.visualize()

    x, y = DataUtil.gen_xor(one_hot=False)
    fit_time = time.time()
    tree = CartTree()
    tree.fit(x, y, train_only=True)
    fit_time = time.time() - fit_time
    if visualize:
        tree.view()
    estimate_time = time.time()
    tree.evaluate(x, y, n_cores=1)
    estimate_time = time.time() - estimate_time
    print("Model building  : {:12.6} s\n"
          "Estimation      : {:12.6} s\n"
          "Total           : {:12.6} s".format(fit_time, estimate_time,
                                               fit_time + estimate_time))
    if visualize:
        tree.visualize2d(x, y, dense=1000)
        tree.visualize()

    wc = [False] * 16
    continuous_lst = [0, 5, 9, 11, 12, 13, 14]
    for _cl in continuous_lst:
        wc[_cl] = True

    train_num = 2000
    (x_train,
     y_train), (x_test,
                y_test), *_ = DataUtil.get_dataset("bank1.0",
                                                   "../_Data/bank1.0.txt",
                                                   n_train=train_num,
                                                   quantize=True)
    fit_time = time.time()
    tree = CartTree()
    tree.fit(x_train, y_train)
    fit_time = time.time() - fit_time
    if visualize:
        tree.view()
    estimate_time = time.time()
    tree.evaluate(x_test, y_test)
    estimate_time = time.time() - estimate_time
    print("Model building  : {:12.6} s\n"
          "Estimation      : {:12.6} s\n"
          "Total           : {:12.6} s".format(fit_time, estimate_time,
                                               fit_time + estimate_time))
    if visualize:
        tree.visualize()

    tree.show_timing_log()
Esempio n. 16
0
def main():
    _data = DataUtil.get_dataset("mushroom", "../_Data/mushroom.txt")
    np.random.shuffle(_data)
    _x, _y = [], []
    for line in _data:
        _y.append(line.pop(0))
        _x.append(line)
    _x, _y = np.array(_x), np.array(_y)
    train_num = 5000
    x_train = _x[:train_num]
    y_train = _y[:train_num]
    x_test = _x[train_num:]
    y_test = _y[train_num:]
    _fit_time = time.time()
    _tree = C45Tree()
    _tree.fit(x_train, y_train)
    _fit_time = time.time() - _fit_time
    _tree.view()
    _estimate_time = time.time()
    _tree.estimate(x_test, y_test)
    _estimate_time = time.time() - _estimate_time
    print("Fit      Process : {:8.6} s\n"
          "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time))
    _tree.visualize()

    # from Util import DataUtil
    # _x, _y = DataUtil.gen_xor()
    # _y = np.argmax(_y, axis=1)
    # _fit_time = time.time()
    # _tree = C45Tree()
    # _tree.fit(_x, _y)
    # _fit_time = time.time() - _fit_time
    # _tree.view()
    # _estimate_time = time.time()
    # _tree.estimate(_x, _y)
    # _estimate_time = time.time() - _estimate_time
    # print("Fit      Process : {:8.6} s\n"
    #       "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time))
    # _tree.visualize2d(_x, _y)

    _whether_discrete = [True] * 16
    _continuous_lst = [0, 5, 9, 11, 12, 13, 14]
    for _cl in _continuous_lst:
        _whether_discrete[_cl] = False
    _data = DataUtil.get_dataset("bank1.0", "../_Data/bank1.0.txt")
    np.random.shuffle(_data)
    _labels = [xx.pop() for xx in _data]
    nb = MergedNB(_whether_discrete)
    nb.fit(_data, _labels)
    _dx, _cx = nb["multinomial"]["x"], nb["gaussian"]["x"]
    _labels = nb["multinomial"]["y"]
    _data = np.hstack((_dx, _cx.T))
    train_num = 1000
    x_train = _data[:train_num]
    y_train = _labels[:train_num]
    x_test = _data[train_num:]
    y_test = _labels[train_num:]
    _fit_time = time.time()
    _tree = C45Tree()
    _tree.fit(x_train, y_train)
    _fit_time = time.time() - _fit_time
    _tree.view()
    _estimate_time = time.time()
    _tree.estimate(x_test, y_test)
    _estimate_time = time.time() - _estimate_time
    print("Fit      Process : {:8.6} s\n"
          "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time))
    _tree.visualize()
Esempio n. 17
0
if __name__ == '__main__':
    # _x, _y = gen_random()
    # test(_x, _y, algorithm="RF", epoch=1)
    # test(_x, _y, algorithm="RF", epoch=10)
    # test(_x, _y, algorithm="RF", epoch=50)
    # test(_x, _y, algorithm="SKRandomForest")
    # test(_x, _y, epoch=1)
    # test(_x, _y, epoch=1)
    # test(_x, _y, epoch=10)
    # _x, _y = gen_xor()
    # test(_x, _y, algorithm="RF", epoch=1)
    # test(_x, _y, algorithm="RF", epoch=10)
    # test(_x, _y, algorithm="RF", epoch=1000)
    # test(_x, _y, algorithm="SKAdaBoost")
    _x, _y = DataUtil.gen_spiral(size=20, n=4, n_class=2, one_hot=False)
    _y[_y == 0] = -1
    # test(_x, _y, clf="SKTree", epoch=10)
    # test(_x, _y, clf="SKTree", epoch=1000)
    # test(_x, _y, algorithm="RF", epoch=10)
    test(_x, _y, algorithm="RF", epoch=30, n_cores=4)
    # test(_x, _y, algorithm="SKAdaBoost")

    train_num = 6000
    (x_train,
     y_train), (x_test, y_test), *_ = DataUtil.get_dataset("mushroom",
                                                           "data/mushroom.txt",
                                                           n_train=train_num,
                                                           quantize=True,
                                                           tar_idx=0)
    y_train[y_train == 0] = -1
Esempio n. 18
0
    def predict(self, x, get_raw_results=False, bound=None, **kwargs):
        trees = self._trees if bound is None else self._trees[:bound]
        matrix = self._multi_clf(x, trees, rf_task, kwargs, target=kwargs.get("target", "parallel"))
        return np.array([RandomForest.most_appearance(rs) for rs in matrix])

    def evaluate(self, x, y, metrics=None, tar=0, prefix="Acc", **kwargs):
        kwargs["target"] = "single"
        super(RandomForest, self).evaluate(x, y, metrics, tar, prefix, **kwargs)


if __name__ == '__main__':
    import time

    train_num = 100
    (x_train, y_train), (x_test, y_test) = DataUtil.get_dataset(
        "mushroom", "data/mushroom.txt", n_train=train_num, tar_idx=0)

    learning_time = time.time()
    forest = RandomForest()
    forest.fit(x_train, y_train)
    learning_time = time.time() - learning_time
    estimation_time = time.time()
    print(
        "===============================\n"
        "{}\n"
        "-------------------------------\n".format('mushroom'), end='\t')
    forest.evaluate(x_train, y_train)
    forest.evaluate(x_test, y_test)
    estimation_time = time.time() - estimation_time

    print(
Esempio n. 19
0
    def _transfer_x(self, x):
        #遍历每个元素,利转化字典进行数值化
        for j, char in enumerate(x):
            x[j] = self._feat_dics[j][char]
        return x


if __name__ == '__main__':
    # 导入标准库 time 以计时,导入DataUtil 类以获取数据
    import time
    from Util import DataUtil

    #遍历1.0,1.5两个版本的气球数据集
    for dataset in ('balloon1.0', 'balloon1.5'):
        #读入数据
        _x, _y = DataUtil.get_dataset(name=dataset,
                                      path='Data/{}.txt'.format(dataset))
        #实例化模型并进行训练,同时记录整个过程花费的时间
        learning_time = time.time()
        nb = MultinomialNB()
        nb.fit(_x, _y)
        learning_time = time.time() - learning_time
        # 评估模型的表现,同时记录评估过程花费的时间
        estimation_time = time.time()
        nb.evluate(_x, _y)
        estimation_time = time.time() - estimation_time
        # 将记录下来的耗时输出
        print('Model buiding : {:12.6} s\n'
              'Estimation    : {:12.6} s\n'
              'Total         : {:12.6} s'.format(
                  learning_time, estimation_time,
                  learning_time + estimation_time))
Esempio n. 20
0
                        data[_j][_c, :],
                        width=0.35,
                        facecolor=colors[nb.label_dic[_c]],
                        edgecolor='white',
                        label='class: {}'.format(nb.label_dic[_c]))
            plt.xticks([i for i in range(sj + 2)],
                       [''] + [_rev_feat_dics[i] for i in range(sj)] + [''])
            plt.ylim(0, 1.0)
            plt.legend()
            plt.savefig('d{}'.format(_j + 1))


if __name__ == '__main__':
    import time
    from Util import DataUtil
    #for dataset in ('balloon1.0', 'balloon1.5'):
    dataset = 'mushroom'
    _x, _y = DataUtil.get_dataset(dataset, 'Data/{}.txt'.format(dataset))
    learning_time = time.time()
    nb = MultinomialNB()
    nb.fit(_x, _y)
    learning_time = time.time() - learning_time
    estimation_time = time.time()
    nb.evaluate(_x, _y)
    estimation_time = time.time() - estimation_time
    print('Model building : {:12.6} s\n'
          'Estimation : {:12.6} s\n'
          'Total : {:12.6} s'.format(learning_time, estimation_time,
                                     learning_time + estimation_time))
    nb.visualize()
Esempio n. 21
0
            plt.legend()
            if not save:
                plt.show()
            else:
                plt.savefig("d{}".format(j + 1))

    @staticmethod
    def _transfer_x(x):
        return x


if __name__ == '__main__':
    import time

    xs, ys = DataUtil.get_dataset(
        "mushroom",
        "C:\\Users\\tangk\\PycharmProjects\Machine_Learning\\_Data\\mushroom.txt",
        tar_idx=0)
    nb = MultinomialNB()
    nb.feed_data(xs, ys)
    xs, ys = nb["x"].tolist(), nb["y"].tolist()

    train_num = 6000
    x_train, x_test = xs[:train_num], xs[train_num:]
    y_train, y_test = ys[:train_num], ys[train_num:]

    learning_time = time.time()
    gb = GaussianNB()
    gb.fit(x_train, y_train)
    learning_time = time.time() - learning_time

    estimation_time = time.time()
Esempio n. 22
0
    hashtag_dics = {_l: hashtag_list.count(_l) for _l in hashtag_set}
    return hashtag_dics

    # fig = plt.figure()
    # plt.title(title)
    # plt.barh(hashtag_dics.keys(),hashtag_dics.values(),width = 0.35,facecolor = 'lightskyblue',edgecolor = 'white')
    # plt.show()


if __name__ == '__main__':
    t = ['#a b c', '#a #b c', '#a #b #c']
    print count_hashtag(t)
    import time
    from Util import DataUtil

    du = DataUtil()
    for dataset in ("balloon1.0(en)", "balloon1.5(en)"):
        _x, _y = du.get_dataset(dataset, "../_Data/{}.txt".format(dataset))
        learning_time = time.time()
        nb = MultinomialNB()
        nb.fit(_x, _y)
        learning_time = time.time() - learning_time

        estimation_time = time.time()
        nb.evaluate(_x, _y)
        estimation_time = time.time() - estimation_time

        print(
            "Model building : {:12.6}  s\n"
            "Estimation     : {:12.6}  s\n"
            "Total          : {:12.6}  s\n".format(
Esempio n. 23
0
        self._data = data
        def func(input_x, tar_category):
            rs = 1
            for d, xx in enumerate(input_x):
                rs *= data[d][tar_category](xx)
            return rs * p_category[tar_category]
        return func

    @staticmethod
    def _transfer_x(x):
        return x

if __name__ == '__main__':
    import time
     # 读入数据
    _x, _y = DataUtil.get_dataset("name", "C:\Program Files\Git\MachineLearning\_Data\\bank2.0.txt")
    gnb = GaussianNB()
    gnb.fit(_x, _y)
    gnb.evaluate(_x, _y)
    # nb = MultinomialNB()
    # nb.feed_data(_x, _y)
    # xs, ys = nb["x"].tolist(), nb["y"].tolist()
    # train_num = 6000
    # x_train, x_test = xs[:train_num], xs[train_num:]
    # y_train, y_test = ys[:train_num], ys[train_num:]
    # nb.fit(x_train, y_train)
    # nb.evaluate(x_train, y_train)
    # nb.evaluate(x_test, y_test)
    # gnb = GaussianNB()
    # gnb.fit(x_train, y_train)
    # gnb.evaluate(x_train, y_train)
Esempio n. 24
0
    #     "Estimation      : {:12.6} s\n"
    #     "Total           : {:12.6} s".format(
    #         learning_time, estimation_time,
    #         learning_time + estimation_time
    #     )
    # )

    whether_continuous = [False] * 16
    continuous_lst = [0, 5, 9, 11, 12, 13, 14]
    for cl in continuous_lst:
        whether_continuous[cl] = True

    train_num = 40000
    data_time = time.time()
    (x_train, y_train), (x_test, y_test) = DataUtil.get_dataset(
        "bank1.0",
        "C:/Users/tangk/Desktop/MachineLearning-master/MachineLearning-master/_Data/bank1.0.txt",
        n_train=train_num)
    data_time = time.time() - data_time
    learning_time = time.time()
    nb = MergedNB(whether_continuous=whether_continuous)
    nb.fit(x_train, y_train)
    learning_time = time.time() - learning_time
    estimation_time = time.time()
    nb.evaluate(x_train, y_train)
    nb.evaluate(x_test, y_test)
    estimation_time = time.time() - estimation_time
    print("Data cleaning   : {:12.6} s\n"
          "Model building  : {:12.6} s\n"
          "Estimation      : {:12.6} s\n"
          "Total           : {:12.6} s".format(
              data_time, learning_time, estimation_time,
Esempio n. 25
0
    #定义数值化数据的函数
    def _transfer_x(self, x):
        # 遍历每个元素,利用转换字典进行数值化
        for j, char in enumerate(x):
            x[j] = self._feat_dics[j][char]
        return x


if __name__ == '__main__':
    #导入标准库time以计时,导入DataUtil类以获取数据
    import time
    from Util import DataUtil
    #遍历1.0,1.5两个版本的气球数据集
    for dataset in ("balloon1.0", "balloon1.5"):
        # 读入数据
        _x, _y = DataUtil.get_dataset(dataset,
                                      "../../_Data/{}.txt".format(dataset))
        #实例化模型并进行训练、同时记录整个过程花费的时间
        learning_time = time.time()
        nb = MultinomialNB()
        np.fit(_x, _y)
        learning_time = time.time() - learning_time
        #评估模型的表现,同时记录评估过程花费的时间
        estimation_time = time.time()
        nb.evaluate(_x, _y)
        estimation_time = time.time() - estimation_time
        #将记录下来的耗时输出
        print("Model building : {:12.6} s\n"
              "Estimation : {:12.6} s\n"
              "Total : {:12.6} s".format(learning_time, estimation_time,
                                         learning_time + estimation_time))
Esempio n. 26
0
'''
Created on 2018年3月4日

@author: IL MARE
'''
import time
from Lib import SVMLib as SVMLib
from Util import DataUtil as DataUtil

if __name__ == "__main__":
    start = time.clock()
    # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional", "svm")#正统方法
    dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel(
        "bank-addtional-format-svm")
    dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, -1)
    trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
        dataSet, labelSet)
    kTup = ("lin", 1.2)
    alphas, b = SVMLib.realSMO(trainSet, trainLabel, 0.6, 0.01, kTup, 10)
    errorCount = 0
    sv, svl = SVMLib.getSupportVectorandSupportLabel(trainSet, trainLabel,
                                                     alphas)
    for data, label in zip(testSet, testLabel):
        predict_label = SVMLib.predictLabel(data, *[sv, svl, alphas, b, kTup])
        if predict_label != label:
            errorCount += 1
    ratio = errorCount / len(testLabel)
    print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" %
          (ratio, 1 - ratio, time.clock() - start))
Esempio n. 27
0
            rs = 1
            for d, xx in enumerate(input_x):
                rs *= data[d][tar_category][xx]
            return rs * p_category[tar_category]

        return func

    def _transfer_x(self, x):
        for j, char in enumerate(x):
            x[j] = self._feat_dics[j][char]
        return x


if __name__ == '__main__':
    import time
    _data = DataUtil.get_dataset("mushroom", "../../_Data/mushroom.txt")
    np.random.shuffle(_data)
    train_num = 6000
    train_x = _data[:train_num]
    test_x = _data[train_num:]
    train_y = [xx.pop(0) for xx in train_x]
    test_y = [xx.pop(0) for xx in test_x]

    learning_time = time.time()
    nb = MultinomialNB()
    nb.fit(train_x, train_y)
    learning_time = time.time() - learning_time

    estimation_time = time.time()
    nb.estimate(train_x, train_y)
    nb.estimate(test_x, test_y)
Esempio n. 28
0
'''
Created on 2018年3月4日

@author: IL MARE
'''
import Util.DataUtil as DataUtil
from lib import LogisticLib as LRLib
import time

if __name__ == "__main__":
    start = time.clock()
    # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional")#正统方法
    dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel(
        "bank-addtional-format-lr")
    trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(
        dataSet, labelSet)
    weight, logList = LRLib.stocGradDescent(trainSet, trainLabel)
    errorCount = 0
    for data, label in zip(testSet, testLabel):
        predict_label = LRLib.classifyVector(data, weight)
        if predict_label != label:
            errorCount += 1
    ratio = errorCount / len(testLabel)
    print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" %
          (ratio, 1 - ratio, time.clock() - start))
    LRLib.plotWeightFig(logList, [i for i in range(0, 6)])
Esempio n. 29
0
                        label=u"class: {}".format(self.label_dic[c]))
            plt.xticks([i for i in range(sj + 2)],
                       [""] + [rev_dic[i] for i in range(sj)] + [""])
            plt.ylim(0, 1.0)
            plt.legend()
            if not save:
                plt.show()
            else:
                plt.savefig("d{}".format(j + 1))


if __name__ == '__main__':

    # 读入数据
    _x, _y = DataUtil.get_dataset(
        "name",
        "C:\Program Files\Git\MachineLearning\_Data\mushroom.txt",
        tar_idx=0)
    # 实例化模型并进行训练、同时记录整个过程花费的时间
    learning_time = time.time()
    nb = MultinomialNB()

    nb.fit(_x, _y)
    learning_time = time.time() - learning_time
    # 评估模型的表现,同时记录评估过程花费的时间
    estimation_time = time.time()
    nb.evaluate(_x, _y)
    estimation_time = time.time() - estimation_time
    # 将记录下来的时间输出
    print("Model building : {:12.6} s\n"
          "Estimation     : {:12.6} s\n"
          "Total          : {:12.6} s".format(learning_time, estimation_time,