def serializeSVMModel(): try: dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel( "bank-addtional-format-svm") except Exception as e: dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional") dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, -1) trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) kTup = ("lin", 1.3) alphas, b = SVMLib.realSMO(trainSet, trainLabel, 0.6, 0.01, kTup, 10) errorCount = 0 sv, svl = SVMLib.getSupportVectorandSupportLabel(trainSet, trainLabel, alphas) for data, label in zip(testSet, testLabel): predict_label = SVMLib.predictLabel(data, *[sv, svl, alphas, b, kTup]) if predict_label != label: errorCount += 1 ratio = errorCount / len(testLabel) print("the error ratio is %.3f, the correct ratio is %.3f" % (ratio, 1 - ratio)) db = shelve.open("{0}/MiningModel".format(sys.path[0])) db['SVMModel'] = [sv, svl, alphas, b, kTup] db['SVMModelCorrectRatio'] = 1 - ratio db.close()
def loadDataSet(filename): print("Loading data...") dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename) print("Loaded data!") print("Undersampling data...") dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no") print("Undersampled data!") return dataSet, labelSet
def main(visualize=True): data_obj = DataUtil('example_data.csv') x, y = data_obj.get_dataSet() fit_time = time.time() tree = CartTree(whether_continuous=[False] * 4) tree.fit(x, y, train_only=True) fit_time = time.time() - fit_time if visualize: tree.view() estimate_time = time.time() tree.evaluate(x, y) x2 = ['紫色', '小', '小孩', '用脚踩'] tree.evaluat2(x2)
def feed_data(self, x, y, sample_weight=None): if sample_weight is not None: sample_weight = np.array(sample_weight) x, y, wc, features, feat_dics, label_dic = DataUtil.quantize_data( x, y, wc=self._whether_continuous, separate=True) if self._whether_continuous is None: self._whether_continuous = wc self._whether_discrete = ~self._whether_continuous self._label_dic = label_dic discrete_x, continuous_x = x cat_counter = np.bincount(y) self._cat_counter = cat_counter labels = [y == value for value in range(len(cat_counter))] #训练离散型朴素贝叶斯 labelled_x = [discrete_x[ci].T for ci in labels] self._multinomial._x, self._multinomial._y = x, y self._multinomial._labelled_x, self._multinomial._label_zip = ( labelled_x, list(zip(labels, labelled_x))) self._multinomial._cat_counter = cat_counter self._multinomial._feat_dics = [ _dic for i, _dic in enumerate(feat_dics) if self._whether_discrete[i] ] self._multinomial._n_possibilities = [ len(feats) for i, feats in enumerate(features) if self._whether_discrete[i] ] self._multinomial._label_dic = label_dic #训练连续型朴素贝叶斯 labelled_x = [continuous_x[label].T for label in labels] self._gaussian._x, self._gaussian._y = continuous_x.T, y self._gaussian._labelled_x, self._gaussian._label_zip = labelled_x, labels self._gaussian._cat_counter, self._gaussian._label_dic = cat_counter, label_dic #处理样本权重 self.feed_sample_weight(sample_weight)
def testRFModel(dataSet, labelSet, T=20): trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) forest = RFLib.generateRandomForest(trainSet, trainLabel, T) errorCount = 0 for data, label in zip(testSet, testLabel): predict_label = RFLib.predictByRandomForest(forest, data) if predict_label != label: errorCount += 1 RFratio = float(errorCount) / len(testLabel) print("RF:total error ratio is %.3f, correct ratio is %.3f" % (RFratio, 1 - RFratio)) return RFratio
def serializeLRModel(): try: dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel( "bank-addtional-format-lr") except Exception as e: dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional") dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, 0) trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) weight, logList = LRLib.stocGradDescent(trainSet, trainLabel) errorCount = 0 for data, label in zip(testSet, testLabel): predict_label = LRLib.classifyVector(data, weight) if predict_label != label: errorCount += 1 ratio = errorCount / len(testLabel) print("the error ratio is %.3f, the correct ratio is %.3f" % (ratio, 1 - ratio)) db = shelve.open("{0}/MiningModel".format(sys.path[0])) db["LRModel"] = weight db["LRModelCorrectRatio"] = 1 - ratio db.close()
def testRFModel(filename="bank-additional"): db = shelve.open("{0}/MiningModel".format(sys.path[0])) maxCorrectRatio = db["RFModelCorrectRatio"] model = db["RFModel"] db.close() dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename) error = 0 for data, label in zip(dataSet, labelSet): predict_label = RFLib.predictByRandomForest(model, data) if predict_label != label: error += 1 errorRatio = error / len(dataSet) print( "RF:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f" % (errorRatio, 1 - errorRatio, maxCorrectRatio))
def testLRModel(filename="bank-additional"): db = shelve.open("{0}/MiningModel".format(sys.path[0])) maxCorrectRatio = db["LRModelCorrectRatio"] weight = db["LRModel"] db.close() # dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-lr") dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel(filename, "lr") error = 0 for data, label in zip(dataSet, labelSet): predict_label = LRLib.classifyVector(data, weight) if predict_label != label: error += 1 errorRatio = error / len(dataSet) print( "LR:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f" % (errorRatio, 1 - errorRatio, maxCorrectRatio))
def testSVMModel(filename="bank-additional"): db = shelve.open("{0}/MiningModel".format(sys.path[0])) maxCorrectRatio = db["SVMModelCorrectRatio"] model = db["SVMModel"] db.close() # dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-svm") dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel(filename, "svm") error = 0 for data, label in zip(dataSet, labelSet): predict_label = SVMLib.predictLabel(data, *model) if predict_label != label: error += 1 errorRatio = error / len(dataSet) print( "SVM:error ratio:%.3f, correct ratio:%.3f, correct ratio on trainSet:%.3f" % (errorRatio, 1 - errorRatio, maxCorrectRatio))
def serializeDTModel(): dataSet, labelSet = loadDataSet("bank-additional") tmp_lst = [] maxRatio = 0 finalModel = {} for i in range(100): trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) model = DTLib.createDecisionTree(trainSet, trainLabel) errorRatio = DTLib.testDTModel(testSet, testLabel, model) tmp_lst.append(1 - errorRatio) if (1 - errorRatio) > maxRatio: maxRatio = 1 - errorRatio finalModel = model db = shelve.open("{0}/MiningModel".format(sys.path[0])) db["DTModel"] = finalModel db["DTModelCorrectRatio"] = maxRatio db.close()
def main(): # _data, _x, _y = [], [], [] # with open("Data/data.txt", "r") as file: # for line in file: # _data.append(line.strip().split(",")) # np.random.shuffle(_data) # for line in _data: # _y.append(line.pop(0)) # _x.append(line) # _x, _y = np.array(_x), np.array(_y) # train_num = 5000 # x_train = _x[:train_num] # y_train = _y[:train_num] # x_test = _x[train_num:] # y_test = _y[train_num:] # _fit_time = time.time() # _tree = CartTree() # _tree.fit(x_train, y_train) # _fit_time = time.time() - _fit_time # _tree.view() # _estimate_time = time.time() # _tree.estimate(x_test, y_test) # _estimate_time = time.time() - _estimate_time # print("Fit Process : {:8.6} s\n" # "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time)) # _tree.visualize() from Util import DataUtil _x, _y = DataUtil.gen_xor() _y = np.argmax(_y, axis=1) _fit_time = time.time() _tree = ID3Tree() _tree.fit(_x, _y) _fit_time = time.time() - _fit_time # _tree.view() _estimate_time = time.time() # _tree.estimate(_x, _y) _estimate_time = time.time() - _estimate_time print("Fit Process : {:8.6} s\n" "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time)) _tree.visualize2d(_x, _y)
def serializeRFModel(): dataSet, labelSet = loadDataSet("bank-additional") maxRatio = 0 finalModel = None for i in range(10): trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) forest = RFLib.generateRandomForest(trainSet, trainLabel, 20) errorCount = 0 for data, label in zip(testSet, testLabel): predict_label = RFLib.predictByRandomForest(forest, data) if predict_label != label: errorCount += 1 RFratio = float(errorCount) / len(testLabel) if (1 - RFratio) > maxRatio: maxRatio = 1 - RFratio finalModel = forest print("RF:total error ratio is %.3f, correct ratio is %.3f" % (RFratio, 1 - RFratio)) db = shelve.open("{0}/MiningModel".format(sys.path[0])) db["RFModel"] = finalModel db["RFModelCorrectRatio"] = maxRatio db.close()
def loadDataSet(filename): print("Loading data...") dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename) print("Loaded data!") print("Undersampling data...") dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no") print("Undersampled data!") return dataSet, labelSet if __name__ == "__main__": start = time.clock() dataSet, labelSet = loadDataSet("bank-additional") tmp_lst = [] for i in range(100): trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) model = DTLib.createDecisionTree(trainSet, trainLabel) errorRatio = DTLib.testDTModel(testSet, testLabel, model) tmp_lst.append(1 - errorRatio) y = np.array(tmp_lst, dtype=np.float) print("the avg correct ratio is %.3f, the std is %.3f" % (y.mean(), y.std())) x = np.arange(0, len(tmp_lst)) fig = plt.figure("test") ax = fig.add_subplot(111) ax.plot(x, y) ax.set_ylim([0, 1]) ax.set_ylabel("correct ratio of DT") ax.set_xlabel("count of exp") plt.show()
x[i][d] = _feat_dics[idx][line[d]] if discrete: idx += 1 return x if __name__ == '__main__': import time _whether_discrete = [True] * 16 _continuous_lst = [0, 5, 9, 11, 12, 13, 14] for _cl in _continuous_lst: _whether_discrete[_cl] = False train_num = 40000 data_time = time.time() _data = DataUtil.get_dataset("bank1.0", "../../_Data/bank1.0.txt") np.random.shuffle(_data) train_x = _data[:train_num] test_x = _data[train_num:] train_y = [xx.pop() for xx in train_x] test_y = [xx.pop() for xx in test_x] data_time = time.time() - data_time learning_time = time.time() nb = MergedNB(_whether_discrete) nb.fit(train_x, train_y) learning_time = time.time() - learning_time estimation_time = time.time() nb.estimate(train_x, train_y) nb.estimate(test_x, test_y)
def main(visualize=True): # x, y = DataUtil.get_dataset("balloon1.0(en)", "../_Data/balloon1.0(en).txt") x, y = DataUtil.get_dataset("test", "../_Data/test.txt") fit_time = time.time() tree = CartTree(whether_continuous=[False] * 4) tree.fit(x, y, train_only=True) fit_time = time.time() - fit_time if visualize: tree.view() estimate_time = time.time() tree.evaluate(x, y) estimate_time = time.time() - estimate_time print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(fit_time, estimate_time, fit_time + estimate_time)) if visualize: tree.visualize() train_num = 6000 (x_train, y_train), (x_test, y_test), *_ = DataUtil.get_dataset("mushroom", "../_Data/mushroom.txt", tar_idx=0, n_train=train_num) fit_time = time.time() tree = C45Tree() tree.fit(x_train, y_train) fit_time = time.time() - fit_time if visualize: tree.view() estimate_time = time.time() tree.evaluate(x_train, y_train) tree.evaluate(x_test, y_test) estimate_time = time.time() - estimate_time print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(fit_time, estimate_time, fit_time + estimate_time)) if visualize: tree.visualize() x, y = DataUtil.gen_xor(one_hot=False) fit_time = time.time() tree = CartTree() tree.fit(x, y, train_only=True) fit_time = time.time() - fit_time if visualize: tree.view() estimate_time = time.time() tree.evaluate(x, y, n_cores=1) estimate_time = time.time() - estimate_time print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(fit_time, estimate_time, fit_time + estimate_time)) if visualize: tree.visualize2d(x, y, dense=1000) tree.visualize() wc = [False] * 16 continuous_lst = [0, 5, 9, 11, 12, 13, 14] for _cl in continuous_lst: wc[_cl] = True train_num = 2000 (x_train, y_train), (x_test, y_test), *_ = DataUtil.get_dataset("bank1.0", "../_Data/bank1.0.txt", n_train=train_num, quantize=True) fit_time = time.time() tree = CartTree() tree.fit(x_train, y_train) fit_time = time.time() - fit_time if visualize: tree.view() estimate_time = time.time() tree.evaluate(x_test, y_test) estimate_time = time.time() - estimate_time print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(fit_time, estimate_time, fit_time + estimate_time)) if visualize: tree.visualize() tree.show_timing_log()
def main(): _data = DataUtil.get_dataset("mushroom", "../_Data/mushroom.txt") np.random.shuffle(_data) _x, _y = [], [] for line in _data: _y.append(line.pop(0)) _x.append(line) _x, _y = np.array(_x), np.array(_y) train_num = 5000 x_train = _x[:train_num] y_train = _y[:train_num] x_test = _x[train_num:] y_test = _y[train_num:] _fit_time = time.time() _tree = C45Tree() _tree.fit(x_train, y_train) _fit_time = time.time() - _fit_time _tree.view() _estimate_time = time.time() _tree.estimate(x_test, y_test) _estimate_time = time.time() - _estimate_time print("Fit Process : {:8.6} s\n" "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time)) _tree.visualize() # from Util import DataUtil # _x, _y = DataUtil.gen_xor() # _y = np.argmax(_y, axis=1) # _fit_time = time.time() # _tree = C45Tree() # _tree.fit(_x, _y) # _fit_time = time.time() - _fit_time # _tree.view() # _estimate_time = time.time() # _tree.estimate(_x, _y) # _estimate_time = time.time() - _estimate_time # print("Fit Process : {:8.6} s\n" # "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time)) # _tree.visualize2d(_x, _y) _whether_discrete = [True] * 16 _continuous_lst = [0, 5, 9, 11, 12, 13, 14] for _cl in _continuous_lst: _whether_discrete[_cl] = False _data = DataUtil.get_dataset("bank1.0", "../_Data/bank1.0.txt") np.random.shuffle(_data) _labels = [xx.pop() for xx in _data] nb = MergedNB(_whether_discrete) nb.fit(_data, _labels) _dx, _cx = nb["multinomial"]["x"], nb["gaussian"]["x"] _labels = nb["multinomial"]["y"] _data = np.hstack((_dx, _cx.T)) train_num = 1000 x_train = _data[:train_num] y_train = _labels[:train_num] x_test = _data[train_num:] y_test = _labels[train_num:] _fit_time = time.time() _tree = C45Tree() _tree.fit(x_train, y_train) _fit_time = time.time() - _fit_time _tree.view() _estimate_time = time.time() _tree.estimate(x_test, y_test) _estimate_time = time.time() - _estimate_time print("Fit Process : {:8.6} s\n" "Estimate Process : {:8.6} s".format(_fit_time, _estimate_time)) _tree.visualize()
if __name__ == '__main__': # _x, _y = gen_random() # test(_x, _y, algorithm="RF", epoch=1) # test(_x, _y, algorithm="RF", epoch=10) # test(_x, _y, algorithm="RF", epoch=50) # test(_x, _y, algorithm="SKRandomForest") # test(_x, _y, epoch=1) # test(_x, _y, epoch=1) # test(_x, _y, epoch=10) # _x, _y = gen_xor() # test(_x, _y, algorithm="RF", epoch=1) # test(_x, _y, algorithm="RF", epoch=10) # test(_x, _y, algorithm="RF", epoch=1000) # test(_x, _y, algorithm="SKAdaBoost") _x, _y = DataUtil.gen_spiral(size=20, n=4, n_class=2, one_hot=False) _y[_y == 0] = -1 # test(_x, _y, clf="SKTree", epoch=10) # test(_x, _y, clf="SKTree", epoch=1000) # test(_x, _y, algorithm="RF", epoch=10) test(_x, _y, algorithm="RF", epoch=30, n_cores=4) # test(_x, _y, algorithm="SKAdaBoost") train_num = 6000 (x_train, y_train), (x_test, y_test), *_ = DataUtil.get_dataset("mushroom", "data/mushroom.txt", n_train=train_num, quantize=True, tar_idx=0) y_train[y_train == 0] = -1
def predict(self, x, get_raw_results=False, bound=None, **kwargs): trees = self._trees if bound is None else self._trees[:bound] matrix = self._multi_clf(x, trees, rf_task, kwargs, target=kwargs.get("target", "parallel")) return np.array([RandomForest.most_appearance(rs) for rs in matrix]) def evaluate(self, x, y, metrics=None, tar=0, prefix="Acc", **kwargs): kwargs["target"] = "single" super(RandomForest, self).evaluate(x, y, metrics, tar, prefix, **kwargs) if __name__ == '__main__': import time train_num = 100 (x_train, y_train), (x_test, y_test) = DataUtil.get_dataset( "mushroom", "data/mushroom.txt", n_train=train_num, tar_idx=0) learning_time = time.time() forest = RandomForest() forest.fit(x_train, y_train) learning_time = time.time() - learning_time estimation_time = time.time() print( "===============================\n" "{}\n" "-------------------------------\n".format('mushroom'), end='\t') forest.evaluate(x_train, y_train) forest.evaluate(x_test, y_test) estimation_time = time.time() - estimation_time print(
def _transfer_x(self, x): #遍历每个元素,利转化字典进行数值化 for j, char in enumerate(x): x[j] = self._feat_dics[j][char] return x if __name__ == '__main__': # 导入标准库 time 以计时,导入DataUtil 类以获取数据 import time from Util import DataUtil #遍历1.0,1.5两个版本的气球数据集 for dataset in ('balloon1.0', 'balloon1.5'): #读入数据 _x, _y = DataUtil.get_dataset(name=dataset, path='Data/{}.txt'.format(dataset)) #实例化模型并进行训练,同时记录整个过程花费的时间 learning_time = time.time() nb = MultinomialNB() nb.fit(_x, _y) learning_time = time.time() - learning_time # 评估模型的表现,同时记录评估过程花费的时间 estimation_time = time.time() nb.evluate(_x, _y) estimation_time = time.time() - estimation_time # 将记录下来的耗时输出 print('Model buiding : {:12.6} s\n' 'Estimation : {:12.6} s\n' 'Total : {:12.6} s'.format( learning_time, estimation_time, learning_time + estimation_time))
data[_j][_c, :], width=0.35, facecolor=colors[nb.label_dic[_c]], edgecolor='white', label='class: {}'.format(nb.label_dic[_c])) plt.xticks([i for i in range(sj + 2)], [''] + [_rev_feat_dics[i] for i in range(sj)] + ['']) plt.ylim(0, 1.0) plt.legend() plt.savefig('d{}'.format(_j + 1)) if __name__ == '__main__': import time from Util import DataUtil #for dataset in ('balloon1.0', 'balloon1.5'): dataset = 'mushroom' _x, _y = DataUtil.get_dataset(dataset, 'Data/{}.txt'.format(dataset)) learning_time = time.time() nb = MultinomialNB() nb.fit(_x, _y) learning_time = time.time() - learning_time estimation_time = time.time() nb.evaluate(_x, _y) estimation_time = time.time() - estimation_time print('Model building : {:12.6} s\n' 'Estimation : {:12.6} s\n' 'Total : {:12.6} s'.format(learning_time, estimation_time, learning_time + estimation_time)) nb.visualize()
plt.legend() if not save: plt.show() else: plt.savefig("d{}".format(j + 1)) @staticmethod def _transfer_x(x): return x if __name__ == '__main__': import time xs, ys = DataUtil.get_dataset( "mushroom", "C:\\Users\\tangk\\PycharmProjects\Machine_Learning\\_Data\\mushroom.txt", tar_idx=0) nb = MultinomialNB() nb.feed_data(xs, ys) xs, ys = nb["x"].tolist(), nb["y"].tolist() train_num = 6000 x_train, x_test = xs[:train_num], xs[train_num:] y_train, y_test = ys[:train_num], ys[train_num:] learning_time = time.time() gb = GaussianNB() gb.fit(x_train, y_train) learning_time = time.time() - learning_time estimation_time = time.time()
hashtag_dics = {_l: hashtag_list.count(_l) for _l in hashtag_set} return hashtag_dics # fig = plt.figure() # plt.title(title) # plt.barh(hashtag_dics.keys(),hashtag_dics.values(),width = 0.35,facecolor = 'lightskyblue',edgecolor = 'white') # plt.show() if __name__ == '__main__': t = ['#a b c', '#a #b c', '#a #b #c'] print count_hashtag(t) import time from Util import DataUtil du = DataUtil() for dataset in ("balloon1.0(en)", "balloon1.5(en)"): _x, _y = du.get_dataset(dataset, "../_Data/{}.txt".format(dataset)) learning_time = time.time() nb = MultinomialNB() nb.fit(_x, _y) learning_time = time.time() - learning_time estimation_time = time.time() nb.evaluate(_x, _y) estimation_time = time.time() - estimation_time print( "Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s\n".format(
self._data = data def func(input_x, tar_category): rs = 1 for d, xx in enumerate(input_x): rs *= data[d][tar_category](xx) return rs * p_category[tar_category] return func @staticmethod def _transfer_x(x): return x if __name__ == '__main__': import time # 读入数据 _x, _y = DataUtil.get_dataset("name", "C:\Program Files\Git\MachineLearning\_Data\\bank2.0.txt") gnb = GaussianNB() gnb.fit(_x, _y) gnb.evaluate(_x, _y) # nb = MultinomialNB() # nb.feed_data(_x, _y) # xs, ys = nb["x"].tolist(), nb["y"].tolist() # train_num = 6000 # x_train, x_test = xs[:train_num], xs[train_num:] # y_train, y_test = ys[:train_num], ys[train_num:] # nb.fit(x_train, y_train) # nb.evaluate(x_train, y_train) # nb.evaluate(x_test, y_test) # gnb = GaussianNB() # gnb.fit(x_train, y_train) # gnb.evaluate(x_train, y_train)
# "Estimation : {:12.6} s\n" # "Total : {:12.6} s".format( # learning_time, estimation_time, # learning_time + estimation_time # ) # ) whether_continuous = [False] * 16 continuous_lst = [0, 5, 9, 11, 12, 13, 14] for cl in continuous_lst: whether_continuous[cl] = True train_num = 40000 data_time = time.time() (x_train, y_train), (x_test, y_test) = DataUtil.get_dataset( "bank1.0", "C:/Users/tangk/Desktop/MachineLearning-master/MachineLearning-master/_Data/bank1.0.txt", n_train=train_num) data_time = time.time() - data_time learning_time = time.time() nb = MergedNB(whether_continuous=whether_continuous) nb.fit(x_train, y_train) learning_time = time.time() - learning_time estimation_time = time.time() nb.evaluate(x_train, y_train) nb.evaluate(x_test, y_test) estimation_time = time.time() - estimation_time print("Data cleaning : {:12.6} s\n" "Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format( data_time, learning_time, estimation_time,
#定义数值化数据的函数 def _transfer_x(self, x): # 遍历每个元素,利用转换字典进行数值化 for j, char in enumerate(x): x[j] = self._feat_dics[j][char] return x if __name__ == '__main__': #导入标准库time以计时,导入DataUtil类以获取数据 import time from Util import DataUtil #遍历1.0,1.5两个版本的气球数据集 for dataset in ("balloon1.0", "balloon1.5"): # 读入数据 _x, _y = DataUtil.get_dataset(dataset, "../../_Data/{}.txt".format(dataset)) #实例化模型并进行训练、同时记录整个过程花费的时间 learning_time = time.time() nb = MultinomialNB() np.fit(_x, _y) learning_time = time.time() - learning_time #评估模型的表现,同时记录评估过程花费的时间 estimation_time = time.time() nb.evaluate(_x, _y) estimation_time = time.time() - estimation_time #将记录下来的耗时输出 print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(learning_time, estimation_time, learning_time + estimation_time))
''' Created on 2018年3月4日 @author: IL MARE ''' import time from Lib import SVMLib as SVMLib from Util import DataUtil as DataUtil if __name__ == "__main__": start = time.clock() # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional", "svm")#正统方法 dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel( "bank-addtional-format-svm") dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, -1) trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) kTup = ("lin", 1.2) alphas, b = SVMLib.realSMO(trainSet, trainLabel, 0.6, 0.01, kTup, 10) errorCount = 0 sv, svl = SVMLib.getSupportVectorandSupportLabel(trainSet, trainLabel, alphas) for data, label in zip(testSet, testLabel): predict_label = SVMLib.predictLabel(data, *[sv, svl, alphas, b, kTup]) if predict_label != label: errorCount += 1 ratio = errorCount / len(testLabel) print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" % (ratio, 1 - ratio, time.clock() - start))
rs = 1 for d, xx in enumerate(input_x): rs *= data[d][tar_category][xx] return rs * p_category[tar_category] return func def _transfer_x(self, x): for j, char in enumerate(x): x[j] = self._feat_dics[j][char] return x if __name__ == '__main__': import time _data = DataUtil.get_dataset("mushroom", "../../_Data/mushroom.txt") np.random.shuffle(_data) train_num = 6000 train_x = _data[:train_num] test_x = _data[train_num:] train_y = [xx.pop(0) for xx in train_x] test_y = [xx.pop(0) for xx in test_x] learning_time = time.time() nb = MultinomialNB() nb.fit(train_x, train_y) learning_time = time.time() - learning_time estimation_time = time.time() nb.estimate(train_x, train_y) nb.estimate(test_x, test_y)
''' Created on 2018年3月4日 @author: IL MARE ''' import Util.DataUtil as DataUtil from lib import LogisticLib as LRLib import time if __name__ == "__main__": start = time.clock() # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional")#正统方法 dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel( "bank-addtional-format-lr") trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet( dataSet, labelSet) weight, logList = LRLib.stocGradDescent(trainSet, trainLabel) errorCount = 0 for data, label in zip(testSet, testLabel): predict_label = LRLib.classifyVector(data, weight) if predict_label != label: errorCount += 1 ratio = errorCount / len(testLabel) print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" % (ratio, 1 - ratio, time.clock() - start)) LRLib.plotWeightFig(logList, [i for i in range(0, 6)])
label=u"class: {}".format(self.label_dic[c])) plt.xticks([i for i in range(sj + 2)], [""] + [rev_dic[i] for i in range(sj)] + [""]) plt.ylim(0, 1.0) plt.legend() if not save: plt.show() else: plt.savefig("d{}".format(j + 1)) if __name__ == '__main__': # 读入数据 _x, _y = DataUtil.get_dataset( "name", "C:\Program Files\Git\MachineLearning\_Data\mushroom.txt", tar_idx=0) # 实例化模型并进行训练、同时记录整个过程花费的时间 learning_time = time.time() nb = MultinomialNB() nb.fit(_x, _y) learning_time = time.time() - learning_time # 评估模型的表现,同时记录评估过程花费的时间 estimation_time = time.time() nb.evaluate(_x, _y) estimation_time = time.time() - estimation_time # 将记录下来的时间输出 print("Model building : {:12.6} s\n" "Estimation : {:12.6} s\n" "Total : {:12.6} s".format(learning_time, estimation_time,