def adaboost(train, test, headers, fullTestData): ylabels = ['H', 'A', 'D']#Make predictions for each of the possible labels results = [] for y in ylabels: # print "Training for", y rootNode = Node(train)#Initialize first decision stump treeRootNode = buildTree(rootNode, y, headers) results.append(predict(test, rootNode.splits, y)) print "Now making predictions" prediction = [] for r in xrange(0,len(results[0])): temp = [zy for zy in column(results, r)] #Take the label with corresponding max value of alpha as final prediction prediction.append(ylabels[temp.index(max(temp))]) print "Now checking predictions" corr = 0 print "Home\tAway\tPrediction\tActual\tBookie" file = open("resultdata.csv", 'a') writer = csv.writer(file, quoting=csv.QUOTE_ALL) for p in xrange(0,len(prediction)): print '\a' writer.writerow([column(fullTestData,-2)[p], column(fullTestData,-1)[p], prediction[p], column(test,-2)[p], column(fullTestData,-4)[p]]) print [column(fullTestData,-2)[p], column(fullTestData,-1)[p], prediction[p], column(test,-2)[p], column(fullTestData,-4)[p]] if prediction[p] == column(test,-2)[p]: corr+=1 file.close() try: print str(float(corr)*100/len(prediction)), len(prediction) except ZeroDivisionError: print 0, len(prediction) print "done"
def predict(self, X): """ Test the trained RF on the given set of examples X Input: ------ X: [m x d] a d-dimensional test examples. Returns: ----------- pclass: the predicted class for the given example, i.e. to which it belongs """ z = [] if self.scalefeat: X = self.applyScaling(X) pred = [] for tree in self.trees: z.append(tree.predict(X)) z = np.array(z).T for row in z: pred.append(stats.mode(row)[0]) return pred
def get_predict(trees_result, trees_fiture, data_train): m_tree = len(trees_result) m = np.shape(data_train)[0] result = [] for i in xrange(m_tree): clf = trees_result[i] feature = trees_fiture[i] data = split_data(data_train, feature) result_i = [] for i in xrange(m): result_i.append((predict(data[i][0:-1], clf).keys())[0]) result.append(result_i) final_predict = np.sum(result, axis=0) return final_predict
def predict(self, data): # 返回权重为中位数的模型(过低的模型容器欠拟合,过高的模型容易过拟合) median = np.median(self.model_weight) # 由于结果都是浮点数,所以要使用差值小于一个小量代表两者相同 median_index = np.where( np.array(self.model_weight) - median <= 1e-3)[0] result = [] for index in median_index: # 每个模型的预测结果需要乘以对应的权重 pred = list( map( lambda _: predict(self.model_list[index], _, self. feature_list), data)) result.append(self.model_weight[index] * pred) # 把每个模型预测的结果相加为最终的强模型 return np.sign(np.sum(result, axis=0)).astype(int).reshape(-1)
def predict(self, X): """ Test the trained RF on the given set of examples X Input: ------ X: [m x d] a d-dimensional test examples. Returns: ----------- pclass: the predicted class for the given example, i.e. to which it belongs """ z = [] if self.scalefeat: X = self.applyScaling(X) #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# nexamples, nfeatures = X.shape predictions = [] for tree in self.trees: predictions.append(tree.predict(X)) #print "pred are:",predictions predictions_np = np.array(predictions) #print 'shape of pred is:',predictions iterator = 0 while (iterator < nexamples): max_sum = -np.inf col = -1 k = 0 while (k < len(self.classes)): boolean = (predictions_np[:, iterator] == self.classes[k]) temp_sum = np.sum(boolean) if (temp_sum > max_sum): col = k #print "col# is:",col max_sum = temp_sum #print "max sum is:",max_sum k = k + 1 z.append(self.classes[col]) iterator = iterator + 1 return z
def predict(self, x_pred): result = np.array([]) for feature_vec in x_pred: # 对每一行特征进行迭代 vote_array = np.array([]) # 记录模型的预测结果进行投票 for index in range(self.n_estimators): pred = predict(self.tree_list[index], feature_vec[self.tree_feature[index]], self.feature_list[self.tree_feature[index]]) vote_array = np.append(vote_array, pred) # 取预测各个模型预测的投票结果的作为模型的预测结果 label_class, counts = np.unique(vote_array.astype(int), return_counts=True) most_label_index = np.argmax(counts) result = np.append(result, label_class[most_label_index]) return result
def fit(self, data, label): """ 模型拟合过程,实现原理参考对应的链接 :param data: 特征矩阵 :param label: 标签 :return: """ # 设置初始的数据分布的采样权重,此时都相等 self.data_weight = np.ones((data.shape[0], 1)) / data.shape[0] # 记录数据集的索引 index = np.arange(0, data.shape[0], 1) # 进行迭代求解 for i in range(self.n_iterates): # 根据数据权重进行采样, 注意bagging是有放回,boosting是无放回 # https://zhuanlan.zhihu.com/p/47922595 sub_samping = np.random.choice( index, int(self.data_weight.shape[0] * self.alpha), replace=False, p=self.data_weight.reshape(-1, ).tolist()) train_x = data[sub_samping] train_y = label[sub_samping] dt = createTree(train_x, train_y, self.feature_list) # 进行弱学习模型训练 self.model_list.append(dt) # 存储该弱学习模型 pred = list( map(lambda _: predict(dt, _, self.feature_list), train_x)) # 计算模型在训练集上的误差率 (即预测错误的样本权重相加,相同为0,不同为1) pred_error = np.ones((len(pred), 1)) pred_error[pred == train_y] = 0 et = pred_error.T.dot(self.data_weight[sub_samping]) # 把模型的权重加入到列表中 at = 0.5 * np.log((1 - et) / et) self.model_weight.append(at) # 更新样本的权重 self.data_weight[sub_samping] = self.data_weight[ sub_samping] * np.exp(-at * train_y * pred).reshape(-1, 1) # 权重归一化 self.data_weight = self.data_weight / self.data_weight.sum()
def get_predict(trees_result, trees_feature, data_train): '''利用训练好的随机森林模型对样本进行预测 :param trees_result: :param trees_feature: :param data_train: :return: ''' m_tree = len(trees_result) # 手动设置的50个树节点 m = np.shape(data_train)[0] result = [] for i in range(m_tree): clf = trees_result[i] feature = trees_feature[i] data = split_data(data_train, feature) result_i = [] for j in range(m): # 查看每个样本与计算出来的树比较,判断数据是左、右子树 result_i.append(list(predict(data[j][0:-1], clf).keys())[0]) result.append(result_i) final_predict = np.sum(result, axis=0) return final_predict
def ValidationMining(target, fn, ln): print("target: %s, fn: %s, ln: %s" % (target, fn, ln)) # get real value for input r_sql = "select * from validtree where nameFirst = \'" + fn + "\' and nameLast = \'" + ln + "\' limit 1;" realdata = databaseconnection(r_sql) if realdata == None or len(realdata) == 0: print("No record exists for %s %s" % (fn, ln)) pred = "Unknown" real = "Unknown" #exit() return pred, real else: r_df = pd.DataFrame(list(realdata)) r_df.columns = [ 'playerID', 'nameFirst', 'nameLast', 'nom', 'hof', 'man' ] r_df.fillna(value=0, inplace=True) real = r_df[target].iloc[0] real = "Y" if int(real) == 1 else "N" print("real value is ", real) # get corresponding row playerid = r_df['playerID'].iloc[0] tables = AllTables() dfcols = tables.cols row_sql = "select * from treesource where playerID = \'" + playerid + "\'" rowdata = databaseconnection(row_sql) rowdf = pd.DataFrame(list(rowdata)) rowdf.columns = dfcols rowdf.fillna(value=0, inplace=True) #######decision tree data sql = tables.sql results = databaseconnection(sql) print("get result from db..") df = pd.DataFrame(list(results)) df.columns = dfcols df.fillna(value=0, inplace=True) df = removezero(df) y = df[target].values.astype(int) df.drop(columns=['playerID', 'nom', 'hof', 'man'], inplace=True) cols = list(df.columns.values) df = df[cols].applymap(np.int64) df = df[cols].round(decimals=-1) rowdf = rowdf[cols].applymap(np.int64) rowdf = rowdf[cols].round(decimals=-1) row = rowdf.iloc[0] df[target] = y.tolist() train, test = tree.train_test_split(df) ## attributes = cols print("Generating decision tree..") root = tree.build_tree(train, attributes, target) print("Start to predict..") pred = str(tree.predict(root, row)) pred = "Y" if int(pred) == 1 else "N" return pred, real
def predict_forest_predict(forest, x): k = len(forest) s = 0.0 for i in range(k): s += tree.predict(forest[i], x) / k return s
['sunny','33','high','FALSE',25], ['sunny','32','high','TRUE',30], ['overcast','31','high','FALSE',46], ['rainy','22','high','FALSE',45], ['rainy','13','normal','FALSE',52], ['rainy','15','normal','TRUE',23], ['overcast','12','normal','TRUE',43], ['sunny','25','high','FALSE',35], ['sunny','13','normal','FALSE',35], ['rainy','23','normal','FALSE',38], ['sunny','24','normal','TRUE',46], ['overcast','25','high','TRUE',48], ['overcast','24','normal','FALSE',52], ['rainy','21','high','TRUE',44] ] """ datamatrix = [['1', '33', '90', '0', 25], ['1', '32', '90', '1', 30], ['50', '31', '90', '0', 46], ['100', '22', '90', '0', 45], ['100', '13', '50', '0', 52], ['100', '15', '50', '1', 23], ['50', '12', '50', '1', 43], ['1', '25', '90', '0', 35], ['1', '13', '50', '0', 35], ['100', '23', '50', '0', 38], ['1', '24', '50', '1', 46], ['50', '25', '90', '1', 48], ['50', '24', '50', '0', 52], ['100', '21', '90', '1', 44]] datamatrix.sort() tree = tree.Tree(tableheader, datamatrix) #, mode="between") tree.showTrees() print 'PREDICTION' #tree.predict(['sunny','hot','high','FALSE']) tree.predict(['sunny', 'hot', 'high', 'FALSE']) #tree.predict(['sunny','33','high','FALSE']) #print tree.predict(['1','24','50','1'])
def predict(forest, x): y_i = np.empty(forest.size, dtype=object) for i in range(forest.size): y_i[i] = tree.predict(forest[i], x) return np.mean(y_i)