def setUp(self): self.feature_histogram = FeatureHistogram() eggroll.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = eggroll.parallelize(data_insts, include_key=False) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def get_histograms(self, node_map={}): LOGGER.info("start to get node histograms") histograms = FeatureHistogram.calculate_histogram( self.data_bin_with_node_dispatch, self.grad_and_hess, self.bin_split_points, self.bin_sparse_points, self.valid_features, node_map) acc_histograms = FeatureHistogram.accumulate_histogram(histograms) return acc_histograms
def get_histograms(self, node_map={}): LOGGER.info("start to get node histograms") histograms = FeatureHistogram.calculate_histogram( self.data_bin_with_position, self.grad_and_hess, self.bin_split_points, self.bin_sparse_points, self.valid_features, node_map, self.use_missing, self.zero_as_missing) LOGGER.info("begin to accumulate histograms") acc_histograms = FeatureHistogram.accumulate_histogram(histograms) LOGGER.info("acc histogram shape is {}".format(len(acc_histograms))) return acc_histograms
def get_histograms(self, node_map={}): LOGGER.info("start to get node histograms") # self.data_bin_with_position = self.data_bin.join(node_positions, lambda v1, v2: (v1, v2)) histograms = FeatureHistogram.calculate_histogram( self.data_bin_with_position, self.grad_and_hess, self.bin_split_points, self.bin_sparse_points, self.valid_features, node_map) LOGGER.info("begin to accumulate histograms") acc_histograms = FeatureHistogram.accumulate_histogram(histograms) LOGGER.info("acc histogram shape is {}".format(len(acc_histograms))) return acc_histograms
def get_left_node_local_histogram(self, cur_nodes: List[Node], tree: List[Node], g_h, table_with_assign, split_points, sparse_point, valid_feature): node_map = self.get_node_map(cur_nodes, left_node_only=True) LOGGER.info("start to get node histograms") histograms = FeatureHistogram.calculate_histogram( table_with_assign, g_h, split_points, sparse_point, valid_feature, node_map, self.use_missing, self.zero_as_missing) hist_bags = [] for hist_list in histograms: hist_bags.append(HistogramBag(hist_list)) left_nodes = [] for node in cur_nodes: if node.is_left_node or node.id == 0: left_nodes.append(node) # set histogram id and parent histogram id for node, hist_bag in zip(left_nodes, hist_bags): # LOGGER.debug('node id {}, node parent id {}, cur tree {}'.format(node.id, node.parent_nodeid, len(tree))) hist_bag.hid = node.id hist_bag.p_hid = node.parent_nodeid return hist_bags
def get_histograms(self, node_map={}): LOGGER.info("start to get node histograms") acc_histograms = FeatureHistogram.calculate_histogram( self.data_bin_with_position, self.grad_and_hess, self.bin_split_points, self.bin_sparse_points, self.valid_features, node_map, self.use_missing, self.zero_as_missing, ret="tb") return acc_histograms
def get_local_histogram(self, cur_to_split: List[Node], g_h, table_with_assign, split_points, sparse_point, valid_feature): LOGGER.info("start to get node histograms") node_map = self.get_node_map(nodes=cur_to_split) histograms = FeatureHistogram.calculate_histogram( table_with_assign, g_h, split_points, sparse_point, valid_feature, node_map, self.use_missing, self.zero_as_missing) hist_bags = [] for hist_list in histograms: hist_bags.append(HistogramBag(hist_list)) return hist_bags
class TestFeatureHistogram(unittest.TestCase): def setUp(self): self.feature_histogram = FeatureHistogram() eggroll.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = eggroll.parallelize(data_insts, include_key=False) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)] def test_accumulate_histogram(self): data = [[[[random.randint(0, 10) for i in range(2)] for j in range(3)] for k in range(4)] for r in range(5)] histograms = self.feature_histogram.accumulate_histogram(copy.deepcopy(data)) for i in range(len(data)): for j in range(len(data[i])): for k in range(1, len(data[i][j])): for r in range(len(data[i][j][k])): data[i][j][k][r] += data[i][j][k - 1][r] self.assertTrue(data[i][j][k][r] == histograms[i][j][k][r]) def test_calculate_histogram(self): histograms = self.feature_histogram.calculate_histogram( self.data_bin, self.grad_and_hess, self.bin_split_points, self.bin_sparse, node_map=self.node_map) his2 = [[[[0 for i in range(3)] for j in range(6)] for k in range(10)] for r in range(4)] for i in range(1000): grad, hess = self.grad_and_hess_list[i] id = self.node_map[self.data_insts[i][1][1]] for fid, bid in self.data_insts[i][0].features.get_all_data(): his2[id][fid][bid][0] += grad his2[id][fid][bid][1] += hess his2[id][fid][bid][2] += 1 for i in range(len(his2)): for j in range(len(his2[i])): for k in range(len(his2[i][j])): for r in range(len(his2[i][j][k])): self.assertTrue(np.fabs(his2[i][j][k][r] - histograms[i][j][k][r]) < consts.FLOAT_ZERO) def test_aggregate_histogram(self): data1 = [[[[random.randint(0, 10) for i in range(2)] for j in range(3)] for k in range(4)] for r in range(5)] data2 = [[[[random.randint(0, 10) for i in range(2)] for j in range(3)] for k in range(4)] for r in range(5)] agg_histograms = self.feature_histogram.aggregate_histogram(data1, data2) for i in range(len(data1)): for j in range(len(data1[i])): for k in range(len(data1[i][j])): for r in range(len(data1[i][j][k])): data1[i][j][k][r] += data2[i][j][k][r] self.assertTrue(data1[i][j][k][r] == agg_histograms[i][j][k][r])