def __init__(self, min_instances_slice=50, min_features_slice=0, alpha=0.001, row_cluster_method='KMeans', ind_test_method="pairwise_treeglm", sub_sample_rows=1000, cluster_penalty=2.0, n_cluster_splits=2, n_iters=1000, n_restarts=2, sklearn_args={}, cltree_leaves=False, poisson_leaves=True, rand_gen=None, cluster_prep_method="sqrt", family="poisson", cluster_first=True, cache=None): """ WRITEME """ self._min_instances_slice = min_instances_slice self._min_features_slice = min_features_slice self._alpha = alpha self._row_cluster_method = row_cluster_method self._ind_test_method = ind_test_method self._cluster_penalty = cluster_penalty self._n_cluster_splits = n_cluster_splits self._n_iters = n_iters self._n_restarts = n_restarts self._sklearn_args = sklearn_args self._cltree_leaves = cltree_leaves self.poisson_leaves = poisson_leaves self._cluster_prep_method = cluster_prep_method self.family = family self._sub_sample_rows = sub_sample_rows self._cluster_first = cluster_first self._rand_gen = rand_gen if rand_gen is not None \ else numpy.random.RandomState(RND_SEED) if cache is not None: self.fit_structure = cache.cache(self.fit_structure) self.config = { "min_instances": min_instances_slice, "alpha": alpha, "cluster_method": row_cluster_method, "cluster_n_clusters": n_cluster_splits, "cluster_iters": n_iters, "cluster_prep_method": cluster_prep_method, "family": self.family } # # resetting the data slice ids (just in case) DataSlice.reset_id_counter()
def test_whole_slice(): n_cols = 15 n_rows = 10 data_slice = DataSlice.whole_slice(n_rows, n_cols) assert data_slice.id == 0 row_ids_t = data_slice.instance_ids == [i for i in range(n_rows)] print(data_slice.instance_ids, row_ids_t, numpy.all(row_ids_t)) assert numpy.all(row_ids_t) assert (data_slice.feature_ids == [i for i in range(n_cols)]).all()
def test_greedy_feature_split(): # on synthetic data first g_factor = 2 s_instance_ids = numpy.array([0, 2, 8, 4, 3]) s_feature_ids = numpy.array([2, 1, 4, 0, 3]) data_slice = DataSlice(s_instance_ids, s_feature_ids) feat_comp_1, feat_comp_2 = algo.learnspn.greedy_feature_split( data, data_slice, feature_vals, g_factor, rand_gen) print(feat_comp_1, feat_comp_2) assert set( list(s_feature_ids)) == set(list(feat_comp_1) + list(feat_comp_2)) # # loading the dataset (using only the training portion) dataset_name = 'nltcs' print('Loading dataset', dataset_name) train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
def estimate_kernel_density_spn(data_slice, feature_sizes, data, alpha, node_id_assoc, building_stack, slices_to_process): """ A mixture with one component for each instance """ instance_ids = data_slice.instance_ids feature_ids = data_slice.feature_ids current_id = data_slice.id n_instances = len(instance_ids) n_features = len(feature_ids) logging.info('Adding a kernel density estimation ' + 'over a slice {0} X {1}'.format(n_instances, n_features)) # # create sum node root_sum_node = SumNode(var_scope=frozenset(feature_ids)) data_slice.type = SumNode building_stack.append(data_slice) root_sum_node.id = current_id node_id_assoc[current_id] = root_sum_node # # for each instance for i in instance_ids: # # create a slice instance_slice = DataSlice(numpy.array([i]), feature_ids) slices_to_process.append(instance_slice) # # linking with appropriate weight data_slice.add_child(instance_slice, 1.0 / n_instances) return root_sum_node, node_id_assoc, building_stack, slices_to_process
def make_naive_factorization(self, current_slice, slices_to_process, building_stack, node_id_assoc): logging.info('into a naive factorization') # # retrieving info from current slice current_instances = current_slice.instance_ids current_features = current_slice.feature_ids current_id = current_slice.id # # putting them in queue child_slices = [ DataSlice(current_instances, [feature_id]) for feature_id in current_features ] slices_to_process.extend(child_slices) children_ids = [child.id for child in child_slices] # # storing the children links for child_slice in child_slices: current_slice.add_child(child_slice) current_slice.type = ProductNode building_stack.append(current_slice) # # creating the product node prod_node = ProductNode(var_scope=frozenset(current_features)) prod_node.id = current_id node_id_assoc[current_id] = prod_node logging.debug('\tCreated Prod Node %s (with children %s)', prod_node, children_ids) return current_slice, slices_to_process, building_stack, node_id_assoc
def fit_structure(self, data): # # a queue containing the data slices to process slices_to_process = deque() # a stack for building nodes building_stack = deque() # a dict to keep track of id->nodes node_id_assoc = {} # creating the first slice whole_slice = DataSlice.whole_slice(data.shape[0], data.shape[1]) slices_to_process.append(whole_slice) cluster_first = self._cluster_first # # iteratively process & split slices # while slices_to_process: # process a slice current_slice = slices_to_process.popleft() # pointers to the current data slice current_instances = current_slice.instance_ids current_features = current_slice.feature_ids current_id = current_slice.id n_features = len(current_features) # if n_features > 1: # # # print("removing Zeros") # datarowsIdx = numpy.sum(data[current_instances, :][:, current_features], 1) > 0 # if not any(datarowsIdx): # datarowsIdx[0] = True # current_instances = current_slice.instance_ids[datarowsIdx] n_instances = len(current_instances) # if n_instances == 0: # #too strong cutting the zeroes # current_instances = [current_slice.instance_ids[0]] # n_instances = len(current_instances) slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] # is this a leaf node or we can split? if n_features == 1 and (current_slice.doNotCluster or n_instances <= self._min_instances_slice): (feature_id, ) = current_features if self.family == "poisson": leaf_node = PoissonNode(data, current_instances, current_features) elif self.family == "gaussian": leaf_node = GaussianNode(data, current_instances, current_features) # storing links # input_nodes.append(leaf_node) leaf_node.id = current_id node_id_assoc[current_id] = leaf_node # elif (current_slice_data.shape[0] < self._min_instances_slice): # elif ( (n_instances <= self._min_instances_slice and n_features > 1) and current_slice_data.shape[0] < self._min_instances_slice): # elif ((n_instances <= self._min_instances_slice and n_features > 1)): elif n_features > 1 and (current_slice.doNotCluster or n_instances <= self._min_instances_slice): # print('into naive factorization') child_slices = [ DataSlice(current_instances, [feature_id]) for feature_id in current_features ] slices_to_process.extend(child_slices) #children_ids = [child.id for child in child_slices] for child_slice in child_slices: child_slice.doNotCluster = current_slice.doNotCluster current_slice.add_child(child_slice) current_slice.type = ProductNode building_stack.append(current_slice) prod_node = ProductNode(data, current_instances, current_features) prod_node.id = current_id node_id_assoc[current_id] = prod_node else: split_on_features = False # first_run = False # # first run is a split on rows if n_features == 1 or cluster_first: cluster_first = False else: if self._ind_test_method == "pairwise_treeglm" or self._ind_test_method == "subsample": fcdata = current_slice_data if self._ind_test_method == "subsample": #sampled_rows = 2000 #sampled_rows = math.floor(current_slice_data.shape[0]*10/100) sampled_rows = self._sub_sample_rows if sampled_rows < current_slice_data.shape[0]: fcdata = current_slice_data[ numpy.random.choice( current_slice_data.shape[0], sampled_rows, replace=False)] else: fcdata = current_slice_data #Using R #from pdn.independenceptest import getIndependentGroups #feature_clusters = retrieve_clustering(getIndependentGroups(fcdata, alpha=self._alpha, family=self.family), current_features) feature_clusters = retrieve_clustering( getIndependentGroupsStabilityTest( fcdata, alpha=self._alpha), current_features) elif self._ind_test_method == "KMeans": feature_clusters = retrieve_clustering( cluster_rows( (data[current_instances, :][:, current_features] ).T, n_clusters=2, cluster_method=self._row_cluster_method, n_iters=self._n_iters, n_restarts=self._n_restarts, cluster_prep_method="sqrt", cluster_penalty=self._cluster_penalty, rand_gen=self._rand_gen, sklearn_args=self._sklearn_args), current_instances) split_on_features = len(feature_clusters) > 1 # # have dependent components been found? if split_on_features: # # splitting on columns # print('---> Splitting on features') # print(feature_clusters) slices = [ DataSlice(current_instances, cluster) for cluster in feature_clusters ] slices_to_process.extend(slices) current_slice.type = ProductNode building_stack.append(current_slice) for child_slice in slices: current_slice.add_child(child_slice) prod_node = ProductNode(data, current_instances, current_features) prod_node.id = current_id node_id_assoc[current_id] = prod_node else: # print('---> Splitting on rows') k_row_clusters = min(self._n_cluster_splits, n_instances - 1) if n_features == 1: # do one kmeans run with K large enough to split into N min instances k_row_clusters = math.floor( n_instances / self._min_instances_slice) + 1 k_row_clusters = min(k_row_clusters, n_instances - 1) clustering = retrieve_clustering( cluster_rows( data[current_instances, :][:, current_features], n_clusters=k_row_clusters, cluster_method=self._row_cluster_method, n_iters=self._n_iters, n_restarts=self._n_restarts, cluster_prep_method=self._cluster_prep_method, cluster_penalty=self._cluster_penalty, rand_gen=self._rand_gen, sklearn_args=self._sklearn_args), current_instances) cluster_slices = [ DataSlice(cluster, current_features) for cluster in clustering ] if len(clustering) < k_row_clusters: for cluster_slice in cluster_slices: cluster_slice.doNotCluster = True n_instances_clusters = sum( [len(cluster) for cluster in clustering]) cluster_weights = [ len(cluster) / n_instances_clusters for cluster in clustering ] slices_to_process.extend(cluster_slices) current_slice.type = SumNode building_stack.append(current_slice) for child_slice, child_weight in zip( cluster_slices, cluster_weights): current_slice.add_child(child_slice, child_weight) sum_node = SumNode(data, current_instances, current_features) sum_node.id = current_id node_id_assoc[current_id] = sum_node root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc, building_stack, True) spn = SpnFactory.layered_linked_spn(root_node, data, self.config) return spn
def test_pruned_spn_from_slices(): # # creating all the data slices # the slicing is a fake stub rows = 5 cols = 5 var = 1 values = 2 node_assoc = {} building_stack = deque() slice_1 = DataSlice.whole_slice(rows, cols) slice_1.type = SumNode node_1 = SumNode() node_1.id = slice_1.id node_assoc[node_1.id] = node_1 building_stack.append(slice_1) slice_2 = DataSlice.whole_slice(rows, cols) slice_2.type = ProductNode node_2 = ProductNode() node_2.id = slice_2.id node_assoc[node_2.id] = node_2 building_stack.append(slice_2) slice_3 = DataSlice.whole_slice(rows, cols) slice_3.type = SumNode node_3 = SumNode() node_3.id = slice_3.id node_assoc[node_3.id] = node_3 building_stack.append(slice_3) # adding first level slice_1.add_child(slice_2, 0.8) slice_1.add_child(slice_3, 0.2) slice_4 = DataSlice.whole_slice(rows, cols) slice_4.type = ProductNode node_4 = ProductNode() node_4.id = slice_4.id node_assoc[node_4.id] = node_4 building_stack.append(slice_4) leaf_5 = CategoricalSmoothedNode(var, values) slice_5 = DataSlice.whole_slice(rows, cols) leaf_5.id = slice_5.id node_assoc[leaf_5.id] = leaf_5 # not adding the slice to the stack slice_2.add_child(slice_4) slice_2.add_child(slice_5) slice_6 = DataSlice.whole_slice(rows, cols) slice_6.type = SumNode node_6 = SumNode() node_6.id = slice_6.id node_assoc[node_6.id] = node_6 building_stack.append(slice_6) slice_7 = DataSlice.whole_slice(rows, cols) slice_7.type = SumNode node_7 = SumNode() node_7.id = slice_7.id node_assoc[node_7.id] = node_7 building_stack.append(slice_7) slice_3.add_child(slice_6, 0.4) slice_3.add_child(slice_7, 0.6) slice_8 = DataSlice.whole_slice(rows, cols) slice_8.type = ProductNode node_8 = ProductNode() node_8.id = slice_8.id node_assoc[node_8.id] = node_8 building_stack.append(slice_8) leaf_15 = CategoricalSmoothedNode(var, values) slice_15 = DataSlice.whole_slice(rows, cols) leaf_15.id = slice_15.id node_assoc[leaf_15.id] = leaf_15 slice_4.add_child(slice_8) slice_4.add_child(slice_15) leaf_13 = CategoricalSmoothedNode(var, values) slice_13 = DataSlice.whole_slice(rows, cols) leaf_13.id = slice_13.id node_assoc[leaf_13.id] = leaf_13 leaf_14 = CategoricalSmoothedNode(var, values) slice_14 = DataSlice.whole_slice(rows, cols) leaf_14.id = slice_14.id node_assoc[leaf_14.id] = leaf_14 slice_8.add_child(slice_13) slice_8.add_child(slice_14) slice_9 = DataSlice.whole_slice(rows, cols) slice_9.type = ProductNode node_9 = ProductNode() node_9.id = slice_9.id node_assoc[node_9.id] = node_9 building_stack.append(slice_9) leaf_16 = CategoricalSmoothedNode(var, values) slice_16 = DataSlice.whole_slice(rows, cols) leaf_16.id = slice_16.id node_assoc[leaf_16.id] = leaf_16 leaf_17 = CategoricalSmoothedNode(var, values) slice_17 = DataSlice.whole_slice(rows, cols) leaf_17.id = slice_17.id node_assoc[leaf_17.id] = leaf_17 slice_9.add_child(slice_16) slice_9.add_child(slice_17) slice_10 = DataSlice.whole_slice(rows, cols) slice_10.type = ProductNode node_10 = ProductNode() node_10.id = slice_10.id node_assoc[node_10.id] = node_10 building_stack.append(slice_10) leaf_18 = CategoricalSmoothedNode(var, values) slice_18 = DataSlice.whole_slice(rows, cols) leaf_18.id = slice_18.id node_assoc[leaf_18.id] = leaf_18 leaf_19 = CategoricalSmoothedNode(var, values) slice_19 = DataSlice.whole_slice(rows, cols) leaf_19.id = slice_19.id node_assoc[leaf_19.id] = leaf_19 slice_10.add_child(slice_18) slice_10.add_child(slice_19) slice_6.add_child(slice_9, 0.1) slice_6.add_child(slice_10, 0.9) slice_11 = DataSlice.whole_slice(rows, cols) slice_11.type = ProductNode node_11 = ProductNode() node_11.id = slice_11.id node_assoc[node_11.id] = node_11 building_stack.append(slice_11) leaf_20 = CategoricalSmoothedNode(var, values) slice_20 = DataSlice.whole_slice(rows, cols) leaf_20.id = slice_20.id node_assoc[leaf_20.id] = leaf_20 leaf_21 = CategoricalSmoothedNode(var, values) slice_21 = DataSlice.whole_slice(rows, cols) leaf_21.id = slice_21.id node_assoc[leaf_21.id] = leaf_21 slice_11.add_child(slice_20) slice_11.add_child(slice_21) slice_12 = DataSlice.whole_slice(rows, cols) slice_12.type = ProductNode node_12 = ProductNode() node_12.id = slice_12.id node_assoc[node_12.id] = node_12 building_stack.append(slice_12) leaf_22 = CategoricalSmoothedNode(var, values) slice_22 = DataSlice.whole_slice(rows, cols) leaf_22.id = slice_22.id node_assoc[leaf_22.id] = leaf_22 leaf_23 = CategoricalSmoothedNode(var, values) slice_23 = DataSlice.whole_slice(rows, cols) leaf_23.id = slice_23.id node_assoc[leaf_23.id] = leaf_23 slice_12.add_child(slice_22) slice_12.add_child(slice_23) slice_7.add_child(slice_11, 0.2) slice_7.add_child(slice_12, 0.7) root_node = SpnFactory.pruned_spn_from_slices(node_assoc, building_stack) print('ROOT nODE', root_node) spn = SpnFactory.layered_linked_spn(root_node) print('SPN', spn) assert spn.n_layers() == 3 for i, layer in enumerate(spn.top_down_layers()): if i == 0: assert layer.n_nodes() == 1 elif i == 1: assert layer.n_nodes() == 5 elif i == 2: assert layer.n_nodes() == 12
def fit_structure(self, data, feature_sizes): """ data is a numpy array of size {n_instances X n_features} feature_sizes is an array of integers representing feature ranges """ # # resetting the data slice ids (just in case) DataSlice.reset_id_counter() tot_n_instances = data.shape[0] tot_n_features = data.shape[1] logging.info('Learning SPN structure on a (%d X %d) dataset', tot_n_instances, tot_n_features) learn_start_t = perf_counter() # # a queue containing the data slices to process slices_to_process = deque() # a stack for building nodes building_stack = deque() # a dict to keep track of id->nodes node_id_assoc = {} # creating the first slice whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features) slices_to_process.append(whole_slice) first_run = True # # iteratively process & split slices # while slices_to_process: # process a slice current_slice = slices_to_process.popleft() # pointers to the current data slice current_instances = current_slice.instance_ids current_features = current_slice.feature_ids current_id = current_slice.id n_instances = len(current_instances) n_features = len(current_features) logging.info('\n*** Processing slice %d (%d X %d)', current_id, n_instances, n_features) logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances, current_features) # # is this a leaf node or we can split? if n_features == 1: logging.info('---> Adding a leaf (just one feature)') (feature_id, ) = current_features feature_size = feature_sizes[feature_id] # slicing from the original dataset slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] # create the node leaf_node = CategoricalSmoothedNode( var=feature_id, var_values=feature_size, data=current_slice_data, instances=current_instances, alpha=self._alpha) # print('lnvf', leaf_node._var_freqs) # storing links # input_nodes.append(leaf_node) leaf_node.id = current_id node_id_assoc[current_id] = leaf_node logging.debug('\tCreated Smooth Node %s', leaf_node) elif (n_instances <= self._min_instances_slice and n_features > 1): # # splitting the slice on each feature logging.info('---> Few instances (%d), decompose all features', n_instances) # # shall put a cltree or if self._cltree_leaves: logging.info('into a Chow-Liu tree') # # slicing data slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] current_feature_sizes = [ feature_sizes[i] for i in current_features ] # # creating a Chow-Liu tree as leaf leaf_node = CLTreeNode(vars=current_features, var_values=current_feature_sizes, data=current_slice_data, alpha=self._alpha) # # storing links leaf_node.id = current_id node_id_assoc[current_id] = leaf_node logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node) elif self._kde and n_instances > 1: estimate_kernel_density_spn(current_slice, feature_sizes, data, self._alpha, node_id_assoc, building_stack, slices_to_process) # elif n_instances == 1: # FIXME: there is a bug here else: current_slice, slices_to_process, building_stack, node_id_assoc = \ self.make_naive_factorization(current_slice, slices_to_process, building_stack, node_id_assoc) else: # # slicing from the original dataset slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] split_on_features = False # # first run is a split on rows if first_run: logging.info('-- FIRST RUN --') first_run = False else: # # try clustering on cols # logging.debug('...trying to split on columns') split_start_t = perf_counter() print(data.shape) dependent_features, other_features = greedy_feature_split( data, current_slice, feature_sizes, self._g_factor, self._rand_gen) split_end_t = perf_counter() logging.info('...tried to split on columns in {}'.format( split_end_t - split_start_t)) if len(other_features) > 0: split_on_features = True # # have dependent components been found? if split_on_features: # # splitting on columns logging.info( '---> Splitting on features' + ' {} -> ({}, {})'.format(len(current_features), len(dependent_features), len(other_features))) # # creating two new data slices and putting them on queue first_slice = DataSlice(current_instances, dependent_features) second_slice = DataSlice(current_instances, other_features) slices_to_process.append(first_slice) slices_to_process.append(second_slice) children_ids = [first_slice.id, second_slice.id] # # storing link parent children current_slice.type = ProductNode building_stack.append(current_slice) current_slice.add_child(first_slice) current_slice.add_child(second_slice) # # creating product node prod_node = ProductNode( var_scope=frozenset(current_features)) prod_node.id = current_id node_id_assoc[current_id] = prod_node logging.debug('\tCreated Prod Node %s (with children %s)', prod_node, children_ids) else: # # clustering on rows logging.info('---> Splitting on rows') # # at most n_rows clusters, for sklearn k_row_clusters = min(self._n_cluster_splits, n_instances - 1) clustering = cluster_rows( data, current_slice, n_clusters=k_row_clusters, cluster_method=self._row_cluster_method, n_iters=self._n_iters, n_restarts=self._n_restarts, cluster_penalty=self._cluster_penalty, rand_gen=self._rand_gen, sklearn_args=self._sklearn_args) if len(clustering) < 2: logging.info('\n\n\nLess than 2 clusters\n\n (%d)', len(clustering)) logging.info('forcing a naive factorization') current_slice, slices_to_process, building_stack, node_id_assoc = \ self.make_naive_factorization(current_slice, slices_to_process, building_stack, node_id_assoc) else: # logging.debug('obtained clustering %s', clustering) logging.info('clustered into %d parts (min %d)', len(clustering), k_row_clusters) # splitting cluster_slices = [ DataSlice(cluster, current_features) for cluster in clustering ] cluster_slices_ids = [ slice.id for slice in cluster_slices ] # cluster_prior = 5.0 # cluster_weights = [(slice.n_instances() + cluster_prior) / # (n_instances + cluster_prior * len(cluster_slices)) # for slice in cluster_slices] cluster_weights = [ slice.n_instances() / n_instances for slice in cluster_slices ] # # appending for processing slices_to_process.extend(cluster_slices) # # storing links # current_slice.children = cluster_slices_ids # current_slice.weights = cluster_weights current_slice.type = SumNode building_stack.append(current_slice) for child_slice, child_weight in zip( cluster_slices, cluster_weights): current_slice.add_child(child_slice, child_weight) # # building a sum node SCOPES_DICT[frozenset(current_features)] += 1 sum_node = SumNode( var_scope=frozenset(current_features)) sum_node.id = current_id node_id_assoc[current_id] = sum_node logging.debug( '\tCreated Sum Node %s (with children %s)', sum_node, cluster_slices_ids) learn_end_t = perf_counter() logging.info('\n\n\tStructure learned in %f secs', (learn_end_t - learn_start_t)) # # linking the spn graph (parent -> children) # logging.info('===> Building tree') link_start_t = perf_counter() root_build_node = building_stack[0] root_node = node_id_assoc[root_build_node.id] logging.debug('root node: %s', root_node) root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc, building_stack) link_end_t = perf_counter() logging.info('\tLinked the spn in %f secs (root_node %s)', (link_end_t - link_start_t), root_node) # # building layers # logging.info('===> Layering spn') layer_start_t = perf_counter() spn = SpnFactory.layered_linked_spn(root_node) layer_end_t = perf_counter() logging.info('\tLayered the spn in %f secs', (layer_end_t - layer_start_t)) logging.info('\nLearned SPN\n\n%s', spn.stats()) #logging.info('%s', SCOPES_DICT.most_common(30)) return spn