def split_cols_RDC_py(local_data, ds_context, scope): meta_types = ds_context.get_meta_types_by_scope(scope) domains = ds_context.get_domains_by_scope(scope) if local_data.shape[0] > max_sampling_threshold_cols: local_data_sample = local_data[np.random.randint(local_data.shape[0], size=max_sampling_threshold_cols), :] clusters = getIndependentRDCGroups_py( local_data_sample, threshold, meta_types, domains, k=k, s=s, # ohe=True, non_linearity=non_linearity, n_jobs=n_jobs, rand_gen=rand_gen, ) return split_data_by_clusters(local_data, clusters, scope, rows=False) else: clusters = getIndependentRDCGroups_py( local_data, threshold, meta_types, domains, k=k, s=s, # ohe=True, non_linearity=non_linearity, n_jobs=n_jobs, rand_gen=rand_gen, ) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_cols_RDC_py(local_data, ds_context, scope, l_rfft=None, is_pair=False): meta_types = ds_context.get_meta_types_by_scope(scope) domains = ds_context.get_domains_by_scope(scope) ### by zhongjie # local_data = local_data[:, scope] clusters = getIndependentRDCGroups_py( local_data, threshold, meta_types, domains, scope, l_rfft, is_pair, k=k, s=s, # ohe=True, non_linearity=non_linearity, n_jobs=n_jobs, rand_gen=rand_gen, ) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) clusters = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_RDC(local_data, ds_context, scope): data = get_RDC_transform( local_data, ds_context.get_meta_types_by_scope(scope), ohe, k=k, s=s) clusters = KMeans(n_clusters=n_clusters, random_state=seed, n_jobs=1).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_naive_mle_conditioning(local_data): # mle conditioning original_ll = naive_ll(local_data) scope = [i for i in range(local_data.shape[1])] best_col_conditioning = None best_conditioning_ll = -np.inf for col_conditioning in range(local_data.shape[1]): ones = np.sum(local_data[:,col_conditioning]) if ones == 0 or ones == local_data.shape[0]: continue clusters = (local_data[:,col_conditioning]==1).astype(int) data_slices = split_data_by_clusters(local_data, clusters, scope, rows=True) left_data_slice, left_scope_slice, left_proportion = data_slices[0] right_data_slice, right_scope_slice, right_proportion = data_slices[1] left_data_slice = np.hstack((left_data_slice[:,:col_conditioning],left_data_slice[:,(col_conditioning+1):])).reshape(left_data_slice.shape[0],left_data_slice.shape[1]-1) right_data_slice = np.hstack((right_data_slice[:,:col_conditioning],right_data_slice[:,(col_conditioning+1):])).reshape(right_data_slice.shape[0],right_data_slice.shape[1]-1) left_ll = naive_ll(left_data_slice) right_ll = naive_ll(right_data_slice) conditioning_ll = ( (left_ll + np.log(left_proportion)) * left_data_slice.shape[0] + \ (right_ll + np.log(right_proportion)) * right_data_slice.shape[0]) / local_data.shape[0] if conditioning_ll > best_conditioning_ll: best_conditioning_ll = conditioning_ll best_col_conditioning = col_conditioning return best_col_conditioning, best_col_conditioning!=None
def split_cols_RDC(local_data, ds_context, scope): adjm = get_RDC_adjacency_matrix( local_data, ds_context.get_meta_types_by_scope(scope), ohe, linear) clusters = clusters_by_adjacency_matrix(adjm, threshold, local_data.shape[1]) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) from sklearn.cluster import KMeans km_model = KMeans(n_clusters=n_clusters, random_state=seed) clusters = km_model.fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True), km_model
def split_conditional_rows_KMeans(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, ohe) clusters = KMeans(n_clusters=n_clusters, random_state=seed, precompute_distances=True).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) kmeans_data = TSNE(n_components=3, verbose=verbose, n_jobs=ncpus, random_state=seed).fit_transform(data) clusters = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(kmeans_data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_cols_random_partitions(local_data, ds_context, scope): if rand_gen.random_sample() < fail: return [(local_data, scope, 1.0)] clusters = np.zeros_like(scope) for i, new_scope in enumerate(np.array_split(np.argsort(scope), 2)): clusters[new_scope] = i return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_binary_random_partition(local_data, ds_context, scope): # data = preproc(local_data, ds_context, pre_proc, ohe) # draw percentage of split from a Beta alloc_perc = rand_gen.beta(a=beta_a, b=beta_b) clusters = rand_gen.choice(2, size=local_data.shape[0], p=[alloc_perc, 1 - alloc_perc]) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_Gower(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, False) try: df = robjects.r["as.data.frame"](data) clusters = robjects.r["mixedclustering"](df, ds_context.distribution_family, n_clusters, seed) clusters = np.asarray(clusters) except Exception as e: np.savetxt("/tmp/errordata.txt", local_data) print(e) raise e return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_GMM(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) estimator = GaussianMixture( n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, n_init=n_init, random_state=seed, ) clusters = estimator.fit(data).predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_cols_RDC_py(local_data, ds_context, scope): meta_types = ds_context.get_meta_types_by_scope(scope) domains = ds_context.get_domains_by_scope(scope) clusters = getIndependentRDCGroups_py(local_data, threshold, meta_types, domains, k=k, s=s, # ohe=True, non_linearity=non_linearity, n_jobs=n_jobs, rand_gen=rand_gen) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) if data.shape[0] > max_sampling_threshold_rows: data_sample = data[np.random.randint(data.shape[0], size=max_sampling_threshold_rows), :] kmeans = KMeans(n_clusters=n_clusters, random_state=seed) clusters = kmeans.fit(data_sample).predict(data) else: kmeans = KMeans(n_clusters=n_clusters, random_state=seed) clusters = kmeans.fit_predict(data) cluster_centers = kmeans.cluster_centers_ result = split_data_by_clusters(local_data, clusters, scope, rows=True) return result, cluster_centers.tolist()
def split_rows_RDC_py(local_data, ds_context, scope): meta_types = ds_context.get_meta_types_by_scope(scope) domains = ds_context.get_domains_by_scope(scope) rdc_data = rdc_transformer(local_data, meta_types, domains, k=k, s=s, non_linearity=non_linearity, return_matrix=True, rand_gen=rand_gen) clusters = KMeans(n_clusters=n_clusters, random_state=rand_gen, n_jobs=n_jobs).fit_predict(rdc_data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_RuleClustering( local_data, ds_context, scope, ): data = preproc(local_data, ds_context, pre_proc, ohe) # https://stackoverflow.com/a/39772170/5595684 km = KMeans(k, random_state=rand_state) km_clusters = km.fit_predict(data) lab, count = np.unique(km.labels_, return_counts=True) # inverse weight classes, todo test if this works ok N = len(data) lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)} W = [lab_wgt[lab] for lab in km.labels_] if model == 'stump': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=1, ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'tree': dtc = DecisionTreeClassifier(random_state=rand_state, max_depth=None, ccp_alpha=0.05, min_impurity_split=0.01 # max_leaf_nodes=2*10**(self.k+1) ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'm-estimate': raise ValueError('Not implemented') else: raise ValueError(str(model) + ' unknown model type') # todo try out rule clusters right_rule = left_rule.negate() rule_clusters = (right_rule.apply( data, scope_partial_data=scope)).astype(int) split = split_data_by_clusters(data, rule_clusters, scope, rows=True) assert len(split) == 2 return split, (left_rule, right_rule)
def split_cols_binary_random_partitions(local_data, ds_context, scope): # data = preproc(local_data, ds_context, None, ohe) # # with a certain percentage it may fail, such that row partitioning may happen clusters = None p = rand_gen.rand() #print('P', p) if p > threshold: # # draw percentage of split from a Beta alloc_perc = rand_gen.beta(a=beta_a, b=beta_b) clusters = rand_gen.choice(2, size=local_data.shape[1], p=[alloc_perc, 1 - alloc_perc]) #print(clusters, clusters.sum(), clusters.shape, alloc_perc) else: clusters = np.zeros(local_data.shape[1]) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_Gower(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, False) feature_types = [] for s in scope: mt = ds_context.meta_types[s] if mt == MetaType.BINARY: feature_types.append("categorical") elif mt == MetaType.DISCRETE: feature_types.append("discrete") else: feature_types.append("continuous") try: df = robjects.r["as.data.frame"](data) clusters = robjects.r["mixedclustering"](df, feature_types, n_clusters, seed) clusters = np.asarray(clusters) except Exception as e: np.savetxt("/tmp/errordata.txt", local_data) raise e return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_DBScan(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) clusters = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_cols_random_partitions(local_data, ds_context, scope): #same as above, but transpose the data data = preproc(local_data.T, ds_context, None, ohe) clusters = above(make_planes(1, data.shape[1], rand_gen), data)[:, 0] return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_random_partitions(local_data, ds_context, scope): data = preproc(local_data, ds_context, None, ohe) clusters = above(make_planes(1, local_data.shape[1], rand_gen), data)[:, 0] return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_cols_GTest(local_data, ds_context, scope): domains = ds_context.domains clusters = gtest_greedy_feature_split(local_data, domains, threshold, rand_gen) return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_RuleClustering( local_data, ds_context, scope, ): data = preproc(local_data, ds_context, pre_proc, ohe) #https://stackoverflow.com/a/39772170/5595684 km = KMeans(k, random_state=rand_state) km_clusters = km.fit_predict(data) lab, count = np.unique(km.labels_, return_counts=True) #inverse weight classes, todo test if this works ok N = len(data) lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)} W = [lab_wgt[lab] for lab in km.labels_] if model == 'stump': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=1, ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'tree': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=None, ccp_alpha=0.05, min_impurity_split=0.01 #max_leaf_nodes=2*10**(self.k+1) ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'm-estimate': raise ValueError('Not implemented') else: raise ValueError(str(model) + ' unknown model type') if debug: import matplotlib as plt #todo remove when everythings working dt_labels = dtc.predict(data) # plot_tree(dtc) # plt.show() print(export_text(dtc.tree_.value)) if data.shape[1] == 2: fig, ax = plt.subplots() colors = np.full(dt_labels.shape, 'blue', dtype=object) np.putmask(colors, dt_labels.astype(bool), 'green') colors[km.labels_ != dt_labels] = 'black' #plot rule: assert len(left_rule) <= 2 for cond in left_rule: if cond['feature'] == 0: ax.axvline(cond['threshhold'], ) else: ax.axhline(cond['threshhold']) ax.scatter(data[:, 0], data[:, 1], c=colors) plt.show() # todo try out rule clusters # rule_clusters = rule.apply(data) split = split_data_by_clusters(data, km_clusters, scope, rows=True) assert len(split) == 2 right_rule = left_rule.negate() return split, (left_rule, right_rule)
def learn_structure_cnet( dataset, ds_context, conditioning, create_leaf, next_operation_cnet=get_next_operation_cnet(), initial_scope=None, data_slicer=default_slicer, ): assert dataset is not None assert ds_context is not None assert create_leaf is not None assert next_operation_cnet is not None root = Product() root.children.append(None) if initial_scope is None: initial_scope = list(range(dataset.shape[1])) tasks = deque() tasks.append((dataset, root, 0, initial_scope)) while tasks: local_data, parent, children_pos, scope = tasks.popleft() operation, op_params = next_operation_cnet(local_data, scope) logging.debug("OP: {} on slice {} (remaining tasks {})".format( operation, local_data.shape, len(tasks))) if operation == Operation.CONDITIONING: from spn.algorithms.splitting.Base import split_data_by_clusters conditioning_start_t = perf_counter() col_conditioning, found_conditioning = conditioning(local_data) if not found_conditioning: node = create_leaf(local_data, ds_context, scope) parent.children[children_pos] = node continue clusters = (local_data[:, col_conditioning] == 1).astype(int) data_slices = split_data_by_clusters(local_data, clusters, scope, rows=True) node = Sum() node.scope.extend(scope) parent.children[children_pos] = node for data_slice, scope_slice, proportion in data_slices: assert isinstance(scope_slice, list), "slice must be a list" node.weights.append(proportion) product_node = Product() node.children.append(product_node) node.children[-1].scope.extend(scope) right_data_slice = np.hstack( (data_slice[:, :col_conditioning], data_slice[:, (col_conditioning + 1):])).reshape( data_slice.shape[0], data_slice.shape[1] - 1) product_node.children.append(None) tasks.append(( right_data_slice, product_node, len(product_node.children) - 1, scope_slice[:col_conditioning] + scope_slice[col_conditioning + 1:], )) left_data_slice = data_slice[:, col_conditioning].reshape( data_slice.shape[0], 1) product_node.children.append(None) tasks.append((left_data_slice, product_node, len(product_node.children) - 1, [scope_slice[col_conditioning]])) conditioning_end_t = perf_counter() logging.debug("\t\tconditioning (in {:.5f} secs)".format( conditioning_end_t - conditioning_start_t)) continue elif operation == Operation.CREATE_LEAF: cltree_start_t = perf_counter() node = create_leaf(local_data, ds_context, scope) parent.children[children_pos] = node cltree_end_t = perf_counter() else: raise Exception("Invalid operation: " + operation) node = root.children[0] assign_ids(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err node = Prune(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err return node