def getCIGroups(local_data, ds_context=None, scope=None, alpha=0.001, families=None): """ :param local_data: np array :param scope: a list of index to output variables :param alpha: threshold :param families: obsolete :return: np array of clustering This function take tuple (output, conditional) as input and returns independent groups alpha is the cutoff parameter for connected components BE CAREFUL WITH SPARSE DATA! """ data = preproc(local_data, ds_context, None, ohe) num_instance = data.shape[0] output_mask = np.zeros(data.shape, dtype=bool) # todo check scope and node.scope again output_mask[:, np.arange(len(scope))] = True dataOut = data[output_mask].reshape(num_instance, -1) dataIn = data[~output_mask].reshape(num_instance, -1) assert len(dataIn) > 0 assert len(dataOut) > 0 pvals = testRcoT(dataOut, dataIn) pvals[pvals > alpha] = 0 clusters = np.zeros(dataOut.shape[1]) for i, c in enumerate(connected_components(from_numpy_matrix(pvals))): clusters[list(c)] = i + 1 return split_conditional_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) clusters = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) from sklearn.cluster import KMeans km_model = KMeans(n_clusters=n_clusters, random_state=seed) clusters = km_model.fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True), km_model
def split_conditional_rows_KMeans(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, ohe) clusters = KMeans(n_clusters=n_clusters, random_state=seed, precompute_distances=True).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) kmeans_data = TSNE(n_components=3, verbose=verbose, n_jobs=ncpus, random_state=seed).fit_transform(data) clusters = KMeans(n_clusters=n_clusters, random_state=seed).fit_predict(kmeans_data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_Gower(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, False) try: df = robjects.r["as.data.frame"](data) clusters = robjects.r["mixedclustering"](df, ds_context.distribution_family, n_clusters, seed) clusters = np.asarray(clusters) except Exception as e: np.savetxt("/tmp/errordata.txt", local_data) print(e) raise e return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_GMM(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) estimator = GaussianMixture( n_components=n_clusters, covariance_type=covariance_type, max_iter=max_iter, n_init=n_init, random_state=seed, ) clusters = estimator.fit(data).predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_KMeans(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) if data.shape[0] > max_sampling_threshold_rows: data_sample = data[np.random.randint(data.shape[0], size=max_sampling_threshold_rows), :] kmeans = KMeans(n_clusters=n_clusters, random_state=seed) clusters = kmeans.fit(data_sample).predict(data) else: kmeans = KMeans(n_clusters=n_clusters, random_state=seed) clusters = kmeans.fit_predict(data) cluster_centers = kmeans.cluster_centers_ result = split_data_by_clusters(local_data, clusters, scope, rows=True) return result, cluster_centers.tolist()
def split_rows_RuleClustering( local_data, ds_context, scope, ): data = preproc(local_data, ds_context, pre_proc, ohe) # https://stackoverflow.com/a/39772170/5595684 km = KMeans(k, random_state=rand_state) km_clusters = km.fit_predict(data) lab, count = np.unique(km.labels_, return_counts=True) # inverse weight classes, todo test if this works ok N = len(data) lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)} W = [lab_wgt[lab] for lab in km.labels_] if model == 'stump': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=1, ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'tree': dtc = DecisionTreeClassifier(random_state=rand_state, max_depth=None, ccp_alpha=0.05, min_impurity_split=0.01 # max_leaf_nodes=2*10**(self.k+1) ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'm-estimate': raise ValueError('Not implemented') else: raise ValueError(str(model) + ' unknown model type') # todo try out rule clusters right_rule = left_rule.negate() rule_clusters = (right_rule.apply( data, scope_partial_data=scope)).astype(int) split = split_data_by_clusters(data, rule_clusters, scope, rows=True) assert len(split) == 2 return split, (left_rule, right_rule)
def split_rows_Gower(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, False) feature_types = [] for s in scope: mt = ds_context.meta_types[s] if mt == MetaType.BINARY: feature_types.append("categorical") elif mt == MetaType.DISCRETE: feature_types.append("discrete") else: feature_types.append("continuous") try: df = robjects.r["as.data.frame"](data) clusters = robjects.r["mixedclustering"](df, feature_types, n_clusters, seed) clusters = np.asarray(clusters) except Exception as e: np.savetxt("/tmp/errordata.txt", local_data) raise e return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_DBScan(local_data, ds_context, scope): data = preproc(local_data, ds_context, pre_proc, ohe) clusters = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_cols_random_partitions(local_data, ds_context, scope): #same as above, but transpose the data data = preproc(local_data.T, ds_context, None, ohe) clusters = above(make_planes(1, data.shape[1], rand_gen), data)[:, 0] return split_data_by_clusters(local_data, clusters, scope, rows=False)
def split_rows_random_partitions(local_data, ds_context, scope): data = preproc(local_data, ds_context, None, ohe) clusters = above(make_planes(1, local_data.shape[1], rand_gen), data)[:, 0] return split_data_by_clusters(local_data, clusters, scope, rows=True)
def split_rows_RuleClustering( local_data, ds_context, scope, ): data = preproc(local_data, ds_context, pre_proc, ohe) #https://stackoverflow.com/a/39772170/5595684 km = KMeans(k, random_state=rand_state) km_clusters = km.fit_predict(data) lab, count = np.unique(km.labels_, return_counts=True) #inverse weight classes, todo test if this works ok N = len(data) lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)} W = [lab_wgt[lab] for lab in km.labels_] if model == 'stump': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=1, ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'tree': dtc = DecisionTreeClassifier( random_state=rand_state, max_depth=None, ccp_alpha=0.05, min_impurity_split=0.01 #max_leaf_nodes=2*10**(self.k+1) ).fit(data, km.labels_, sample_weight=W) # dtc.cost_complexity_pruning_path() left_rule = tree_to_rule(dtc, scope, ds_context) elif model == 'm-estimate': raise ValueError('Not implemented') else: raise ValueError(str(model) + ' unknown model type') if debug: import matplotlib as plt #todo remove when everythings working dt_labels = dtc.predict(data) # plot_tree(dtc) # plt.show() print(export_text(dtc.tree_.value)) if data.shape[1] == 2: fig, ax = plt.subplots() colors = np.full(dt_labels.shape, 'blue', dtype=object) np.putmask(colors, dt_labels.astype(bool), 'green') colors[km.labels_ != dt_labels] = 'black' #plot rule: assert len(left_rule) <= 2 for cond in left_rule: if cond['feature'] == 0: ax.axvline(cond['threshhold'], ) else: ax.axhline(cond['threshhold']) ax.scatter(data[:, 0], data[:, 1], c=colors) plt.show() # todo try out rule clusters # rule_clusters = rule.apply(data) split = split_data_by_clusters(data, km_clusters, scope, rows=True) assert len(split) == 2 right_rule = left_rule.negate() return split, (left_rule, right_rule)