def Matryoshka(data, merge_cutoff=0.1, max_k=10, max_ndim=2, bic='bic'): if data.shape[0] <= 20: root = BTree(('leaf', )) root.indices = data.index.values.tolist() root.all_clustering_dic = _set_small_leaf(data) return root separable_features, bipartitions, scores, all_clustering_dic = HiScanFeatures( data, merge_cutoff, max_k, max_ndim, bic) if len(separable_features) == 0: root = BTree(('leaf', )) root.indices = data.index.values.tolist() root.all_clustering_dic = all_clustering_dic return root idx_best = np.argmax(scores) best_feature = separable_features[idx_best] best_partition = bipartitions[best_feature] ## construct current node root = BTree(best_feature) root.indices = data.index.values.tolist() root.all_clustering_dic = all_clustering_dic #root.marker_summary = marker_summary #root.para = para ## branch cells, component with higher mean goes right. p1_mean = data.loc[best_partition, best_feature].mean(0) p2_mean = data.loc[~best_partition, best_feature].mean(0) flag = True if len(p1_mean) == 1: flag = p1_mean.values > p2_mean.values else: p1_cosine = sum(p1_mean) / np.sqrt(sum(p1_mean**2)) p2_cosine = sum(p2_mean) / np.sqrt(sum(p2_mean**2)) flag = p1_cosine > p2_cosine if flag: child_right = data.iloc[best_partition, :] child_left = data.iloc[~best_partition, :] root.where_dominant = 'right' else: child_right = data.iloc[~best_partition, :] child_left = data.iloc[best_partition, :] root.where_dominant = 'left' ## recursion root.left = Matryoshka(child_left, merge_cutoff, max_k, max_ndim, bic) root.right = Matryoshka(child_right, merge_cutoff, max_k, max_ndim, bic) return root
def ReSplit(data, merge_cutoff=0.1, weight=1, max_k=10, max_ndim=2, bic='bic'): root = BTree(('leaf', )) root.indices = data.index.values.tolist() root.weight = weight #if len(root.indices) < 500: # print(root.indices) if data.shape[0] < 2: root.all_clustering_dic = _set_small_leaf(data) root.stop = 'small size' return root unimodal = GaussianMixture(1, covariance_type='full').fit(data) root.ll = root.weight * unimodal.lower_bound_ root.bic = unimodal.bic(data) separable_features, bipartitions, scores_ll, bic_list, all_clustering_dic = HiScanFeatures( data, root, merge_cutoff, max_k, max_ndim, bic) if len(separable_features) == 0: root.all_clustering_dic = all_clustering_dic root.stop = 'no separable features' return root ''' scores_ll = np.zeros(len(separable_features)) bic_list = np.zeros(len(separable_features)) for fidx in range(len(separable_features)): f = separable_features[fidx] if np.sum(bipartitions[f]) < 2 or np.sum(~bipartitions[f]) < 2: continue gmm1 = GaussianMixture(1,covariance_type='full').fit(data.loc[bipartitions[f],:]) ll1 = gmm1.lower_bound_ * sum(bipartitions[f])/len(bipartitions[f]) bic1 = gmm1.bic(data.loc[bipartitions[f],:]) gmm0 = GaussianMixture(1,covariance_type='full').fit(data.loc[~bipartitions[f],:]) ll0 = gmm0.lower_bound_ * sum(~bipartitions[f])/len(bipartitions[f]) bic0 = gmm0.bic(data.loc[~bipartitions[f],:]) scores_ll[fidx] = (ll1 + ll0) * root.weight - root.ll bic_list[fidx] = bic1 + bic0 ''' #print(separable_features) #print(scores_ll) #print(bic_list) idx_best = np.argmax(scores_ll) if np.max(scores_ll) < 0.001: #if root.bic < bic_list[idx_best]: root.stop = 'spliting increases bic' return root #idx_best = np.argmax(scores_ent) best_feature = separable_features[idx_best] best_partition = bipartitions[best_feature] #best_weights = all_clustering_dic[len(best_feature)][best_feature]['weight'] ## construct current node root.key = best_feature root.all_clustering_dic = all_clustering_dic #root.marker_summary = marker_summary #root.para = para ## branch cells, component with higher mean goes right. p1_mean = data.loc[best_partition, best_feature].mean(0) p2_mean = data.loc[~best_partition, best_feature].mean(0) flag = True if len(p1_mean) == 1: flag = p1_mean.values > p2_mean.values else: p1_cosine = sum(p1_mean) / np.sqrt(sum(p1_mean**2)) p2_cosine = sum(p2_mean) / np.sqrt(sum(p2_mean**2)) flag = p1_cosine > p2_cosine if flag: child_right = data.iloc[best_partition, :] w_r = sum(best_partition) / len(best_partition) child_left = data.iloc[~best_partition, :] w_l = sum(~best_partition) / len(best_partition) root.where_dominant = 'right' else: child_right = data.iloc[~best_partition, :] w_r = sum(~best_partition) / len(best_partition) child_left = data.iloc[best_partition, :] w_l = sum(best_partition) / len(best_partition) root.where_dominant = 'left' ## recursion root.left = ReSplit(child_left, merge_cutoff, weight * w_l, max_k, max_ndim, bic) root.right = ReSplit(child_right, merge_cutoff, weight * w_r, max_k, max_ndim, bic) return root