예제 #1
0
def Matryoshka(data, merge_cutoff=0.1, max_k=10, max_ndim=2, bic='bic'):

    if data.shape[0] <= 20:
        root = BTree(('leaf', ))
        root.indices = data.index.values.tolist()
        root.all_clustering_dic = _set_small_leaf(data)
        return root

    separable_features, bipartitions, scores, all_clustering_dic = HiScanFeatures(
        data, merge_cutoff, max_k, max_ndim, bic)

    if len(separable_features) == 0:
        root = BTree(('leaf', ))
        root.indices = data.index.values.tolist()
        root.all_clustering_dic = all_clustering_dic
        return root

    idx_best = np.argmax(scores)
    best_feature = separable_features[idx_best]
    best_partition = bipartitions[best_feature]

    ## construct current node
    root = BTree(best_feature)
    root.indices = data.index.values.tolist()
    root.all_clustering_dic = all_clustering_dic
    #root.marker_summary = marker_summary
    #root.para = para

    ## branch cells, component with higher mean goes right.
    p1_mean = data.loc[best_partition, best_feature].mean(0)
    p2_mean = data.loc[~best_partition, best_feature].mean(0)

    flag = True
    if len(p1_mean) == 1:
        flag = p1_mean.values > p2_mean.values
    else:
        p1_cosine = sum(p1_mean) / np.sqrt(sum(p1_mean**2))
        p2_cosine = sum(p2_mean) / np.sqrt(sum(p2_mean**2))
        flag = p1_cosine > p2_cosine

    if flag:
        child_right = data.iloc[best_partition, :]
        child_left = data.iloc[~best_partition, :]
        root.where_dominant = 'right'
    else:
        child_right = data.iloc[~best_partition, :]
        child_left = data.iloc[best_partition, :]
        root.where_dominant = 'left'

    ## recursion
    root.left = Matryoshka(child_left, merge_cutoff, max_k, max_ndim, bic)
    root.right = Matryoshka(child_right, merge_cutoff, max_k, max_ndim, bic)

    return root
예제 #2
0
def ReSplit(data, merge_cutoff=0.1, weight=1, max_k=10, max_ndim=2, bic='bic'):

    root = BTree(('leaf', ))
    root.indices = data.index.values.tolist()
    root.weight = weight
    #if len(root.indices) < 500:
    #    print(root.indices)

    if data.shape[0] < 2:
        root.all_clustering_dic = _set_small_leaf(data)
        root.stop = 'small size'
        return root

    unimodal = GaussianMixture(1, covariance_type='full').fit(data)
    root.ll = root.weight * unimodal.lower_bound_
    root.bic = unimodal.bic(data)

    separable_features, bipartitions, scores_ll, bic_list, all_clustering_dic = HiScanFeatures(
        data, root, merge_cutoff, max_k, max_ndim, bic)

    if len(separable_features) == 0:
        root.all_clustering_dic = all_clustering_dic
        root.stop = 'no separable features'
        return root
    '''
    scores_ll = np.zeros(len(separable_features))
    bic_list = np.zeros(len(separable_features))
    for fidx in range(len(separable_features)):
        f = separable_features[fidx]
        if np.sum(bipartitions[f]) < 2 or np.sum(~bipartitions[f]) < 2:
            continue
        gmm1 = GaussianMixture(1,covariance_type='full').fit(data.loc[bipartitions[f],:])
        ll1 = gmm1.lower_bound_ * sum(bipartitions[f])/len(bipartitions[f])
        bic1 = gmm1.bic(data.loc[bipartitions[f],:]) 
        
        gmm0 = GaussianMixture(1,covariance_type='full').fit(data.loc[~bipartitions[f],:])
        ll0 = gmm0.lower_bound_ * sum(~bipartitions[f])/len(bipartitions[f])
        bic0 = gmm0.bic(data.loc[~bipartitions[f],:]) 
        
        scores_ll[fidx] = (ll1 + ll0) * root.weight - root.ll
        bic_list[fidx] = bic1 + bic0
    '''
    #print(separable_features)
    #print(scores_ll)
    #print(bic_list)
    idx_best = np.argmax(scores_ll)
    if np.max(scores_ll) < 0.001:
        #if root.bic < bic_list[idx_best]:
        root.stop = 'spliting increases bic'
        return root

    #idx_best = np.argmax(scores_ent)
    best_feature = separable_features[idx_best]
    best_partition = bipartitions[best_feature]
    #best_weights = all_clustering_dic[len(best_feature)][best_feature]['weight']

    ## construct current node
    root.key = best_feature
    root.all_clustering_dic = all_clustering_dic
    #root.marker_summary = marker_summary
    #root.para = para

    ## branch cells, component with higher mean goes right.
    p1_mean = data.loc[best_partition, best_feature].mean(0)
    p2_mean = data.loc[~best_partition, best_feature].mean(0)

    flag = True
    if len(p1_mean) == 1:
        flag = p1_mean.values > p2_mean.values
    else:
        p1_cosine = sum(p1_mean) / np.sqrt(sum(p1_mean**2))
        p2_cosine = sum(p2_mean) / np.sqrt(sum(p2_mean**2))
        flag = p1_cosine > p2_cosine

    if flag:
        child_right = data.iloc[best_partition, :]
        w_r = sum(best_partition) / len(best_partition)
        child_left = data.iloc[~best_partition, :]
        w_l = sum(~best_partition) / len(best_partition)
        root.where_dominant = 'right'
    else:
        child_right = data.iloc[~best_partition, :]
        w_r = sum(~best_partition) / len(best_partition)
        child_left = data.iloc[best_partition, :]
        w_l = sum(best_partition) / len(best_partition)
        root.where_dominant = 'left'

    ## recursion
    root.left = ReSplit(child_left, merge_cutoff, weight * w_l, max_k,
                        max_ndim, bic)
    root.right = ReSplit(child_right, merge_cutoff, weight * w_r, max_k,
                         max_ndim, bic)

    return root