Beispiel #1
0
    def __init__(self, spts, raw_data_spts):

        rgp = regionprops_table(
            spts,
            raw_data_spts,
            properties=["label", "intensity_image", "area"])

        lbl_vol_ints_avints = []
        for k in range(len(rgp["label"])):
            lbl_vol_ints_avints.append([
                rgp["label"][k], rgp["area"][k],
                rgp["intensity_image"][k].sum(),
                rgp["intensity_image"][k].sum() / rgp["area"][k]
            ])

        lbl_vol_ints_avints = np.asarray(lbl_vol_ints_avints)

        clusters_vol, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 1], 3)
        clusters_ints, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 2], 3)
        clusters_avints, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 3], 3)

        clusters_vol = np.asarray(clusters_vol)
        clusters_ints = np.asarray(clusters_ints)
        clusters_avints = np.asarray(clusters_avints)

        idxs = np.where(clusters_vol + clusters_ints + clusters_avints == 6)[0]

        ts_mtx = np.zeros(spts.shape, dtype=np.int32)
        for k in idxs:
            ts_mtx += int(lbl_vol_ints_avints[k, 0]) * (spts == int(
                lbl_vol_ints_avints[k, 0]))

        self.ts_mtx = ts_mtx
Beispiel #2
0
def Clustering(Fasta, Coverage, Size, cpu):
    Bin = []
    Fa_Size = {}
    with open(Fasta, 'r') as in_handle:
        for title, seq in SimpleFastaParser(in_handle):
            Bin.append(title.split()[0])
            Fa_Size[title.split()[0]] = len(seq)
    df = pd.read_table(Coverage, delimiter='\t', index_col=0)
    df = df.loc[Bin, :]
    if int(Size) > len(df.index):
        return {}, 0
    # start clustering
    if len(df.columns) == 1:  # 1-d kmeans
        df1 = df.iloc[:, 0].to_list()
        df_name = df.index
        labels, centroids = kmeans1d.cluster(df1, int(Size))
    else:  # kmeans
        df1 = df.values
        df_name = df.index
        # Number of clusters
        kmeans = KMeans(n_clusters=int(Size), n_init=30, n_jobs=cpu)
        # Fitting the input data
        kmeans = kmeans.fit(df1)
        # Getting the cluster labels
        labels = kmeans.predict(df1)
        # Centroid values
        centroids = kmeans.cluster_centers_
    Sep_fa = {}
    for i, j in enumerate(df_name):
        Sep_fa.setdefault(str(labels[i]), []).append(j)
    return Sep_fa, Fa_Size
Beispiel #3
0
    def set_mask_matrix(self):
        # torch.set_printoptions(threshold=500000)
        self.var_matrix = self.var_matrix / self.count_var_cov
        var_flatten = torch.flatten(self.var_matrix)

        if self.margin == 0:  # kmeans1d clustering setting for ISW
            clusters, centroids = kmeans1d.cluster(
                var_flatten, self.clusters)  # 50 clusters
            num_sensitive = var_flatten.size()[0] - clusters.count(
                0)  # 1: Insensitive Cov, 2~50: Sensitive Cov
            print("num_sensitive, centroids =", num_sensitive, centroids)
            _, indices = torch.topk(var_flatten, k=int(num_sensitive))
        else:  # do not use
            num_sensitive = self.num_off_diagonal - self.margin
            print("num_sensitive = ", num_sensitive)
            _, indices = torch.topk(var_flatten, k=int(num_sensitive))
        mask_matrix = torch.flatten(torch.zeros(self.dim, self.dim).cuda())
        mask_matrix[indices] = 1

        if self.mask_matrix is not None:
            self.mask_matrix = (self.mask_matrix.int() & mask_matrix.view(
                self.dim, self.dim).int()).float()
        else:
            self.mask_matrix = mask_matrix.view(self.dim, self.dim)
        self.num_sensitive = torch.sum(self.mask_matrix)
        print("Check whether two ints are same", num_sensitive,
              self.num_sensitive)

        self.var_matrix = None
        self.count_var_cov = 0

        if torch.cuda.current_device() == 0:
            print("Covariance Info: (CXC Shape, Num_Off_Diagonal)",
                  self.mask_matrix.shape, self.num_off_diagonal)
            print("Selective (Sensitive Covariance)", self.num_sensitive)
Beispiel #4
0
    def pca_init(self):
        from sklearn.decomposition import PCA
        import kmeans1d

        #Get the codebook inputs for the init_dataset:
        codebook_layer_in = self.codebook_layer.input
        n_batches = self.estimation_points // self.data.batch_size
        pred_fn = tf.keras.backend.function(inputs=self.model.inputs,
                                            outputs=[codebook_layer_in])
        encoded_data = [
            pred_fn(self.init_dataset.__getitem__(i)[0])[0]
            for i in range(n_batches)
        ]
        encoded_data = np.concatenate(encoded_data)
        encoded_data = np.reshape(encoded_data, (-1, encoded_data.shape[-1]))

        #Apply PCA:
        pca_model = PCA(n_components=self.codebook_layer.groups)
        encoded_low = pca_model.fit_transform(encoded_data)

        #Kmeans over each principal component
        groups_centroids = []
        for component_values in encoded_low.T:
            clusters, centroids = kmeans1d.cluster(
                component_values, self.codebook_layer.codes_per_group)
            groups_centroids.append(centroids)
        groups_centroids = np.array(groups_centroids)[:, :, np.newaxis]
        codebook = groups_centroids * np.transpose(
            pca_model.components_[:, :, np.newaxis], (0, 2, 1))

        return codebook
Beispiel #5
0
def groupBlocks(blocks_in, prev_word, pg_num):

    # Cluster blocks horizontally by the following 5 categories:
    # left word, left cont., center, right word, right cont.
    clusters, centroids = kmeans1d.cluster(
        [block['bbox'][0] for block in blocks_in], 5)

    # add the cluster component to the blocks
    blocks = [{
        **block, 'cluster': cluster
    } for block, cluster in zip(blocks_in, clusters)]
    # sort in vertical direction
    blocks.sort(key=lambda block: (block['bbox'][1] + block['bbox'][3]) / 2 +
                (1e5 if block['bbox'][0] > centroids[2] else 0))

    left_words = []  # words on left side of page
    right_words = []  # words on right side of page
    split_words = []  # words that are split across sides/pages
    left_word = []  # holds data for left-side words
    right_word = []  # holds data for right-side words
    insert_word = None  # used to hold data from a split word
    for block in blocks:
        cluster = block['cluster']
        if cluster == 0:  # start of phrase (left side)
            if prev_word:
                insert_word = prev_word
                prev_word = False
            if left_word:
                add_word(left_words, left_word)
            left_word = add_group({}, add_block({}, block))
        elif cluster == 1:  # continuation of phrase (left side)
            if prev_word:
                split_word = add_group(prev_word, add_block({}, block))
                add_word(split_words, split_word)
                prev_word = None
                continue
            assert (left_word)
            add_block(left_word['groups'], block)
        elif cluster == 3:  # start of phrase (right side)
            if right_word:
                add_word(right_words, right_word)
            elif left_word:
                add_word(left_words, left_word)
                left_word = None
            right_word = add_group({}, add_block({}, block))
        elif cluster == 4:  # continuation of phrase (right side)
            if right_word:
                add_block(right_word['groups'], block)
            else:
                assert (left_word)
                split_word = add_group(left_word, add_block({}, block))
                add_word(split_words, split_word)
                left_word = None

    # make sure we terminate with the last item on either side
    last_word = right_word if right_word else left_word
    return left_words + right_words, split_words, last_word, insert_word
def shrink_categorical(cat, n=4):
    '''
    Reduces the categorical distribution to distribution of size n using k-means clustering
    :param cat: categorical distribution
    :param n: number of bins/clusters to be reduced to
    :return:
    '''
    if (not hasattr(cat, "sample")) or (len(cat.vals) < n):
        return cat
    clusters, centroids = kmeans1d.cluster(cat.vals, n)
    probs = [0 for _ in range(n)]
    for cluster, prob in zip(clusters, cat.probs):
        probs[cluster] += prob
    return Categorical(centroids, probs=probs)
Beispiel #7
0
 def save_quantized_bin(self, path):
     import kmeans1d  # Not required for general pastiche usage, just for generating quantized model.
     k = 2 ** 8
     q_state = {}  # quantized state
     layer_names = [layer_name for layer_name in VGG19.LAYER_NAMES if re.match(r'^block\d+_conv\d+$', layer_name)]
     for layer_name in layer_names:
         layer = getattr(self, layer_name)
         bias = layer.bias
         shape = layer.weight.shape
         weight = layer.weight.flatten()
         clusters, centroids = kmeans1d.cluster(weight, k)
         q_state[layer_name + '_W_q'] = torch.tensor(clusters, dtype=torch.uint8).reshape(shape)
         q_state[layer_name + '_W_table'] = torch.tensor(centroids, dtype=torch.float32)
         q_state[layer_name + '_b'] = bias.detach().to('cpu', copy=True)
     torch.save(q_state, path)
def cluster(bins, flattened_val, val, inertia=None):
    if USE_KMEANS_CUDA and kmeans_cuda:
        invalids = None
        int_bins = bins
        while invalids is None or int_bins - invalids < bins:
            if invalids:
                int_bins = bins + invalids
            codebook, _ = kmeans_cuda(flattened_val.reshape((-1, 1)),
                                      int_bins,
                                      device=1)
            invalids = np.count_nonzero(
                np.isnan(codebook).any(axis=1)) + np.count_nonzero(
                    np.isneginf(codebook).any(axis=1)) + np.count_nonzero(
                        np.isposinf(codebook).any(axis=1))
        # this is not a good solution since it will possibly end up with the wrong numer of bins
        # however kmeans cuda is not easy to install so probably not right anyway
        codebook = codebook[~np.isnan(codebook).any(axis=1)]
        codebook = codebook[~np.isneginf(codebook).any(axis=1)]
        codebook = codebook[~np.isposinf(codebook).any(axis=1)]
        last_inertia = None
    elif USE_KMEANS_1D and kmeans1d:
        clustered = kmeans1d.cluster(flattened_val, bins)
        codebook = np.array(clustered.centroids)
        encoded = codebook[clustered.clusters]
        last_inertia = np.sum(
            np.power(flattened_val, 2) - np.power(encoded, 2))
    else:
        # scipy is horribly slow
        kmeans = KMeans(n_clusters=bins)
        kmeans.fit(flattened_val.reshape((-1, 1)))
        codebook = kmeans.cluster_centers_
        last_inertia = kmeans.inertia_
    codebook = codebook.astype(val.dtype).flatten()
    compressed_val, codes = codes_and_compressed(flattened_val, codebook,
                                                 val.shape)
    if last_inertia is not None and inertia is not None:
        inertia.append(last_inertia)
    return compressed_val, codes, codebook
l_m[1, 0] = 53.29799633
l_m[2, 0] = 22.6235733
df1 = pd.read_csv("kine_features1.csv")
print(df1.head())
data_12 = df1.values
norm = summarize_cls(data_12)
#print(data_12[1])
#print(class_separation(data_12))

d = list()
df = pd.read_csv("kine_class_1.csv")
#print(df.head())
data_1 = df["dist"]
data_2 = df.values
##creating clusters from worspace for sampling the theta_initial
clusters, centroids = kmeans1d.cluster(data_1, 5)
# assign each sample to a cluster
#km.fit(x.reshape(-1,1))
for i in centroids:
    d.append(i)
##def Reverse(Ist):
#Ist.reverse()
#return Ist

# Driver Code
#lst = [10, 11, 12, 13, 14, 15]
#print(Reverse(d))
d_1 = d[::-1]
f1 = {0: d_1[0], 1: d_1[1], 2: d_1[2], 3: d_1[3], 4: d_1[4]}
l_m_1 = sqrt(pow(l_m[0, 0], 2) + pow(l_m[1, 0], 2) + pow(l_m[2, 0], 2))
d2 = []
Beispiel #10
0
sentiment_list = []
objectivity_list = []
retweet_list = []

for tweet in tweets:
    favorites_list.append(tweet['favorites'])
    listed_list.append(tweet['listed'])
    followers_list.append(tweet['followers'])
    friends_list.append(tweet['friends'])
    statuses_list.append(tweet['statuses'])
    time_list.append(tweet['created_at'])
    sentiment_list.append(tweet['sentiment'])
    objectivity_list.append(tweet['objectivity'])
    retweet_list.append(tweet['retweets'])

favorites_clusters, favorites_centroids = kmeans1d.cluster(favorites_list, 16)
listed_clusters, listed_centroids = kmeans1d.cluster(listed_list, 7)
followers_clusters, followers_centroids = kmeans1d.cluster(followers_list, 11)
friends_clusters, friends_centroids = kmeans1d.cluster(friends_list, 16)
statuses_clusters, statuses_centroids = kmeans1d.cluster(statuses_list, 11)
time_clusters, time_centroids = kmeans1d.cluster(statuses_list, 24)
sentiment_clusters, sentiment_centroids = kmeans1d.cluster(sentiment_list, 5)
objectivity_clusters, objectivity_centroids = kmeans1d.cluster(
    objectivity_list, 5)

print(time_list)
print(time_clusters)
print(time_centroids)

x = []
Beispiel #11
0
def sbl(A, y, eps=1e-3, thresholding_method='tau'):
    '''
  Refer to SLide 31 of http://math.iisc.ernet.in/~nmi/Chandra%20Murthy.pdf for algorithm
  Inputs:
  A = measurement matrix
  y = measurements
  sigval = variance of noise in measurement
  tau = threshold on signal
  '''
    # Doing this preprocessing inside sbl() function itself
    tau = 0.  #0.01 * np.min(y/np.sum(A, axis=-1))

    y_max = np.max(y)
    assert y_max >= 0
    if y_max > 0:
        A = A / y_max
        y = y / y_max

        pos_y = y[y > 0.]
        pos_A = A[y > 0.]
        sigval = np.std(pos_y / np.sum(pos_A, axis=-1))
    else:
        # sigma should be 0 but this will mess with the algo. Set it to some small
        # value
        sigval = 0.1
    y = np.array(y, dtype=np.float64)
    A = np.array(A, dtype=np.float64)

    [m, n] = A.shape

    # Pre-Processing
    # As all A_ij and x_j are positive, for any y_i=0 implies that for all j s.t A_ij=1, x_j=0. This reduces problem dimension.
    #nz_index = (np.where(y != 0))[0]
    #z_index = (np.where(y == 0))[0]
    #red_y = y[nz_index]

    #[r,c] = np.where(A[z_index,:] != 0)
    #ind_zero_x = np.unique(c)
    #ind = np.arange(0, n)
    #ind_nonzero_x = np.setxor1d(ind,ind_zero_x)
    #red_x = x[ind_nonzero_x]
    #red_A = (A[nz_index,:])[:,ind_nonzero_x]
    #red_n = (ind_nonzero_x.shape)[0]

    red_A = A
    red_n = n
    red_y = y
    ind_nonzero_x = np.arange(n)

    # Sparse Bayesian Learning
    # Corner cases are when 0 samples or all ys are 0
    if red_n == 0 or np.all(red_y == 0):
        x_est = np.zeros(n)
    else:
        #E-step
        #   mu is estimated mean of posterior distribution x|y, and so is the estimated red_x computed iteratively
        #   Sigma is variance of posterior distribution of x|y
        # M-step
        #   Gamma is the prior variance of x, inv_Gamma is saved as E-step only requires the inverse
        inv_Gamma = np.identity(red_n)
        gamma = np.ones(n)
        mu_old = np.ones(red_n)
        mu = np.zeros(red_n)
        variance = sigval * sigval
        #print('inside else')
        while np.sum(mu) == 0 or np.linalg.norm(
                mu_old - mu, 2) / np.linalg.norm(mu, 2) > eps:
            #print('inside loop')
            mu_old = mu
            inv_Sigma = np.matmul(np.transpose(red_A),
                                  red_A) / (variance) + inv_Gamma
            Sigma = np.linalg.inv(inv_Sigma)
            mu = np.matmul(Sigma, np.matmul(np.transpose(red_A),
                                            red_y)) / (variance)
            #mu[mu<0] = 0
            err = red_y - np.dot(red_A, mu)
            variance = (np.sum(err * err) +
                        variance * np.sum(1.0 -
                                          (1.0 / gamma) * np.diag(Sigma))) / m
            gamma = np.square(mu) + np.diag(Sigma)
            inv_Gamma = np.diag(1 / gamma)

        #pos_mu = mu[mu>0]
        #min_pos_mu = np.min(pos_mu)
        #mu[mu<=0] = min_pos_mu
        #log_mu = np.log(mu)
        #clusters, centroids = kmeans1d.cluster(log_mu, 2)
        #lower_centroid = centroids[0]
        #clusters = np.array(clusters)
        #lower_cluster = mu[clusters == 0]
        #lower_cluster_std = np.std(lower_cluster)
        #tau = lower_centroid - lower_cluster_std
        #log_tau = lower_centroid - lower_cluster_std
        #mu[log_mu < log_tau] = 0
        #print('\nmu = ', mu)
        #print('\nlog_mu = ', np.log(mu))
        #print('clusters = ', clusters)
        #print('centroids = ', centroids)

        assert thresholding_method in ['tau', 'cluster']

        if thresholding_method == 'cluster':
            # Cluster with mu_square instead of with mu to get even more precise results!
            # Still catch 40% of positives on avg
            mu1 = np.array(mu)
            mu1[mu1 < 0] = 0
            mu_square = mu1 * mu1
            #mu_square = mu
            clusters, centroids = kmeans1d.cluster(mu_square, 2)
            mu = mu * np.array(clusters)

        x_est = np.zeros(n)
        x_est[ind_nonzero_x] = mu
        if thresholding_method == 'tau':
            x_est[x_est < tau] = 0

    #print(x_est)
    return x_est
Beispiel #12
0
 def test_cluster(self):
     x = [4.0, 4.1, 4.2, -50, 200.2, 200.4, 200.9, 80, 100, 102]
     k = 4
     clusters, centroids = cluster(x, k)
     self.assertEqual(clusters, [1, 1, 1, 0, 3, 3, 3, 2, 2, 2])
     self.assertEqual(centroids, [-50.0, 4.1, 94.0, 200.5])
Data_I_Arr = pd.Series(Data_I).array
Data_ID_Arr = pd.Series(Data_ID).array
Data_PR_Arr = pd.Series(Data_PR).array
Data_PRD_Arr = pd.Series(Data_PRD).array
Data_BR_Arr = pd.Series(Data_BR).array
Data_ER_Arr = pd.Series(Data_ER).array
Data_PV_Arr = pd.Series(Data_PV).array
Data_SD_Arr = pd.Series(Data_SD).array
Data_OS_Arr = pd.Series(Data_OS).array
Data_B_Arr = pd.Series(Data_B).array
Data_R_Arr = pd.Series(Data_R).array
Data_TT_Arr = pd.Series(Data_TT).array
Data_W_Arr = pd.Series(Data_W).array
Data_Label_Arr = pd.Series(Data_Label).array

clusters, centroids_A = kmeans1d.cluster(Data_A_Arr, 5)
clusters, centroids_AD = kmeans1d.cluster(Data_AD_Arr, 5)
clusters, centroids_I = kmeans1d.cluster(Data_I_Arr, 5)
clusters, centroids_ID = kmeans1d.cluster(Data_ID_Arr, 5)
clusters, centroids_PR = kmeans1d.cluster(Data_PR_Arr, 5)
clusters, centroids_PRD = kmeans1d.cluster(Data_PRD_Arr, 5)
clusters, centroids_BR = kmeans1d.cluster(Data_BR_Arr, 5)
clusters, centroids_ER = kmeans1d.cluster(Data_ER_Arr, 5)
clusters, centroids_PV = kmeans1d.cluster(Data_PV_Arr, 5)
clusters, centroids_SD = kmeans1d.cluster(Data_SD_Arr, 5)


def find_Centroid(arr):
    Out = np.array([0.0, 0.0, 0.0, 0.0])
    Out[0] = (arr[0] + arr[1]) / 2
    Out[1] = (arr[1] + arr[2]) / 2