def kmeans_function(theta, w_min, w_max):

    kmeans = KMeans(n_clusters=3, random_state=1).fit(theta)

    #   print((kmeans.cluster_centers_))#chch

    ######### weighted_Kmeans ##############

    kmeans.cluster_centers_ = numpy.ndarray.tolist(kmeans.cluster_centers_)

    kmeans.cluster_centers_.sort()

    kmeans.cluster_centers_ = numpy.array(kmeans.cluster_centers_)

    #          print(kmeans.cluster_centers_)

    kmeans.cluster_centers_[0] = kmeans.cluster_centers_[0] * w_min

    kmeans.cluster_centers_[2] = kmeans.cluster_centers_[2] * w_max

    #   print((kmeans.cluster_centers_)) ##chchch

    w_l = scipy.cluster.vq.vq(theta, kmeans.cluster_centers_)

    kmeans_label = w_l[0] - 1
    kmeans_centers = kmeans.cluster_centers_

    return kmeans_label, kmeans_centers
Beispiel #2
0
def reset_openings(shape, ends, starts, openings):
    exits = openings["exits"]
    entrances = openings["entrances"]

    data_exits = []
    data_entrances = []
    for exit in exits:
        data_exits.append((exit[0] + exit[2])/2,(exit[1], exit[3])/2)

    for entrance in entrances:
        data_exits.append((entrance[0] + entrance[2])/2,(entrance[1], entrance[3])/2)

    exit_kmeans = KMeans(n_clusters=(len(exits)))
    entrance_kmeans = KMeans(n_clusters=(len(entrances)))

    exit_kmeans.cluster_centers_ = np.array(data_exits, dtype=int)
    entrance_kmeans.cluster_centers_ = np.array(data_entrances, dtype=int)

    y_vals = exit_kmeans.predict(ends)
    sy_vals = entrance_kmeans.predict(starts)

    buckets = [0] * len(exits)
    s_buckets = [0] * len(entrances)

    for i in range(len(y_vals)):
        buckets[y_vals[i]] += 1

    for i in range(len(sy_vals)):
        s_buckets[sy_vals[i]] += 1

    return buckets, s_buckets
Beispiel #3
0
def quantize_k_means(param,
                     k=16,
                     codebook=None,
                     guess='k-means++',
                     update_labels=False,
                     re_quantize=False,
                     **unused):
    """
    quantize using k-means clustering
    :param param:
    :param codebook: sklearn.cluster.KMeans, codebook of quantization, default=None
    :param k: int, the number of quantization level, default=16
    :param guess: str, initial quantization centroid generation method,
                       choose from 'linear', 'random', 'k-means++'
                  numpy.ndarray of shape (num_el, 1)
    :param update_labels: bool, whether to re-allocate the param elements to the latest centroids
    :param re_quantize: bool, whether to re-quantize the param
    :param unused: unused options
    :return:
        sklearn.cluster.KMeans, codebook of quantization
    """
    param_shape = param.size()
    num_el = param.numel()
    param_1d = param.view(num_el)

    if codebook is None or re_quantize:
        param_numpy = param_1d.view(num_el, 1).cpu().numpy()

        if guess == 'linear':
            guess = np.linspace(np.min(param_numpy), np.max(param_numpy), k)
            guess = guess.reshape(guess.size, 1)
        codebook = KMeans(n_clusters=k, init=guess, n_jobs=-1).fit(param_numpy)
        codebook.cluster_centers_ = torch.from_numpy(
            codebook.cluster_centers_).float()
        codebook.labels_ = torch.from_numpy(codebook.labels_).long()
        if param.is_cuda:
            codebook.cluster_centers_ = codebook.cluster_centers_.cuda(
                param.device)

    else:
        if update_labels:
            sorted_centers, indices = torch.sort(codebook.cluster_centers_,
                                                 dim=0)
            boundaries = (sorted_centers[1:] + sorted_centers[:-1]) / 2
            sorted_labels = torch.ge(param_1d - boundaries,
                                     0).long().sum(dim=0)
            codebook.labels_ = indices.index_select(0,
                                                    sorted_labels).view(num_el)
        for i in range(k):
            codebook.cluster_centers_[i, 0] = param_1d[codebook.labels_ ==
                                                       i].mean()

    param_quantize = codebook.cluster_centers_[codebook.labels_].view(
        param_shape)
    if param.is_contiguous():
        param_quantize = param_quantize.contiguous()
    param.set_(param_quantize)

    return codebook
    def transform(self, X):
        """ Cluster the operational conditons of PHM08 dataset according to Wang et al (2008).
        
        Parameters
        ----------
        X: data.

        Reference
        ----------
        Wang, Tianyi, et al. "A similarity-based prognostics approach for remaining useful life
        estimation of engineered systems." Prognostics and Health Management, 2008.
        PHM 2008. International Conference on. IEEE, 2008.
        """
        kmeans = KMeans(n_clusters=6)
        kmeans.cluster_centers_ = self.op_centers

        operational_readings = np.array(X[[
            'operational_setting_1', 'operational_setting_2',
            'operational_setting_3'
        ]])

        if operational_readings.ndim == 1:
            operational_readings = operational_readings.reshape(1, -1)

        operational_conditions = kmeans.predict(operational_readings) + 1

        return operational_conditions
Beispiel #5
0
    def test_basic(self, Xl_blobs_easy):
        X, _ = Xl_blobs_easy

        # make it super easy to cluster
        a = DKKMeans(n_clusters=3, random_state=0)
        b = SKKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(
            a,
            b,
            exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"])
        assert abs(a.inertia_ - b.inertia_) < 0.01
        # order is arbitrary, so align first
        a_order = np.argsort(a.cluster_centers_, 0)[:, 0]
        b_order = np.argsort(b.cluster_centers_, 0)[:, 0]
        a_centers = a.cluster_centers_[a_order]
        b_centers = b.cluster_centers_[b_order]
        np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3)
        b_labels = replace(b.labels_, [0, 1, 2],
                           a_order[b_order]).astype(b.labels_.dtype)
        assert_eq(a.labels_.compute(), b_labels)
        assert a.n_iter_
        # this is hacky
        b.cluster_centers_ = b_centers
        a.cluster_centers_ = a_centers
        assert_eq(a.transform(X), b.transform(X), rtol=1e-3)

        yhat_a = a.predict(X)
        yhat_b = b.predict(X)
        assert_eq(yhat_a.compute(), yhat_b)
Beispiel #6
0
def apply_weight_sharing(param, bits, codebook):
    """
    apply weight sharing to the parameter
    :param param: tensor, weight parameter of the model
    :param bits: int, number of bits for quantization
    :param codebook: dict, codebook for clustering param
    """
    num_el = param.numel()
    param_numpy = param.view(num_el).cpu().numpy()
    param_nz = param_numpy[param_numpy != 0]
    param_nz = param_nz.reshape(param_nz.size, 1)

    k = 2**bits
    if codebook is None:
        codebook = KMeans(n_clusters=k).fit(param_nz)
        centers = codebook.cluster_centers_
        centers = np.append(0.0, centers)
        codebook.cluster_centers_ = centers.reshape(centers.size, 1)
        codebook.labels_ = codebook.predict(
            param_numpy.reshape(num_el, 1).astype('float'))
    else:
        codebook.labels_ = codebook.predict(
            param_numpy.reshape(num_el, 1).astype('float'))

    return codebook
Beispiel #7
0
 def get_kmeans_object(self):
     kmeans_obj = KMeans()
     kmeans_obj.cluster_centers_ = self.cluster_centers_
     if hasattr(kmeans_obj, 'count_'):
         kmeans_obj.count_ = kmeans_obj.count_
     kmeans_obj.inverse_transform = lambda cluster_idx: kmeans_obj.cluster_centers_[cluster_idx, :]
     return kmeans_obj
Beispiel #8
0
def run():
    cluster_centers = load_prediction()
    test_data = load_test_data()
    k = KMeans(n_clusters=200)
    k.cluster_centers_ = cluster_centers
    score = k.score(test_data)
    print("Score: %f" % (score / len(test_data) * -1))
Beispiel #9
0
def get_prediction_and_truth(training_data, single_word_cats, multi_word_cats, algorithm, kwargs):
    train_embeddings = np.stack(embedding for embedding, _, _ in training_data).astype("float64")
    eval_embeddings = np.stack(embedding for embedding, _, _ in single_word_cats).astype("float64")
    if algorithm == "kmeans":
        cluster_model = KMeans(random_state=0, n_jobs=-1, **kwargs)
    elif algorithm == "attention":
        centroids = kwargs["centroids"]
        cluster_model = KMeans(random_state=0, n_jobs=-1, n_clusters=len(centroids), init=centroids)
        cluster_model.cluster_centers_ = centroids
    elif algorithm == "affinity":
        cluster_model = AffinityPropagation(damping=0.9, **kwargs)
    else:
        raise AssertionError("Clustering algorithm {} is not supported".format(algorithm))
    if algorithm != "attention":
        train_prediction = cluster_model.fit_predict(train_embeddings)
    prediction = cluster_model.predict(eval_embeddings)
    multiword_predictions = get_multiword_predictions(cluster_model, multi_word_cats)
    prediction = np.append(prediction, multiword_predictions)
    if algorithm == "attention":
        train_prediction = prediction[:len(single_word_cats)]

    print_clusters(prediction, [w for _, w, _ in single_word_cats + multi_word_cats])

    truth = [cat for _, _, cat in single_word_cats + multi_word_cats]

    assert len(truth) == len(prediction)

    return prediction, truth, train_prediction
Beispiel #10
0
    def getState_r3_list(self):
        # 由states_list来生成BeeDescription
        from readData.feedState_r2 import getState_r2_list

        self.state_r2_list = getState_r2_list(states_list=self.states_list)
        state_r2_values = np.array([i.value for i in self.state_r2_list
                                    ]).reshape(-1, 1)
        # print(state_r2_values)

        kmeans = KMeans(n_clusters=len(self.centers_list), random_state=0)
        centers_array = np.array([[center] for center in self.centers_list
                                  ]).reshape(-1, 1)
        kmeans.cluster_centers_ = centers_array
        label_idxs = kmeans.predict(state_r2_values)
        # print(kmeans.transform(state_r2_values))
        # 获得state_r3集成的list,按照centers_list中的顺序集成
        state_r3_list = []
        for center_idx, center in enumerate(self.centers_list):
            state_r3 = State_r3(value=center)
            mems_idx = list(np.where(label_idxs == center_idx)[0])
            r3_state_r2_list = [
                self.state_r2_list[state_r2_idx] for state_r2_idx in mems_idx
            ]
            if (len(r3_state_r2_list) == 0): continue
            state_r3_list.append(state_r3)
            state_r3.set_state_r2_list(r3_state_r2_list)
        return state_r3_list
Beispiel #11
0
def compress_image(src_image):
    file = BytesIO(src_image)

    image_file = Image.open(file).convert("L")
    original_image = np.array(image_file)
    original_shape = original_image.shape
    original_image = skimage.measure.block_reduce(original_image, (3, 3),
                                                  np.average)
    shape = original_image.shape
    image = original_image.reshape(-1, 1)

    #means = [8, 66, 211, 252]
    #_means = list(range(256))
    _means = [22, 58, 95, 136, 174, 206, 234, 254]
    #_means = [0, 10, 23, 46, 56, 64, 73, 81, 89, 102, 109, 119, 129, 140, 152, 160, 168, 174, 179, 188, 198, 204, 211, 217, 221, 226, 230, 233, 236, 244, 250, 254]
    means = np.array(_means).reshape(-1, 1)

    X = np.array(list(range(8))).reshape(-1, 1)
    kmeans = KMeans().fit(X)
    kmeans.cluster_centers_ = means

    image = kmeans.predict(image).reshape(shape)

    result = BytesIO()
    io.imsave(result, image)
    result = base64.b64encode(result.getvalue())
    return (result, ) + original_shape
Beispiel #12
0
    def create_codebook(self, features, _class='label'):

        if self.debug:
            print '\t- creating visual codebook for {0} ...'.format(_class)
            print '\t- features.shape', features.shape
            sys.stdout.flush()

        n_feats, n_cuboids, cuboid_depth = features.shape
        features = features.reshape(-1, cuboid_depth)

        if self.codebook_selection == self.cs_dict["kmeans"]:

            codebook = KMeans(init='k-means++', n_clusters=self.codebook_size, n_init=50,
                              tol=1e-10, max_iter=1000, random_state=self.seed, n_jobs=self.n_jobs)

            codebook.fit(features)

            return codebook

        else:

            codebook = KMeans(init='random', n_clusters=self.codebook_size, n_init=1,
                              tol=1e-10, max_iter=1, random_state=self.seed, n_jobs=self.n_jobs)

            codebook.cluster_centers_ = _init_centroids(features, k=self.codebook_size, init='random', random_state=self.seed)

            return codebook
    def find_patches_score_from_centroids(self, list):
        with open(
                CENTROIDS_PATH + str(CLUSTER_NUMBER) + "/centroids_" +
                str(CLUSTER_NUMBER) + ".txt", "rb") as fp:
            centroids = pickle.load(fp)
        k_means = KMeans(n_clusters=CLUSTER_NUMBER)
        k_means.cluster_centers_ = centroids

        for i in list:
            entries = sorted(os.scandir(SPLIT_IMAGES_PATH + str(i) + "/"),
                             key=lambda x: (x.is_dir(), x.name))
            significant_feature_number = []
            for entry in entries:
                tmp = []
                tmp.append(
                    np.asarray(
                        Image.open(SPLIT_IMAGES_PATH + str(i) + "/" +
                                   entry.name)))
                image = np.asarray(tmp)
                bottle_neck = self.__encoder.get_predict(image)
                tmp = k_means.predict(bottle_neck) + 1
                significant_feature_number.append(tmp[0])

            with open(
                    CENTROIDS_PATH + str(CLUSTER_NUMBER) + "/" + str(i) +
                    ".txt", "wb") as fp:
                pickle.dump(significant_feature_number, fp)
Beispiel #14
0
def get_vlad_feat(img_list,grid_spacing,patch_size,bow_model):
    raw_feat_extractor=dsift.DsiftExtractor(grid_spacing,patch_size,1)    
    num_words,dim_feat=bow_model.shape
    dim_vlad=num_words*dim_feat
    vlad_feat=npy.zeros((len(img_list),dim_vlad),dtype=npy.float32)
    obj_kmeans=KMeans(num_words,'k-means++',3,500,0.001)
    obj_kmeans.cluster_centers_=bow_model
    eps_float32=npy.finfo(npy.float32).eps
    for kk in range(len(img_list)):
        print("Extracting VLAD feature,"+str(kk)+"/"+str(len(img_list)))
        img=imread(img_list[kk])
        if img.ndim==3:
            img=npy.mean(img,axis=2)
        raw_feat,pos_feat=raw_feat_extractor.process_image(img,False,False)
        label_feat=obj_kmeans.predict(raw_feat)
        vlad_feat_kk=npy.zeros(dim_vlad,dtype=npy.float32)
        for ii in range(label_feat.shape[0]):
            label_ii=label_feat[ii]
            res_ii=raw_feat[ii,:]-bow_model[label_ii,:]
            res_ii_norm=npy.sqrt(npy.sum(res_ii*res_ii))
            res_ii=res_ii/(res_ii_norm+eps_float32)
            res_ii=res_ii+vlad_feat_kk[label_ii*dim_feat:(label_ii+1)*dim_feat]
            vlad_feat_kk[label_ii*dim_feat:(label_ii+1)*dim_feat]=res_ii
        vlad_feat_kk_ssr=npy.sqrt(npy.abs(vlad_feat_kk))
        idx_temp=vlad_feat_kk>0
        vlad_feat_kk[idx_temp]=vlad_feat_kk_ssr[idx_temp]
        idx_temp=npy.logical_not(idx_temp)
        vlad_feat_kk[idx_temp]=-vlad_feat_kk_ssr[idx_temp]
        vlad_feat[kk,:]=vlad_feat_kk/(npy.sqrt(npy.sum(vlad_feat_kk*vlad_feat_kk)+eps_float32))
    return vlad_feat
Beispiel #15
0
 def __init__(self,
              data,
              labels,
              cluster_per_class=2,
              threshold=None,
              verbose=0):
     classes = np.unique(labels)
     if verbose: print("Clustering...")
     kmeans = KMeans(n_clusters=int(len(classes) * cluster_per_class),
                     verbose=verbose).fit(data)
     if threshold is not None:
         if verbose: print("Removing anomalies...")
         cluster_cnts = np.histogram(kmeans.labels_,
                                     bins=len(np.unique(kmeans.labels_)))[0]
         print("Samples in clusters: {}".format(cluster_cnts))
         kmeans.cluster_centers_ = kmeans.cluster_centers_[
             cluster_cnts > threshold]
         elim_labels = np.where(cluster_cnts <= threshold)
         if type(elim_labels) is tuple and len(elim_labels) == 1:
             elim_labels = elim_labels[0]
         print("Clusters eliminated: {}".format(elim_labels))
         for label in elim_labels:
             # print(kmeans.labels_, label)
             # print(sum(kmeans.labels_ == label))
             # print(data[kmeans.labels_ == label, :].shape)
             kmeans.labels_[kmeans.labels_ == label] = kmeans.predict(
                 data[kmeans.labels_ == label, :])
     self.encoders = [
         kmeans,
     ]
     self._labels = labels.copy()
Beispiel #16
0
def k_maedoids_centers(X, k, init="k-means++"):
    # kmeans = KMeans(n_clusters=k, max_iter=1, n_init=1, init=init).fit(X)
    kmean = KMeans(n_clusters=k).fit(X)
    centers = []
    index_score = []
    score_mean = -2
    score_std = -2
    index = -1
    indexes = []
    for i in range(10):
        for center in kmean.cluster_centers_:
            new_center = X[0]
            dist = np.linalg.norm(center - new_center)
            for i in range(1, X.__len__()):
                tmp_dist = np.linalg.norm(center - X[i])
                if tmp_dist < dist:
                    index = i
                    dist = tmp_dist
                    new_center = X[i]
            centers.append(new_center)
            indexes.append(index)
        predicted = kmean.predict(X)
        index_score.append(davies_bouldin_score(X, predicted))
        score_mean = np.mean(index_score)
        score_std = np.std(index_score)
    kmean.cluster_centers_ = np.array(centers)
    return kmean, score_mean, score_std, np.array(indexes)
Beispiel #17
0
    def sample_points(self):
        k = KMeans(n_clusters=self.no_clusters)
        k.cluster_centers_ = np.array(self.cluster_center_points)
        assigned_clusters = k.predict(np.array(self.data))

        self.cluster_centers = [
            ClusterCenter(c) for c in self.cluster_center_points
        ]
        self.data_points = [
            DataPoint(self.data[i], self.cluster_centers[assigned_clusters[i]])
            for i in range(len(self.data))
        ]

        dp_sum = np.sum([dp.calc_sampling_weight()
                         for dp in self.data_points]) / self.out_per_mapper

        for dp in self.data_points:
            dp.dp_sum = dp_sum

        #logging.warn("Tot!")
        #logging.warn(sum([dp.calc_sampling_probability() for dp in self.data_points]))
        #logging.error(len(self.data_points))

        while self.can_write_more_features():
            np.random.shuffle(self.data_points)
            for dp in self.data_points:
                if not self.can_write_more_features():
                    return

                dp.dp_sum = dp_sum
                if np.random.sample() < dp.calc_sampling_probability():
                    self.write_feature(dp.point,
                                       dp.calc_weight(self.out_per_mapper))
    def reset_openings(self, ends, exits):
        exit_kmeans = KMeans(n_clusters=(len(exits)))

        exit_kmeans.cluster_centers_ = np.array(exits, dtype=int)

        y_vals = exit_kmeans.predict(ends)

        return y_vals
Beispiel #19
0
def adjacent(centers,index1, index2, X):
    n_clusters = len(centers)
    kmeans1 = KMeans(n_clusters)
    kmeans1.cluster_centers_ = centers
    labels = kmeans1.predict(X)
    if index2 in np.argsort(kmeans.transform(X[labels == index1]), axis = 1)[:,1]:
        return True
    return False
Beispiel #20
0
def Gain(centers, X, alpha=0.75):
    SSEDM_Arr = []
    n_clusters = len(centers)
    kmeans2 = KMeans(n_clusters)
    kmeans2.cluster_centers_ = centers
    labels = kmeans2.predict(X)
    for i in range(n_clusters):
        SSEDM_Arr.append(np.sum(kmeans2.transform(X[labels == i]), axis=0)[i])
    SSEDM_Arr = [SSEDM_Arr[i] * alpha for i in range(len(SSEDM_Arr))]
    return SSEDM_Arr
Beispiel #21
0
def restore_codebook(codebook_filename):
    """
    reads the cluster_centers from the codebook file
    and restores a kmeans model to use for prediction
    """
    cluster_centers = np.fromfile(codebook_filename).reshape(-1, 128)
    n_clusters = len(cluster_centers)
    codebook = KMeans(n_clusters=n_clusters, random_state=42)
    codebook.cluster_centers_ = cluster_centers
    return codebook
def init_constant(x, k):
    mu = np.array([20, 120, 200])
    sigma = np.empty(k)
    kmeans = KMeans(n_clusters=k)
    kmeans.cluster_centers_ = mu.reshape(-1, 1)
    nn = kmeans.predict(x.reshape(-1, 1))
    z = np.bincount(nn)
    pi = z / np.sum(z)
    for i in range(k):
        sigma[i] = np.mean((x[nn == i] - mu[i])**2)
    return mu, sigma, pi
Beispiel #23
0
def RGBToIndex(img, color_classes):
    # reconstruct the kmeans from center information
    kmeans = KMeans(n_clusters=n_indexed_colors, random_state=0)
    kmeans.cluster_centers_ = color_classes
    # Reshape the image into a vector of pixels
    pixel_vector = img.reshape(-1, 3)
    # Get the nearest class for each pixel
    labels = kmeans.predict(pixel_vector)
    # Reshape the indexed image to the height and width of the original
    return_img = labels
    rows, cols, channels = img.shape
    return return_img.reshape(rows, cols)
Beispiel #24
0
def Cost(centers, X):
    cost_Arr = []
    n_clusters = len(centers)
    kmeans = KMeans(n_clusters)
    kmeans.cluster_centers_ = centers
    labels = kmeans.predict(X)
    for i in range(n_clusters):
        dis = np.sort(kmeans.transform(X[labels == i]), axis=1)
        sub_SSEDM = np.sum(dis[:, 0])
        ccp_SSEDM = np.sum(dis[:, 1])
        cost_Arr.append(sub_SSEDM - ccp_SSEDM)
    return cost_Arr
Beispiel #25
0
    def remove_half_nearest_points(self, center_points, data):
        k = KMeans(n_clusters=self.no_clusters)
        k.cluster_centers_ = np.array(center_points)
        assigned_clusters = k.predict(np.array(data))
        clusters = [ClusterCenter(c) for c in center_points]
        for i in range(0, len(assigned_clusters)):
            clusters[assigned_clusters[i]].add_point(data[i])

        ret = []
        for c in clusters:
            ret += c.get_half_farthest_points()
        return ret
Beispiel #26
0
def ikmeans(km, X, min_improvement=.01):
    ''' incremental kmeans; split worst cluster based on the silhouette score '''
    K = len(km.cluster_centers_)

    labels = km.labels_
    # compute label distribution
    labels_ratio = np.histogram(labels, bins=len(set(labels)))[0]
    labels_ratio = np.array(labels_ratio, dtype=float) / len(labels)
    scores = metrics.silhouette_samples(X, labels, metric='euclidean')
    score = scores.mean()

    # measure global performance of each cluster
    k_score = zeros(K)
    for k in range(K):
        idx = np.where(labels == k)
        k_score[k] = scores[idx].mean()

    # identify the cluster to split where population higher then min_population_ratio
    idx = np.where(labels_ratio > 0.01)[0]
    worst_score = k_score[idx].max()
    worst_idx = np.where(k_score == worst_score)[0]
    if len(worst_idx) > 1:
        print("several worst k (%i)" % len(worst_idx))
    worst_k = worst_idx[0]
    print("worsk cluster -> %i" % (worst_k))

    # split worst cluster
    idx = np.where(labels == k)[0]
    X_k = X[idx]
    if len(X_k) <= 2:
        print("not enought data point to split")
        return
    skm = KMeans(n_clusters=2, random_state=1).fit(X_k)

    # measure improvement with the 2 new clusters
    ikm = KMeans(n_clusters=K + 1, random_state=1)
    new_centers = np.array(km.cluster_centers_).tolist()
    new_centers.remove(new_centers[worst_k])
    [new_centers.append(center) for center in skm.cluster_centers_]
    ikm.cluster_centers_ = np.array(new_centers)
    ilabels = ikm.predict(X)
    ikm.labels_ = ilabels
    new_score = metrics.silhouette_score(X, ilabels, metric='euclidean')

    improvement = (score - new_score) / score

    if improvement > min_improvement:
        print("increase k (%2.2f%%)" % (improvement * 100))
        ikmeans(ikm, X)
    else:
        print("improvement %s->%s = %2.2f%%" % (K, K + 1, improvement * 100))
        print("best k = %i" % K)
        return km
Beispiel #27
0
def deserialize_kmeans_clustering(model_dict):
    model = KMeans(model_dict["params"])

    model.cluster_centers_ = np.array(model_dict["cluster_centers_"])
    model.labels_ = np.array(model_dict["labels_"])
    model.inertia_ = model_dict["inertia_"]
    model.n_features_in_ = model_dict["n_features_in_"]
    model.n_iter_ = model_dict["n_iter_"]
    model._n_threads = model_dict["_n_threads"]
    model._tol = model_dict["_tol"]

    return model
def deserialize_kmeans_clustering(model_dict):
    model = KMeans(model_dict['params'])

    model.cluster_centers_ = np.array(model_dict['cluster_centers_'])
    model.labels_ = np.array(model_dict['labels_'])
    model.inertia_ = model_dict['inertia_']
    model.n_features_in_ = model_dict['n_features_in_']
    model.n_iter_ = model_dict['n_iter_']
    model._n_threads = model_dict['_n_threads']
    model._tol = model_dict['_tol']

    return model
Beispiel #29
0
def init_sklearn_kmeans_from_checkpoint(checkpoint_path):
    checkpoint_path = pathlib.Path(checkpoint_path)
    with open(checkpoint_path / 'clusters.json', 'rb') as jf:
        clusters = np.array(json.load(jf))

    clusters = drop_nan_packets(clusters)
    # make KMeans think it was fitted
    quantizer = KMeans(n_clusters=clusters.shape[0])
    quantizer._n_threads = 1
    quantizer.cluster_centers_ = clusters
    logger.info(f'init sklearn KMeans from checkpoint: {checkpoint_path}')
    return quantizer
Beispiel #30
0
    def remove_half_nearest_points(self, center_points, data):
        k = KMeans(n_clusters=self.no_clusters)
        k.cluster_centers_ = np.array(center_points)
        assigned_clusters = k.predict(np.array(data))
        clusters = [ClusterCenter(c) for c in center_points]
        for i in range(0, len(assigned_clusters)):
            clusters[assigned_clusters[i]].add_point(data[i])

        ret = []
        for c in clusters:
            ret += c.get_half_farthest_points()
        return ret
def get_bow_feat(img_list,grid_spacing,patch_size,bow_model):
    raw_feat_extractor=dsift.DsiftExtractor(grid_spacing,patch_size,1)
    num_words=bow_model.shape[0]
    obj_kmeans=KMeans(num_words,'k-means++',3,500,0.001)
    obj_kmeans.cluster_centers_=bow_model
    bow_feat=npy.zeros((len(img_list),num_words),dtype=npy.float32)
    for kk in range(len(img_list)):
        img=imread(img_list[kk])
        if img.ndim==3:
            img=npy.mean(img,axis=2)
        raw_feat=raw_feat_extractor.process_image(img,False,False)[0]
        label_feat=obj_kmeans.predict(raw_feat)
        bow_feat[kk,:]=get_hist(label_feat,npy.array([0,num_words-1]),num_words,True)
    return bow_feat
Beispiel #32
0
 def __init__(self, centers=None, n_clusters=256, dim=16):
     """Product quantization for better cluster
     args:
         n_clusters (int): number clusters on each product
         dim (int): dimension of each cluster
     """
     self.n_clusters = n_clusters
     self.dim = dim
     self.centers = centers
     self.kmeans = []
     for i in range(0, self.centers.shape[1], self.dim):
         kmeans = KMeans(n_clusters=self.n_clusters)
         kmeans.cluster_centers_ = self.centers[:, i:i+self.dim]
         self.kmeans.append(kmeans)
def get_spm_feat(img_list,grid_spacing,patch_size,bow_model,pyramid_level):
    raw_feat_extractor=dsift.DsiftExtractor(grid_spacing,patch_size,1)
    num_words=bow_model.shape[0]
    dim_spm=num_words*(4**(pyramid_level+1)-1)/3
    obj_kmeans=KMeans(num_words,'k-means++',3,500,0.001)
    obj_kmeans.cluster_centers_=bow_model
    spm_feat=npy.zeros((len(img_list),dim_spm),dtype=npy.float32)
    for kk in range(len(img_list)):
        img=imread(img_list[kk])
        if img.ndim==3:
            img=npy.mean(img,axis=2)
        raw_feat,pos_feat=raw_feat_extractor.process_image(img,False,False)
        label_feat=obj_kmeans.predict(raw_feat)
        spm_feat[kk,:]=get_spm_hist(label_feat,pos_feat,num_words,pyramid_level,img.shape)
    return spm_feat   
Beispiel #34
0
def extract_features(data):
    kmeans = KMeans()
    kmeans.cluster_centers_ = vCenters

    bovw = []
    for idx, image in enumerate(data):
        image_feature_desciptors = extract_HOG_descriptors_per_image(image)
        Y = kmeans.predict(image_feature_desciptors.T)

        vFeatures = np.zeros(vCenters.shape[0])
        for vfeature in Y:
            vFeatures[vfeature] += 1
        bovw.append(vFeatures)


    return np.asarray(bovw)
Beispiel #35
0
def readBespokeFile(infile):
    """Returns a Model namedtuple with all the model parts"""
    with open(infile, 'r') as modelfile:
        lines = iter(modelfile.read().splitlines())
    n_params = int(lines.next())
    metric_names = [lines.next() for i in range(n_params)]
    means = _stringToArray(lines.next())
    stdevs = _stringToArray(lines.next())
    rotation_matrix = _stringToArray(lines.next())
    models = []
    centroids = []
    try:
        while True:
            name = lines.next() # kill a line
            centroids.append(_stringToArray(lines.next()))
            weights = _stringToArray(lines.next())
            functions = [LinearRegression.stringToFunction(lines.next()) 
                         for i in range(weights.shape[0])]
            models.append(LinearRegression.Model(functions, weights))
    except StopIteration:
        pass
    kmeans = KMeans(len(centroids))
    kmeans.cluster_centers_ = np.array(centroids)
    return Model(metric_names, means, stdevs, rotation_matrix, kmeans, models)
Beispiel #36
0
        # run doughnut and regular k-means cluster alg and store metrics
        clus = KMeans(n_clusters =k-1)
        clus_reg = KMeans(n_clusters = k)

        #   run  lloyds alg on regular and doughnuted data. Uses KMeans++ 
        #   method: max centroid distance.
        clus.fit(data[clustered])
        clus_reg.fit(data)

        #------------ Deal with Labels
        # Method 1: need to classify the held out according to closest centroids
        held_labels = []
        
         # append the centroid of heldout points
        centroid = np.mean(data[heldout],axis=0)
        clus.cluster_centers_=np.append(clus.cluster_centers_,[centroid],axis=0)
        
        # assign to cluster with closest centroid
        for h in heldout:
            held_labels.append(np.linalg.norm(np.subtract(data[h], clus.cluster_centers_),axis=1).argmin())
                
        # assign the heldouts according to held_labels to stitch labels back together
        stitched_label= np.zeros(len(data), dtype=np.int)
        for b in range(len(heldout)):   
            stitched_label[heldout[b]]=held_labels[b]
           
        for b in range(len(clustered)):
            stitched_label[clustered[b]]=clus.labels_[b]
        #------------ at this point the labels of our doughnut method are titles stitched_label
        
        # retrieve the prop of clusters and rsq (ratio between/within var)
Beispiel #37
0
def run(args):
    """
    Interface into Eiger model generation/polling/serialization/printing/etc.
    """
    if args['input'] == None:
        print "Loading training data..."
        training_DC = database.DataCollection(args['training_datacollection'], 
                                              args['db'])
        if args['dump_csv'] is not None:
            header = ','.join([met[0] for met in training_DC.metrics])
            np.savetxt(args['dump_csv'], training_DC.profile, delimiter=',', 
                       header=header, comments='')
            return
        for idx,metric in enumerate(training_DC.metrics):
            if(metric[0] == args['performance_metric']):
                performance_metric_id = idx
        try:
            training_performance = training_DC.profile[:,performance_metric_id]
        except UnboundLocalError:
            print "Unable to find performance metric '%s', " \
            "please specify a valid one: " % (args['performance_metric'],)
            for (my_name,my_desc,my_type) in training_DC.metrics:
                if my_type == 'result':
                    print "\t%s" % (my_name,)
            return
        if(args['predictor_metrics'] is not None):
            metric_ids = training_DC.metricIndexByName(args['predictor_metrics'])
        else:
            metric_ids = training_DC.metricIndexByType('deterministic', 
                                                       'nondeterministic')
        try:
            metric_ids.remove(performance_metric_id)
        except ValueError:
            pass
        metric_names = [training_DC.metrics[mid][0] for mid in metric_ids]
        try:
            training_profile = training_DC.profile[:,metric_ids]
        except IndexError:
            print "Unable to make model for empty data collection. Aborting..."
            return

        #pca
        training_pca = PCA.PCA(training_profile)
        nonzero_components = training_pca.nonzeroComponents()
        rotation_matrix = training_pca.components[:,nonzero_components]
        rotated_training_profile = np.dot(training_profile, rotation_matrix)

        print "Visualizing PCA..."
        if(args['plot_scree']):
            print training_pca.loadings
            PCA.PlotScree(training_pca.loadings, log=False, 
                              title="PCA Scree Plot")
        if(args['plot_pcs_per_metric']):
            PCA.PlotPCsPerMetric(rotation_matrix, metric_names, 
                                 title="PCs Per Metric")
        if(args['plot_metrics_per_pc']):
            PCA.PlotMetricsPerPC(rotation_matrix, metric_names, 
                                 title="Metrics Per PC")
        #kmeans
        n_clusters = args['clusters']
        kmeans = KMeans(n_clusters)
        means = np.mean(rotated_training_profile, axis=0)
        stdevs = np.std(rotated_training_profile - means, axis=0, ddof=1)
        stdevs[stdevs==0.0] = 1.0
        clusters = kmeans.fit_predict((rotated_training_profile - means)/stdevs)

        # reserve a vector for each model created per cluster
        models = [0] * len(clusters)

        print "Modeling..."
        # for printing the json file
        json_root = {}
        with tempfile.NamedTemporaryFile(delete=False) as modelfile:
            # For printing the original model file encoding 
            modelfile.write("%s\n%s\n" % (len(metric_names), '\n'.join(metric_names)))
            modelfile.write("[%s](%s)\n" % 
                    (len(means), ','.join([str(mean) for mean in means.tolist()])))
            modelfile.write("[%s](%s)\n" % 
                    (len(stdevs), ','.join([str(stdev) for stdev in stdevs.tolist()])))
            modelfile.write("[%s,%s]" % rotation_matrix.shape)
            modelfile.write("(%s)\n" % 
                            ','.join(["(%s)" % 
                                ','.join([str(elem) for elem in row]) 
                                for row in rotation_matrix.tolist()]))
            # for printing the json file
            json_root["metric_names"] = [name for name in metric_names]
            json_root["means"] = [mean for mean in means.tolist()]
            json_root["std_devs"] = [stdev for stdev in stdevs.tolist()]
            json_root["rotation_matrix"] = [[elem for elem in row] for row in rotation_matrix.tolist()]
            json_root["clusters"] = []

            for i in range(n_clusters):
                cluster_profile = rotated_training_profile[clusters==i,:]
                cluster_performance = training_performance[clusters==i]
                regression = LinearRegression.LinearRegression(cluster_profile,
                                                               cluster_performance)
                pool = [LinearRegression.identityFunction()]
                for col in range(cluster_profile.shape[1]):
                    if('inv_quadratic' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, -2))
                    if('inv_linear' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, -1))
                    if('inv_sqrt' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, -.5))
                    if('sqrt' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, .5))
                    if('linear' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, 1))
                    if('quadratic' in args['regressor_functions']):
                        pool.append(LinearRegression.powerFunction(col, 2))
                    if('log' in args['regressor_functions']):
                        pool.append(LinearRegression.logFunction(col))
                    if('cross' in args['regressor_functions']):
                        for xcol in range(col, cluster_profile.shape[1]):
                            pool.append(LinearRegression.crossFunction(col, xcol))
                    if('div' in args['regressor_functions']):
                        for xcol in range(col, cluster_profile.shape[1]):
                            pool.append(LinearRegression.divFunction(col,xcol))
                            pool.append(LinearRegression.divFunction(xcol,col))
                (models[i], r_squared, r_squared_adj) = regression.select(pool, 
                        threshold=args['threshold'],
                        folds=args['nfolds'])
                
                # dump model to original file encoding
                modelfile.write('Model %s\n' % i)
                modelfile.write("[%s](%s)\n" % (rotation_matrix.shape[1],
                                                ','.join([str(center) for center in
                                                    kmeans.cluster_centers_[i].tolist()])))
                modelfile.write(repr(models[i]))
                modelfile.write('\n') # need a trailing newline

                # dump model for json encoding
                json_cluster = {}
                json_cluster["center"] = [center for center in kmeans.cluster_centers_[i].tolist()]
                # get models in json format
                json_cluster["regressors"] = models[i].toJSONObject()
                json_root["clusters"].append(json_cluster)

                print "Index\tMetric Name"
                print '\n'.join("%s\t%s" % metric for metric in enumerate(metric_names))
                print "PCA matrix:"
                print rotation_matrix 
                print "Model:\n" + str(models[i])

                print "Finished modeling cluster %s:" % (i,)
                print "r squared = %s" % (r_squared,)
                print "adjusted r squared = %s" % (r_squared_adj,)
           
        # if we want to save the model file, copy it now
        if args['output'] == True:
            if args['json'] == True:
                with open(training_DC.name + '.model', 'w') as outfile:
                    json.dump(json_root, outfile, indent=4)
            else:
                shutil.copy(modelfile.name, training_DC.name + '.model')
    else:
        lines = iter(open(args['input'],'r').read().splitlines())
        n_params = int(lines.next())
        metric_names = [lines.next() for i in range(n_params)]
        means = _stringToArray(lines.next())
        stdevs = _stringToArray(lines.next())
        rotation_matrix = _stringToArray(lines.next())
        models = []
        centroids = []
        try:
            while True:
                name = lines.next() # kill a line
                centroids.append(_stringToArray(lines.next()))
                weights = _stringToArray(lines.next())
                functions = [LinearRegression.stringToFunction(lines.next()) 
                             for i in range(weights.shape[0])]
                models.append(LinearRegression.Model(functions, weights))
        except StopIteration:
            pass
        kmeans = KMeans(len(centroids))
        kmeans.cluster_centers_ = np.array(centroids)

    if(args['experiment_datacollection'] or args['test_fit']):
        DC = args['experiment_datacollection'] if \
            args['experiment_datacollection'] else args['training_datacollection']
        print "Running experiment on data collection %s..." % \
              (DC,)
        experiment_DC = database.DataCollection(DC, 
                                                args['db'])
        _runExperiment(kmeans, means, stdevs, models, rotation_matrix,
                       experiment_DC, args, metric_names)
    print "Done!"
    def _train(self, phi, data, y=None):
        assert y is None
        n_samples = self._settings['n_samples']
        n_per_image = self._settings['n_per_image']

        n_images = n_samples // n_per_image

        # TODO: This always processes all the images
        X = phi(data[:n_images])
        ag.info('Extracting patches')
        #patches = self._get_patches(X)

        # Get patches
        gen = ag.image.extract_patches(X, self._part_shape,
                                       samples_per_image=n_per_image,
                                       seed=self._settings['seed'])


        ag.info('Extracting patches 2')
        # Filter
        th = self._settings['std_thresh']
        if th > 0:
            gen = (x for x in gen if x.std() >= th)

        rs = np.random.RandomState(0)

        # Now request the patches and convert them to floats
        #patches = np.asarray(list(itr.islice(gen, n_samples)), dtype=np.float64) / 255
        patches = np.asarray(list(itr.islice(gen, n_samples)))
        ag.info('Extracting patches 3')

        from vzlog.default import vz

        # Flatten the patches
        pp = patches.reshape((patches.shape[0], -1))

        C = X.shape[-1]
        sh = (-1,) + self._part_shape + (C,)

        if C <= 3:
            def plot(title):
                vz.section(title)
                grid = ag.plot.ColorImageGrid(pp[:1000].reshape(sh), rows=15)
                grid.save(vz.impath(), scale=3)
        else:
            def plot(title): return

        plot('Original patches')

        # Standardize the patches
        if self._settings['standardize']:
            pp = self._standardize_patches(pp)

        plot('Standardized patches')

        # Determine whitening coefficients
        sigma = np.dot(pp.T, pp) / len(pp)

        self._extra['sigma'] = sigma

        if self._settings['whiten']:
            U, S, _ = np.linalg.svd(sigma)

            shrinker = np.diag(1 / np.sqrt(S + self.w_epsilon))

            #self._whitening_matrix = U @ shrinker @ U.T
            self._whitening_matrix = np.dot(U, np.dot(shrinker, U.T))

            # Now whiten the training patches
            pp = self.whiten_patches(pp)
        else:
            self._whitening_matrix = None

        plot('Whitened patches')

        if self._settings['random_centroids']:
            rs = np.random.RandomState(self._settings['seed'])
            sh = (self._num_parts,) + self._part_shape
            self._parts = rs.normal(0, 1, size=sh)
            #self._parts /= ag.apply_once(np.mean, self._parts, [1, 2])
            return
        else:
            # Run K-means
            from sklearn.cluster import KMeans, MiniBatchKMeans

            #cl = MiniBatchKMeans(
            cl = KMeans(
                        n_clusters=self._num_parts,
                        n_init=self._settings['n_init'],
                        max_iter=self._settings['max_iter'],
                        random_state=self._settings['seed'],
                        #batch_size=50000,
                        n_jobs=self._settings['n_jobs'],
                        )

            ag.info('Training', self._num_parts, 'K-means parts')
            cl.fit(pp)
            ag.info('Done.')

            counts = np.bincount(cl.labels_, minlength=self._num_parts)
            ww = counts / counts.sum()
            HH = np.sum(-ww * np.log(ww))
            print('entropy', HH)

            II = np.argsort(counts)[::-1]
            cl.cluster_centers_ = cl.cluster_centers_[II]
            counts = counts[II]

            ag.info('counts', counts)

            self._parts = cl.cluster_centers_.reshape((-1,) + patches.shape[1:])

            vz.section('Parts')

        self._preprocess()
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 13 20:26:56 2015

@author: felix
"""

from sklearn.cluster import KMeans
import numpy as np


C = np.array([[1,1], [1,2], [2,1], [2,2], [5,1], [6,1], [5,2]])
centers = [[3,0], [5,0]]

clf = KMeans(init='k-means++', n_clusters=2, n_init=5)
clf.cluster_centers_ = centers

clf.fit(C)

print "centros: ", clf.cluster_centers_
def main():
	start_date = datetime.datetime.now()
	search_date = start_date + datetime.timedelta(-30) 
	week1_query ='''SELECT T1.uid_i as uid_i,ave as ave_f, special_crystal as special_crystal_f, 
	    pve_consumable as pve_consumable_f, upgrade as upgrade_f, premium_hero as premium_hero_f, n_transactions_i,age_i

	FROM


	(SELECT uid_i, s_ave/total as ave, s_special_crystal/total as special_crystal, s_pve_consumable/total as pve_consumable, 
	s_upgrade/total as upgrade, s_premium_hero/total as premium_hero, n_transactions_i
	FROM

	(SELECT uid_i, SUM(ave) as s_ave, SUM(special_crystal) as s_special_crystal, sum(pve_consumable) as s_pve_consumable, sum(upgrade) as s_upgrade,
	sum(premium_hero) as s_premium_hero,
	(SUM(ave) + SUM(special_crystal) +sum(pve_consumable)+sum(upgrade)+sum(premium_hero)) as total, COUNT(*) as n_transactions_i
	FROM

	(SELECT uid_i, data_reason_desc_s,data_reason_pricing_id_s, 
	(case when left(data_reason_pricing_id_s,4) ='ave_' then data_item_q_i else 0 end) as ave,

	(case when data_reason_pricing_id_s LIKE('%crystal%') and data_reason_pricing_id_s not LIKE('%golden%') then data_item_q_i 
	when data_reason_pricing_id_s LIKE('%upsale%') then data_item_q_i 
	when data_reason_pricing_id_s like('rocket%') then data_item_q_i
	else 0 end) as special_crystal,

	(case when data_reason_pricing_id_s LIKE('%golden%') then data_item_q_i 
	when data_reason_pricing_id_s LIKE('%upgrade%') then data_item_q_i 
	when data_reason_pricing_id_s LIKE('%regen%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%arena%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%duel%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%key%') then data_item_q_i 
	when data_reason_pricing_id_s is null then data_item_q_i
	else 0 end) upgrade,


	(case when data_reason_pricing_id_s LIKE('health_potion%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('revive%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('team%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%questing_pack%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%booster%') then data_item_q_i
	when data_reason_pricing_id_s LIKE('%pve_refill%') then data_item_q_i
	else 0 end) as pve_consumable,


	(case when data_reason_pricing_id_s LIKE('%premium_hero%') then data_item_q_i else 0 end) premium_hero,


	FROM table_date_range(marvel_production_view.redeemer_transactions,timestamp(\''''+str(search_date)+'''\'),timestamp(\''''+str(start_date)+'''\'))
	where counter_s = 'spend'
	and data_item_n_s ='hc'
	and data_reason_desc_s !='buyGift'
	and data_reason_pricing_id_s !='fte_guaranteed'
	and data_reason_pricing_id_s not LIKE('hero_crystal%')
	and data_reason_pricing_id_s !='alliance_create_cost_b')
	GROUP EACH BY 1)) T1
	JOIN EACH
	(SELECT uid_i, DATEDIFF(timestamp(\''''+str(start_date)+'''\'),time_join_t) as age_i
	FROM marvel_production_view.users

	where time_join_t < timestamp(\''''+str(search_date)+'''\')) T2
	ON T1.uid_i = T2.uid_i

	'''
	print('performing query ...')
	df_dimensions_collapsed_w1 = gbq_large.read_gbq(week1_query,project_id='mcoc-bi',destination_table='datascience_view.clusters_tmp')
	df_dimensions_collapsed_w1=df_dimensions_collapsed_w1.fillna(0)
	df_dimensions = df_dimensions_collapsed_w1[['ave_f','special_crystal_f','pve_consumable_f','upgrade_f','premium_hero_f']]
	est_c = KMeans(n_clusters=10)
	print('clustering ...')
	est_c.cluster_centers_ = np.asarray([[ 0.02694769,  0.06531768,  0.06121219,  0.82539261,  0.02112983],
	       [ 0.05772959,  0.37772436,  0.09730477,  0.40487444,  0.06236684],
	       [ 0.08125626,  0.29389585,  0.42306508,  0.12683245,  0.07495037],
	       [ 0.01135739,  0.08087575,  0.0494629 ,  0.0646581 ,  0.79364585],
	       [ 0.51941725,  0.14303638,  0.15421209,  0.14783146,  0.03550281],
	       [ 0.00832494,  0.91744861,  0.02002415,  0.03100689,  0.02319541],
	       [ 0.06583563,  0.62194053,  0.09732582,  0.12262572,  0.0922723 ],
	       [ 0.08316859,  0.09417081,  0.33420608,  0.44578459,  0.04266993],
	       [ 0.03944744,  0.05819858,  0.79046582,  0.09186975,  0.02001841],
	       [ 0.04018328,  0.35265425,  0.08800917,  0.11709595,  0.40205735]])
	labels_c=est_c.predict(df_dimensions)
	print('post processing ...')
	df_dimensions_collapsed_w1['cluster_label_i'] = labels_c
	df_write = df_dimensions_collapsed_w1
	df_write['ave_f'] = df_write.ave_f.apply(lambda x: np.fabs(x))
	df_write['special_crystal_f'] = df_write.special_crystal_f.apply(lambda x: np.fabs(x))
	df_write['pve_consumable_f'] = df_write.pve_consumable_f.apply(lambda x: np.fabs(x))
	df_write['upgrade_f'] = df_write.upgrade_f.apply(lambda x: np.fabs(x))
	df_write['premium_hero_f'] = df_write.premium_hero_f.apply(lambda x: np.fabs(x))
	df_write['_ts_t'] = start_date.strftime('%Y-%m-%d %H:%M:%S')
	filename_str = 'segmentation.csv'
	table_write = 'mcoc-bi:marvel_bi.user_segmentation_historical'+ start_date.strftime('%Y%m%d')
	print('writing csv ...')
	df_write.to_csv(filename_str,index=False)
	print('bq loading ...')
	subprocess.call("bq load --source_format=CSV --skip_leading_rows=1 "+table_write+ " " + filename_str + " uid_i:integer,ave_f:float,special_crystal_f:float,pve_consumable_f:float,upgrade_f:float,premium_hero_f:float,n_transactions_i:integer,age_i:integer,cluster_label_i:integer,_ts_t:timestamp",shell=True)
Beispiel #41
-4
    def sample_points(self):
        k = KMeans(n_clusters=self.no_clusters)
        k.cluster_centers_ = np.array(self.cluster_center_points)
        assigned_clusters = k.predict(np.array(self.data))

        self.cluster_centers = [ClusterCenter(c) for c in self.cluster_center_points]
        self.data_points = [DataPoint(self.data[i], self.cluster_centers[assigned_clusters[i]]) for i in
                            range(len(self.data))]

        dp_sum = np.sum([dp.calc_sampling_weight() for dp in self.data_points]) / self.out_per_mapper

        for dp in self.data_points:
            dp.dp_sum = dp_sum

        #logging.warn("Tot!")
        #logging.warn(sum([dp.calc_sampling_probability() for dp in self.data_points]))
        #logging.error(len(self.data_points))

        while self.can_write_more_features():
            np.random.shuffle(self.data_points)
            for dp in self.data_points:
                if not self.can_write_more_features():
                    return

                dp.dp_sum = dp_sum
                if np.random.sample() < dp.calc_sampling_probability():
                    self.write_feature(dp.point, dp.calc_weight(self.out_per_mapper))