def plot_GPLVM_data_cluster(results_dir, n_clusters=None, VB=False):
    # Load relevant datasets
    data_array = np.genfromtxt(os.path.join(results_dir, 'summary.csv'), delimiter=',')
    X = (np.genfromtxt(os.path.join(results_dir, 'GPLVM-datasets-2.csv'), delimiter=','))
    datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()]
    methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()]
    # Fit a mixture model
    if n_clusters is None:
        m = DPGMM()
    elif VB:
        m = VBGMM(alpha = 10, n_components=n_clusters)
    else:
        m = GMM(n_components=n_clusters, n_init=100)
    m.fit(data_array.T)
    clusters = m.predict(data_array.T)
    # Plot
    #clf()
    figure(1)
    pretty_scatter(X[:,0], X[:,1], clusters, 200*np.ones(X[:,0].shape), datasets)
    xlabel('Dimension 1')
    ylabel('Dimension 2')
    if n_clusters is None:
        title('CRP MoG')
    elif VB:
        title('%d clusters with VB' % n_clusters)
    else:
        title('%d clusters with EM' % n_clusters)
    show()
Ejemplo n.º 2
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k,
                          covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
Ejemplo n.º 3
0
 def try_covar(type_str, x_words):
     clf = DPGMM(n_components=20, covariance_type=type_str, alpha=30,  n_iter=1000)
     clf.fit(x_data)
     y_ = clf.predict(x_data)
     print type_str
     print_centers(x_words, y_, clf)
     print
Ejemplo n.º 4
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k, covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
def Dirichlet(cluster_data, identification, iteration_number = 1):
    print "In Dirichlet"
    for i in range(0,iteration_number):
        print "On iteration number ", i
        dirichlet = DPGMM(n_components = len(cluster_data)).fit(cluster_data)
    #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)

    return _make_final_list(identification, predict)
Ejemplo n.º 6
0
def Dirichlet(cluster_data, identification, iteration_number=1):
    print "In Dirichlet"
    for i in range(0, iteration_number):
        print "On iteration number ", i
        dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data)
        #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)

    return _make_final_list(identification, predict)
Ejemplo n.º 7
0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None):
    best_bic = np.inf
    bic_dpgmm = None
    lbl_vec_dpgmm = np.zeros(X.shape[0])
    prob_vec_dpgmm = np.zeros(X.shape[0])
    log_prob_dpgmm = None
    for i in xrange(n_init):
        dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                        alpha=alpha, random_state=rand_state)
        dpgmm.fit(X)
        b = dpgmm.bic(X)
        if b < best_bic:
            bic_dpgmm = b
            lbl_vec = dpgmm.predict(X)
            prob_vec = dpgmm.predict_proba(X)
            log_prob_dpgmm = np.sum(dpgmm.score(X))
    return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
Ejemplo n.º 8
0
def _Dirichlet(cluster_data, identification):
    print "In Dirichlet"
    for i in range(0, 3):
        print "i is ", i
        dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data)
        #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)

    final = []
    for x in range(0, len(identification)):
        final.append([identification[x], predict[x]])

    print "this is what final sort of looked like"
    print final[:3]

    return final
Ejemplo n.º 9
0
def _Dirichlet(cluster_data, identification):
    print "In Dirichlet"
    for i in range(0, 3):
        print "i is ", i
        dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data)
        # paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print ("Estimated number of clusters with Dirichlet: %d" % n_clusters_)

    final = []
    for x in range(0, len(identification)):
        final.append([identification[x], predict[x]])

    print "this is what final sort of looked like"
    print final[:3]

    return final
Ejemplo n.º 10
0
def dpgmm_segmenter(factors, width=MEDIAN_WIDTH):
    factors = median_filter(factors, size=(MEDIAN_WIDTH, 1), mode='mirror')
    factors = pre.scale(factors, axis=1)
    best_boundaries = [0, factors.shape[0] - 1]
    best_n_types = 1

    dpgmm = DPGMM(n_components=10, covariance_type='diag', alpha=10, n_iter=100)
    dpgmm.fit(np.tile(factors, (10, 1)))
    labels = dpgmm.predict(factors)
    boundaries, labels = find_boundaries(labels, width)

    if len(np.unique(labels)) > 1:
        best_boundaries = boundaries
        best_n_types = len(np.unique(labels))

    if len(best_boundaries) < best_n_types + 1:
        best_n_types = len(best_boundaries) - 1

    best_labels = segment_labeling(factors, best_boundaries, c_method='kmeans', k=best_n_types)
    best_boundaries = np.array(best_boundaries)

    return best_boundaries, best_labels
Ejemplo n.º 11
0
def main(method,cluster_num=30,alpha=.5):
    f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv'
    #f ='/Users/davidgreenfield/Downloads/features_f500.csv'
    cols=range(1,4096)
    feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols))
    asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str)
    if method == 'kmeans':
        k_means=cluster.KMeans(n_clusters=cluster_num)
        k_means.fit(feats)
        y = k_means.labels_
        if MAKE_GRAPH==1:
            print "hello 1"
        create_graph(k_means)
    elif method == 'GMM_VB':
        gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5)
        y = gmm_vb.predict(feats)
        cluster_no = len(np.unique(y))
    elif method == 'GMM_DP':
        gmm_dp = DPGMM(n_components=50,alpha=alpha)
        gmm_dp.fit(feats)
        y = gmm_dp.predict(feats)
        cluster_no = len(np.unique(y))


    clusters=[]
    groups={}
    data=load_data('./data/boots_aws.csv')

    for i in range(0,cluster_num):
        groups[i]=np.where(y==i)
        ids=asins[groups[i]]
        clusters.append(ids)
        links=[data[x]['url'] for x in ids]
        create_html(links,"templates/groups/group"+str(i)+".html")

    output_clusters(clusters,"outputs/clusters.csv")
Ejemplo n.º 12
0
#X = StandardScaler().fit_transform(fn)

fold = 3
kf = StratifiedKFold(label, n_folds=fold, shuffle=True)
#kf = KFold(len(label), n_folds=fold, shuffle=True)
clf = RFC(n_estimators=100, criterion='entropy')
rounds = 1
acc_sum = [[] for i in range(fold)]
for train, test in kf:
    train_fn = fn[train]
    #n_class = len(np.unique(label[train]))

    d = DPGMM(n_components=50, covariance_type='spherical',alpha=10)
    d.fit(train_fn)
    #print 'mixture mean', d.means_
    preds = d.predict(train_fn)
    print '# of M by DP', len(np.unique(preds))
    acc_sum[0].append(ARI(label[train], preds))
    #acc_sum[0].append(SS(train_fn, preds))

    #n_class = len(np.unique(preds))
    n_class = 32
    g = GMM(n_components=n_class, covariance_type='spherical', init_params='wmc', n_iter=100)
    g.fit(train_fn)
    #g.means_ = np.array([x_train[y_train == i].mean(axis=0) for i in np.unique(y_train)])
    preds = g.predict(train_fn)
    #prob = np.sort(g.predict_proba(train_fd))
    acc_sum[1].append(ARI(label[train], preds))
    #acc_sum[1].append(SS(train_fn, preds))

    k = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
Ejemplo n.º 13
0
class Clustering:
    def __init__(self, X, scale=False, features=None):
        self.random_seed = 12345  # set to None for random
        self.X = X
        self.k = None

        self.centers = None
        self.h = None
        self.Y = None

        self.scaler = None
        self.ids = None

        # Visualize().plot( zip(*self.X) ) # FOR DEBUG

        # Reduce the number of features
        if features is not None:
            if isinstance(features, (int, float)):
                variances = VarianceThreshold().fit(self.X).variances_
                self.ids = sorted(
                    range(len(variances)), key=lambda i: variances[i]
                )[-int(features
                       ):]  # indexes of the top n_features values in variances

            elif type(features) in [list, tuple]:
                self.ids = features

            self.X = self.reduceFeatures(self.X)
            print("Selected features", self.ids, "on a total of",
                  len(X[0]))  # FOR DEBUG

        # Visualize().plot( zip(*self.X) ) # FOR DEBUG

        if scale:
            self.scaler = StandardScaler(
            )  # MinMaxScaler() can also be used instead of StandardScaler()
            self.X = self.scaler.fit_transform(self.X)

    #---------------------------------------
    def reduceFeatures(self, X):
        if self.ids is None:
            return X
        else:
            return [[v for iv, v in enumerate(x) if iv in self.ids] for x in X]

    #---------------------------------------
    def affinity(self, k=2):  # K is not used here
        self.h = AffinityPropagation(damping=0.75,
                                     preference=k,
                                     max_iter=200,
                                     convergence_iter=15,
                                     copy=True,
                                     affinity='euclidean').fit(self.X)
        self.Y = self.h.labels_
        self.k = k
        self.centers = self.getCenters()

        return self

    #---------------------------------------
    def meanshift(self, k=2):  # K is not used here
        self.h = MeanShift(bandwidth=None,
                           seeds=None,
                           bin_seeding=False,
                           min_bin_freq=1,
                           cluster_all=True).fit(self.X)
        self.Y = self.h.labels_
        self.k = k
        self.centers = self.getCenters()

        return self

    #---------------------------------------
    def kmeans(self, k=2):
        self.h = KMeans(n_clusters=k,
                        init='k-means++',
                        n_init=10,
                        max_iter=1000,
                        tol=0.00001,
                        random_state=self.random_seed).fit(self.X)
        self.Y = self.h.labels_
        self.k = k
        self.centers = self.getCenters()

        return self

    #---------------------------------------
    def gmm(self, k=2):
        self.h = GMM(n_components=k, random_state=self.random_seed).fit(self.X)
        self.Y = self.h.predict(self.X)
        self.k = k
        self.centers = self.getCenters()

        #TODO
        # posterior = self.h.predict_proba( self.X[:5] )
        # likelihood = self.h.score( self.X[:5] )

        return self

    #---------------------------------------
    ''' Dirichlet Process is as likely to start a new cluster for a point as it is to add that point to a cluster with alpha elements (0<alpha<inf).
	A higher alpha means more clusters, as the expected number of clusters is alpha*log(N)'''

    def dpgmm(self, k=10, alpha=1.0):
        self.h = DPGMM(n_components=k,
                       alpha=alpha,
                       random_state=self.random_seed).fit(self.X)
        self.Y = self.h.predict(self.X)
        self.k = k  # this is the max number of components in dpgmm
        self.centers = self.getCenters()

        #TODO
        # posterior = self.h.predict_proba( self.X[:5] )
        # likelihood = self.h.score( self.X[:5] )

        return self

    #---------------------------------------
    def done(self):
        if self.h is None:
            print("Clustering is not yet done !")
            return False
        else:
            return True

    #---------------------------------------
    def getCenters(self):
        if not self.done(): return

        try:
            return self.h.cluster_centers_

        # If the clustering has no centers, compute them based on clusters
        except AttributeError:
            unique_labels = np.unique(self.Y)
            clusters = {ul: [] for ul in unique_labels}

            for i in range(len(self.X)):
                clusters[self.Y[i]].append(self.X[i])

            centers = []
            for label in clusters:
                centers.append(
                    [np.mean(col) for col in list(zip(*clusters[label]))])

            return centers

    #---------------------------------------
    def predict(self, x):
        if not self.done(): return
        x_processed = x
        x_processed = self.reduceFeatures([x_processed])[0]
        x_processed = x_processed if self.scaler is None else self.scaler.transform(
            x_processed)

        return self.h.predict(x_processed)[0]

    #---------------------------------------
    def predictAll(self, X):
        if not self.done(): return
        X_processed = X
        X_processed = self.reduceFeatures(X_processed)
        X_processed = X_processed if self.scaler is None else self.scaler.transform(
            X_processed)

        return list(self.h.predict(X_processed))

    #---------------------------------------
    def quality(self, X=None):
        if not self.done(): return

        if X is None:  # if X not provided then use the training data and resulting labels
            X = self.X
            Y = self.Y
        else:  # if X is provided then use it with the predicted labels (clusters)
            Y = self.predictAll(X)

        indexs = range(len(X))
        shuffle(indexs)
        X = np.array([X[i] for i in indexs[:5000]])
        Y = np.array([Y[i] for i in indexs[:5000]])

        if len(set(Y)) < 2: return 0.  # FIXME

        return silhouette_score(X, Y, metric='euclidean')

    #---------------------------------------
    def plot(self, fig=None):
        if not self.done(): return

        viz = Visualize()
        if len(self.X[0]) > 3:
            X = viz.PCA_Transform(list(zip(*self.X)))
        else:
            X = self.X

        unique_labels = np.unique(self.Y)
        clusters = {ul: [] for ul in unique_labels}

        for i in range(len(X)):
            clusters[self.Y[i]].append(X[i])

        centers_for_plot = [
        ]  # Not the real centers because dimension was reduced using PCA
        for label in clusters:
            centers_for_plot.append(
                [np.mean(col) for col in list(zip(*clusters[label]))])

        viz.do_plot(list(zip(*centers_for_plot)), marker='o', color='m')
        viz.plot_groups(clusters, fig)
Ejemplo n.º 14
0
Archivo: DC.py Proyecto: clouizos/AIR
    """if you want clustering on the dissimilarity space uncomment
       below and change accordingly"""
    # print 'Calculating dissimilarity space for training queries...'
    # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean')
    # data_cluster_train_ds = sc.squareform(data_cluster_train_ds)

    # # plt.figure(1)
    # # plt.imshow(data_cluster_train_ds)
    # # plt.colorbar()
    # # plt.title('Initial dissimilarity')

    print 'Training a Dirichlet Process Gaussian Mixture model...'
    dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50)
    dpgmm.fit(data_cluster_train_ds)
    prediction = dpgmm.predict(data_cluster_train_ds)
    clusters = np.unique(prediction)

    print 'Found %i clusters!' % clusters.shape[0]
    print clusters

    """create the reordered input data according to the clusters
      it is only needed if you want to visuallize the clustering
      afterwards"""
    #data_cluster = np.zeros((1, data_cluster_train.shape[1]))

    # each cluster is a list of lists that contains the indices
    # of the queries for each cluster
    each_cluster = []
    for i in xrange(clusters.shape[0]):
        cluster = data_cluster_train[prediction == clusters[i], :]
Ejemplo n.º 15
0
    data_cluster_train_ds = data_cluster_train
    """if you want clustering on the dissimilarity space uncomment
       below and change accordingly"""
    # print 'Calculating dissimilarity space for training queries...'
    # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean')
    # data_cluster_train_ds = sc.squareform(data_cluster_train_ds)

    # # plt.figure(1)
    # # plt.imshow(data_cluster_train_ds)
    # # plt.colorbar()
    # # plt.title('Initial dissimilarity')

    print 'Training a Dirichlet Process Gaussian Mixture model...'
    dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50)
    dpgmm.fit(data_cluster_train_ds)
    prediction = dpgmm.predict(data_cluster_train_ds)
    clusters = np.unique(prediction)

    print 'Found %i clusters!' % clusters.shape[0]
    print clusters
    """create the reordered input data according to the clusters
      it is only needed if you want to visuallize the clustering
      afterwards"""
    #data_cluster = np.zeros((1, data_cluster_train.shape[1]))

    # each cluster is a list of lists that contains the indices
    # of the queries for each cluster
    each_cluster = []
    for i in xrange(clusters.shape[0]):
        cluster = data_cluster_train[prediction == clusters[i], :]
        each_cluster.append(np.where(prediction == clusters[i])[0])
Ejemplo n.º 16
0
class DPGMMClusterModel(BaseEstimator, TransformerMixin):

    def __init__(self, w2v_model=None, n_components=None, no_above=0.9, no_below=8, dataname="", stoplist=None,
                 dictionary=None, recluster_thresh=1000, alpha=5):
        self.w2v_model = w2v_model
        self.no_above = no_above
        self.no_below = no_below
        self.alpha = alpha
        self.n_components = n_components
        self.n_sub_components = int(n_components / 2)
        self.stoplist = stoplist
        self.dataname = dataname
        self.dictionary = dictionary
        self.dpgmm = None
        self.scaler = None
        self.cluster_info = None
        # a list of sub-clusterer
        self.feature_crd = {}
        self.subdpgmms = []
        self.reclustered = []
        self.recluster_thresh = recluster_thresh

    def should_cluster_word(self, word):
        return (word in self.dictionary.token2id) and (len(word) > 1) and \
               (self.w2v_model is None or word in self.w2v_model) and \
               (self.stoplist is None or word not in self.stoplist)

    # constructs a dictionary and a DPGMM model on 9000 middle frequency words from X
    # X is a sequence of texts
    def fit(self, X, y=None):
        # either consturct a dictionary from X, trim it
        if self.dictionary is None:
            self.dictionary = corpora.Dictionary(X)
        # or use an existing dictionary and trim the given set of words
        self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above, keep_n=9000)

        if self.w2v_model is None:
            w2v_corpus = [[word for word in text if self.should_cluster_word(word)] for text in X]
            self.w2v_model = w2v_models.build_word2vec(w2v_corpus, size=100, window=10, min_count=self.no_below,
                                                       dataname=self.dataname+"_dpgmm")

        word_list = np.array([word for word in self.dictionary.token2id.iterkeys() if self.should_cluster_word(word)])

        # This was  reclustering clause - I need to re-write this
        # else:
        #    # note the double loop here!!
        #    word_list = np.array([word for text in X for word in text if self.should_cluster_word(word)])

        # construct a list of words to cluster
        # remove rare and frequent words
        # remove words of length 1
        # remove stopwords
        vec_list = [self.w2v_model[word] for word in word_list]

        logging.info("DPGMM received %i words" % len(vec_list))

        # save word representations
        filename = "w2v_vocab_%s_%.1f_%.0f.lcsv" % (self.dataname, self.no_above, self.no_below)
        io.save_words_representations(filename, word_list, vec_list)

        self.scaler = StandardScaler()
        vecs = self.scaler.fit_transform(np.array(vec_list))

        self.dpgmm = DPGMM(n_components=self.n_components, covariance_type='diag', alpha=self.alpha,
                           n_iter=1000, tol=0.0001)
        self.dpgmm.fit(vecs)
        logging.info("DPGMM converged: %s" % self.dpgmm.converged_)


        # save information about found clusters
        self.cluster_info = []
        y_ = self.dpgmm.predict(vecs)

        for i, cluster_center in enumerate(self.dpgmm.means_):
            cluster_words = word_list[y_ == i]
            cluster_size = len(cluster_words)
            if cluster_size > self.recluster_thresh and self.recluster_thresh > 0:
                logging.info("DPGMM: reclustering %i words for cluster %i" % (len(cluster_words), i))
                sub_dpgmm = DPGMMClusterModel(w2v_model=self.w2v_model,
                                              n_components=self.n_sub_components,
                                              dictionary=self.dictionary,
                                              dataname="%s-%i" % (self.dataname, i), stoplist=self.stoplist)
                # recluster words.  Note the double array
                sub_dpgmm.fit([cluster_words])
                self.subdpgmms.append(sub_dpgmm)
                self.reclustered.append(i)
            if cluster_size > 0:
                #cluster_center_original = self.scaler.inverse_transform(cluster_center)
                #similar_words = self.w2v_model.most_similar_cosmul(positive=[cluster_center_original], topn=cluster_size)
                #central_words = [word for word, _ in similar_words if word in cluster_words]
                central_words = cluster_words[0:10]
            else:
                central_words = []
            self.cluster_info.append({'cnt': i, 'size': cluster_size, 'words': central_words})

        filename = "clusters_%s_%i_%.1f_%.0f.txt" % (self.dataname, self.n_components, self.no_above, self.no_below)
        io.save_cluster_info(filename, self.cluster_info)

        # setting up the coordinates for the features
        self.feature_crd = {'global': range(0, self.n_components),
                            'reclustered': [i for i in range(0, self.n_components + self.n_sub_components*len(self.reclustered))
                                            if i not in self.reclustered]}

        return self

    # calculate cluster counts for one text
    def clusterize(self, text):
        word_list = [word for word in text if self.should_cluster_word(word)]
        vec_list = np.array([self.w2v_model[word] for word in word_list])
        bincounts = np.zeros((self.n_components+self.n_sub_components*len(self.reclustered),))

        if len(vec_list) > 0:
            # assign words to clusters
            predictions = self.dpgmm.predict(self.scaler.transform(np.array(vec_list)))
            global_bincount = np.bincount(predictions, minlength=self.n_components)
            # re-assign words in large clusters
            bincounts[0:self.n_components] = global_bincount #reshape((1,len(global_bincount)))
            start = self.n_components
            for i, subdpgmm in zip(self.reclustered, self.subdpgmms):
                # if words in respective clusters exists - recluster them
                vecs_torecluster = vec_list[predictions == i]
                if len(vecs_torecluster) > 0:
                    predictions = subdpgmm.dpgmm.predict(subdpgmm.scaler.transform(np.array(vecs_torecluster)))
                    bincounts[start:start+subdpgmm.dpgmm.n_components] = \
                        np.bincount(predictions, minlength=subdpgmm.dpgmm.n_components) #.reshape((1, subdpgmm.n_components))
                start += subdpgmm.dpgmm.n_components
                # erase the count inthe global counts

        # returns a vector of cluster bin counts: [ global, reclustered1, reclustered2, ...]
        return bincounts.reshape((1, len(bincounts)))


    # for a  text, constructs a bincount of clusters present in the sentence
    # X is a list of texts.  One text is one string! Not tokenized
    def transform(self, X):

        # Text pre-processing
        x_clean = [tu.normalize_punctuation(text).split() for text in X]
        logging.info("DPGGM: Text prepocessed")

        # Vectorize using W2V model
        if self.dpgmm is not None:
            logging.info("Vectorizing a corpus")
            size = self.w2v_model.layer1_size
            if len(X) > 0:
                vecs = np.concatenate([self.clusterize(z) for z in x_clean], axis=0)
            else:
                vecs = np.zeros(size).reshape((1, size))
            logging.info("DPGMM: returning pre-processed data of shape %s" % (vecs.shape, ))
        else:
            logging.info("W2V Averaged: no model was provided.")
            vecs = np.zeros((len(X), 1))

        return vecs
Ejemplo n.º 17
0
    return X, Y

def test1():
    print 'test1'
    model = VDPGMM(T = 10, alpha = 1, max_iter = 50)
    X, Y = getXY('iris')
    model.fit(X)
    y = model.predict(X)
    print 'VDPGMM'
    print len(np.unique(y)), np.unique(y)
    print [np.sum(y == label) for label in np.unique(y)]

    from sklearn.mixture import DPGMM
    model = DPGMM(n_components = 10, alpha = 1, n_iter = 50)
    model.fit(X)
    y = model.predict(X)
    print 'DPGMM'
    print len(np.unique(y)), np.unique(y)
    print [np.sum(y == label) for label in np.unique(y)]

def test2():
    print 'test2'
    np.random.seed(1)
    X = np.concatenate((2 + np.random.randn(100, 2), 5 + np.random.randn(100, 2),  10 + np.random.randn(100, 2)))
    T = 10
    model = VDPGMM(T=T, alpha=.5, max_iter=100, thresh=1e-5)
    model.fit(X)
    
    plt.clf()
    h = plt.subplot()
    color = 'rgbcmykw'
        img.reshape((1, img.shape[0], img.shape[1], img.shape[2])), -1, 1)
img = vgg16.preprocess_input(img.astype('float32'))
""" Scaling activations to fit random initialization scheme"""
actvs = get_activations(model, layer, img).squeeze()
actvs /= np.max(actvs) * 0.1
""" Clustering with dirichlet process Gaussian Mixture Model"""
dpgmm = DPGMM(n_components=50,
              alpha=1,
              verbose=2,
              tol=0.01,
              n_iter=250,
              min_covar=1e-6)
#dpgmm = BayesianGaussianMixture(n_components=50, covariance_type="diag", reg_covar = 1e-6,
#                                weight_concentration_prior_type="dirichlet_process",
#                                weight_concentration_prior=1, verbose=2,
#                                tol=0.01, max_iter=250, init_params='random',
#                                mean_precision_prior=actvs.std(),
#                                mean_prior=np.repeat(actvs.max()/5,actvs.shape[0]))

dpgmm.fit(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
labels = dpgmm.predict(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
labels = labels.reshape((actvs.shape[1], actvs.shape[2]))

plt.subplot(1, 2, 2)
plt.imshow(labels, interpolation="nearest")
plt.title('Labelmap from layer ' + str(layer))
Ejemplo n.º 19
0
    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    n_components = 3
    dpgmm = DPGMM(n_components=n_components, alpha=1, covariance_type='full', random_state=0)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm._get_covars()
    print 'DPGMM均值 = \n', centers
    print 'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
            continue
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
Ejemplo n.º 20
0
def dpgmm_simple(X, init_numC, random_state):
    model = DPGMM(n_components = init_numC, n_iter=100, tol=0.000001, random_state=random_state)
    model.fit(X)
    y = model.predict(X)
    cluster_num = len(np.unique(y))
    return cluster_num, y
Ejemplo n.º 21
0
  # Sample the specified number of points from X_unlabeled
  size = np.cumsum(chunk_sizes[:chunks])[-1]
  
  # Fit a Dirichlet process mixture of Gaussians using up to  ten components
  dpgmm = DPGMM(n_components=10, alpha=10.0, covariance_type='full')
  indices = np.arange(X_unlabeled.shape[0])
  np.random.shuffle(indices)
  X = X_unlabeled[indices[:size],]
  
  print("fitting a model with", size, "data points")
  with timeit():
    dpgmm.fit(X)
  print("Done!")
  print("AIC for this model & data: ", dpgmm.aic(X))
  print("BIC for this model & data: ", dpgmm.bic(X))
  Y_hat = dpgmm.predict(X)
  print ("Model assigned points to", np.max(Y_hat), "components")
  

# How can I best check this out? 
#color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
#for i, (clf, title) in enumerate([(gmm, 'GMM'),
                                  #(dpgmm, 'Dirichlet Process GMM')]):
    #splot = plt.subplot(2, 1, 1 + i)
    #Y_ = clf.predict(X)
    #for i, (mean, covar, color) in enumerate(zip(
            #clf.means_, clf._get_covars(), color_iter)):
        #v, w = linalg.eigh(covar)
        #u = w[0] / linalg.norm(w[0])
        ## as the DP will not use every component it has access to
        ## unless it needs it, we shouldn't plot the redundant
Ejemplo n.º 22
0
class Clustering:
	def __init__(self, X, scale=False, features=None):
		self.random_seed = 12345 # set to None for random
		self.X = X
		self.k = None
		
		self.centers = None
		self.h = None
		self.Y = None
		
		self.scaler = None
		self.ids = None
		
		# Visualize().plot( zip(*self.X) ) # FOR DEBUG
		
		# Reduce the number of features
		if features is not None:
			if isinstance( features, (int, float) ):
				variances = VarianceThreshold().fit(self.X).variances_
				self.ids = sorted(range(len(variances)), key=lambda i: variances[i])[-int(features):] # indexes of the top n_features values in variances
				
			elif type(features) in [list,tuple]:
				self.ids = features
			
			self.X = self.reduceFeatures(self.X)
			print("Selected features", self.ids, "on a total of", len(X[0]))  # FOR DEBUG
			
			
		# Visualize().plot( zip(*self.X) ) # FOR DEBUG
    
		if scale:
			self.scaler = StandardScaler() # MinMaxScaler() can also be used instead of StandardScaler()
			self.X = self.scaler.fit_transform(self.X)

	#---------------------------------------
	def reduceFeatures(self, X):
		if self.ids is None:
			return X
		else:
			return [ [v for iv,v in enumerate(x) if iv in self.ids] for x in X ]
	
	#---------------------------------------
	def affinity(self, k=2): # K is not used here
		self.h = AffinityPropagation(damping=0.75, preference=k, max_iter=200, convergence_iter=15, copy=True, affinity='euclidean').fit( self.X )
		self.Y = self.h.labels_
		self.k = k
		self.centers = self.getCenters()
		
		return self
	#---------------------------------------
	def meanshift(self, k=2): # K is not used here
		self.h = MeanShift(bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True).fit( self.X )
		self.Y = self.h.labels_
		self.k = k
		self.centers = self.getCenters()
		
		return self
	#---------------------------------------
	def kmeans(self, k=2):
		self.h = KMeans(n_clusters = k, init = 'k-means++', n_init = 10, max_iter = 1000, tol = 0.00001, random_state = self.random_seed).fit( self.X )
		self.Y = self.h.labels_
		self.k = k
		self.centers = self.getCenters()
		
		return self
		
	#---------------------------------------
	def gmm(self, k=2):
		self.h = GMM(n_components=k, random_state = self.random_seed).fit( self.X )
		self.Y = self.h.predict( self.X )
		self.k = k
		self.centers = self.getCenters()
		
		#TODO
		# posterior = self.h.predict_proba( self.X[:5] )
		# likelihood = self.h.score( self.X[:5] )
		
		return self
		
	#---------------------------------------
	''' Dirichlet Process is as likely to start a new cluster for a point as it is to add that point to a cluster with alpha elements (0<alpha<inf).
	A higher alpha means more clusters, as the expected number of clusters is alpha*log(N)'''
	def dpgmm(self, k=10, alpha=1.0):
		self.h = DPGMM(n_components=k, alpha=alpha, random_state = self.random_seed).fit( self.X )
		self.Y = self.h.predict( self.X )
		self.k = k # this is the max number of components in dpgmm
		self.centers = self.getCenters()
		
		#TODO
		# posterior = self.h.predict_proba( self.X[:5] )
		# likelihood = self.h.score( self.X[:5] )
		
		return self
		
	#---------------------------------------
	def done(self):
		if self.h is None:
			print("Clustering is not yet done !")
			return False
		else:
			return True
			
	#---------------------------------------
	def getCenters(self):
		if not self.done(): return
		
		try:
			return self.h.cluster_centers_
		
		# If the clustering has no centers, compute them based on clusters
		except AttributeError: 
			unique_labels = np.unique(self.Y)
			clusters = { ul:[] for ul in unique_labels }
			
			for i in range( len(self.X) ):
				clusters[ self.Y[i] ].append( self.X[i] )
			
			centers = []
			for label in clusters:
				centers.append( [np.mean(col) for col in list(zip(* clusters[label] )) ] )

			return centers
		
	#---------------------------------------
	def predict(self, x):
		if not self.done(): return
		x_processed = x
		x_processed = self.reduceFeatures([x_processed])[0]
		x_processed = x_processed if self.scaler is None else self.scaler.transform(x_processed)

		return self.h.predict(x_processed)[0]
	
	#---------------------------------------
	def predictAll(self, X):
		if not self.done(): return
		X_processed = X
		X_processed = self.reduceFeatures(X_processed)
		X_processed = X_processed if self.scaler is None else self.scaler.transform(X_processed)
		
		return list(self.h.predict(X_processed))
	
	#---------------------------------------
	def quality(self, X=None):
		if not self.done(): return
		
		if X is None: # if X not provided then use the training data and resulting labels
			X = self.X
			Y = self.Y
		else: # if X is provided then use it with the predicted labels (clusters)
			Y = self.predictAll(X)
		
		indexs = range(len(X)); shuffle(indexs)
		X = np.array([ X[i] for i in indexs[:5000] ])
		Y = np.array([ Y[i] for i in indexs[:5000] ])
		
		if len(set(Y)) < 2: return 0. # FIXME
		
		return silhouette_score(X, Y, metric='euclidean')
		
	#---------------------------------------
	def plot(self, fig=None):
		if not self.done(): return
		
		viz = Visualize()
		if len(self.X[0]) > 3:
			X = viz.PCA_Transform( list(zip(*self.X)) )
		else:
			X = self.X
		
		unique_labels = np.unique(self.Y)
		clusters = { ul:[] for ul in unique_labels }
		
		for i in range( len(X) ):
			clusters[ self.Y[i] ].append( X[i] )
		
		centers_for_plot = [] # Not the real centers because dimension was reduced using PCA
		for label in clusters:
			centers_for_plot.append( [np.mean(col) for col in list(zip(* clusters[label] )) ] )
		
		viz.do_plot(list(zip(*centers_for_plot)), marker='o', color='m')
		viz.plot_groups(clusters, fig)
Ejemplo n.º 23
0
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    n_components = 3
    dpgmm = DPGMM(n_components=n_components,
                  alpha=1,
                  covariance_type='full',
                  random_state=0)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm._get_covars()
    print 'DPGMM均值 = \n', centers
    print 'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
            continue
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])