Esempio n. 1
0
    def _fit_vbgmm(self, x):
        # clustering
        for c in xrange(len(self.crange)):
            k = self.crange[c]
            for r in xrange(self.repeats):
                # info
                if self.debug is True:
                    print '\t[%s][c:%d][r:%d]' % (
                        self.clus_type, self.crange[c], r + 1),
                idx = c * self.repeats + r

                # fit and evaluate model
                model_kwargs = {}
                if 'alpha' in self.clus_kwargs:
                    model_kwargs.update(alpha=self.clus_kwargs['alpha'])
                if 'conv_thresh' in self.clus_kwargs:
                    model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
                model = VBGMM(n_components=k, covariance_type=self.cvtype,
                              **model_kwargs)
                model.n_features = self.input_dim
                fit_kwargs = {}
                if 'max_iter' in self.clus_kwargs:
                    fit_kwargs.update(n_iter=self.clus_kwargs['max_iter'])
                model.fit(x, params='wmc', init_params='wmc', **fit_kwargs)
                self._labels[idx] = model.predict(x)
                self._parameters[idx] = model.means
                self._ll[idx] = model.score(x).sum()

                # evaluate goodness of fit
                self._gof[idx] = self.gof(x, self._ll[idx], k)

                # debug
                if self.debug is True:
                    print self._gof[idx], model.converged_
Esempio n. 2
0
    def _fit_vbgmm(self, x):
        # clustering
        for c in xrange(len(self.crange)):
            k = self.crange[c]
            for r in xrange(self.repeats):
                # info
                if self.debug is True:
                    print '\t[%s][c:%d][r:%d]' % (self.clus_type,
                                                  self.crange[c], r + 1),
                idx = c * self.repeats + r

                # fit and evaluate model
                model_kwargs = {}
                if 'alpha' in self.clus_kwargs:
                    model_kwargs.update(alpha=self.clus_kwargs['alpha'])
                if 'conv_thresh' in self.clus_kwargs:
                    model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
                model = VBGMM(n_components=k,
                              covariance_type=self.cvtype,
                              **model_kwargs)
                model.n_features = self.input_dim
                fit_kwargs = {}
                if 'max_iter' in self.clus_kwargs:
                    fit_kwargs.update(n_iter=self.clus_kwargs['max_iter'])
                model.fit(x, params='wmc', init_params='wmc', **fit_kwargs)
                self._labels[idx] = model.predict(x)
                self._parameters[idx] = model.means
                self._ll[idx] = model.score(x).sum()

                # evaluate goodness of fit
                self._gof[idx] = self.gof(x, self._ll[idx], k)

                # debug
                if self.debug is True:
                    print self._gof[idx], model.converged_
Esempio n. 3
0
def test_vbgmm_no_modify_alpha():
    alpha = 2.
    n_components = 3
    X, y = make_blobs(random_state=1)
    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
    assert_equal(vbgmm.alpha, alpha)
    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
Esempio n. 4
0
def test_vbgmm_no_modify_alpha():
    alpha = 2.
    n_components = 3
    X, y = make_blobs(random_state=1)
    vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1)
    assert_equal(vbgmm.alpha, alpha)
    assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
Esempio n. 5
0
def main(method,cluster_num=30,alpha=.5):
    f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv'
    #f ='/Users/davidgreenfield/Downloads/features_f500.csv'
    cols=range(1,4096)
    feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols))
    asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str)
    if method == 'kmeans':
        k_means=cluster.KMeans(n_clusters=cluster_num)
        k_means.fit(feats)
        y = k_means.labels_
        if MAKE_GRAPH==1:
            print "hello 1"
        create_graph(k_means)
    elif method == 'GMM_VB':
        gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5)
        y = gmm_vb.predict(feats)
        cluster_no = len(np.unique(y))
    elif method == 'GMM_DP':
        gmm_dp = DPGMM(n_components=50,alpha=alpha)
        gmm_dp.fit(feats)
        y = gmm_dp.predict(feats)
        cluster_no = len(np.unique(y))


    clusters=[]
    groups={}
    data=load_data('./data/boots_aws.csv')

    for i in range(0,cluster_num):
        groups[i]=np.where(y==i)
        ids=asins[groups[i]]
        clusters.append(ids)
        links=[data[x]['url'] for x in ids]
        create_html(links,"templates/groups/group"+str(i)+".html")

    output_clusters(clusters,"outputs/clusters.csv")