Ejemplo n.º 1
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k,
                          covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
Ejemplo n.º 2
0
def fit_vel_profile_dpgmm(vel_profile, n_comps=5, dp=False):
    """
    fit a velocity profile with DP-GMM
    """
    N = 1000    # 1000 samples to fit
    integral = np.sum(vel_profile)
    #vel_profile is a 1D array, try to convert it to samples
    t = np.linspace(0, 1, len(vel_profile))
    data = np.array([])
    for i in range(len(t)):
        n_samples = vel_profile[i] / integral * N
        if n_samples > 0:
            #add samples
            samples = np.ones(n_samples) * t[i]
            #add noise
            data = np.concatenate([data, samples])
    fit_data = np.array([data]).transpose()
    #fit Dirichlet-Process Gaussian Mixture Model, 
    #something wrong with the module? The clusters seem merged...
    if dp:
        model = DPGMM(n_components=n_comps, n_iter=1000, alpha=10)
    else:
        model = GMM(n_components=n_comps)
    
    model.fit(fit_data)

    return model
Ejemplo n.º 3
0
    def cluster(self,
                dim,
                method='dpgmm',
                max_n_clusters=80,
                max_iter=300,
                refresh=True):
        '''
        dim is the dim index for clustering
        '''
        print('clustering DPGMM')
        from sklearn.mixture import BayesianGaussianMixture as DPGMM
        dpgmm = DPGMM(n_components=max_n_clusters,
                      covariance_type='full',
                      weight_concentration_prior=1e-3,
                      weight_concentration_prior_type='dirichlet_process',
                      init_params="kmeans",
                      max_iter=max_iter,
                      random_state=0,
                      verbose=1,
                      verbose_interval=10)  # init can be "kmeans" or "random"
        dpgmm.fit(self.fet[:, dim])
        label = dpgmm.predict(self.fet[:, dim])
        self.clu.membership = label
        self.clu.__construct__()
        self.clu.emit('cluster')

        if refresh is True:
            self.set_data(self.fet, self.clu)
        return label
Ejemplo n.º 4
0
def clustering_algorithm(lengths,
                         covs,
                         kmers,
                         algorithm='dirichlet',
                         K=300,
                         max_epoch=25,
                         t=0,
                         seed=None,
                         mu_pkl=None):
    """Clusters using given algorithm

  Takes as argument cluster names, lengths, and coverage/kmer matrices.
  """

    # create matrix for clustering
    logging.info('Creating data matrix')
    X = create_matrix(lengths, covs, kmers)

    # project down dimension
    logging.info('Performing dimensionality reduction')
    X = reduce_dimensionality(X)

    # do the clustering
    logging.info('Starting clustering algorithm')
    if algorithm == 'sk-gmm':
        gmm = GMM(n_components=K, covariance_type='full', n_iter=500)
        gmm.fit(X)
        z = gmm.predict(X)
        return z
    elif algorithm == 'sk-dpgmm':
        gmm = DPGMM(n_components=K, covariance_type='full', n_iter=500)
        gmm.fit(X)
        z = gmm.predict(X)
        return z
    elif algorithm == 'dirichlet':
        n_data = X.shape[0]
        mu_pred_dem, Sigma_pred_dem, asgn_dem, llik = dirichlet_em(
            X.T, K=K, n_minibatch=n_data, max_epoch=max_epoch, seed=seed)

        if mu_pkl:
            with open(mu_pkl, 'wb') as f:
                pickle.dump((mu_pred_dem, asgn_dem), f)

        # further compress stuff
        compressed_clusters = agglomerative(mu_pred_dem.T, t=t)
        transl_dict = {i: c for i, c in enumerate(compressed_clusters)}
        asign_dem_agg = np.array([transl_dict[i] for i in asgn_dem])

        return asign_dem_agg
    elif algorithm == 'ard':
        n_data = X.shape[0]
        _, _, asgn, _ = variational_em(X.T,
                                       K=K,
                                       n_minibatch=n_data,
                                       max_epoch=max_epoch)
        return asgn
    else:
        raise ValueError("Invalid algorithm name")
Ejemplo n.º 5
0
def Dirichlet(cluster_data, identification, iteration_number=1):
    print "In Dirichlet"
    for i in range(0, iteration_number):
        print "On iteration number ", i
        dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data)
        #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)

    return _make_final_list(identification, predict)
Ejemplo n.º 6
0
    def dpgmm(self, k=10, alpha=1.0):
        self.h = DPGMM(n_components=k,
                       alpha=alpha,
                       random_state=self.random_seed).fit(self.X)
        self.Y = self.h.predict(self.X)
        self.k = k  # this is the max number of components in dpgmm
        self.centers = self.getCenters()

        #TODO
        # posterior = self.h.predict_proba( self.X[:5] )
        # likelihood = self.h.score( self.X[:5] )

        return self
Ejemplo n.º 7
0
 def _dpgmm(fet, n_comp=8, max_iter=400):
     from sklearn.mixture import BayesianGaussianMixture as DPGMM
     dpgmm = DPGMM(n_components=n_comp,
                   covariance_type='full',
                   weight_concentration_prior=1e-3,
                   weight_concentration_prior_type='dirichlet_process',
                   init_params="kmeans",
                   max_iter=100,
                   random_state=0,
                   verbose=0,
                   verbose_interval=10)  # init can be "kmeans" or "random"
     dpgmm.fit(fet)
     label = dpgmm.predict(fet)
     return label
Ejemplo n.º 8
0
def test1():
    print 'test1'
    model = VDPGMM(T=10, alpha=1, max_iter=50)
    X, Y = getXY('iris')
    model.fit(X)
    y = model.predict(X)
    print 'VDPGMM'
    print len(np.unique(y)), np.unique(y)
    print[np.sum(y == label) for label in np.unique(y)]

    from sklearn.mixture import DPGMM
    model = DPGMM(n_components=10, alpha=1, n_iter=50)
    model.fit(X)
    y = model.predict(X)
    print 'DPGMM'
    print len(np.unique(y)), np.unique(y)
    print[np.sum(y == label) for label in np.unique(y)]
Ejemplo n.º 9
0
 def dpgmm_cluster(self, max_n_clusters=30, max_iter=300, verbose=False):
     from sklearn.mixture import BayesianGaussianMixture as DPGMM
     dpgmm = DPGMM(n_components=max_n_clusters,
                   covariance_type='full',
                   weight_concentration_prior=1e-3,
                   weight_concentration_prior_type='dirichlet_process',
                   init_params="kmeans",
                   max_iter=max_iter,
                   random_state=0,
                   verbose=verbose,
                   verbose_interval=10)  # init can be "kmeans" or "random"
     dpgmm.fit(self.fet)
     label = dpgmm.predict(self.fet)
     self.clu.membership = label
     self.clu.__construct__()
     self.clu.emit('cluster')
     return dpgmm
Ejemplo n.º 10
0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None):
    best_bic = np.inf
    bic_dpgmm = None
    lbl_vec_dpgmm = np.zeros(X.shape[0])
    prob_vec_dpgmm = np.zeros(X.shape[0])
    log_prob_dpgmm = None
    for i in xrange(n_init):
        dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                        alpha=alpha, random_state=rand_state)
        dpgmm.fit(X)
        b = dpgmm.bic(X)
        if b < best_bic:
            bic_dpgmm = b
            lbl_vec = dpgmm.predict(X)
            prob_vec = dpgmm.predict_proba(X)
            log_prob_dpgmm = np.sum(dpgmm.score(X))
    return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
Ejemplo n.º 11
0
def train_DPGMM(d, max_n_comp=100, max_n_iter=500):
    '''Imports Data, Trains a DPGMM, Generates predictions testing'''

    print "Training Model..."
    gmm = DPGMM(max_n_comp, n_iter=max_n_iter)

    start = timeit.default_timer()
    gmm.fit(d)
    end = timeit.default_timer()

    print "Training completed in %f seconds" % (end-start)

    print
    print "Converged: "
    print gmm.converged_
    print

    return gmm
Ejemplo n.º 12
0
def select_model(model_key):
    model = None
    if model_key == 'b':
        model = GradientBoostingClassifier()
    elif model_key == 'svc':
        model = SVC(probability=True, gamma='auto')
    elif model_key == 'nusvc':
        print 'selecting NuSVC'
        model = NuSVC(probability=True)
    elif model_key == 'r':
        model = RandomForestClassifier(class_weight={'buy': 1, 'stay': .75})
    elif model_key == 'e':
        model = ExtraTreesClassifier()
    elif model_key == 'nn':
        model = KNeighborsClassifier()
    elif model_key == 'gmm':
        model = DPGMM()
    return model, model_key
Ejemplo n.º 13
0
def _Dirichlet(cluster_data, identification):
    print "In Dirichlet"
    for i in range(0, 3):
        print "i is ", i
        dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data)
        #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit
        predict = dirichlet.predict(cluster_data)
        n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0)
        print('Estimated number of clusters with Dirichlet: %d' % n_clusters_)

    final = []
    for x in range(0, len(identification)):
        final.append([identification[x], predict[x]])

    print "this is what final sort of looked like"
    print final[:3]

    return final
Ejemplo n.º 14
0
def plot_num_iters_dpgmm(X, num_c, cv_type, alpha, max_iters, n_init):
    bic = []
    for iters in np.arange(1, max_iters):
        best_bic = np.inf
        for j in xrange(n_init):
            dpgmm = DPGMM(n_components=comp, covariance_type=cv_type, \
                                                alpha=a, n_iter=iters)
            dpgmm.fit(X)
            b = dpgmm.bic(X)
            if b < best_bic:
                best_bic = b
        bic.append(best_bic)
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(np.arange(1, max_iters), bic)
    ax.set_title('BIC vs. Number of Iterations DPGMM')
    ax.set_xlabel('Number of iterations')
    ax.set_ylabel('BIC score')
    return fig
Ejemplo n.º 15
0
def plot_alpha_dpgmm(X, num_c, cv_type, alphas, iters, n_init):
    bic = []
    for a in alphas:
        best_bic = np.inf
        for j in xrange(n_init):
            dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                                                alpha=a, n_iter=iters)
            dpgmm.fit(X)
            b = dpgmm.bic(X)
            if b < best_bic:
                best_bic = b
        bic.append(best_bic)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(alphas, bic, 'bo-', lw=2)
    ax.set_title('BIC vs. Alpha DPGMM')
    ax.set_xlabel('Alpha')
    ax.set_ylabel('BIC score')
    return fig
Ejemplo n.º 16
0
def plotClustering(fullpath,
                   order=1,
                   sr=4,
                   cutoff=.1,
                   n_singv=3,
                   feature='chroma',
                   dim_red='SVD',
                   round_to=0,
                   normalize=1,
                   scale=1,
                   length=4,
                   clustering='KMEANS'):
    feat = {}
    print(
        'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, '
        'n_singv {}, scale {} normalize {}, round_to {}'.format(
            fullpath, feature, order, sr, cutoff, n_singv, scale, normalize,
            round_to))
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)

    # extract filter and apply pre-processing
    feat[feature], beat_times = extractFeature(filename,
                                               file_ext,
                                               feature,
                                               scale,
                                               round_to,
                                               normalize,
                                               beat_sync=True,
                                               save=True)

    feat['LPF'] = lpf(feat[feature], cutoff, sr, order)
    feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv)
    feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv)
    feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order)
    feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF']
    feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)],
                                              cutoff, sr, order)
    feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(
        dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv)

    # create vars for plotting
    ts = np.arange(0, len(feat[feature]))
    step_size = max(1, int(len(ts) * .01))
    fig = plt.figure(figsize=(98, 64))
    fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format(
        feature, order, cutoff, sr))

    gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1])
    i = 0
    print "\tPlot data and pre-processing"
    for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red),
                 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature),
                 '{}(LPF({}-LPF))'.format(dim_red, feature)):
        data = feat[name]

        data_wide = np.array([
            feat[name][m:m + length, :]
            for m in xrange(len(feat[name]) - length)
        ])
        data_wide = data_wide.reshape(data_wide.shape[0],
                                      data_wide.shape[1] * data_wide.shape[2])

        # build codebook using kmeans or DP-GMM
        if clustering == 'KMEANS':
            K_MIN, K_MAX = 2, 16
            KM = [
                KMeans(n_clusters=l, init='k-means++').fit(data_wide)
                for l in xrange(K_MIN, K_MAX + 1)
            ]

            # compute scores to assess fit
            scores_bic = [
                computeBic(KM[x], data_wide) for x in xrange(len(KM))
            ]
            scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))]
            scores_silhouette = [
                silhouette_score(data_wide, KM[x].labels_, metric='euclidean')
                for x in xrange(len(KM))
            ]

            # get best clusters
            idx_best_bic = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0])
            idx_best_inertia = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0])
            idx_best_silhouette = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0])
            idx_best = int(
                np.median(
                    (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1

            # get clusters and cluster allocations given best K
            k_best = idx_best + K_MIN
            centroids = KM[idx_best].cluster_centers_
            centroid_idx = KM[idx_best].labels_
        elif clustering == 'DPGMM':
            n_components = 12
            dpgmm = DPGMM(n_components=n_components,
                          tol=1e-3,
                          n_iter=32,
                          alpha=1000,
                          covariance_type='diag',
                          verbose=True)
            dpgmm.fit(data_wide)

            # compute scores to assess fit
            scores_bic = dpgmm.bic(data_wide)
            scores_silhouette = [
                silhouette_score(data_wide, centroids, metric='euclidean')
            ]
            scores_silhouette = [0.0]

            # get clusters and cluster allocations given best K
            k_best = dpgmm.means_.shape[0]
            centroids = dpgmm.means_
            centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1)
        # plot data
        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T

        ax = fig.add_subplot(gs[i, :])
        ax.set_title(name)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        xlabels = [
            "{}:{}".format(int(x / 60), int(x % 60))
            for x in beat_times[::step_size]
        ]
        ax.set_xticks(ts[::step_size])
        ax.set_xticklabels(xlabels, rotation=60)
        ax.grid(False)

        # plot clustering on raw feature
        changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:]))
        for c in xrange(changes.shape[0] - 1):
            if changes[c] and changes[c + 1]:
                changes[c] = False
        ax_twin = ax.twiny()
        ax_twin.set_xlim(ax.get_xlim())
        ax_twin.set_xticks(np.argwhere(changes)[:, 0])
        ax_twin.set_xticklabels(centroid_idx[changes])
        ax_twin.grid(False)

        # plot codebook (centroids)
        ax = fig.add_subplot(gs[i + 1, 0])
        ax.set_title(name)

        if centroids.shape[1] == 3:
            centroids = centroids.reshape(1, centroids.shape[0],
                                          centroids.shape[1])
        elif centroids.shape[1] == n_singv * length:
            centroids = centroids.reshape(1, centroids.shape[0] * length,
                                          centroids.shape[1] / length)
        else:
            centroids = centroids.reshape(centroids.shape[0] * length,
                                          centroids.shape[1] / length).T
        ax.imshow(centroids,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(xrange(0, centroids.shape[1], 4))
        ax.set_xticklabels(xrange(k_best))
        ax.grid(False)

        # plot elbow curve
        c = 1
        for k, v, idx in (('BIC', scores_bic, idx_best_bic),
                          ('INERTIA', scores_inertia,
                           idx_best_inertia), ('SILHOUETTE', scores_silhouette,
                                               idx_best_silhouette)):
            ax = fig.add_subplot(gs[i + 1, c])
            ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN))
            ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-')
            ax.set_xlim((K_MIN, K_MAX + 1))
            ax.set_xlabel('Number of clusters')
            ax.set_ylabel('Score')
            ax.grid(True)
            ax.axvline(idx + K_MIN, color='r')
            c += 1
        i += 2
        """
        if 'SVD' in name:
            # scikit-image clustering
            segments_slic = slic(
                data, n_segments=10, compactness=10, sigma=1)
            segments_quickshift = quickshift(
                data, kernel_size=3, max_dist=6, ratio=0.5)

            ax = fig.add_subplot(gs[k, 0])
            ax.set_title('{} with quickshift'.format(name))
            ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)

            ax = fig.add_subplot(gs[k, 1])

            ax.set_title('{} with slic'.format(name))
            ax.imshow(mark_boundaries(data, segments_slic, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)
            k += 1
        """

    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    # save with large size
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))
    # save with smaller size
    fig.set_figwidth(36)
    fig.set_figheight(24)
    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    plt.close(fig)
               aspect='auto',
               origin='low',
               interpolation='nearest',
               cmap=plt.cm.plasma)
axes[2].imshow(feats_log_normed,
               aspect='auto',
               origin='low',
               interpolation='nearest',
               cmap=plt.cm.plasma)
fig.tight_layout()

# Clustering with DP-GMM
n_components = 32
dpgmm = DPGMM(n_components=n_components,
              tol=1e-3,
              n_iter=32,
              alpha=1000,
              covariance_type='diag',
              verbose=True)
dpgmm.fit(feats_log.T)
preds_proba = dpgmm.predict_proba(feats_log.T)
preds = np.argmax(preds_proba, axis=1)
np.unique(preds)
# resynthesis by sampling from clusters
resynthesis = dpgmm.means_[preds.astype(int), :]

fig, axes = plt.subplots(4, 1, figsize=(18, 8))
axes[0].set_title(feature)
axes[1].set_title('Prediction Probability')
axes[2].set_title('Resynthesis')
axes[3].set_title('Max(Prediction Probability)')
Ejemplo n.º 18
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.mixture import GMM, DPGMM, BayesianGaussianMixture, VBGMM
from sklearn.svm import NuSVC, SVC

# Useful for seeing all sklearn estimators that have `predict_prob` attribute
estimators = all_estimators()
for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

# Now pick and choose the ones you like
estimators = {
    AdaBoostClassifier(): 'AdaBoost',
    BayesianGaussianMixture(): 'BayesianGaussianMixture',
    BernoulliNB(): 'BernoulliNB',
    DPGMM(): 'DPGMM',
    ExtraTreesClassifier(): 'ExtraTreesClassifier',
    GMM(): 'GMM',
    GaussianNB(): 'GaussianNB',
    GaussianProcessClassifier(): 'GaussianProcessClassifier',
    GradientBoostingClassifier(): 'GradientBoostingClassifier',
    KNeighborsClassifier(): 'KNeighborsClassifier',
    LabelPropagation(): 'LabelPropagation',
    LabelSpreading(): 'LabelSpreading',
    LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis',
    LogisticRegression(): 'LogisticRegression',
    MLPClassifier(): 'MLPClassifier',
    NuSVC(): 'NuSVC',
    QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis',
    RandomForestClassifier(): 'RandomForestClassifier',
    SGDClassifier(): 'SGDClassifier',
# Choose a max number of components for the algorithm
max_components = 8

# Count the number of clusters the DPGMM chooses
num_clusters = []
size_sample = []

# Try clustering at different sample sizes
for iteration in range(int(np.floor(len(gaussian_data) / 10)) - 2):
    # Number of samples to use
    max_sample_value = ((iteration + 2) * 10) 
    sample_set = gaussian_data[0:max_sample_value]
    size_sample.append(max_sample_value - 0)
    
    # Fit Dirichlet Process Gaussian Mixture Model
    dpgmm_model = DPGMM(n_components = max_components, n_iter=1000, alpha=1.0)
    fitted_dpgmm = dpgmm_model.fit(sample_set)
    dpgmm_predictions = fitted_dpgmm.predict(gaussian_data)
    num_clusters.append(len(set(dpgmm_predictions)))
    
    # Append predicted labels to dataframe
    gaussian_data['predicted'] = dpgmm_predictions

# Give a unique color to each category
unique_categories = list(set(gaussian_data['predicted']))
color_labels = ['r', 'y', 'g', 'b', 'c', 'm', 'k', 'w']
colors = [color_labels[unique_categories.index(i)] for i in gaussian_data['predicted']]

# Plot predicted data
plt.scatter(gaussian_data['x'], gaussian_data['y'], c=colors)
plt.xlim([-12,12])
Ejemplo n.º 20
0
    data_cluster_train = query_features(training, 15, 10, 23, data)
    data_cluster_test = query_features(testing, 15, 10, 23, data)
    data_cluster_train_ds = data_cluster_train
    """if you want clustering on the dissimilarity space uncomment
       below and change accordingly"""
    # print 'Calculating dissimilarity space for training queries...'
    # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean')
    # data_cluster_train_ds = sc.squareform(data_cluster_train_ds)

    # # plt.figure(1)
    # # plt.imshow(data_cluster_train_ds)
    # # plt.colorbar()
    # # plt.title('Initial dissimilarity')

    print 'Training a Dirichlet Process Gaussian Mixture model...'
    dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50)
    dpgmm.fit(data_cluster_train_ds)
    prediction = dpgmm.predict(data_cluster_train_ds)
    clusters = np.unique(prediction)

    print 'Found %i clusters!' % clusters.shape[0]
    print clusters
    """create the reordered input data according to the clusters
      it is only needed if you want to visuallize the clustering
      afterwards"""
    #data_cluster = np.zeros((1, data_cluster_train.shape[1]))

    # each cluster is a list of lists that contains the indices
    # of the queries for each cluster
    each_cluster = []
    for i in xrange(clusters.shape[0]):
Ejemplo n.º 21
0
def fit_dirichlet_gmm_to_points(points,
                      n_components,
                      mdl,
                      ps=[],
                      num_iter=100,
                      covariance_type='full',
                      mass_multiplier=1.0):
    """fit a GMM to some points. Will return core::Gaussians.
    if no particles are provided, they will be created

    points:            list of coordinates (python)
    n_components:      number of gaussians to create
    mdl:               IMP Model
    ps:                list of particles to be decorated. if empty, will add
    num_iter:          number of EM iterations
    covariance_type:   covar type for the gaussians. options: 'full', 'diagonal', 'spherical'
    init_centers:      initial coordinates of the GMM
    force_radii:       fix the radii (spheres only)
    force_weight:      fix the weights
    mass_multiplier:   multiply the weights of all the gaussians by this value
    """


    new_sklearn = True
    try:
        from sklearn.mixture import BayesianGaussianMixture
    except ImportError:
        from sklearn.mixture import DPGMM
        new_sklearn = False

    ### create and fit GMM
    print('using dirichlet prior')
    if new_sklearn:
        gmm = BayesianGaussianMixture(
                weight_concentration_prior_type='dirichlet_process',
                n_components=n_components, max_iter=num_iter,
                covariance_type=covariance_type)
    else:
        gmm = DPGMM(n_components=n_components, n_iter=num_iter,
                    covariance_type=covariance_type)

    gmm.fit(points)

    #print('>>> GMM score',gmm.score(points))

    #print gmm.covars_
    #print gmm.weights_
    #print gmm.means_
    ### convert format to core::Gaussian
    if not new_sklearn:
        gmm.precisions_ = gmm.precs_
    for ng in range(n_components):
        invcovar=gmm.precisions_[ng]
        covar=np.linalg.inv(invcovar)
        if covar.size==3:
            covar=np.diag(covar).tolist()
        else:
            covar=covar.tolist()
        center=list(gmm.means_[ng])
        weight=mass_multiplier*gmm.weights_[ng]
        if ng>=len(ps):
            ps.append(IMP.Particle(mdl))
        shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center))
        g=IMP.core.Gaussian.setup_particle(ps[ng],shape)
        IMP.atom.Mass.setup_particle(ps[ng],weight)
        IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances())))
Ejemplo n.º 22
0
def dpgmm_simple(X, init_numC, random_state):
    model = DPGMM(n_components = init_numC, n_iter=100, tol=0.000001, random_state=random_state)
    model.fit(X)
    y = model.predict(X)
    cluster_num = len(np.unique(y))
    return cluster_num, y
# In[4]:

train_dataset = train.values
X = train_dataset[:, 2:]
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

# In[5]:
df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

print('dirichlet process gaussian mixture begins****************')
dpgmm = DPGMM(n_components=3)
print('fitting****************')
dpgmm_train = dpgmm.fit(X, y)
print('predicting on train****************')
dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1]
print('predicting on test****************')
dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1]
tr_te_concatenated = np.concatenate(
    [dpgmm_X_prediction, dpgmm_X_test_prediction])
df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated

print('final tr_te shape', df.shape)
print(df.head())

df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False)
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False):
    """
    The list of all classifiers was generated by running the following commented code.

    Args:
        a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets.
        a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will
        be printed to a file with the current date and time.
        a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally
        describing the run that is being performed.

    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test,  pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test,  pd.core.frame.Series)
    assert isinstance(print_output_scores_to_csv, bool)
    assert isinstance(output_scores_csv_file_suffix, object)

    import time

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from sklearn.utils.testing import all_estimators
    #estimators = all_estimators()
    #for name, class_ in estimators:
    #    log_print(name)

    from sklearn.calibration           import CalibratedClassifierCV
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.ensemble              import AdaBoostClassifier
    from sklearn.ensemble              import BaggingClassifier
    from sklearn.ensemble              import ExtraTreesClassifier
    from sklearn.ensemble              import GradientBoostingClassifier
    from sklearn.ensemble              import RandomForestClassifier
    from sklearn.gaussian_process      import GaussianProcessClassifier
    from sklearn.linear_model          import LogisticRegression
    from sklearn.linear_model          import LogisticRegressionCV
    from sklearn.linear_model          import SGDClassifier

    from sklearn.mixture               import BayesianGaussianMixture
    from sklearn.mixture               import DPGMM
    from sklearn.mixture               import GaussianMixture
    from sklearn.mixture               import GMM
    from sklearn.mixture               import VBGMM
    from sklearn.naive_bayes           import BernoulliNB
    from sklearn.naive_bayes           import GaussianNB
    from sklearn.neighbors             import KNeighborsClassifier
    from sklearn.neural_network        import MLPClassifier
    from sklearn.semi_supervised       import LabelPropagation
    from sklearn.semi_supervised       import LabelSpreading
    from sklearn.svm                   import SVC
    from sklearn.tree                  import DecisionTreeClassifier
    #from xgboost                       import XGBClassifier

    models = []
    models.append(('AdaBoostClassifier',            AdaBoostClassifier()))
    models.append(('BaggingClassifier',             BaggingClassifier()))
    models.append(('BayesianGaussianMixture',       BayesianGaussianMixture()))
    models.append(('BernoulliNB',                   BernoulliNB()))
    models.append(('CalibratedClassifierCV',        CalibratedClassifierCV()))
    models.append(('DPGMM',                         DPGMM()))
    models.append(('DecisionTreeClassifier',        DecisionTreeClassifier(random_state=SEED)))
    models.append(('ExtraTreesClassifier',          ExtraTreesClassifier(random_state=SEED)))
    models.append(('GMM',                           GMM()))
    models.append(('GaussianMixture',               GaussianMixture()))
    models.append(('GaussianNB',                    GaussianNB()))
    models.append(('GaussianProcessClassifier',     GaussianProcessClassifier()))
    models.append(('GradientBoostingClassifier',    GradientBoostingClassifier()))
    models.append(('KNeighborsClassifier',          KNeighborsClassifier()))
    models.append(('LabelPropagation',              LabelPropagation()))
    models.append(('LabelSpreading',                LabelSpreading()))
    models.append(('LinearDiscriminantAnalysis',    LinearDiscriminantAnalysis()))
    models.append(('LogisticRegression',            LogisticRegression()))
    models.append(('LogisticRegressionCV',          LogisticRegressionCV()))
    models.append(('MLPClassifier',                 MLPClassifier()))
    #models.append(('MultinomialNB', MultinomialNB()))
    #models.append(('NuSVC', NuSVC()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('RandomForestClassifier',        RandomForestClassifier(random_state=SEED)))
    models.append(('SGDClassifier',                 SGDClassifier()))
    models.append(('SVC',                           SVC()))
    models.append(('VBGMM',                         VBGMM()))
    #models.append(('XGBClassifier',                 XGBClassifier()))
    
    output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table)

    if print_output_scores_to_csv:
        output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv')

    return output_scores_df

def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True):
    """
    Run all classifiers of sklearn

    Args:
        X_train, X_test, y_train, y_test: The train and tests datasets.
        print_details: if true, print details of all models and save csv table ;
                       if false, print only table with summary of the models
    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test, pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test, pd.core.frame.Series)
    assert isinstance(print_details, bool)

    log_method_execution_time(log_funcname())

    from sklearn.utils.testing import all_estimators
    import sklearn.metrics
    import time
    from src.util.acq_util import RANDOM_SEED

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from xgboost import XGBClassifier
    #models.append(('XGBClassifier', XGBClassifier()))

    models = all_estimators(type_filter='classifier')
    output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0',
                                                'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'],
                                         columns=list(zip(*models))[0])

    for name, model in models:
        if print_details is True:
            print('------------------------------------------------------------------------------')
            print(name)
            print('------------------------------------------------------------------------------')

        if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'):
            continue

        model = model()
        if 'random_state' in model.get_params():
            model.random_state = SEED

        #Fitting the model.
        model.fit(X_train, y_train)

        #Measuring accuracy.
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset,
                                                       ['Accuracy on the train set', name], print_details)
        output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset,
                                                       ['Accuracy on the test set', name], print_details)

        #Plotting confusion matrix.
        output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details)

        #Showing classification report.
        if print_details is True:
            print(sklearn.metrics.classification_report(y_test, y_test_pred))

        # Printing scores to output dataset.
        output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name)

    # Can use idxmax with axis=1 to find the column with the greatest value on each row.
    output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1)
    #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1)

    if print_details is True:
        output_scores_dataset.to_csv('output_scores' + '.csv')

    return output_scores_dataset

def train_test_split_for_classification(dataset, label, test_size, random_state=SEED):
    """
    Selects X and y, considering that y has been renamed to label.
    """
    from sklearn.model_selection import train_test_split

    assert isinstance(dataset, pd.core.frame.DataFrame)
    assert isinstance(test_size, float)
    assert isinstance(random_state, int)

    X = dataset.loc[:, dataset.columns != label]
    y = dataset[g_label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    log_print('X_train: {}'.format(X_train.shape))
    log_print('y_train: {}'.format(y_train.shape))
    log_print('X_test:  {}'.format(X_test.shape))
    log_print('y_test:  {}'.format(y_test.shape))
    return(X_train, X_test, y_train, y_test)
Ejemplo n.º 25
0
# print(pca.n_components_)
# print(pca.explained_variance_ratio_[0:3])
print(pca.reconstruction_err_)

fig = plt.figure(figsize=(20, 20))

ax = fig.add_subplot(111, projection='3d')

plt.scatter(datat[:, 0], datat[:, 1], zs=datat[:, 2], c=labels, marker='o')

plt.show()
plt.close()

#km = KMeans(n_clusters=5)

km = DPGMM(n_components=7, covariance_type='tied')

clabels = km.fit_predict(datat)

# for ex, lab in zip(exid, clabels):
#     print(ex, lab)

fig = plt.figure(figsize=(20, 20))

ax = fig.add_subplot(111, projection='3d')

plt.scatter(datat[:, 0], datat[:, 1], zs=datat[:, 2], c=clabels, marker='o')

plt.show()
plt.close()
Ejemplo n.º 26
0
                    angle=angle,
                    color='m',
                    alpha=0.5,
                    clip_box=ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    n_components = 3
    dpgmm = DPGMM(n_components=n_components,
                  alpha=1,
                  covariance_type='full',
                  random_state=0)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm._get_covars()
    print 'DPGMM均值 = \n', centers
    print 'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
plt.figure()
plt.subplot(1, 2, 1)
plt.imshow(img)
plt.title('Original Image')
if K.image_dim_ordering() == "th":
    img = np.moveaxis(
        img.reshape((1, img.shape[0], img.shape[1], img.shape[2])), -1, 1)
img = vgg16.preprocess_input(img.astype('float32'))
""" Scaling activations to fit random initialization scheme"""
actvs = get_activations(model, layer, img).squeeze()
actvs /= np.max(actvs) * 0.1
""" Clustering with dirichlet process Gaussian Mixture Model"""
dpgmm = DPGMM(n_components=50,
              alpha=1,
              verbose=2,
              tol=0.01,
              n_iter=250,
              min_covar=1e-6)
#dpgmm = BayesianGaussianMixture(n_components=50, covariance_type="diag", reg_covar = 1e-6,
#                                weight_concentration_prior_type="dirichlet_process",
#                                weight_concentration_prior=1, verbose=2,
#                                tol=0.01, max_iter=250, init_params='random',
#                                mean_precision_prior=actvs.std(),
#                                mean_prior=np.repeat(actvs.max()/5,actvs.shape[0]))

dpgmm.fit(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
labels = dpgmm.predict(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
Ejemplo n.º 28
0
    def __init__(self,
                 cluster_method=2,
                 cluter_tag=False,
                 train_path=None,
                 event_info_path=None,
                 city_id=None):
        self.loss_choice = 0  # 0:reg; 1:pairwise ranking
        self.ndim = 20
        self.tr_method = 0  # 0:SGD1; 1:SGD2
        self.cluster_method = cluster_method  # 0:DPGMM; 1:GMM; 2:K-means
        self.n_components = 20
        self.city_id = city_id

        # SGD
        self.niters1 = 10
        self.lr1 = 0.01
        self.lambda1 = 0.001
        self.neg_num1 = 5
        self.beta1 = 1
        self.alpha1 = 1
        self.ins_weight = [self.beta1, self.alpha1]

        pois = []
        if cluter_tag == True:
            events = set(
                [entry[1] for entry in csv.reader(open(train_path, "r"))])
            for entry in csv.reader(open(event_info_path, "r")):
                event = entry[0]
                if event in events:
                    poi = map(float, entry[3].split(" "))
                    pois.append(poi)
                    if not checkGeoScope(poi, self.city_id):
                        print 'Invalic location'
                        sys.exit(1)
            if self.cluster_method == 0:
                cluster = DPGMM(n_components=500,
                                covariance_type='diag',
                                alpha=1,
                                n_iter=50)
                cluster.fit(pois)
                centers = removeDup(cluster.means_)
                outputCenterforVis(centers)
                self.n_components = len(centers)
                cluster_fd = open(settings["DPGMM_CLUSTER"], "wb")
                pickle.dump([centers, None], cluster_fd)
                self.model_path = settings["GEOMF"]
                outputCenterforVis(centers)
            elif self.cluster_method == 1:
                cluster = GMM(n_components=self.n_components,
                              covariance_type='diag',
                              min_covar=1e-7,
                              n_init=10,
                              random_state=0,
                              n_iter=100)
                cluster.fit(pois)
                outputCenterforVis(cluster.means_)
                labels = deterClusterRel(pois, cluster.means_)
                #showNumInEachCluster(labels, self.n_components)
                dis_variances = calDisVariance(self.n_components, labels, pois)
                dis_variances = smoothVar(dis_variances)
                covars = smoothVar(cluster.covars_)
                cluster_fd = open(settings["GMM_CLUSTER"], "wb")
                pickle.dump([cluster.means_, covars, dis_variances],
                            cluster_fd)
            elif self.cluster_method == 2:
                cluster = KMeans(n_clusters=self.n_components,
                                 max_iter=300,
                                 init='k-means++')
                cluster.fit(pois)
                means, variances = calCenterCov(self.n_components,
                                                cluster.labels_, pois)
                outputCenterforVis(means)
                dis_variances = calDisVariance(self.n_components,
                                               cluster.labels_, pois)
                variances = smoothVar(variances)
                dis_variances = smoothVar(dis_variances)
                cluster_fd = open(settings["KMEANS_CLUSTER"], "wb")
                pickle.dump([means, variances, dis_variances], cluster_fd)
            else:
                print 'Invalid choice of clustering method'
                sys.exit(1)
Ejemplo n.º 29
0
    def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4):
        self.num_params = 0
        #The range of len(params)
        _step = 0
        for var_idx in tqdm(range(len(self.merge_var[0]))):

            for x_v in range(len(self.merge_var[0][var_idx])):
                print('Step %d'%_step,end='\r')
                _step += 1
                try:
                    
                    for y_v in range(len(self.merge_var[0][var_idx][x_v])):
                        #print('cluster weights ....%d'%var_idx)
                        dist = []
                        for task_idx in range(len(self.merge_var)):
                            nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200)
                            dist.append(nor)
                        
                        dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                        if dp:
                            print('Initializing DPGMM%d ... '%_step,end='\r')
                            gmm = DPGMM( max_iter=1000,  n_components=n_component, covariance_type='spherical')
                        else:
                            gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                        gmm.fit(dist)
                        new_idx_list = []
                        for task_idx in range(len(self.merge_var)):
                            #if dp:
                            #Strategy 1. Set threshold
                            predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                            f_ = True
                            
                            while f_:
                                #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                                if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                    new_idx = np.argmax(predict_probability)
                                    f_ = False
                                else:
                                    predict_probability[0][np.argmax(predict_probability)] = 0.0
                                    self.num_params += 1
                            

                        #else:
                        #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                        #    if new_idx in new_idx_list:
                                self.num_params += 1
                            new_idx_list.append(new_idx)
                            self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx]
                            self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)


                except TypeError:
                    dist = []
                    
                    
                    for task_idx in range(len(self.merge_var)):
                        nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200)
                        dist.append(nor)
                    dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                    if dp:
                        print('Initializing DPGMM%d ... '%_step,end='\r')
                        gmm = DPGMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    else:
                        gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    gmm.fit(dist)
                    new_idx_list = []
                    for task_idx in range(len(self.merge_var)):
                        #if dp:
                        #Strategy 1. Set threshold
                        predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                        f_ = True
                        while f_:
                            #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                            if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                new_idx = np.argmax(predict_probability)
                                f_ = False
                            else:
                                predict_probability[0][np.argmax(predict_probability)] = 0.0
                                self.num_params += 1

                    #else:
                    #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                    #    if new_idx in new_idx_list:
                    #        self.num_params += 1
                        new_idx_list.append(new_idx)
                        self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx]
                        self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)
Ejemplo n.º 30
0
    def _st_smooth(self,
                   var_idx,
                   x_v,
                   y_v=None,
                   n_component=1,
                   thresh_hold=0.3,
                   dp=False):
        mixture_dist = []
        for task_idx in range(self.num_task):
            if y_v is not None:
                mean = self.params_mean[task_idx][var_idx][x_v][y_v]
                var = self.transform_var(
                    self.params_var[task_idx][var_idx][x_v][y_v])
            else:
                mean = self.params_mean[task_idx][var_idx][x_v]
                var = self.transform_var(
                    self.params_var[task_idx][var_idx][x_v])
            mixture_dist.append({'kwargs': {'loc': mean, 'scale': var}})

        alpha = 0.3
        alpha_list = [(1 - alpha) / (self.num_task - 1)] * (self.num_task - 1)
        alpha_list.append(alpha)
        sample = create_mixture(mixture_dist, alpha_list=alpha_list)
        if dp:
            gmm = DPGMM(max_iter=1000,
                        n_components=n_component,
                        covariance_type='spherical')
        else:
            gmm = GMM(max_iter=500,
                      n_components=n_component,
                      covariance_type='spherical')
        gmm.fit(sample)

        new_idx_list = []
        for task_idx in range(self.num_task):
            if y_v is not None:
                predict_probability = gmm.predict_proba(
                    np.array(
                        self.params_mean[task_idx][var_idx][x_v][y_v]).reshape(
                            -1, 1))
            else:
                predict_probability = gmm.predict_proba(
                    np.array(self.params_mean[task_idx][var_idx][x_v]).reshape(
                        -1, 1))
            f_ = True
            while f_:
                if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                    new_idx = np.argmax(predict_probability)
                    f_ = False
                else:
                    predict_probability[0][np.argmax(
                        predict_probability)] = 0.0
                    #self.num_merged_params += 1
            if new_idx in new_idx_list:
                self.num_merged_params += 1
            new_idx_list.append(new_idx)
            if y_v is not None:
                self.params_mean[task_idx][var_idx][x_v][y_v] = gmm.means_[
                    new_idx]
                self.params_var[task_idx][var_idx][x_v][
                    y_v] = self.retransform_var(gmm.covariances_[new_idx])
            else:
                self.params_mean[task_idx][var_idx][x_v] = gmm.means_[new_idx]
                self.params_var[task_idx][var_idx][x_v] = self.retransform_var(
                    gmm.covariances_[new_idx])
        """