Ejemplo n.º 1
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k, covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
Ejemplo n.º 2
0
def fit_vel_profile_dpgmm(vel_profile, n_comps=5, dp=False):
    """
    fit a velocity profile with DP-GMM
    """
    N = 1000    # 1000 samples to fit
    integral = np.sum(vel_profile)
    #vel_profile is a 1D array, try to convert it to samples
    t = np.linspace(0, 1, len(vel_profile))
    data = np.array([])
    for i in range(len(t)):
        n_samples = vel_profile[i] / integral * N
        if n_samples > 0:
            #add samples
            samples = np.ones(n_samples) * t[i]
            #add noise
            data = np.concatenate([data, samples])
    fit_data = np.array([data]).transpose()
    #fit Dirichlet-Process Gaussian Mixture Model, 
    #something wrong with the module? The clusters seem merged...
    if dp:
        model = DPGMM(n_components=n_comps, n_iter=1000, alpha=10)
    else:
        model = GMM(n_components=n_comps)
    
    model.fit(fit_data)

    return model
Ejemplo n.º 3
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k,
                          covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
def plot_GPLVM_data_cluster(results_dir, n_clusters=None, VB=False):
    # Load relevant datasets
    data_array = np.genfromtxt(os.path.join(results_dir, 'summary.csv'), delimiter=',')
    X = (np.genfromtxt(os.path.join(results_dir, 'GPLVM-datasets-2.csv'), delimiter=','))
    datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()]
    methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()]
    # Fit a mixture model
    if n_clusters is None:
        m = DPGMM()
    elif VB:
        m = VBGMM(alpha = 10, n_components=n_clusters)
    else:
        m = GMM(n_components=n_clusters, n_init=100)
    m.fit(data_array.T)
    clusters = m.predict(data_array.T)
    # Plot
    #clf()
    figure(1)
    pretty_scatter(X[:,0], X[:,1], clusters, 200*np.ones(X[:,0].shape), datasets)
    xlabel('Dimension 1')
    ylabel('Dimension 2')
    if n_clusters is None:
        title('CRP MoG')
    elif VB:
        title('%d clusters with VB' % n_clusters)
    else:
        title('%d clusters with EM' % n_clusters)
    show()
Ejemplo n.º 5
0
 def try_covar(type_str, x_words):
     clf = DPGMM(n_components=20, covariance_type=type_str, alpha=30,  n_iter=1000)
     clf.fit(x_data)
     y_ = clf.predict(x_data)
     print type_str
     print_centers(x_words, y_, clf)
     print
Ejemplo n.º 6
0
def load_build_dpggm(dpggm_model_name, x_data):
    if os.path.isfile(dpggm_model_name):
        clf = load_dpggm(dpggm_model_name)
    else:
        clf = DPGMM(n_components=30, covariance_type='diag', alpha=5,  n_iter=1000)
        logging.info("Fitting with DPGMM")
        clf.fit(x_data)
        pickle.dump(clf, open(dpggm_model_name, 'wb'))
        logging.info("Fitted")
        print clf.converged_
    return clf
Ejemplo n.º 7
0
def main():
    if len(sys.argv) != 4:
        print(__doc__)
        return 1

    infile = sys.argv[1]
    N = int(sys.argv[2])    
    num_random = int(sys.argv[3])

    print("Reading in", infile)
    fullarr = np.loadtxt(fileinput.input(infile), delimiter = '\t')[:,:-7]

    stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T
    means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T
    stds[stds == 0] = 1.0

    num_lines = num_random
    fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:]

    fullarr = (fullarr - means) / stds

    output = ''

    print("Parameter searching...")
    igmm = None
    best_score = -100000
    best_alpha = -1
    best_model = None
    for alpha in [0.01,0.1,1,10]: 
        print("Learning infinite GMM with N={}, alpha={}".format(N, alpha))
        output += "Learning infinite GMM with N={}, alpha={}\n".format(N, alpha)
        igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc')
        igmm.fit(fullarr)
        score = igmm.score(fullarr)
        score = sum(score)/len(score)
        print('{}: {} with {} clusters'.format(alpha, score, igmm.n_components))
        output += '{}: {} with {} clusters\n'.format(alpha, score, igmm.n_components)

        if score > best_score:
            best_score = score
            best_alpha = alpha
            best_model = igmm

    print('Best alpha={}, score={}'.format(best_alpha, best_score))
    output += 'Best alpha={}, score={}\n'.format(best_alpha, best_score)
    with open('parameter_search_results.txt', 'a+') as outf:
        outf.write(output)
    
    return 0
Ejemplo n.º 8
0
def train_dpgmm(X, n_components=3, covariance_type='diag', alpha=1.0,
                random_state=None, thresh=None, tol=0.001, verbose=False,
                min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
    """
    This function trains a Infinite Gaussian Mixture Model for clustering
    :param X:
    :param n_components:
    :param covariance_type:
    :param alpha:
    :param random_state:
    :param thresh:
    :param tol:
    :param verbose:
    :param min_covar:
    :param n_iter:
    :param params:
    :param init_params:
    :return: a trained DPGMM clustering model
    """
    model = DPGMM(n_components=n_components,
                  covariance_type=covariance_type,
                  alpha=alpha,
                  random_state=random_state,
                  thresh=thresh,
                  verbose=verbose,
                  min_covar=min_covar,
                  n_iter=n_iter,
                  params=params,
                  init_params=init_params)
    model = model.fit(X)
    return model
Ejemplo n.º 9
0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None):
    best_bic = np.inf
    bic_dpgmm = None
    lbl_vec_dpgmm = np.zeros(X.shape[0])
    prob_vec_dpgmm = np.zeros(X.shape[0])
    log_prob_dpgmm = None
    for i in xrange(n_init):
        dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                        alpha=alpha, random_state=rand_state)
        dpgmm.fit(X)
        b = dpgmm.bic(X)
        if b < best_bic:
            bic_dpgmm = b
            lbl_vec = dpgmm.predict(X)
            prob_vec = dpgmm.predict_proba(X)
            log_prob_dpgmm = np.sum(dpgmm.score(X))
    return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
Ejemplo n.º 10
0
def train_DPGMM(d, max_n_comp=100, max_n_iter=500):
    '''Imports Data, Trains a DPGMM, Generates predictions testing'''

    print "Training Model..."
    gmm = DPGMM(max_n_comp, n_iter=max_n_iter)

    start = timeit.default_timer()
    gmm.fit(d)
    end = timeit.default_timer()

    print "Training completed in %f seconds" % (end-start)

    print
    print "Converged: "
    print gmm.converged_
    print

    return gmm
Ejemplo n.º 11
0
def plot_num_iters_dpgmm(X, num_c, cv_type, alpha, max_iters, n_init):
    bic = []
    for iters in np.arange(1, max_iters):
        best_bic = np.inf
        for j in xrange(n_init):
            dpgmm = DPGMM(n_components=comp, covariance_type=cv_type, \
                                                alpha=a, n_iter=iters)
            dpgmm.fit(X)
            b = dpgmm.bic(X)
            if b < best_bic:
                best_bic = b
        bic.append(best_bic)
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(np.arange(1, max_iters), bic)
    ax.set_title('BIC vs. Number of Iterations DPGMM')
    ax.set_xlabel('Number of iterations')
    ax.set_ylabel('BIC score')
    return fig
Ejemplo n.º 12
0
def plot_alpha_dpgmm(X, num_c, cv_type, alphas, iters, n_init):
    bic = []
    for a in alphas:
        best_bic = np.inf
        for j in xrange(n_init):
            dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                                                alpha=a, n_iter=iters)
            dpgmm.fit(X)
            b = dpgmm.bic(X)
            if b < best_bic:
                best_bic = b
        bic.append(best_bic)

    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(alphas, bic, 'bo-', lw=2)
    ax.set_title('BIC vs. Alpha DPGMM')
    ax.set_xlabel('Alpha')
    ax.set_ylabel('BIC score')
    return fig
Ejemplo n.º 13
0
def dpgmm_segmenter(factors, width=MEDIAN_WIDTH):
    factors = median_filter(factors, size=(MEDIAN_WIDTH, 1), mode='mirror')
    factors = pre.scale(factors, axis=1)
    best_boundaries = [0, factors.shape[0] - 1]
    best_n_types = 1

    dpgmm = DPGMM(n_components=10, covariance_type='diag', alpha=10, n_iter=100)
    dpgmm.fit(np.tile(factors, (10, 1)))
    labels = dpgmm.predict(factors)
    boundaries, labels = find_boundaries(labels, width)

    if len(np.unique(labels)) > 1:
        best_boundaries = boundaries
        best_n_types = len(np.unique(labels))

    if len(best_boundaries) < best_n_types + 1:
        best_n_types = len(best_boundaries) - 1

    best_labels = segment_labeling(factors, best_boundaries, c_method='kmeans', k=best_n_types)
    best_boundaries = np.array(best_boundaries)

    return best_boundaries, best_labels
Ejemplo n.º 14
0
def main():
    if len(sys.argv) != 5:
        print(__doc__)
        return 1

    infiles = glob(sys.argv[1])
    outfile = sys.argv[2]
    N = int(sys.argv[3])
    alpha = float(sys.argv[4])

    print("Reading in", len(infiles), "files")
    fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7]


    stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T
    means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T
    stds[stds == 0] = 1.0

    num_lines = 10000
    fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:]

    fullarr = (fullarr - means) / stds


    print("Learning infinite GMM with N={}, alpha={}".format(N, alpha))

    igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc')
    igmm.fit(fullarr)

    print("Infinite GMM trained, saving")

    with open(outfile + '_' + num_lines, 'wb') as out_model:
        pickle.dump(igmm, out_model)

    print("Score:", igmm.score(fullarr))
    print("Num Components:", igmm.n_components)
    
    return 0
Ejemplo n.º 15
0
def main(method,cluster_num=30,alpha=.5):
    f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv'
    #f ='/Users/davidgreenfield/Downloads/features_f500.csv'
    cols=range(1,4096)
    feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols))
    asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str)
    if method == 'kmeans':
        k_means=cluster.KMeans(n_clusters=cluster_num)
        k_means.fit(feats)
        y = k_means.labels_
        if MAKE_GRAPH==1:
            print "hello 1"
        create_graph(k_means)
    elif method == 'GMM_VB':
        gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5)
        y = gmm_vb.predict(feats)
        cluster_no = len(np.unique(y))
    elif method == 'GMM_DP':
        gmm_dp = DPGMM(n_components=50,alpha=alpha)
        gmm_dp.fit(feats)
        y = gmm_dp.predict(feats)
        cluster_no = len(np.unique(y))


    clusters=[]
    groups={}
    data=load_data('./data/boots_aws.csv')

    for i in range(0,cluster_num):
        groups[i]=np.where(y==i)
        ids=asins[groups[i]]
        clusters.append(ids)
        links=[data[x]['url'] for x in ids]
        create_html(links,"templates/groups/group"+str(i)+".html")

    output_clusters(clusters,"outputs/clusters.csv")
Ejemplo n.º 16
0
    def __init__(self, cluster_method=2, cluter_tag=False, train_path=None, event_info_path=None, city_id=None):
        self.loss_choice = 0      # 0:reg; 1:pairwise ranking
        self.ndim = 20
        self.tr_method = 0        # 0:SGD1; 1:SGD2
        self.cluster_method = cluster_method   # 0:DPGMM; 1:GMM; 2:K-means
        self.n_components = 20
        self.city_id = city_id

        # SGD
        self.niters1 = 10
        self.lr1 = 0.01
        self.lambda1 = 0.001
        self.neg_num1 = 5
        self.beta1 = 1
        self.alpha1 = 1
        self.ins_weight = [self.beta1, self.alpha1]

        pois = []
        if cluter_tag == True:
            events = set([entry[1] for entry in csv.reader(open(train_path, "r"))])
            for entry in csv.reader(open(event_info_path, "r")):
                event = entry[0]
                if event in events:
                    poi = map(float, entry[3].split(" "))
                    pois.append(poi)
                    if not checkGeoScope(poi, self.city_id):
                        print 'Invalic location'
                        sys.exit(1)
            if self.cluster_method == 0:
                cluster = DPGMM(n_components=500,
                                covariance_type='diag',
                                alpha=1,
                                n_iter=50)
                cluster.fit(pois)
                centers = removeDup(cluster.means_)
                outputCenterforVis(centers)
                self.n_components = len(centers)
                cluster_fd = open(settings["DPGMM_CLUSTER"], "wb")
                pickle.dump([centers, None], cluster_fd)
                self.model_path = settings["GEOMF"]
                outputCenterforVis(centers)
            elif self.cluster_method == 1:
                cluster = GMM(n_components = self.n_components,
                              covariance_type='diag',
                              min_covar=1e-7,
                              n_init=10,
                              random_state=0,
                              n_iter=100)
                cluster.fit(pois)
                outputCenterforVis(cluster.means_)
                labels = deterClusterRel(pois, cluster.means_)
                #showNumInEachCluster(labels, self.n_components)
                dis_variances = calDisVariance(self.n_components, labels, pois)
                dis_variances = smoothVar(dis_variances)
                covars = smoothVar(cluster.covars_)
                cluster_fd = open(settings["GMM_CLUSTER"], "wb")
                pickle.dump([cluster.means_, covars, dis_variances], cluster_fd)
            elif self.cluster_method == 2:
                cluster = KMeans(n_clusters = self.n_components,
                                 max_iter=300,
                                 init='k-means++')
                cluster.fit(pois)
                means, variances= calCenterCov(self.n_components, cluster.labels_, pois)
                outputCenterforVis(means)
                dis_variances = calDisVariance(self.n_components, cluster.labels_, pois)
                variances = smoothVar(variances)
                dis_variances = smoothVar(dis_variances)
                cluster_fd = open(settings["KMEANS_CLUSTER"], "wb")
                pickle.dump([means, variances, dis_variances], cluster_fd)
            else:
                print 'Invalid choice of clustering method'
                sys.exit(1)
Ejemplo n.º 17
0
#print 'feature num', len(feature_idx)
#fn = fn[:, feature_idx]
#X = StandardScaler().fit_transform(fn)

fold = 3
kf = StratifiedKFold(label, n_folds=fold, shuffle=True)
#kf = KFold(len(label), n_folds=fold, shuffle=True)
clf = RFC(n_estimators=100, criterion='entropy')
rounds = 1
acc_sum = [[] for i in range(fold)]
for train, test in kf:
    train_fn = fn[train]
    #n_class = len(np.unique(label[train]))

    d = DPGMM(n_components=50, covariance_type='spherical',alpha=10)
    d.fit(train_fn)
    #print 'mixture mean', d.means_
    preds = d.predict(train_fn)
    print '# of M by DP', len(np.unique(preds))
    acc_sum[0].append(ARI(label[train], preds))
    #acc_sum[0].append(SS(train_fn, preds))

    #n_class = len(np.unique(preds))
    n_class = 32
    g = GMM(n_components=n_class, covariance_type='spherical', init_params='wmc', n_iter=100)
    g.fit(train_fn)
    #g.means_ = np.array([x_train[y_train == i].mean(axis=0) for i in np.unique(y_train)])
    preds = g.predict(train_fn)
    #prob = np.sort(g.predict_proba(train_fd))
    acc_sum[1].append(ARI(label[train], preds))
    #acc_sum[1].append(SS(train_fn, preds))
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
axes[1].imshow(feats_log,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
axes[2].imshow(feats_log_normed,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
fig.tight_layout()


# Clustering with DP-GMM
n_components = 32
dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000,
              covariance_type='diag', verbose=True)
dpgmm.fit(feats_log.T)
preds_proba = dpgmm.predict_proba(feats_log.T)
preds = np.argmax(preds_proba, axis=1)
np.unique(preds)
# resynthesis by sampling from clusters
resynthesis = dpgmm.means_[preds.astype(int), :]

fig, axes = plt.subplots(4, 1, figsize=(18, 8))
axes[0].set_title(feature)
axes[1].set_title('Prediction Probability')
axes[2].set_title('Resynthesis')
axes[3].set_title('Max(Prediction Probability)')

axes[0].imshow(feats_log,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
Ejemplo n.º 19
0
def plotClustering(fullpath,
                   order=1,
                   sr=4,
                   cutoff=.1,
                   n_singv=3,
                   feature='chroma',
                   dim_red='SVD',
                   round_to=0,
                   normalize=1,
                   scale=1,
                   length=4,
                   clustering='KMEANS'):
    feat = {}
    print(
        'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, '
        'n_singv {}, scale {} normalize {}, round_to {}'.format(
            fullpath, feature, order, sr, cutoff, n_singv, scale, normalize,
            round_to))
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)

    # extract filter and apply pre-processing
    feat[feature], beat_times = extractFeature(filename,
                                               file_ext,
                                               feature,
                                               scale,
                                               round_to,
                                               normalize,
                                               beat_sync=True,
                                               save=True)

    feat['LPF'] = lpf(feat[feature], cutoff, sr, order)
    feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv)
    feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv)
    feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order)
    feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF']
    feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)],
                                              cutoff, sr, order)
    feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(
        dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv)

    # create vars for plotting
    ts = np.arange(0, len(feat[feature]))
    step_size = max(1, int(len(ts) * .01))
    fig = plt.figure(figsize=(98, 64))
    fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format(
        feature, order, cutoff, sr))

    gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1])
    i = 0
    print "\tPlot data and pre-processing"
    for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red),
                 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature),
                 '{}(LPF({}-LPF))'.format(dim_red, feature)):
        data = feat[name]

        data_wide = np.array([
            feat[name][m:m + length, :]
            for m in xrange(len(feat[name]) - length)
        ])
        data_wide = data_wide.reshape(data_wide.shape[0],
                                      data_wide.shape[1] * data_wide.shape[2])

        # build codebook using kmeans or DP-GMM
        if clustering == 'KMEANS':
            K_MIN, K_MAX = 2, 16
            KM = [
                KMeans(n_clusters=l, init='k-means++').fit(data_wide)
                for l in xrange(K_MIN, K_MAX + 1)
            ]

            # compute scores to assess fit
            scores_bic = [
                computeBic(KM[x], data_wide) for x in xrange(len(KM))
            ]
            scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))]
            scores_silhouette = [
                silhouette_score(data_wide, KM[x].labels_, metric='euclidean')
                for x in xrange(len(KM))
            ]

            # get best clusters
            idx_best_bic = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0])
            idx_best_inertia = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0])
            idx_best_silhouette = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0])
            idx_best = int(
                np.median(
                    (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1

            # get clusters and cluster allocations given best K
            k_best = idx_best + K_MIN
            centroids = KM[idx_best].cluster_centers_
            centroid_idx = KM[idx_best].labels_
        elif clustering == 'DPGMM':
            n_components = 12
            dpgmm = DPGMM(n_components=n_components,
                          tol=1e-3,
                          n_iter=32,
                          alpha=1000,
                          covariance_type='diag',
                          verbose=True)
            dpgmm.fit(data_wide)

            # compute scores to assess fit
            scores_bic = dpgmm.bic(data_wide)
            scores_silhouette = [
                silhouette_score(data_wide, centroids, metric='euclidean')
            ]
            scores_silhouette = [0.0]

            # get clusters and cluster allocations given best K
            k_best = dpgmm.means_.shape[0]
            centroids = dpgmm.means_
            centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1)
        # plot data
        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T

        ax = fig.add_subplot(gs[i, :])
        ax.set_title(name)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        xlabels = [
            "{}:{}".format(int(x / 60), int(x % 60))
            for x in beat_times[::step_size]
        ]
        ax.set_xticks(ts[::step_size])
        ax.set_xticklabels(xlabels, rotation=60)
        ax.grid(False)

        # plot clustering on raw feature
        changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:]))
        for c in xrange(changes.shape[0] - 1):
            if changes[c] and changes[c + 1]:
                changes[c] = False
        ax_twin = ax.twiny()
        ax_twin.set_xlim(ax.get_xlim())
        ax_twin.set_xticks(np.argwhere(changes)[:, 0])
        ax_twin.set_xticklabels(centroid_idx[changes])
        ax_twin.grid(False)

        # plot codebook (centroids)
        ax = fig.add_subplot(gs[i + 1, 0])
        ax.set_title(name)

        if centroids.shape[1] == 3:
            centroids = centroids.reshape(1, centroids.shape[0],
                                          centroids.shape[1])
        elif centroids.shape[1] == n_singv * length:
            centroids = centroids.reshape(1, centroids.shape[0] * length,
                                          centroids.shape[1] / length)
        else:
            centroids = centroids.reshape(centroids.shape[0] * length,
                                          centroids.shape[1] / length).T
        ax.imshow(centroids,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(xrange(0, centroids.shape[1], 4))
        ax.set_xticklabels(xrange(k_best))
        ax.grid(False)

        # plot elbow curve
        c = 1
        for k, v, idx in (('BIC', scores_bic, idx_best_bic),
                          ('INERTIA', scores_inertia,
                           idx_best_inertia), ('SILHOUETTE', scores_silhouette,
                                               idx_best_silhouette)):
            ax = fig.add_subplot(gs[i + 1, c])
            ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN))
            ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-')
            ax.set_xlim((K_MIN, K_MAX + 1))
            ax.set_xlabel('Number of clusters')
            ax.set_ylabel('Score')
            ax.grid(True)
            ax.axvline(idx + K_MIN, color='r')
            c += 1
        i += 2
        """
        if 'SVD' in name:
            # scikit-image clustering
            segments_slic = slic(
                data, n_segments=10, compactness=10, sigma=1)
            segments_quickshift = quickshift(
                data, kernel_size=3, max_dist=6, ratio=0.5)

            ax = fig.add_subplot(gs[k, 0])
            ax.set_title('{} with quickshift'.format(name))
            ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)

            ax = fig.add_subplot(gs[k, 1])

            ax.set_title('{} with slic'.format(name))
            ax.imshow(mark_boundaries(data, segments_slic, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)
            k += 1
        """

    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    # save with large size
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))
    # save with smaller size
    fig.set_figwidth(36)
    fig.set_figheight(24)
    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    plt.close(fig)
Ejemplo n.º 20
0
    def __init__(self,
                 cluster_method=2,
                 cluter_tag=False,
                 train_path=None,
                 event_info_path=None,
                 city_id=None):
        self.loss_choice = 0  # 0:reg; 1:pairwise ranking
        self.ndim = 20
        self.tr_method = 0  # 0:SGD1; 1:SGD2
        self.cluster_method = cluster_method  # 0:DPGMM; 1:GMM; 2:K-means
        self.n_components = 20
        self.city_id = city_id

        # SGD
        self.niters1 = 10
        self.lr1 = 0.01
        self.lambda1 = 0.001
        self.neg_num1 = 5
        self.beta1 = 1
        self.alpha1 = 1
        self.ins_weight = [self.beta1, self.alpha1]

        pois = []
        if cluter_tag == True:
            events = set(
                [entry[1] for entry in csv.reader(open(train_path, "r"))])
            for entry in csv.reader(open(event_info_path, "r")):
                event = entry[0]
                if event in events:
                    poi = map(float, entry[3].split(" "))
                    pois.append(poi)
                    if not checkGeoScope(poi, self.city_id):
                        print 'Invalic location'
                        sys.exit(1)
            if self.cluster_method == 0:
                cluster = DPGMM(n_components=500,
                                covariance_type='diag',
                                alpha=1,
                                n_iter=50)
                cluster.fit(pois)
                centers = removeDup(cluster.means_)
                outputCenterforVis(centers)
                self.n_components = len(centers)
                cluster_fd = open(settings["DPGMM_CLUSTER"], "wb")
                pickle.dump([centers, None], cluster_fd)
                self.model_path = settings["GEOMF"]
                outputCenterforVis(centers)
            elif self.cluster_method == 1:
                cluster = GMM(n_components=self.n_components,
                              covariance_type='diag',
                              min_covar=1e-7,
                              n_init=10,
                              random_state=0,
                              n_iter=100)
                cluster.fit(pois)
                outputCenterforVis(cluster.means_)
                labels = deterClusterRel(pois, cluster.means_)
                #showNumInEachCluster(labels, self.n_components)
                dis_variances = calDisVariance(self.n_components, labels, pois)
                dis_variances = smoothVar(dis_variances)
                covars = smoothVar(cluster.covars_)
                cluster_fd = open(settings["GMM_CLUSTER"], "wb")
                pickle.dump([cluster.means_, covars, dis_variances],
                            cluster_fd)
            elif self.cluster_method == 2:
                cluster = KMeans(n_clusters=self.n_components,
                                 max_iter=300,
                                 init='k-means++')
                cluster.fit(pois)
                means, variances = calCenterCov(self.n_components,
                                                cluster.labels_, pois)
                outputCenterforVis(means)
                dis_variances = calDisVariance(self.n_components,
                                               cluster.labels_, pois)
                variances = smoothVar(variances)
                dis_variances = smoothVar(dis_variances)
                cluster_fd = open(settings["KMEANS_CLUSTER"], "wb")
                pickle.dump([means, variances, dis_variances], cluster_fd)
            else:
                print 'Invalid choice of clustering method'
                sys.exit(1)
Ejemplo n.º 21
0
Archivo: DC.py Proyecto: clouizos/AIR
    data_cluster_train_ds = data_cluster_train

    """if you want clustering on the dissimilarity space uncomment
       below and change accordingly"""
    # print 'Calculating dissimilarity space for training queries...'
    # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean')
    # data_cluster_train_ds = sc.squareform(data_cluster_train_ds)

    # # plt.figure(1)
    # # plt.imshow(data_cluster_train_ds)
    # # plt.colorbar()
    # # plt.title('Initial dissimilarity')

    print 'Training a Dirichlet Process Gaussian Mixture model...'
    dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50)
    dpgmm.fit(data_cluster_train_ds)
    prediction = dpgmm.predict(data_cluster_train_ds)
    clusters = np.unique(prediction)

    print 'Found %i clusters!' % clusters.shape[0]
    print clusters

    """create the reordered input data according to the clusters
      it is only needed if you want to visuallize the clustering
      afterwards"""
    #data_cluster = np.zeros((1, data_cluster_train.shape[1]))

    # each cluster is a list of lists that contains the indices
    # of the queries for each cluster
    each_cluster = []
    for i in xrange(clusters.shape[0]):
axes[2].imshow(feats_log_normed,
               aspect='auto',
               origin='low',
               interpolation='nearest',
               cmap=plt.cm.plasma)
fig.tight_layout()

# Clustering with DP-GMM
n_components = 32
dpgmm = DPGMM(n_components=n_components,
              tol=1e-3,
              n_iter=32,
              alpha=1000,
              covariance_type='diag',
              verbose=True)
dpgmm.fit(feats_log.T)
preds_proba = dpgmm.predict_proba(feats_log.T)
preds = np.argmax(preds_proba, axis=1)
np.unique(preds)
# resynthesis by sampling from clusters
resynthesis = dpgmm.means_[preds.astype(int), :]

fig, axes = plt.subplots(4, 1, figsize=(18, 8))
axes[0].set_title(feature)
axes[1].set_title('Prediction Probability')
axes[2].set_title('Resynthesis')
axes[3].set_title('Max(Prediction Probability)')

axes[0].imshow(feats_log,
               aspect='auto',
               origin='low',
Ejemplo n.º 23
0
class DPGMMClusterModel(BaseEstimator, TransformerMixin):

    def __init__(self, w2v_model=None, n_components=None, no_above=0.9, no_below=8, dataname="", stoplist=None,
                 dictionary=None, recluster_thresh=1000, alpha=5):
        self.w2v_model = w2v_model
        self.no_above = no_above
        self.no_below = no_below
        self.alpha = alpha
        self.n_components = n_components
        self.n_sub_components = int(n_components / 2)
        self.stoplist = stoplist
        self.dataname = dataname
        self.dictionary = dictionary
        self.dpgmm = None
        self.scaler = None
        self.cluster_info = None
        # a list of sub-clusterer
        self.feature_crd = {}
        self.subdpgmms = []
        self.reclustered = []
        self.recluster_thresh = recluster_thresh

    def should_cluster_word(self, word):
        return (word in self.dictionary.token2id) and (len(word) > 1) and \
               (self.w2v_model is None or word in self.w2v_model) and \
               (self.stoplist is None or word not in self.stoplist)

    # constructs a dictionary and a DPGMM model on 9000 middle frequency words from X
    # X is a sequence of texts
    def fit(self, X, y=None):
        # either consturct a dictionary from X, trim it
        if self.dictionary is None:
            self.dictionary = corpora.Dictionary(X)
        # or use an existing dictionary and trim the given set of words
        self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above, keep_n=9000)

        if self.w2v_model is None:
            w2v_corpus = [[word for word in text if self.should_cluster_word(word)] for text in X]
            self.w2v_model = w2v_models.build_word2vec(w2v_corpus, size=100, window=10, min_count=self.no_below,
                                                       dataname=self.dataname+"_dpgmm")

        word_list = np.array([word for word in self.dictionary.token2id.iterkeys() if self.should_cluster_word(word)])

        # This was  reclustering clause - I need to re-write this
        # else:
        #    # note the double loop here!!
        #    word_list = np.array([word for text in X for word in text if self.should_cluster_word(word)])

        # construct a list of words to cluster
        # remove rare and frequent words
        # remove words of length 1
        # remove stopwords
        vec_list = [self.w2v_model[word] for word in word_list]

        logging.info("DPGMM received %i words" % len(vec_list))

        # save word representations
        filename = "w2v_vocab_%s_%.1f_%.0f.lcsv" % (self.dataname, self.no_above, self.no_below)
        io.save_words_representations(filename, word_list, vec_list)

        self.scaler = StandardScaler()
        vecs = self.scaler.fit_transform(np.array(vec_list))

        self.dpgmm = DPGMM(n_components=self.n_components, covariance_type='diag', alpha=self.alpha,
                           n_iter=1000, tol=0.0001)
        self.dpgmm.fit(vecs)
        logging.info("DPGMM converged: %s" % self.dpgmm.converged_)


        # save information about found clusters
        self.cluster_info = []
        y_ = self.dpgmm.predict(vecs)

        for i, cluster_center in enumerate(self.dpgmm.means_):
            cluster_words = word_list[y_ == i]
            cluster_size = len(cluster_words)
            if cluster_size > self.recluster_thresh and self.recluster_thresh > 0:
                logging.info("DPGMM: reclustering %i words for cluster %i" % (len(cluster_words), i))
                sub_dpgmm = DPGMMClusterModel(w2v_model=self.w2v_model,
                                              n_components=self.n_sub_components,
                                              dictionary=self.dictionary,
                                              dataname="%s-%i" % (self.dataname, i), stoplist=self.stoplist)
                # recluster words.  Note the double array
                sub_dpgmm.fit([cluster_words])
                self.subdpgmms.append(sub_dpgmm)
                self.reclustered.append(i)
            if cluster_size > 0:
                #cluster_center_original = self.scaler.inverse_transform(cluster_center)
                #similar_words = self.w2v_model.most_similar_cosmul(positive=[cluster_center_original], topn=cluster_size)
                #central_words = [word for word, _ in similar_words if word in cluster_words]
                central_words = cluster_words[0:10]
            else:
                central_words = []
            self.cluster_info.append({'cnt': i, 'size': cluster_size, 'words': central_words})

        filename = "clusters_%s_%i_%.1f_%.0f.txt" % (self.dataname, self.n_components, self.no_above, self.no_below)
        io.save_cluster_info(filename, self.cluster_info)

        # setting up the coordinates for the features
        self.feature_crd = {'global': range(0, self.n_components),
                            'reclustered': [i for i in range(0, self.n_components + self.n_sub_components*len(self.reclustered))
                                            if i not in self.reclustered]}

        return self

    # calculate cluster counts for one text
    def clusterize(self, text):
        word_list = [word for word in text if self.should_cluster_word(word)]
        vec_list = np.array([self.w2v_model[word] for word in word_list])
        bincounts = np.zeros((self.n_components+self.n_sub_components*len(self.reclustered),))

        if len(vec_list) > 0:
            # assign words to clusters
            predictions = self.dpgmm.predict(self.scaler.transform(np.array(vec_list)))
            global_bincount = np.bincount(predictions, minlength=self.n_components)
            # re-assign words in large clusters
            bincounts[0:self.n_components] = global_bincount #reshape((1,len(global_bincount)))
            start = self.n_components
            for i, subdpgmm in zip(self.reclustered, self.subdpgmms):
                # if words in respective clusters exists - recluster them
                vecs_torecluster = vec_list[predictions == i]
                if len(vecs_torecluster) > 0:
                    predictions = subdpgmm.dpgmm.predict(subdpgmm.scaler.transform(np.array(vecs_torecluster)))
                    bincounts[start:start+subdpgmm.dpgmm.n_components] = \
                        np.bincount(predictions, minlength=subdpgmm.dpgmm.n_components) #.reshape((1, subdpgmm.n_components))
                start += subdpgmm.dpgmm.n_components
                # erase the count inthe global counts

        # returns a vector of cluster bin counts: [ global, reclustered1, reclustered2, ...]
        return bincounts.reshape((1, len(bincounts)))


    # for a  text, constructs a bincount of clusters present in the sentence
    # X is a list of texts.  One text is one string! Not tokenized
    def transform(self, X):

        # Text pre-processing
        x_clean = [tu.normalize_punctuation(text).split() for text in X]
        logging.info("DPGGM: Text prepocessed")

        # Vectorize using W2V model
        if self.dpgmm is not None:
            logging.info("Vectorizing a corpus")
            size = self.w2v_model.layer1_size
            if len(X) > 0:
                vecs = np.concatenate([self.clusterize(z) for z in x_clean], axis=0)
            else:
                vecs = np.zeros(size).reshape((1, size))
            logging.info("DPGMM: returning pre-processed data of shape %s" % (vecs.shape, ))
        else:
            logging.info("W2V Averaged: no model was provided.")
            vecs = np.zeros((len(X), 1))

        return vecs
max_components = 8

# Count the number of clusters the DPGMM chooses
num_clusters = []
size_sample = []

# Try clustering at different sample sizes
for iteration in range(int(np.floor(len(gaussian_data) / 10)) - 2):
    # Number of samples to use
    max_sample_value = ((iteration + 2) * 10) 
    sample_set = gaussian_data[0:max_sample_value]
    size_sample.append(max_sample_value - 0)
    
    # Fit Dirichlet Process Gaussian Mixture Model
    dpgmm_model = DPGMM(n_components = max_components, n_iter=1000, alpha=1.0)
    fitted_dpgmm = dpgmm_model.fit(sample_set)
    dpgmm_predictions = fitted_dpgmm.predict(gaussian_data)
    num_clusters.append(len(set(dpgmm_predictions)))
    
    # Append predicted labels to dataframe
    gaussian_data['predicted'] = dpgmm_predictions

# Give a unique color to each category
unique_categories = list(set(gaussian_data['predicted']))
color_labels = ['r', 'y', 'g', 'b', 'c', 'm', 'k', 'w']
colors = [color_labels[unique_categories.index(i)] for i in gaussian_data['predicted']]

# Plot predicted data
plt.scatter(gaussian_data['x'], gaussian_data['y'], c=colors)
plt.xlim([-12,12])
plt.ylim([-12,12])
Ejemplo n.º 25
0
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    n_components = 3
    dpgmm = DPGMM(n_components=n_components, alpha=1, covariance_type='full', random_state=0)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm._get_covars()
    print 'DPGMM均值 = \n', centers
    print 'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
Ejemplo n.º 26
0
    data_cluster_test = query_features(testing, 15, 10, 23, data)
    data_cluster_train_ds = data_cluster_train
    """if you want clustering on the dissimilarity space uncomment
       below and change accordingly"""
    # print 'Calculating dissimilarity space for training queries...'
    # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean')
    # data_cluster_train_ds = sc.squareform(data_cluster_train_ds)

    # # plt.figure(1)
    # # plt.imshow(data_cluster_train_ds)
    # # plt.colorbar()
    # # plt.title('Initial dissimilarity')

    print 'Training a Dirichlet Process Gaussian Mixture model...'
    dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50)
    dpgmm.fit(data_cluster_train_ds)
    prediction = dpgmm.predict(data_cluster_train_ds)
    clusters = np.unique(prediction)

    print 'Found %i clusters!' % clusters.shape[0]
    print clusters
    """create the reordered input data according to the clusters
      it is only needed if you want to visuallize the clustering
      afterwards"""
    #data_cluster = np.zeros((1, data_cluster_train.shape[1]))

    # each cluster is a list of lists that contains the indices
    # of the queries for each cluster
    each_cluster = []
    for i in xrange(clusters.shape[0]):
        cluster = data_cluster_train[prediction == clusters[i], :]
Ejemplo n.º 27
0
#labeled_datafile.close()
unlabeled_datafile.close()

for chunks in np.arange(1, opts.size, step = 3):
  # Sample the specified number of points from X_unlabeled
  size = np.cumsum(chunk_sizes[:chunks])[-1]
  
  # Fit a Dirichlet process mixture of Gaussians using up to  ten components
  dpgmm = DPGMM(n_components=10, alpha=10.0, covariance_type='full')
  indices = np.arange(X_unlabeled.shape[0])
  np.random.shuffle(indices)
  X = X_unlabeled[indices[:size],]
  
  print("fitting a model with", size, "data points")
  with timeit():
    dpgmm.fit(X)
  print("Done!")
  print("AIC for this model & data: ", dpgmm.aic(X))
  print("BIC for this model & data: ", dpgmm.bic(X))
  Y_hat = dpgmm.predict(X)
  print ("Model assigned points to", np.max(Y_hat), "components")
  

# How can I best check this out? 
#color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
#for i, (clf, title) in enumerate([(gmm, 'GMM'),
                                  #(dpgmm, 'Dirichlet Process GMM')]):
    #splot = plt.subplot(2, 1, 1 + i)
    #Y_ = clf.predict(X)
    #for i, (mean, covar, color) in enumerate(zip(
            #clf.means_, clf._get_covars(), color_iter)):
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3,
                   feature='chroma', dim_red='SVD', round_to=0, normalize=1,
                   scale=1, length=4, clustering='KMEANS'):
    feat = {}
    print ('Analyzing {} with feature {}, order {}, sr {}, cutoff {}, '
           'n_singv {}, scale {} normalize {}, round_to {}'.format(
               fullpath, feature, order, sr, cutoff, n_singv, scale, normalize,
               round_to))
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)

    # extract filter and apply pre-processing
    feat[feature], beat_times = extractFeature(
        filename, file_ext, feature, scale, round_to, normalize,
        beat_sync=True, save=True)

    feat['LPF'] = lpf(feat[feature], cutoff, sr, order)
    feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv)
    feat['{}(LPF)'.format(dim_red)] = dim_red_fn(
        dim_red, feat['LPF'], n_singv)
    feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order)
    feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF']
    feat['LPF({}-LPF)'.format(feature)] = lpf(
        feat['{}-LPF'.format(feature)], cutoff, sr, order)
    feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(dim_red,
        feat['LPF({}-LPF)'.format(feature)], n_singv)

    # create vars for plotting
    ts = np.arange(0, len(feat[feature]))
    step_size = max(1, int(len(ts) * .01))
    fig = plt.figure(figsize=(98, 64))
    fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format(
        feature, order, cutoff, sr))

    gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1])
    i = 0
    print "\tPlot data and pre-processing"
    for name in (feature,
                 '{}-LPF'.format(feature),
                 '{}(LPF)'.format(dim_red),
                 'LPF({})'.format(dim_red),
                 'LPF({}-LPF)'.format(feature),
                 '{}(LPF({}-LPF))'.format(dim_red, feature)):
        data = feat[name]

        data_wide = np.array([feat[name][m:m+length, :]
                              for m in xrange(len(feat[name])-length)])
        data_wide = data_wide.reshape(
            data_wide.shape[0], data_wide.shape[1]*data_wide.shape[2])

        # build codebook using kmeans or DP-GMM
        if clustering == 'KMEANS':
            K_MIN, K_MAX = 2, 16
            KM = [KMeans(n_clusters=l, init='k-means++').fit(data_wide)
                  for l in xrange(K_MIN, K_MAX+1)]

            # compute scores to assess fit
            scores_bic = [computeBic(KM[x], data_wide) for x in xrange(len(KM))]
            scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))]
            scores_silhouette = [silhouette_score(data_wide, KM[x].labels_,
                                                  metric='euclidean')
                                 for x in xrange(len(KM))]

            # get best clusters
            idx_best_bic = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_bic))[0])
            idx_best_inertia = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_inertia))[0])
            idx_best_silhouette = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_silhouette))[0])
            idx_best = int(np.median(
                (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1

            # get clusters and cluster allocations given best K
            k_best = idx_best + K_MIN
            centroids = KM[idx_best].cluster_centers_
            centroid_idx = KM[idx_best].labels_
        elif clustering == 'DPGMM':
            n_components = 12
            dpgmm = DPGMM(
                n_components=n_components, tol=1e-3, n_iter=32, alpha=1000,
                covariance_type='diag', verbose=True)
            dpgmm.fit(data_wide)

            # compute scores to assess fit
            scores_bic = dpgmm.bic(data_wide)
            scores_silhouette = [silhouette_score(data_wide, centroids,
                                 metric='euclidean')]
            scores_silhouette = [0.0]

            # get clusters and cluster allocations given best K
            k_best = dpgmm.means_.shape[0]
            centroids = dpgmm.means_
            centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1)
        # plot data
        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T

        ax = fig.add_subplot(gs[i, :])
        ax.set_title(name)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        xlabels = ["{}:{}".format(int(x / 60), int(x % 60))
                   for x in beat_times[::step_size]]
        ax.set_xticks(ts[::step_size])
        ax.set_xticklabels(xlabels, rotation=60)
        ax.grid(False)

        # plot clustering on raw feature
        changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:]))
        for c in xrange(changes.shape[0]-1):
            if changes[c] and changes[c+1]:
                changes[c] = False
        ax_twin = ax.twiny()
        ax_twin.set_xlim(ax.get_xlim())
        ax_twin.set_xticks(np.argwhere(changes)[:, 0])
        ax_twin.set_xticklabels(centroid_idx[changes])
        ax_twin.grid(False)

        # plot codebook (centroids)
        ax = fig.add_subplot(gs[i+1, 0])
        ax.set_title(name)

        if centroids.shape[1] == 3:
            centroids = centroids.reshape(
                1, centroids.shape[0], centroids.shape[1])
        elif centroids.shape[1] == n_singv * length:
            centroids = centroids.reshape(
                1, centroids.shape[0]*length, centroids.shape[1]/length)
        else:
            centroids = centroids.reshape(
                centroids.shape[0] * length,
                centroids.shape[1] / length).T
        ax.imshow(centroids,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(xrange(0, centroids.shape[1], 4))
        ax.set_xticklabels(xrange(k_best))
        ax.grid(False)

        # plot elbow curve
        c = 1
        for k, v, idx in (('BIC', scores_bic, idx_best_bic),
                          ('INERTIA', scores_inertia, idx_best_inertia),
                          ('SILHOUETTE', scores_silhouette, idx_best_silhouette)
                          ):
            ax = fig.add_subplot(gs[i+1, c])
            ax.set_title('{}, {} best K {}'.format(name, k, idx+K_MIN))
            ax.plot(xrange(K_MIN, K_MAX+1), v,  'b*-')
            ax.set_xlim((K_MIN, K_MAX+1))
            ax.set_xlabel('Number of clusters')
            ax.set_ylabel('Score')
            ax.grid(True)
            ax.axvline(idx+K_MIN, color='r')
            c += 1
        i += 2

        """
        if 'SVD' in name:
            # scikit-image clustering
            segments_slic = slic(
                data, n_segments=10, compactness=10, sigma=1)
            segments_quickshift = quickshift(
                data, kernel_size=3, max_dist=6, ratio=0.5)

            ax = fig.add_subplot(gs[k, 0])
            ax.set_title('{} with quickshift'.format(name))
            ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)

            ax = fig.add_subplot(gs[k, 1])

            ax.set_title('{} with slic'.format(name))
            ax.imshow(mark_boundaries(data, segments_slic, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)
            k += 1
        """

    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))

    # save with large size
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))
    # save with smaller size
    fig.set_figwidth(36)
    fig.set_figheight(24)
    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))

    plt.close(fig)
Ejemplo n.º 29
0
def dpgmm_simple(X, init_numC, random_state):
    model = DPGMM(n_components = init_numC, n_iter=100, tol=0.000001, random_state=random_state)
    model.fit(X)
    y = model.predict(X)
    cluster_num = len(np.unique(y))
    return cluster_num, y
Ejemplo n.º 30
0
        Y = ds.target
    return X, Y

def test1():
    print 'test1'
    model = VDPGMM(T = 10, alpha = 1, max_iter = 50)
    X, Y = getXY('iris')
    model.fit(X)
    y = model.predict(X)
    print 'VDPGMM'
    print len(np.unique(y)), np.unique(y)
    print [np.sum(y == label) for label in np.unique(y)]

    from sklearn.mixture import DPGMM
    model = DPGMM(n_components = 10, alpha = 1, n_iter = 50)
    model.fit(X)
    y = model.predict(X)
    print 'DPGMM'
    print len(np.unique(y)), np.unique(y)
    print [np.sum(y == label) for label in np.unique(y)]

def test2():
    print 'test2'
    np.random.seed(1)
    X = np.concatenate((2 + np.random.randn(100, 2), 5 + np.random.randn(100, 2),  10 + np.random.randn(100, 2)))
    T = 10
    model = VDPGMM(T=T, alpha=.5, max_iter=100, thresh=1e-5)
    model.fit(X)
    
    plt.clf()
    h = plt.subplot()
        img.reshape((1, img.shape[0], img.shape[1], img.shape[2])), -1, 1)
img = vgg16.preprocess_input(img.astype('float32'))
""" Scaling activations to fit random initialization scheme"""
actvs = get_activations(model, layer, img).squeeze()
actvs /= np.max(actvs) * 0.1
""" Clustering with dirichlet process Gaussian Mixture Model"""
dpgmm = DPGMM(n_components=50,
              alpha=1,
              verbose=2,
              tol=0.01,
              n_iter=250,
              min_covar=1e-6)
#dpgmm = BayesianGaussianMixture(n_components=50, covariance_type="diag", reg_covar = 1e-6,
#                                weight_concentration_prior_type="dirichlet_process",
#                                weight_concentration_prior=1, verbose=2,
#                                tol=0.01, max_iter=250, init_params='random',
#                                mean_precision_prior=actvs.std(),
#                                mean_prior=np.repeat(actvs.max()/5,actvs.shape[0]))

dpgmm.fit(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
labels = dpgmm.predict(
    np.transpose(actvs.reshape(actvs.shape[0],
                               actvs.shape[1] * actvs.shape[2])))
labels = labels.reshape((actvs.shape[1], actvs.shape[2]))

plt.subplot(1, 2, 2)
plt.imshow(labels, interpolation="nearest")
plt.title('Labelmap from layer ' + str(layer))
train_dataset = train.values
X = train_dataset[:, 2:]
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

# In[5]:
df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

print('dirichlet process gaussian mixture begins****************')
dpgmm = DPGMM(n_components=3)
print('fitting****************')
dpgmm_train = dpgmm.fit(X, y)
print('predicting on train****************')
dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1]
print('predicting on test****************')
dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1]
tr_te_concatenated = np.concatenate(
    [dpgmm_X_prediction, dpgmm_X_test_prediction])
df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated

print('final tr_te shape', df.shape)
print(df.head())

df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False)

print(df.head())
Ejemplo n.º 33
0
                    clip_box=ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title(u'GMM', fontsize=20)
    plt.grid(True)

    # DPGMM
    n_components = 3
    dpgmm = DPGMM(n_components=n_components,
                  alpha=1,
                  covariance_type='full',
                  random_state=0)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm._get_covars()
    print 'DPGMM均值 = \n', centers
    print 'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    # print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat: