Example #1
0
        
        while iter_ < int(np.floor(x_valid.shape[0] / batch_size)):
    
            batch_x = x_valid[iter_*batch_size: (iter_+1)*batch_size, :].T.reshape(1, sequence_len, batch_size)
            batch_y = y_valid[np.newaxis, iter_*batch_size: (iter_+1)*batch_size]
                
            errors_valid[iter_] = sess.run(prediction-batch_y, feed_dict={input_: batch_x,
                                                                          target: batch_y})

            iter_ +=  1
        
        # estimate mean and deviation of the errors' vector
        #  since we have a batch size that may be different from 1 and we consider
        #   the error of each last batch_y, we need to cut off the zero values
        errors_valid = errors_valid[:iter_].flatten()
        gaussian_mixture = mixture.GaussianMixture(n_components=n_mixtures)
        gm = gaussian_mixture.fit(errors_valid.reshape(-1, 1))
        means_valid = gm.means_[:,0]
        stds_valid = gm.covariances_[:,0,0]**.5  # square it since it is the cov matrix
        weights_valid = gm.weights_
                
        # test
        predictions = np.zeros(shape=(int(np.floor(x_test.shape[0] / batch_size)), batch_size))
        y_test = y_test[:x_test.shape[0]]

        # anomalies' statistics
        gaussian_error_statistics = np.zeros(shape=(len(predictions), batch_size))
        errors_test = np.zeros(shape=(len(predictions), batch_size))
        threshold = [scistats.norm.pdf(mean-sigma_threshold*std, mean, std) for (mean, std) in zip(means_valid, stds_valid)]
        anomalies = np.array([np.array([False for _ in range(batch_size)]) for _ in range(len(y_test))])
        
def make_graphical_experiments(algorithms=[],
                               n_samples=1500,
                               run_scikit_algorithms=True,
                               SAVE_PLOTS=False,
                               results_file_name=''):
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    noisy_square = np.random.rand(n_samples, 2), None
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    anisotropic_blobs = (X_aniso, y)
    varied_blobs = make_blobs(n_samples=n_samples,
                              cluster_std=[1.0, 2.5, 0.5],
                              random_state=random_state)

    if run_scikit_algorithms:
        scikit_algorithms = range(9)
    else:
        scikit_algorithms = []
    plt.figure(figsize=((len(scikit_algorithms) + len(algorithms)) * 2 + 3,
                        12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)
    plot_num = 1

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 50,
        'n_clusters': 3
    }
    datasets = [(noisy_circles, {
        'damping': .77,
        'preference': -240,
        'quantile': .2,
        'n_clusters': 2
    }), (noisy_moons, {
        'damping': .75,
        'preference': -220,
        'n_clusters': 2
    }), (varied_blobs, {
        'eps': .18,
        'n_neighbors': 2
    }), (anisotropic_blobs, {
        'eps': .15,
        'n_neighbors': 2
    }), (blobs, {}), (noisy_square, {})]

    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

        # connectivity matrix for structured Ward
        G, pos, labels = generate_dataset_from_euclidean_points(
            X,
            similarity_measure=lambda p, q: np.exp(-(np.linalg.norm(p - q) / 1.
                                                     )**2),
            threshold=.8)
        G, pos, labels = connect_dataset_connected_components(G, pos, labels)
        connectivity = nx.to_scipy_sparse_matrix(G)
        print("Dataset: ", i_dataset)

        if run_scikit_algorithms:
            ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
            two_means = cluster.MiniBatchKMeans(
                n_clusters=params['n_clusters'])
            ward = cluster.AgglomerativeClustering(
                n_clusters=params['n_clusters'],
                linkage='ward',
                connectivity=connectivity)
            spectral = cluster.SpectralClustering(
                n_clusters=params['n_clusters'],
                eigen_solver='arpack',
                affinity="nearest_neighbors")
            dbscan = cluster.DBSCAN(eps=params['eps'])
            affinity_propagation = cluster.AffinityPropagation(
                damping=params['damping'], preference=params['preference'])
            average_linkage = cluster.AgglomerativeClustering(
                linkage="average",
                affinity="cityblock",
                n_clusters=params['n_clusters'],
                connectivity=connectivity)
            birch = cluster.Birch(n_clusters=params['n_clusters'])
            gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                          covariance_type='full')

            scikit_algorithms = [('MiniBatchKMeans', two_means),
                                 ('AffinityProp', affinity_propagation),
                                 ('MeanShift', ms),
                                 ('SpectralClustering', spectral),
                                 ('Ward', ward),
                                 ('AggloClustering', average_linkage),
                                 ('DBSCAN', dbscan), ('Birch', birch),
                                 ('GaussianMixture', gmm)]

            for name, algorithm in scikit_algorithms:

                t0 = time()
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message="the number of connected components of the " +
                        "connectivity matrix is [0-9]{1,2}" +
                        " > 1. Completing it to avoid stopping the tree early.",
                        category=UserWarning)
                    warnings.filterwarnings(
                        "ignore",
                        message=
                        "Graph is not fully connected, spectral embedding" +
                        " may not work as expected.",
                        category=UserWarning)
                    algorithm.fit(X)
                t1 = time()

                if hasattr(algorithm, 'labels_'):
                    y_pred = algorithm.labels_.astype(np.int)
                else:
                    y_pred = algorithm.predict(X)

                plt.subplot(len(datasets),
                            len(scikit_algorithms) + len(algorithms), plot_num)
                if i_dataset == 0:
                    plt.title(name, size=18)

                colors = np.array(
                    list(
                        islice(
                            cycle([
                                '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                                '#a65628', '#984ea3', '#999999', '#e41a1c',
                                '#dede00'
                            ]), int(max(y_pred) + 1))))
                plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
                plt.xlim(-2.5, 2.5)
                plt.ylim(-2.5, 2.5)
                plt.xticks(())
                plt.yticks(())
                # plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                #          transform=plt.gca().transAxes, size=15,
                #          horizontalalignment='right')
                plot_num += 1

        for name, algorithm in algorithms:

            t0 = time()
            clusters = algorithm(G)
            t1 = time()
            y_pred = clusters_list2clusters_dict(clusters).values()

            plt.subplot(len(datasets),
                        len(scikit_algorithms) + len(algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)
            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            # plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
            #          transform=plt.gca().transAxes, size=15,
            #          horizontalalignment='right')
            plot_num += 1

    if SAVE_PLOTS:
        plt.savefig(results_file_name + ".pdf", bbox_inches='tight')
        plt.savefig(results_file_name + ".png", bbox_inches='tight')
    else:
        plt.show()
                           M=64,
                           num_gpus=arguments.num_gpus,
                           arguments=arguments)
            if arguments.cuda:
                mdl.cuda()
            mdl.load_state_dict(torch.load(path))

            # train the base distribution
            if 0 & os.path.exists(path2 + 'Kmog{}.gmm'.format(Kcomps)):
                GMM = pickle.load(
                    open(path2 + 'Kmog{}.gmm'.format(Kcomps), 'rb'))
            else:
                if use_gmms[0]:
                    GMM = mix.GaussianMixture(n_components=Kcomps,
                                              verbose=1,
                                              n_init=3,
                                              max_iter=200,
                                              covariance_type='diag')
                    GMM.fit(all_hhats.data.cpu().numpy())
                    pickle.dump(
                        GMM, open(path2 + 'Kmog{}.gmm'.format(Kcomps), 'wb'))
                    mdl.initialize_GMMparams(GMM=GMM)

                if use_gmms[1]:
                    BGMM = mix.GaussianMixture(n_components=Kcomps,
                                               verbose=1,
                                               n_init=3,
                                               max_iter=200,
                                               covariance_type='full')
                    BGMM.fit(all_hhats.data.cpu().numpy())
                    pickle.dump(
Example #4
0
    y.append(int(l.split(",")[-2]))

X = np.array(X, dtype=np.float32)

X = np.array(X, dtype=np.float32)
scaler = StandardScaler()
scaler.fit(X)

X = scaler.transform(X)
pca = PCA(n_components=2)
pca.fit(X)
dr_X = pca.transform(X)

#plot_bic(X)

gmm = mixture.GaussianMixture(n_components=2, covariance_type='tied')
gmm.fit(dr_X)

newX = []

for pt in dr_X:
    newX.append(gmm.predict(pt.reshape(1, -1))[0])

newX = np.array(newX)

newX = to_categorical(newX)
y = to_categorical(y)

INIT_LR = 5E-4
EPOCHS = 500
BS = 50
    def init_std(self,
                 x,
                 gmm_mu=None,
                 gmm_cv=None,
                 weights=None,
                 inv_maxstd=1e-1,
                 beta_constant=0.5,
                 component_overwrite=None,
                 beta_override=None,
                 n_samples=2,
                 z_override=None,
                 sigma=None):
        if component_overwrite is not None:
            self.num_components = component_overwrite
        if z_override is None:
            with torch.no_grad():
                mu, lv = torch.chunk(self.encoder(x.to(self.device)),
                                     chunks=2,
                                     dim=-1)
                z = td.Normal(loc=mu, scale=lv.mul(0.5).exp() + 1e-10).sample(
                    [n_samples])
                z = z.reshape(int(x.shape[0] * n_samples), z.shape[-1])
        else:
            z = z_override
        N, D = x.shape
        d = z.shape[1]
        inv_maxstd = inv_maxstd  # 1.0 / x.std(dim=0).mean() # x.std(dim=0).mean() #D*x.var(dim=0).mean()

        if gmm_mu is None and gmm_cv is None and weights is None:
            from sklearn import mixture
            clf = mixture.GaussianMixture(n_components=self.num_components,
                                          covariance_type='spherical')
            clf.fit(z.cpu().numpy())
            self.gmm_means = clf.means_
            self.gmm_covariances = clf.covariances_
            self.clf_weights = clf.weights_
        else:
            print('loading weights...')
            self.gmm_means = gmm_mu
            self.gmm_covariances = gmm_cv
            self.clf_weights = weights
        if beta_override is None:
            beta = beta_constant.cpu() / torch.tensor(
                self.gmm_covariances, dtype=torch.float, requires_grad=False)
        else:
            beta = beta_override
        self.beta = beta.to(self.device)
        self.dec_std = nnj.Sequential(
            nnj.RBF(d,
                    self.num_components,
                    points=torch.tensor(self.gmm_means,
                                        dtype=torch.float,
                                        requires_grad=False),
                    beta=self.beta),  # d --> num_components
            nnj.PosLinear(self.num_components, 1,
                          bias=False),  # num_components --> 1
            nnj.Reciprocal(inv_maxstd),  # 1 --> 1
            nnj.PosLinear(1, D)).to(self.device)  # 1 --> D
        if sigma is not None:
            self.dec_std[0] = nnj.RBF_variant(
                d,
                self.gmm_means.shape[0],
                points=torch.tensor(self.gmm_means,
                                    dtype=torch.float,
                                    requires_grad=False),
                beta=self.beta.requires_grad_(False),
                boxwidth=sigma).to(self.device)
        with torch.no_grad():
            self.dec_std[1].weight[:] = (
                (torch.tensor(self.clf_weights, dtype=torch.float).exp() -
                 1.0).log()).to(self.device)
Example #6
0
def gmm(k):
    model = mixture.GaussianMixture(n_components=k,
                                    covariance_type='full',
                                    random_state=100)
    return model
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')

    clustering_algorithms = (('My_KMeans', my_kmeans), ('My_GMM', my_gmm),
                             ('My_SpectralClustering',
                              my_spectral), ('MiniBatchKMeans', two_means),
                             ('AffinityPropagation', affinity_propagation),
                             ('MeanShift', ms), ('Ward', ward),
                             ('AgglomerativeClustering',
                              average_linkage), ('DBSCAN', dbscan), ('OPTICS',
                                                                     optics),
                             ('Birch', birch), ('GaussianMixture', gmm))

    # 此处是内层循环,遍历每种算法
    for name, algorithm in clustering_algorithms:
        t0 = time.time()
Example #8
0
def mixture_gaussian(param, n_samples, components=0, name=None, analyze=False):
    if path.exists(f'{base_dir}/gm_{name}_samples.pkl'):
        best_gmm = load_mixture_gaussian(name)
        if not analyze and global_data.seed == 0:
            print(f'Load samples from file {name}')
            pickle_in = open(f'{base_dir}/gm_{name}_samples.pkl', "rb")
            dict = pickle.load(pickle_in)
            samples = dict['samples']
            if samples.shape[0] == n_samples:
                return samples
            else:
                name = f'{name}_{n_samples}'
                if path.exists(f'{base_dir}/gm_{name}_samples.pkl'):
                    print(f'Load samples from file {name}')
                    pickle_in = open(f'{base_dir}/gm_{name}_samples.pkl', "rb")
                    dict = pickle.load(pickle_in)
                    return dict['samples']
        else:
            print('Load distribution')
            best_gmm = load_mixture_gaussian(name)
    else:
        bic = []
        lowest_bic = np.infty
        max_components = param.shape[1] if param.shape[1] < 15 else 15
        if components != 0:
            gmm = mixture.GaussianMixture(n_components=components,
                                          covariance_type='full',
                                          max_iter=5000,
                                          tol=1e-15,
                                          n_init=20)
            gmm.fit(param)
            print(
                f'Lowest bic with number of components {components}: {gmm.bic(param)}'
            )
            best_gmm = gmm
        else:
            for n_components in range(1, max_components):
                # Fit a Gaussian mixture with EM
                gmm = mixture.GaussianMixture(n_components=n_components,
                                              covariance_type='full',
                                              max_iter=5000,
                                              tol=1e-15,
                                              n_init=20)
                gmm.fit(param)
                bic.append(gmm.bic(param))
                if bic[-1] < lowest_bic:
                    components = n_components
                    lowest_bic = bic[-1]
                    print(
                        f'Lowest bic with number of components {n_components}: {lowest_bic}'
                    )
                    best_gmm = gmm
    samples = best_gmm.sample(n_samples)[0]
    if name is not None and not analyze and global_data.seed == 0:
        print(f'Save samples and mixture gaussian in file {name}')
        dict = {'samples': samples}
        pickle_out = open(f'{base_dir}/gm_{name}_samples.pkl', "wb")
        pickle.dump(dict, pickle_out)
        dict = {
            'comp': components,
            'weights': best_gmm.weights_,
            'means': best_gmm.means_,
            'cov': best_gmm.covariances_,
            'precision': best_gmm.precisions_cholesky_
        }
        pickle_out = open(f'{base_dir}/gm_{name}_dist.pkl', "wb")
        pickle.dump(dict, pickle_out)
    if analyze:
        centers = best_gmm.means_
        if centers.shape[-1] == 9:
            centers = centers.reshape(centers.shape[0], 3, 3)
            centers = centers.reshape(1, centers.shape[0], 3, 3)
            print(best_gmm.weights_)
            # plot_weights(centers, name)
        # mixture_analysis(best_gmm.weights_, best_gmm.means_, best_gmm.covariances_, name)
        return best_gmm
        # return samples
    return samples
Example #9
0
        # np.savetxt("Data_train_rand_seed="+str(0)+".csv", complete_D_train,delimiter=",")
        ##Stack the test
        complete_D_test = np.zeros([len(test_idx), num_stacked * n])
        len_test = len(test_idx)

        for i in range(len(sorted_test_idx)):
            idx = sorted_test_idx[i]
            idx_left = idx - 1
            while idx_left not in sorted_training_idx:
                idx_left -= 1
            point_tr = sorted_training_idx.index(idx_left)
            complete_D_test[i] = complete_D_train[point_tr]
            complete_D_test[i][0:n] = Data[idx][0:n]
        # np.savetxt("Data_test_rand_seed="+str(0)+".csv", complete_D_test,delimiter=",")
        #####INITIALIZATION!!!
        gmm = mixture.GaussianMixture(n_components=num_clusters,
                                      covariance_type="full")
        gmm.fit(complete_D_train)
        clustered_points = gmm.predict(complete_D_train)
        clustered_points_test = gmm.predict(complete_D_test)
        gmm_clustered_pts_test = gmm.predict(complete_D_test)
        gmm_clustered_pts = clustered_points + 0

        gmm_covariances = gmm.covariances_
        gmm_means = gmm.means_

        ##USE K-means
        kmeans = KMeans(n_clusters=num_clusters,
                        random_state=0).fit(complete_D_train)
        clustered_points_kmeans = kmeans.labels_
        clustered_points_test_kmeans = kmeans.predict(complete_D_test)
Example #10
0
    def fit(self, input_file):
        """
        Main method for TICC solver.
        Parameters:
            - input_file: location of the data file
        """
        assert self.maxIters > 0  # must have at least one iteration
        self.log_parameters()

        # Get data into proper format
        times_series_arr, time_series_rows_size, time_series_col_size = self.load_data(
            input_file)

        ############
        # The basic folder to be created
        str_NULL = self.prepare_out_directory()

        # Train test split
        training_indices = getTrainTestSplit(
            time_series_rows_size, self.num_blocks,
            self.window_size)  # indices of the training samples
        num_train_points = len(training_indices)

        # Stack the training data
        complete_D_train = self.stack_training_data(times_series_arr,
                                                    time_series_col_size,
                                                    num_train_points,
                                                    training_indices)

        print("here")
        # Initialization
        # Gaussian Mixture
        gmm = mixture.GaussianMixture(n_components=self.number_of_clusters,
                                      covariance_type="full")
        print("here maybe")
        gmm.fit(complete_D_train)
        print("here past gmmfit")
        clustered_points = gmm.predict(complete_D_train)
        gmm_clustered_pts = clustered_points + 0
        # K-means
        print("here at kmeans")
        kmeans = KMeans(n_clusters=self.number_of_clusters,
                        random_state=0).fit(complete_D_train)
        clustered_points_kmeans = kmeans.labels_  # todo, is there a difference between these two?
        kmeans_clustered_pts = kmeans.labels_

        print("here again")

        train_cluster_inverse = {}
        log_det_values = {}  # log dets of the thetas
        computed_covariance = {}
        cluster_mean_info = {}
        cluster_mean_stacked_info = {}
        old_clustered_points = None  # points from last iteration

        empirical_covariances = {}

        # PERFORM TRAINING ITERATIONS
        pool = Pool(processes=self.num_proc)  # multi-threading
        for iters in range(self.maxIters):
            print("\n\n\nITERATION ###", iters)

            # Get the train and test points
            train_clusters_arr = collections.defaultdict(
                list)  # {cluster: [point indices]}
            for point, cluster_num in enumerate(clustered_points):
                train_clusters_arr[cluster_num].append(point)

            len_train_clusters = {
                k: len(train_clusters_arr[k])
                for k in range(self.number_of_clusters)
            }

            # train_clusters holds the indices in complete_D_train
            # for each of the clusters
            opt_res = self.train_clusters(
                cluster_mean_info, cluster_mean_stacked_info, complete_D_train,
                empirical_covariances, len_train_clusters,
                time_series_col_size, pool, train_clusters_arr)

            self.optimize_clusters(computed_covariance, len_train_clusters,
                                   log_det_values, opt_res,
                                   train_cluster_inverse)

            # update old computed covariance
            old_computed_covariance = computed_covariance

            print("UPDATED THE OLD COVARIANCE")

            self.trained_model = {
                'cluster_mean_info': cluster_mean_info,
                'computed_covariance': computed_covariance,
                'cluster_mean_stacked_info': cluster_mean_stacked_info,
                'complete_D_train': complete_D_train,
                'time_series_col_size': time_series_col_size
            }
            clustered_points = self.predict_clusters()

            # recalculate lengths
            new_train_clusters = collections.defaultdict(
                list)  # {cluster: [point indices]}
            for point, cluster in enumerate(clustered_points):
                new_train_clusters[cluster].append(point)

            len_new_train_clusters = {
                k: len(new_train_clusters[k])
                for k in range(self.number_of_clusters)
            }

            before_empty_cluster_assign = clustered_points.copy()

            if iters != 0:
                cluster_norms = [(np.linalg.norm(
                    old_computed_covariance[self.number_of_clusters, i]), i)
                                 for i in range(self.number_of_clusters)]
                norms_sorted = sorted(cluster_norms, reverse=True)
                # clusters that are not 0 as sorted by norm
                valid_clusters = [
                    cp[1] for cp in norms_sorted
                    if len_new_train_clusters[cp[1]] != 0
                ]

                # Add a point to the empty clusters
                # assuming more non empty clusters than empty ones
                counter = 0
                for cluster_num in range(self.number_of_clusters):
                    if len_new_train_clusters[cluster_num] == 0:
                        cluster_selected = valid_clusters[
                            counter]  # a cluster that is not len 0
                        counter = (counter + 1) % len(valid_clusters)
                        print("cluster that is zero is:", cluster_num,
                              "selected cluster instead is:", cluster_selected)
                        start_point = np.random.choice(
                            new_train_clusters[cluster_selected]
                        )  # random point number from that cluster
                        for i in range(0, self.cluster_reassignment):
                            # put cluster_reassignment points from point_num in this cluster
                            point_to_move = start_point + i
                            if point_to_move >= len(clustered_points):
                                break
                            clustered_points[point_to_move] = cluster_num
                            computed_covariance[
                                self.number_of_clusters,
                                cluster_num] = old_computed_covariance[
                                    self.number_of_clusters, cluster_selected]
                            cluster_mean_stacked_info[
                                self.number_of_clusters,
                                cluster_num] = complete_D_train[
                                    point_to_move, :]
                            cluster_mean_info[self.number_of_clusters, cluster_num] \
                                = complete_D_train[point_to_move, :][
                                  (self.window_size - 1) * time_series_col_size:self.window_size * time_series_col_size]

            for cluster_num in range(self.number_of_clusters):
                print("length of cluster #", cluster_num, "-------->",
                      sum([x == cluster_num for x in clustered_points]))

            self.write_plot(clustered_points, str_NULL, training_indices)

            # TEST SETS STUFF
            # LLE + swtiching_penalty
            # Segment length
            # Create the F1 score from the graphs from k-means and GMM
            # Get the train and test points
            train_confusion_matrix_EM = compute_confusion_matrix(
                self.number_of_clusters, clustered_points, training_indices)
            train_confusion_matrix_GMM = compute_confusion_matrix(
                self.number_of_clusters, gmm_clustered_pts, training_indices)
            train_confusion_matrix_kmeans = compute_confusion_matrix(
                self.number_of_clusters, kmeans_clustered_pts,
                training_indices)
            ###compute the matchings
            matching_EM, matching_GMM, matching_Kmeans = self.compute_matches(
                train_confusion_matrix_EM, train_confusion_matrix_GMM,
                train_confusion_matrix_kmeans)

            print("\n\n\n")

            if np.array_equal(old_clustered_points, clustered_points):
                print("\n\n\n\nCONVERGED!!! BREAKING EARLY!!!")
                break
            old_clustered_points = before_empty_cluster_assign
            # end of training
        if pool is not None:
            pool.close()
            pool.join()
        train_confusion_matrix_EM = compute_confusion_matrix(
            self.number_of_clusters, clustered_points, training_indices)
        train_confusion_matrix_GMM = compute_confusion_matrix(
            self.number_of_clusters, gmm_clustered_pts, training_indices)
        train_confusion_matrix_kmeans = compute_confusion_matrix(
            self.number_of_clusters, clustered_points_kmeans, training_indices)

        self.compute_f_score(matching_EM, matching_GMM, matching_Kmeans,
                             train_confusion_matrix_EM,
                             train_confusion_matrix_GMM,
                             train_confusion_matrix_kmeans)

        if self.compute_BIC:
            bic = computeBIC(self.number_of_clusters, time_series_rows_size,
                             clustered_points, train_cluster_inverse,
                             empirical_covariances)
            print("this is the val,", bic)
            if iters > 998:
                bic = 999999999
                return clustered_points, train_cluster_inverse, bic
            return clustered_points, train_cluster_inverse, bic

        return clustered_points, train_cluster_inverse
Example #11
0
    def __init__(self,
                 data,
                 n_labels,
                 beta_init=1,
                 stencil=None,
                 normalize=True):
        """

        Args:
            data (:obj:`np.ndarray`): Multidimensional data array containing all observations (features) in the
                following shape:

                    1D = (Y, F)
                    2D = (Y, X, F)
                    3D = (Y, X, Z, F)

            n_labels (int): Number of labels representing the number of clusters to be segmented.
            beta_init (float): Initial penalty value for Gibbs energy calculation.
            stencil (int): Number specifying the stencil of the neighborhood system used in the Gibbs energy
                calculation.

        """
        # TODO: [DOCS] Main object description

        # store initial data
        self.data = data
        # get shape for physical and feature dimensions
        self.shape = np.shape(data)
        self.phys_shp = np.array(self.shape[:-1])

        # get number of features
        self.n_feat = self.shape[-1]

        # GRAPH COLORING
        self.stencil = stencil
        self.colors = pseudocolor(self.shape, self.stencil)

        # ************************************************************************************************
        # fetch dimensionality, coordinate and feature vector from input data

        # 1D
        if len(self.shape) == 2:
            # 1d case
            self.dim = 1
            # create coordinate vector
            # self.coords = np.array([np.arange(self.shape[0])]).T
            # feature vector
            self.feat = self.data

        # 2D
        elif len(self.shape) == 3:
            # 2d case
            self.dim = 2
            # create coordinate vector
            # y, x = np.indices(self.shape[:-1])
            # print(y, x)
            # self.coords = np.array([y.flatten(), x.flatten()]).T

            # feature vector
            self.feat = np.array(
                [self.data[:, :, f].ravel() for f in range(self.n_feat)]).T

        # 3D
        elif len(self.shape) == 4:
            # 3d case
            raise Exception("3D segmentation not yet supported.")

        # mismatch
        else:
            raise Exception(
                "Data format appears to be wrong (neither 1-, 2- or 3-D).")

        if normalize:
            self.normalize_feature_vectors()

        # ************************************************************************************************
        # INIT GAUSSIAN MIXTURE MODEL
        self.n_labels = n_labels
        self.gmm = mixture.GaussianMixture(n_components=n_labels,
                                           covariance_type="full")
        self.gmm.fit(self.feat)
        # do initial prediction based on fit and observations, store as first entry in labels

        # ************************************************************************************************
        # INIT LABELS, MU and COV based on GMM
        # TODO: [GENERAL] storage variables from lists to numpy ndarrays
        self.labels = np.array([self.gmm.predict(self.feat)])
        # INIT MU (mean from initial GMM)
        self.mus = np.array([self.gmm.means_])
        # INIT COV (covariances from initial GMM)
        self.covs = np.array([self.gmm.covariances_])

        self.labels_probability = np.zeros(
            (1, self.labels.shape[1], self.n_labels))
        self.storage_gibbs_e = np.zeros(
            [1, self.labels.shape[1], self.n_labels])
        self.storage_like_e = np.zeros(
            [1, self.labels.shape[1], self.n_labels])
        self.storage_te = np.zeros([1, self.labels.shape[1], self.n_labels])

        self.beta_acc_ratio = np.array([])
        self.cov_acc_ratio = np.array([])
        self.mu_acc_ratio = np.array([])

        # ************************************************************************************************
        # Initialize PRIOR distributions for beta, mu and covariance
        # BETA
        if self.dim == 1:
            self.prior_beta = norm(beta_init, np.eye(1) * 100)
            self.betas = [beta_init]
        elif self.dim == 2:
            if self.stencil == "4p":
                beta_dim = 2
            elif self.stencil == "8p" or self.stencil is None:
                beta_dim = 4

            self.betas = [[beta_init for i in range(beta_dim)]]
            self.prior_beta = multivariate_normal(
                [beta_init for i in range(beta_dim)],
                np.eye(beta_dim) * 100)

        elif self.dim == 3:
            raise Exception("3D not yet supported.")

        # MU
        # generate distribution means for each label
        prior_mu_means = [self.mus[0][label] for label in range(self.n_labels)]
        # generate distribution covariances for each label
        prior_mu_stds = [
            np.eye(self.n_feat) * 100 for label in range(self.n_labels)
        ]
        # use the above to generate multivariate normal distributions for each label
        self.priors_mu = [
            multivariate_normal(prior_mu_means[label], prior_mu_stds[label])
            for label in range(self.n_labels)
        ]

        # COV
        # generate b_sigma
        self.b_sigma = np.zeros((self.n_labels, self.n_feat))
        for l in range(self.n_labels):
            self.b_sigma[l, :] = np.log(
                np.sqrt(np.diag(self.gmm.covariances_[l, :, :])))
        # generate kesi
        self.kesi = np.ones((self.n_labels, self.n_feat)) * 100
        # generate nu
        self.nu = self.n_feat + 1
Example #12
0
def fit_gaus(masked_array, ras_fn, ncomp, sampleStep):

    # http://stackoverflow.com/questions/10143905/python-two-curve-gaussian-fitting-with-non-linear-least-squares/19182915#19182915
    X_compress = masked_array.compressed()
    X_reshape = np.reshape(X_compress, (masked_array.compressed().size, 1))

    clf = mixture.GaussianMixture(n_components=ncomp, covariance_type='full')
    clf.fit(X_reshape)

    ml = clf.means_
    wl = clf.weights_
    cl = clf.covariances_
    ms = [m[0] for m in ml]
    cs = [np.sqrt(c[0][0]) for c in cl]
    ws = [w for w in wl]
    i = 0

    sampleStep_str = "%03d" % (sampleStep)

    histo = matplotlib.pyplot.hist(masked_array.compressed(),
                                   300,
                                   normed=True,
                                   color='gray',
                                   alpha=0.5)
    fig_name = ras_fn.split('/')[-1].strip('.tif') + "_" + str(
        ncomp
    ) + "_" + sampleStep_str + '.png'  ##'_pks' + str(ncomp) + '_' + 'hist' + str(sampleStep_str) +'.png'

    # Delete out_peaksCSV if exists
    out_dir = os.path.split(ras_fn)[0]
    out_peaks_csv = os.path.join(out_dir, fig_name.strip('.png') + '.csv')

    if os.path.isfile(out_peaks_csv):
        os.remove(out_peaks_csv)

    print "\tOutput gaussian peaks csv: %s" % (out_peaks_csv)

    with open(out_peaks_csv, 'w') as outpk:

        # Write hdr if new
        outpk.write(
            'ras_fn,gaus1_mean,gaus1_sd,gaus2_mean,gaus2_sd,gaus3_mean,gaus3_sd\n'
        )
        i = 0
        gauss_num = ''
        outpk.write(ras_fn)  # Start writing the line
        for w, m, c in zip(ws, ms, cs):
            i += 1
            matplotlib.pyplot.plot(
                histo[1],
                w * matplotlib.mlab.normpdf(histo[1], m, np.sqrt(c)),
                linewidth=3)
            matplotlib.pyplot.axis([-5, 15, 0, 1])

            gauss_num = 'Gaussian peak #%s' % (i)
            print '\t' + gauss_num + ' mean: ', m, ' std dev:', c

            outpk.write(',' + str(m) + ',' + str(c))  # Finish writing the line
            if i == ncomp:
                outpk.write('\n')

        matplotlib.pyplot.savefig(os.path.join(out_dir, fig_name))
        matplotlib.pyplot.clf()

        return (out_peaks_csv)
Example #13
0
    if pc_name == "vision-pc26-Ubuntu":
        data_path = "/home/z2228wan/data/BSDS300/images"
    else:
        data_path = "BSDS300/images"

    # load data
    print("[*] Loading data ...\t", end="")
    start = time.time()
    train_data = read_data(os.path.join(data_path, "train"), num_samples)
    test_data = read_data(os.path.join(data_path, "test"), num_samples)
    np.save(f"train_gmm/train_data_{num_samples}.npy", train_data)
    np.save(f"train_gmm/test_data_{num_samples}.npy", test_data)
    sio.savemat(f"train_gmm/data_{num_samples}.mat", dict(train_data=train_data, test_data=test_data))
    print(f"{time.time() - start:.3f} s")

    # fit a GMM model with EM
    gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=500, tol=1e-6, verbose=2, verbose_interval=1)
    if is_train:
        print("[*] Fitting ...")
        start = time.time()
        gmm.fit(train_data)
        print(f"Fitting takes {time.time() - start:.3f} s")
        joblib.dump(gmm, f"train_gmm/gmm_{n_components}.joblib")
    else:
        gmm = joblib.load(f"train_gmm/gmm_{n_components}.joblib")

    # testing this model
    log_prob = gmm.score_samples(test_data)
    neg_log_prob = - np.mean(log_prob)
    print(f"[*] Negative log likeliohood on testing set: {neg_log_prob:.3f}")
Example #14
0
print(df.shape, df.dtype)
dat = df[:, 0:64]
tar1 = df[:, 64]
X = dat
y = tar1

x1 = []
a1 = []
a2 = []
t1 = []
t2 = []
i1 = []

for k in range(2, 16, 2):
    gmm = mixture.GaussianMixture(n_components=k,
                                  covariance_type='full',
                                  random_state=777)
    gmm.fit(X)
    cluster_labels = gmm.predict(X)
    l1 = np.reshape(cluster_labels, (5619, 1))
    print(cluster_labels.shape, l1.shape, X.shape)
    print(cluster_labels)
    X1 = np.hstack((l1, X))
    X_train, X_test, y_train, y_test = train_test_split(X1,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=20)
    clf = MLPClassifier(solver='sgd',
                        activation='relu',
                        alpha=0.03,
                        momentum=0.9,
Example #15
0
trimData = np.array(trimData[1:],dtype=np.float)
# %%

# %%
#labels.remove('STID')
labels = np.array(labels)
labels = np.delete(labels, 0) # remove first element in array
#labels = labels.astype(np.float)
# %% View data thus far
for i in range(3):
    print "data " + str(data[i+1]) + "\n"
    print "labels " + str(labels[i]) + "\n"
    print "trim" + str(trimData[i]) + "\n"
# %%
start = time.time()
gmix = mixture.GaussianMixture(n_components=6, covariance_type='full')
gmix.fit(trimData)
end = time.time()
print(end - start)

print gmix.means_
predictions = gmix.predict(trimData)
# %%view data
kylabels = pd.DataFrame({'a':labels})
kylabels.head(10)
kylabels.tail(10)
kylabels['a'].value_counts()
# %%

# %%
cllabels = pd.DataFrame({'a':predictions})
Example #16
0
                if Fs == all_emotion_Fs:
                    features = extract_MFCCs(x, Fs, window * Fs,
                                             window_overlap * Fs,
                                             voiced_threshold_mul,
                                             voiced_threshold_range,
                                             calc_deltas)
                    all_emotion_data.append(features)
                else:
                    print sample_file + " skipped due to mismatch in frame rate"

            all_emotion_data = np.concatenate(all_emotion_data, 0)
            #print all_emotion_data.shape

            try:
                gmm = mixture.GaussianMixture(
                    n_components=n_mixtures,
                    covariance_type='diag',
                    max_iter=max_iterations).fit(all_emotion_data)
            except:
                print "ERROR : Error while training model for file " + emotion

            try:
                joblib.dump(gmm, 'train_models/' + emotion_name + '.pkl')
            except:
                print "ERROR : Error while saving model for " + emotion_name

            spct += 1

        print "Training Completed"

        confusion_matrix = np.zeros((total_sp, total_sp))
        tct = 0
Example #17
0
def cal(algo, labels_true, labels_pred):
    print('%-30s\t%.3f\t%.3f\t%.3f' % (
        algo,
        metrics.normalized_mutual_info_score(
            labels_true, labels_pred, average_method='arithmetic'),
        metrics.homogeneity_score(labels_true, labels_pred),
        metrics.completeness_score(labels_true, labels_pred),
    ))


labels_pred = cluster.KMeans(n_clusters=np.unique(tar).shape[0],
                             random_state=30).fit_predict(Data)
cal('K-Means', labels, labels_pred)
labels_pred = cluster.AffinityPropagation(damping=0.6,
                                          preference=-2000).fit_predict(Data)
cal('AffinityPropagation', labels, labels_pred)
labels_pred = cluster.MeanShift(bandwidth=0.0005,
                                bin_seeding=True).fit_predict(Data)
cal('Mean-Shift', labels, labels_pred)
labels_pred = cluster.SpectralClustering(
    n_clusters=np.unique(tar).shape[0]).fit_predict(Data)
cal('SpectralClustering', labels, labels_pred)
labels_pred = cluster.AgglomerativeClustering(
    n_clusters=np.unique(tar).shape[0]).fit_predict(Data)
cal('AgglomerativeClustering', labels, labels_pred)
labels_pred = cluster.DBSCAN(eps=0.004, min_samples=6).fit_predict(Data)
cal('Dbscan', labels, labels_pred)
labels_pred = mixture.GaussianMixture(
    n_components=np.unique(tar).shape[0]).fit_predict(Data)
cal('GaussianMixtures', labels, labels_pred)
Example #18
0
        angle = 180. * angle / np.pi  # convert to degrees
        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
        ell.set_clip_box(splot.bbox)
        ell.set_alpha(0.5)
        splot.add_artist(ell)

    plt.xticks(())
    plt.yticks(())
    plt.title(title)

compnum = [2, 3, 4, 6, 8, 10]
for each in compnum: 
    t0= time.clock()

    # Fit a Gaussian mixture with EM using n components
    gmm = mixture.GaussianMixture(n_components= each, covariance_type='full')
    
    gmm = gmm.fit(traindata)
    # print(gmm.means_)
    print(gmm.converged_)
    print("Lower Bound: ")
    print(gmm.lower_bound_)
    t1= time.clock()
    timetaken = str(t1-t0)
    print("Computation Time: " + timetaken)
    plot_results(traindata, gmm.predict(traindata), gmm.means_, gmm.covariances_, 0,
                'Gaussian Mixture')

    dpgmm = mixture.BayesianGaussianMixture(n_components=each,
                                        covariance_type='full').fit(traindata)
    plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
Example #19
0
def create_graph_with_weight(points, normCount):
    '''
    Returns a graph created from cell coordiantes.
    edge weights set by normalized counts.
    
    :param points: shape (n,2); normCount: shape (n)
    :rtype: ndarray shape (n ,3)
    
    '''
    edges = {}   
    var = normCount.var()
    delauny = Delaunay(points)
#    cellGraph = np.zeros((delauny.simplices.shape[0]*delauny.simplices.shape[1], 4))
    cellGraph = np.zeros((points.shape[0]*10, 4))

    for simplex in delauny.simplices:
        simplex.sort()
        edge0 = str(simplex[0]) + " " + str(simplex[1])
        edge1 = str(simplex[0]) + " " + str(simplex[2])
        edge2 = str(simplex[1]) + " " + str(simplex[2])
        edges[edge0] = 1
        edges[edge1] = 1
        edges[edge2] = 1
    ## remove repetitives edges among triangle
        
    i = 0
    for kk in edges.keys():  
        node0 = int(kk.split(sep=" ")[0])
        node1 = int(kk.split(sep=" ")[1])
        edgeDiff = normCount[node0] - normCount[node1]
        energy = np.exp((0 - edgeDiff**2)/(2*var))
        dist = distance.euclidean(points[node0,:], points[node1,:])
        cellGraph[i] = [node0, node1, energy, dist]       
        i = i + 1
    
    tempGraph = cellGraph[0:i]
    n_components_range = range(1,5)
    best_component = 1
    lowest_bic=np.infty
    temp_data = tempGraph[:,3].reshape(-1,1)  ## GMM of dist 
    for n_components in n_components_range:
        gmm = mixture.GaussianMixture(n_components = n_components)
        gmm.fit(temp_data)
        gmm_bic = gmm.bic(temp_data)
        if gmm_bic < lowest_bic:
            best_gmm = gmm
            lowest_bic = gmm_bic
            best_component = n_components  
    
    mIndex = np.where(best_gmm.weights_ == max(best_gmm.weights_))[0]
    cutoff = best_gmm.means_[mIndex] + 2*np.sqrt(best_gmm.covariances_[mIndex])

    for simplex in delauny.simplices:
        simplex.sort()          
        dist0 = distance.euclidean(points[simplex[0],:], points[simplex[1],:])
        dist1 = distance.euclidean(points[simplex[0],:], points[simplex[2],:])
        dist2 = distance.euclidean(points[simplex[1],:], points[simplex[2],:])
        tempArray = np.array((dist0, dist1, dist2))
        badIndex = np.where(tempArray == max(tempArray))[0][0]  ## remove longest edges among simplex taiangle.
        if tempArray[badIndex] > cutoff:
            edge0 = str(simplex[0]) + " " + str(simplex[1])  
            edge1 = str(simplex[0]) + " " + str(simplex[2])       
            edge2 = str(simplex[1]) + " " + str(simplex[2])
            edgeCount = 0
            if edge0 in edges and edge1 in edges and edge2 in edges:
                if badIndex == 0:
                    del edges[edge0]
                elif badIndex == 1:
                    del edges[edge1]
                elif badIndex == 2:
                    del edges[edge2]     ## remove longest edges from edges

    i = 0
    for kk in edges.keys():         ## recrete cellGraph with new edges
        node0 = int(kk.split(sep=" ")[0])
        node1 = int(kk.split(sep=" ")[1])
        edgeDiff = normCount[node0] - normCount[node1]
        energy = np.exp((0 - edgeDiff**2)/(2*var))
        dist = distance.euclidean(points[node0,:], points[node1,:])
        cellGraph[i] = [node0, node1, energy, dist]       
        i = i + 1   
      
    tempGraph = cellGraph[0:i]
    temp_data = tempGraph[:,3].reshape(-1,1)    
    gmm = mixture.GaussianMixture(n_components = 1)
    gmm.fit(temp_data)    
    cutoff = gmm.means_[0] + 2*np.sqrt(gmm.covariances_[0])
    finalGraph = tempGraph.copy()
    j=0
    for i in np.arange(tempGraph.shape[0]):    
        if tempGraph[i, 3] < cutoff:     ### re-test all edges' dist have similar distribution. 
            finalGraph[j] = tempGraph[i]
            j = j + 1
         
    return finalGraph
Example #20
0
myML.plotML.plotparam_cluster(X,labels_true,"cluster.AgglomerativeClustering()",drawParam=1,n_clusters=nums)

# 测试 AgglomerativeClustering 的聚类结果随链接方式的影响
nums=range(1,50)
linkages=['ward','complete','average']
myML.plotML.plotparam_cluster(X,labels_true,"cluster.AgglomerativeClustering()",drawParam=2,n_clusters=nums,linkage=linkages)


# ---GMM
centers=[[1,1],[2,2],[1,2],[10,20]] # 用于产生聚类的中心点
X, labels_true = myML.DataPre.make_datasets("blobs", n_samples=1000, centers=centers, cluster_std=0.5 )
from sklearn import mixture
from sklearn.metrics import adjusted_rand_score

# 测试 GMM 的用法
clst=mixture.GaussianMixture()
clst.fit(X)
predicted_labels=clst.predict(X)
print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels))

# 测试 GMM 的聚类结果随 n_components 参数的影响
nums=range(1,20)
myML.plotML.plotparam_cluster(X,labels_true,"mixture.GaussianMixture()",n_components=nums)

# 测试 GMM 的聚类结果随协方差类型的影响
nums=range(1,20)
cov_types=['spherical','tied','diag','full']
myML.plotML.plotparam_cluster(X,labels_true,"mixture.GaussianMixture()",drawParam=2,n_components=nums,covariance_type=cov_types)


Example #21
0
def LSTM_MYAP_TRAIN(Xtrain,ytrain):
 
    everya, everyb = add_data(Xtrain,ytrain)
    
    num_class = 2
    num_features = 10
    n_epoch = 20
    n_batch = 10
    look_back = 2
    
    gmm1 = mixture.GaussianMixture(n_components = 2,covariance_type='full').fit(Xtrain)
    nm1 = gmm1.predict(Xtrain)
    #kmeans = KMeans(n_clusters=2, random_state=0).fit(Xtrain)
    #nm1 = kmeans.labels_
    nm1 = nm1.reshape(len(nm1),1)
    Xtrain = np.concatenate((Xtrain, nm1),axis = 1);
    Xtrainn = Xtrain ###
    
    ytrainn = Xtrain[:,10 ]
    Xtrain = Xtrain[:,0:Xtrain.shape[1]-1]
    ytraina = ytrain[ytrainn==0]
    ytrainb = ytrain[ytrainn==1]
  
    
################################################################

    
    Xtraina = Xtrainn[Xtrainn[:,10]==0];
    Xtraina = Xtraina[:,0:Xtraina.shape[1]-1]

    Xtrainb = Xtrainn[Xtrainn[:,10]==1];
    Xtrainb = Xtrainb[:,0:Xtrainb.shape[1]-1]
 

    Xtraina = np.concatenate((everya,Xtraina),axis = 0)
    Xtrainb = np.concatenate((everya,Xtrainb),axis = 0)
    ytraina = np.concatenate((everyb,ytraina),axis = 0)
    ytrainb = np.concatenate((everyb,ytrainb),axis = 0)
    
    num_class = 4
    num_features = 10
    n_epoch = 20
    n_batch = 10
    look_back = 2
    
    
    
    nb_samples = Xtraina.shape[0] - look_back
    Xtrain2 = np.zeros((nb_samples,look_back,num_features))
    
    y_train_reshaped2 = np.zeros((nb_samples,1,num_class))
    one_hot_labels2 = np.zeros((nb_samples,1,num_class))
    ytra = np.array(pd.get_dummies(np.array(ytraina.astype(int).reshape(-1))))
    
    for i in range(nb_samples):
        y_position = i + look_back
        Xtrain2[i] = Xtraina[i:y_position]
        one_hot_labels2[i] = ytra[y_position,:4]
    
    
    model = Sequential()
    opt = Adam(lr=0.001)
    model.add(LSTM(4,input_shape=(None, num_features), return_sequences=True))
    model.add(TimeDistributed(Dense(num_class,activation = 'tanh')))
    model.add(Activation('softmax'))
   

    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    filepath="weights-improvement1-{epoch:02d}-{categorical_accuracy:.2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy', verbose=2, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    cm1 = model.fit(Xtrain2,one_hot_labels2,epochs=n_epoch,batch_size=n_batch,verbose=2)
  

    clf1 = model

    
    nb_samples = Xtrainb.shape[0] - look_back
    Xtrain2 = np.zeros((nb_samples,look_back,num_features))

    y_train_reshaped2 = np.zeros((nb_samples,1,num_class))
    one_hot_labels2 = np.zeros((nb_samples,1,num_class))
    ytra = np.array(pd.get_dummies(np.array(ytrainb.astype(int).reshape(-1))))
    
    for i in range(nb_samples):
        y_position = i + look_back
        Xtrain2[i] = Xtrainb[i:y_position]
        one_hot_labels2[i] = ytra[y_position,:4]
    
    model = Sequential()
    opt = Adam(lr=0.001)
    model.add(LSTM(4, input_shape=(None, num_features), return_sequences=True,kernel_initializer='random_uniform'))
    model.add(TimeDistributed(Dense(num_class,activation = 'tanh')))
    model.add(Activation('softmax'))

    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    filepath="weights-improvement1-{epoch:02d}-{categorical_accuracy:.2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy', verbose=2, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]



    n_epoch = 20
    cm2 = model.fit(Xtrain2,one_hot_labels2, epochs=n_epoch, batch_size=n_batch, verbose=2)
    clf2 = model
    
    po = ([len(list(group)) for key, group in groupby(np.sort(ytraina))])

    pn = ([len(list(group)) for key, group in groupby(np.sort(ytrainb))])
    

    return (clf1, clf2, po, pn, Xtraina, ytraina, Xtrainb, ytrainb)
Example #22
0
 def __init__(self, K, random_state=42):
     self.gmm = mixture.GaussianMixture(n_components=K,
                                        covariance_type='full')
Example #23
0
    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError(
                "Column {0} not found in the experiment".format(self.ychannel))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError(
                "If num_components == 1, all posteriors are 1.")

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda x: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        self._xscale = util.scale_factory(self.xscale,
                                          experiment,
                                          channel=self.xchannel)
        self._yscale = util.scale_factory(self.yscale,
                                          experiment,
                                          channel=self.ychannel)

        gmms = {}

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])

            # drop data that isn't in the scale range
            x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))]
            x = x.values

            gmm = mixture.GaussianMixture(n_components=self.num_components,
                                          covariance_type="full",
                                          random_state=1)
            gmm.fit(x)

            if not gmm.converged_:
                raise util.CytoflowOpError("Estimator didn't converge"
                                           " for group {0}".format(group))

            # in the 1D version, we sort the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.  that doesn't work in a 2D area,
            # obviously.

            # instead, we assume that the clusters are likely (?) to be
            # arranged along *one* of the axes, so we take the |norm| of the
            # x,y mean of each cluster and sort that way.

            norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]

            gmms[group] = gmm

        self._gmms = gmms
Example #24
0
MFCC_GIRL = mfcc(SIG_GIRL, RATE_GIRL, numcep=16)
DELTA1_GIRL = delta(MFCC_GIRL, 2)
DELTA2_GIRL = delta(DELTA1_GIRL, 2)
GIRL_FEATURES = pd.concat([
    pd.DataFrame(MFCC_GIRL),
    pd.DataFrame(DELTA1_GIRL),
    pd.DataFrame(DELTA2_GIRL)
],
                          axis=1)
GIRL_FEATURES = preprocessing.scale(GIRL_FEATURES)

##########################################################################################################################################

BOY_MODEL = mixture.GaussianMixture(n_components=20,
                                    max_iter=1000,
                                    tol=.01,
                                    warm_start=True,
                                    covariance_type='diag')
BOY_MODEL.fit(BOY_FEATURES)

GIRL_MODEL = mixture.GaussianMixture(n_components=20,
                                     max_iter=1000,
                                     tol=.01,
                                     warm_start=True,
                                     covariance_type='diag')
GIRL_MODEL.fit(GIRL_FEATURES)

##########################################################################################################################################

(RATE_INPUT, SIG_INPUT) = wav.read(
    "/Users/abhishaikemahajan/Documents/VOICE/WORKSHOPTEST/SeshaTest.wav")
Example #25
0
        splot.add_artist(ell)

    plt.xlim(-9., 5.)
    plt.ylim(-3., 6.)
    plt.xticks(())
    plt.yticks(())
    plt.title(title)


# Number of samples per component
n_samples = 500

# Generate random sample, two components
np.random.seed(0)
C = np.array([[0., -0.1], [1.7, .4]])
X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
          .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]

# Fit a Gaussian mixture with EM using five components
gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X)
plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0,
             'Gaussian Mixture')

# Fit a Dirichlet process Gaussian mixture using five components
dpgmm = mixture.BayesianGaussianMixture(n_components=5,
                                        covariance_type='full').fit(X)
plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
             'Bayesian Gaussian Mixture with a Dirichlet process prior')

plt.show()
Example #26
0
    plt.suptitle(("Silhouette analysis for KMeans clustering on tic tac toe dataset "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

##em

lowest_bic = np.infty
bic = []
n_components_range = range(2, 14,2)
cv_types = ['spherical', 'tied', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components,
                                      covariance_type=cv_type, random_state=777)
        gmm.fit(X12)
        gmm_labels = gmm.predict(X12)
        bic.append(gmm.bic(X12))
        if abs(bic[-1]) < lowest_bic:
            lowest_bic = abs(bic[-1])
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
                              'darkorange'])
clf = best_gmm
bars = []

# Plot the BIC scores
plt.figure(figsize=(8, 6))
Example #27
0
# In[6]:

from sklearn import mixture
import itertools

Xgmm = Xmat

lowest_bic = np.infty
bic = []
n_components_range = range(1, 6)
cv_types = ['tied']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components,
                                      covariance_type=cv_type)
        gmm.fit(Xgmm)
        bic.append(gmm.bic(Xgmm))
        if n_components == 5:
            bic[-1] = bic[-2] + 1000

        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(['cornflowerblue'])
clf = best_gmm
bars = []

# Plot the BIC scores
Example #28
0
def maintask(task):
    data = np.load('lab2_data.npz')['data']
    phoneHMMs = np.load('lab2_models_onespkr.npz')['phoneHMMs'].item()
    phoneHMMs_all = np.load('lab2_models_all.npz')['phoneHMMs'].item()
    if task == '4':
        hmm1 = phoneHMMs['ah']
        hmm2 = phoneHMMs['ao']
        twohmm = concatTwoHMMs(hmm1, hmm2)
    """5 HMM Likelihood and Recognition"""
    example = np.load('lab2_example.npz')['example'].item()
    isolated = {}
    for digit in prondict.keys():
        isolated[digit] = ['sil'] + prondict[digit] + ['sil']
    wordHMMs = {}
    wordHMMs_all = {}
    for digit in prondict.keys():
        wordHMMs[digit] = concatHMMs(phoneHMMs, isolated[digit])
    # for 11 digits
    for digit in prondict.keys():
        wordHMMs_all[digit] = concatHMMs(phoneHMMs_all, isolated[digit])
    # example
    lpr = log_multivariate_normal_density_diag(example['lmfcc'],
                                               wordHMMs['o']['means'],
                                               wordHMMs['o']['covars'])
    diff = example['obsloglik'] - lpr  # 0
    # same digit 'o'
    lpr_o = log_multivariate_normal_density_diag(data[22]['lmfcc'],
                                                 wordHMMs_all['o']['means'],
                                                 wordHMMs_all['o']['covars'])
    if task == '5.1':
        plt.figure()
        plt.subplot(2, 1, 1)
        plt.pcolormesh(lpr.T)
        plt.title('example "o" ')
        plt.colorbar()
        plt.subplot(2, 1, 2)
        plt.pcolormesh(lpr_o.T)
        plt.title('test "o" from data22')
        plt.colorbar()
        plt.show()
    """
    5.2
    """
    lalpha = forward(lpr, np.log(wordHMMs['o']['startprob']),
                     np.log(wordHMMs['o']['transmat']))
    diff1 = example['logalpha'] - lalpha  # 0
    # log-likelihood
    loglike = logsumexp(lalpha[-1])
    diff0 = example['loglik'] - loglike  # 0

    # score all the 44 utterances in the data array with each of the 11 HMM
    # models in wordHMMs.
    scores_1 = np.zeros((44, 11))
    scores_2 = np.zeros((44, 11))
    labels_ori = []
    labels_pre = []
    labels_pre2 = []
    keys = list(prondict.keys())
    acc_1 = 0
    acc_2 = 0
    if task == '5.2':
        for i in range(44):
            for j, key in enumerate(keys):
                lpr = log_multivariate_normal_density_diag(
                    data[i]['lmfcc'], wordHMMs_all[key]['means'],
                    wordHMMs_all[key]['covars'])
                alpha = forward(lpr, np.log(wordHMMs_all[key]['startprob']),
                                np.log(wordHMMs_all[key]['transmat']))
                scores_2[i, j] = logsumexp(alpha[-1])
                lpr_1 = log_multivariate_normal_density_diag(
                    data[i]['lmfcc'], wordHMMs[key]['means'],
                    wordHMMs[key]['covars'])
                alpha_1 = forward(lpr_1, np.log(wordHMMs[key]['startprob']),
                                  np.log(wordHMMs[key]['transmat']))
                scores_1[i, j] = logsumexp(alpha_1[-1])
            ori = data[i]['digit']
            pre_1 = keys[int(np.argmax(scores_1[i, :]))]
            pre_2 = keys[int(np.argmax(scores_2[i, :]))]
            #labels_ori.append(ori)
            labels_pre.append(pre_1)
            labels_pre2.append(pre_2)
            if ori == pre_1:
                acc_1 += 1
            if ori == pre_2:
                acc_2 += 1
        print(
            "Accuracy(trained on all speakers): {0}; Accuracy(trained on one speaker):{1} "
            .format(acc_2, acc_1))
        print(labels_pre, labels_pre2)
    """
    5.3 Viterbi
    """
    viterbi_loglik, viterbi_path = viterbi(lpr,
                                           np.log(wordHMMs['o']['startprob']),
                                           np.log(wordHMMs['o']['transmat']))

    if task == '5.3':
        plt.pcolormesh(lalpha.T)
        plt.plot(viterbi_path, 'r')
        plt.title(
            'alpha array overlaid with best path obtained by Viterbi decoding '
        )
        plt.colorbar()
        plt.show()
        diff3 = example['vloglik'] - viterbi_loglik.T  # 0

        # Score all 44 utterances in the data with each of the 11 HMM models in wordHMMs
        for i in range(44):
            for j, key in enumerate(keys):
                lpr = log_multivariate_normal_density_diag(
                    data[i]['lmfcc'], wordHMMs_all[key]['means'],
                    wordHMMs_all[key]['covars'])
                viterbi_2, viterbi_path_2 = viterbi(
                    lpr, np.log(wordHMMs_all[key]['startprob']),
                    np.log(wordHMMs_all[key]['transmat']))
                scores_2[i, j] = viterbi_2
                lpr_1 = log_multivariate_normal_density_diag(
                    data[i]['lmfcc'], wordHMMs[key]['means'],
                    wordHMMs[key]['covars'])
                viterbi_1, viterbi_path_1 = viterbi(
                    lpr_1, np.log(wordHMMs[key]['startprob']),
                    np.log(wordHMMs[key]['transmat']))
                scores_1[i, j] = viterbi_1
            ori = data[i]['digit']
            pre_1 = keys[int(np.argmax(scores_1[i, :]))]
            pre_2 = keys[int(np.argmax(scores_2[i, :]))]
            #labels_ori.append(ori)
            labels_pre.append(pre_1)
            labels_pre2.append(pre_2)
            if ori == pre_1:
                acc_1 += 1
            if ori == pre_2:
                acc_2 += 1
        print(
            "Accuracy(trained on all speakers): {0}; Accuracy(trained on one speaker):{1} "
            .format(acc_2, acc_1))
        print(labels_pre, labels_pre2)
    """
    5.4
    """
    lbeta = backward(lpr, np.log(wordHMMs['o']['startprob']),
                     np.log(wordHMMs['o']['transmat']))
    diff2 = example['logbeta'] - lbeta
    # log-likelihood
    loglike = logsumexp(lbeta[0])
    diff4 = example['loglik'] - loglike  # 0
    if task == '5.4':
        plt.figure()
        plt.subplot(1, 3, 1)
        plt.pcolormesh(lbeta)
        plt.title('log-beta')
        plt.subplot(1, 3, 2)
        plt.pcolormesh(example['logbeta'])
        plt.title('example')
        plt.subplot(1, 3, 3)
        plt.pcolormesh(example['logalpha'])
        plt.title('log-alpha')
        plt.show()
    """6 HMM Retraining(emission probability distributions)"""
    """
    6.1
    """
    lgamma = statePosteriors(lalpha, lbeta)
    N = lgamma.shape[0]
    K = 9
    lgamma_gmm = np.zeros((N, K))
    total = log_multivariate_normal_density_diag(example['lmfcc'],
                                                 wordHMMs['o']['means'],
                                                 wordHMMs['o']['covars'])
    if task == '6.1':
        print('HMM posteriors')
        print('each time step sum along state axis',
              np.sum(np.exp(lgamma), axis=1))  #=1
        print('each state sum along time axis',
              np.sum(np.exp(lgamma) / 71, axis=0))
        print('sum over both states and time steps',
              np.sum(np.sum(
                  np.exp(lgamma))))  # =length of obs sequence/time steps
        print('length of observation sequence', lalpha.shape[0])
        print('GMM posteriors')
        # for k in range(K):
        #lgamma_gmm[:, k] = 1 / K * total[:, k] / np.sum(total[:, k])
        gmm = mixture.GaussianMixture(n_components=9)
        gmm.fit(example['lmfcc'])
        gmm_post = gmm.predict_proba(example['lmfcc'])
        plt.subplot(2, 1, 1)
        plt.pcolormesh(gmm_post.T)
        plt.title('GMM posteriors')
        plt.colorbar()
        plt.subplot(2, 1, 2)
        plt.pcolormesh(lgamma.T)
        plt.title('HMM posteriors')
        plt.colorbar()
        plt.show()
    """6.2"""
    if task == '6.2':
        plt.figure()
        L = {}
        for d in prondict:
            # initialization
            log_pi = np.log(wordHMMs_all[d]['startprob'])
            log_tr = np.log(wordHMMs_all[d]['transmat'])
            means = wordHMMs_all[d]['means']
            covars = wordHMMs_all[d]['covars']
            l = []
            # repitation:
            for i in range(20):
                lpr = log_multivariate_normal_density_diag(
                    data[10]['lmfcc'], means, covars)
                # Expectation
                lalpha = forward(lpr, log_pi, log_tr)
                lbeta = backward(lpr, log_pi, log_tr)
                log_gamma = statePosteriors(lalpha, lbeta)
                # Maximization
                means, covars = updateMeanAndVar(data[10]['lmfcc'], log_gamma)
                # Estimate likelihood
                log_like = logsumexp(lalpha[-1])
                if i > 2 and log_like - l[-1] < 0.1:
                    l.append(log_like)
                    L[d] = l
                    break
                else:
                    l.append(log_like)
                    L[d] = l

            plt.plot(l, label=d)

        plt.legend()
        plt.title('log-likelihood (data[10] with different wordHMMs)')
        plt.show()
Example #29
0
    def compute_paa(self, targets, target_labels, anchors, labels_all,
                    loss_all, matched_idx_all):
        """
            criteria: 'PAA' or 'GMM'

        Args:
            labels_all (batch_size x num_anchors): assigned labels
            loss_all (batch_size x numanchors): calculated loss
        """
        cls_labels = []
        reg_targets = []
        matched_gts = []
        for im_i in range(len(targets)):
            targets_per_im = targets[im_i].tensor
            bboxes_per_im = targets_per_im
            labels_per_im = target_labels[im_i]
            anchors_per_im = Boxes.cat(anchors[im_i]).tensor
            num_gt = bboxes_per_im.shape[0]

            labels = labels_all[im_i]
            loss = loss_all[im_i]
            matched_idx = matched_idx_all[im_i]
            assert labels.shape == matched_idx.shape

            num_anchors_per_level = [
                len(anchors_per_level) for anchors_per_level in anchors[im_i]
            ]

            # select candidates based on IoUs between anchors and GTs
            candidate_idxs = []
            for gt in range(num_gt):
                candidate_idxs_per_gt = []
                star_idx = 0
                for level, anchors_per_level in enumerate(anchors[im_i]):
                    end_idx = star_idx + num_anchors_per_level[level]
                    loss_per_level = loss[star_idx:end_idx]
                    labels_per_level = labels[star_idx:end_idx]
                    matched_idx_per_level = matched_idx[star_idx:end_idx]
                    match_idx = ((matched_idx_per_level == gt) &
                                 (labels_per_level > 0)).nonzero()[:, 0]
                    if match_idx.numel() > 0:
                        _, topk_idxs = loss_per_level[match_idx].topk(
                            min(match_idx.numel(), self.cfg.MODEL.PAA.TOPK),
                            largest=False)
                        topk_idxs_per_level_per_gt = match_idx[topk_idxs]
                        candidate_idxs_per_gt.append(
                            topk_idxs_per_level_per_gt + star_idx)
                    star_idx = end_idx
                if candidate_idxs_per_gt:
                    candidate_idxs.append(torch.cat(candidate_idxs_per_gt))
                else:
                    candidate_idxs.append(None)

            # fit 2-mode GMM per GT box
            n_labels = anchors_per_im.shape[0]
            cls_labels_per_im = torch.zeros(n_labels, dtype=torch.long).cuda()
            matched_gts_per_im = torch.zeros_like(anchors_per_im)
            fg_inds = matched_idx >= 0
            matched_gts_per_im[fg_inds] = bboxes_per_im[matched_idx[fg_inds]]
            is_grey = None
            for gt in range(num_gt):
                if candidate_idxs[gt] is not None:
                    if candidate_idxs[gt].numel() > 1:
                        candidate_loss = loss[candidate_idxs[gt]]
                        candidate_loss, inds = candidate_loss.sort()
                        candidate_loss = candidate_loss.view(-1,
                                                             1).cpu().numpy()
                        min_loss, max_loss = candidate_loss.min(
                        ), candidate_loss.max()
                        means_init = [[min_loss], [max_loss]]
                        weights_init = [0.5, 0.5]
                        precisions_init = [[[1.0]], [[1.0]]]
                        gmm = skm.GaussianMixture(
                            2,
                            weights_init=weights_init,
                            means_init=means_init,
                            precisions_init=precisions_init)
                        gmm.fit(candidate_loss)
                        components = gmm.predict(candidate_loss)
                        scores = gmm.score_samples(candidate_loss)
                        components = torch.from_numpy(components).to("cuda")
                        scores = torch.from_numpy(scores).to("cuda")
                        fgs = components == 0
                        bgs = components == 1
                        if fgs.nonzero().numel() > 0:
                            fg_max_score = scores[fgs].max().item()
                            fg_max_idx = (
                                fgs &
                                (scores == fg_max_score)).nonzero().min()
                            is_neg = inds[fgs | bgs]
                            is_pos = inds[:fg_max_idx + 1]
                        else:
                            # just treat all samples as positive for high recall.
                            is_pos = inds
                            is_neg = is_grey = None
                    else:
                        is_pos = 0
                        is_neg = None
                        is_grey = None
                    if is_grey is not None:
                        grey_idx = candidate_idxs[gt][is_grey]
                        cls_labels_per_im[grey_idx] = -1
                    if is_neg is not None:
                        neg_idx = candidate_idxs[gt][is_neg]
                        cls_labels_per_im[neg_idx] = 0
                    pos_idx = candidate_idxs[gt][is_pos]
                    cls_labels_per_im[pos_idx] = labels_per_im[gt].view(-1, 1)
                    matched_gts_per_im[pos_idx] = bboxes_per_im[gt].view(-1, 4)

            reg_targets_per_im = self.box_coder.get_deltas(
                anchors_per_im, matched_gts_per_im)
            cls_labels.append(cls_labels_per_im)
            reg_targets.append(reg_targets_per_im)
            matched_gts.append(matched_gts_per_im)

        return cls_labels, reg_targets, matched_gts
Example #30
0
    for i in range(0, len(idx_positive[0])):
        positive_features[i] = gen_features[idx_positive[0][i]]

    print("positive features: ", type(positive_features),
          positive_features.shape)

    for i in range(0, len(idx_negative[0])):
        negative_features[i] = gen_features[idx_negative[0][i]]

    print("negative features: ", type(negative_features),
          negative_features.shape)

    n_components = np.arange(1, 10)
    models = [
        mixture.GaussianMixture(n, covariance_type='full', random_state=0)
        for n in n_components
    ]
    aics = [
        model.fit(positive_features).aic(positive_features) for model in models
    ]
    bics = [
        model.fit(positive_features).bic(positive_features) for model in models
    ]
    plt.plot(n_components, aics, label="AIC-positive")
    plt.plot(n_components, bics, label="BIC-positive")
    plt.legend(loc='best')
    plt.xlabel('n_components')
    plt.savefig("/path/to/save/results/positive_data.png")
    print("Plot saved")
    gmm_positive = mixture.GaussianMixture(4,