Beispiel #1
0
def gmm_scale(gmm, shift=None, scale=None, reverse=False, params=None):
    """
    Apply scaling factors to GMM instances.

    Parameters
    ----------
    gmm : GaussianMixture
        GMM instance to be scaled.
    shift : int, float, optional
        Shift for the entire model. Default is 0 (no shift).
    scale : int, float, optional
        Scale for all components. Default is 1 (no scale).
    reverse : bool, optional
        Whether the GMM should be reversed.
    params
        GaussianMixture params for initialization of new instance.

    Returns
    -------
    GaussianMixture
        Modified GMM instance.

    """

    # Fetch parameters if not supplied
    if params is None:
        # noinspection PyUnresolvedReferences
        params = gmm.get_params()

    # Instantiate new GMM
    gmm_new = GaussianMixture(**params)

    # Create scaled fitted GMM model
    gmm_new.weights_ = gmm.weights_

    # Apply shift if set
    gmm_new.means_ = gmm.means_ + shift if shift is not None else gmm.means_

    # Apply scale
    if scale is not None:
        gmm_new.means_ /= scale

    gmm_new.covariances_ = gmm.covariances_ / scale ** 2 if scale is not None else gmm.covariances_
    gmm_new.precisions_ = np.linalg.inv(gmm_new.covariances_) if scale is not None else gmm.precisions_
    gmm_new.precisions_cholesky_ = np.linalg.cholesky(gmm_new.precisions_) if scale is not None \
        else gmm.precisions_cholesky_

    # Reverse if set
    if reverse:
        gmm_new.means_ *= -1

    # Add converged attribute if available
    if gmm.converged_:
        gmm_new.converged_ = gmm.converged_

    # Return scaled GMM
    return gmm_new
Beispiel #2
0
def update_GMM(g_x, N_samples, chunk):

    M_samples = chunk.shape[0]
    #3
    a_x = EM_using_mono(chunk)
    m = g_x.weights_.shape[0]  # no of components in original model
    C = merge(g_x, N_samples, a_x, M_samples)
    g_x2 = GMM(n_components=1, covariance_type='spherical')
    g_x2.fit(chunk)
    modify_gaussian(g_x2, C)
    g_x2.converged_ = True
    return N_samples + M_samples, g_x2
Beispiel #3
0
  def _sample_rows_same(self, X):
    """ uses efficient sklearn implementation to sample from gaussian mixture -> only works if all rows of X are the same"""
    weights, locs, scales = self._get_mixture_components(np.expand_dims(X[0], axis=0))

    # make sure that sum of weights < 1
    weights = weights.astype(np.float64)
    weights = weights / np.sum(weights)

    gmm = GaussianMixture(n_components=self.n_centers, covariance_type='diag', max_iter=5, tol=1e-1)
    gmm.fit(np.random.normal(size=(100,self.ndim_y))) # just pretending a fit
    # overriding the GMM parameters with own params
    gmm.converged_ = True
    gmm.weights_ = weights[0]
    gmm.means_ = locs[0]
    gmm.covariances_ = scales[0]
    y_sample, _ = gmm.sample(X.shape[0])
    assert y_sample.shape == (X.shape[0], self.ndim_y)
    return X, y_sample
Beispiel #4
0
def test_once_by_random_features():
    Xtrain = numpy.random.random_sample((5000)).reshape(-1, 10)
    Xtest = numpy.random.random_sample((500)).reshape(-1, 10)

    gmm_orig = GaussianMixture(n_components=8, random_state=1)
    gmm_copy = GaussianMixture()

    gmm_orig.fit(Xtrain)

    gmm_copy.weights_ = gmm_orig.weights_
    gmm_copy.means_ = gmm_orig.means_
    gmm_copy.covariances_ = gmm_orig.covariances_
    gmm_copy.precisions_ = gmm_orig.precisions_
    gmm_copy.precisions_cholesky_ = gmm_orig.precisions_cholesky_
    gmm_copy.converged_ = gmm_orig.converged_
    gmm_copy.n_iter_ = gmm_orig.n_iter_
    gmm_copy.lower_bound_ = gmm_orig.lower_bound_

    y_orig = gmm_orig.score_samples(Xtest)
    y_copy = gmm_copy.score_samples(Xtest)

    return all(y_orig == y_copy)
Beispiel #5
0
def fit_markov_chain(y,plot=False):
	y_0 = y[:-1]
	y_1 = y[1:]
	grad_0 = np.gradient(y_0)
	grad_1 = np.gradient(y_1)
	state_1 = grad_1[np.where(grad_0 < 0)] # instances where previous gradient was negative
	state_2 = grad_1[np.where(grad_0 > 0)] # instances where previous gradient was positive
	mean_1,std_1 = stats.norm.fit(state_1)
	mean_2,std_2 = stats.norm.fit(state_2)
	# Reshaping parameters to be suitable for sklearn.GaussianMixture
	means = np.array([mean_1,mean_2])
	means = means.reshape(2,1)
	y_GM = np.concatenate((state_2.reshape(-1,1),state_1.reshape(-1,1)))
	precisions =  [1/(std_1**2),1/(std_2**2)]	
	GM = GaussianMixture(n_components=2,covariance_type='spherical')
	GM.weights_ = [0.5,0.5]
	GM.means_ = means
	GM.covariances_ = [std_1,std_2]
	GM.precisions_ = precisions
	GM.precisions_cholesky_ = precisions	
	GM.converged_ = True
	if(plot):
		samples = GM.sample(5000)[0]
		fig,ax_list = plt.subplots(3,1)
		fig.set_size_inches(20,20)
		ax_list[0].hist(state_1,bins=70)
		ax_list[1].hist(state_2,bins=70)
		lnspc_1 = np.linspace(state_1.min(),state_1.max(),y.shape[0])
		gauss_1 = stats.norm.pdf(lnspc_1, mean_1, std_1)
		lnspc_2 = np.linspace(state_2.min(),state_2.max(),y.shape[0])
		gauss_2 = stats.norm.pdf(lnspc_2, mean_2, std_2)
		ax_list[0].plot(lnspc_1,gauss_1)
		ax_list[1].plot(lnspc_2,gauss_2)
		ax_list[0].scatter(mean_1,30)
		ax_list[1].scatter(mean_2,30)
		ax_list[2].hist(samples,bins=100)
		plt.show()
	return GM
def main(dataset_name, pca, cluster_method, lm_type, document_repr_type,
         random_state):
    save_dict_data = {}

    # pca = 0 means no pca
    do_pca = pca != 0

    save_dict_data["dataset_name"] = dataset_name
    save_dict_data["pca"] = pca
    save_dict_data["cluster_method"] = cluster_method
    save_dict_data["lm_type"] = lm_type
    save_dict_data["document_repr_type"] = document_repr_type
    save_dict_data["random_state"] = random_state

    naming_suffix = f"pca{pca}.clus{cluster_method}.{lm_type}.{document_repr_type}.{random_state}"
    print(naming_suffix)

    data_dir = os.path.join(INTERMEDIATE_DATA_FOLDER_PATH, dataset_name)
    print(data_dir)

    with open(os.path.join(data_dir, "dataset.pk"), "rb") as f:
        dictionary = pk.load(f)
        class_names = dictionary["class_names"]
        num_classes = len(class_names)
        print(class_names)

    with open(
            os.path.join(
                data_dir,
                f"document_repr_lm-{lm_type}-{document_repr_type}.pk"),
            "rb") as f:
        dictionary = pk.load(f)
        document_representations = dictionary["document_representations"]
        class_representations = dictionary["class_representations"]
        repr_prediction = np.argmax(cosine_similarity_embeddings(
            document_representations, class_representations),
                                    axis=1)
        save_dict_data["repr_prediction"] = repr_prediction

    if do_pca:
        _pca = PCA(n_components=pca, random_state=random_state)
        document_representations = _pca.fit_transform(document_representations)
        class_representations = _pca.transform(class_representations)
        print(f"Explained variance: {sum(_pca.explained_variance_ratio_)}")

    if cluster_method == 'gmm':
        cosine_similarities = cosine_similarity_embeddings(
            document_representations, class_representations)
        document_class_assignment = np.argmax(cosine_similarities, axis=1)
        document_class_assignment_matrix = np.zeros(
            (document_representations.shape[0], num_classes))
        for i in range(document_representations.shape[0]):
            document_class_assignment_matrix[i][
                document_class_assignment[i]] = 1.0

        gmm = GaussianMixture(n_components=num_classes,
                              covariance_type='tied',
                              random_state=random_state,
                              n_init=999,
                              warm_start=True)
        gmm.converged_ = "HACK"

        gmm._initialize(document_representations,
                        document_class_assignment_matrix)
        gmm.lower_bound_ = -np.infty
        gmm.fit(document_representations)

        documents_to_class = gmm.predict(document_representations)
        centers = gmm.means_
        save_dict_data["centers"] = centers
        distance = -gmm.predict_proba(document_representations) + 1
    elif cluster_method == 'kmeans':
        kmeans = KMeans(n_clusters=num_classes,
                        init=class_representations,
                        random_state=random_state)
        kmeans.fit(document_representations)

        documents_to_class = kmeans.predict(document_representations)
        centers = kmeans.cluster_centers_
        save_dict_data["centers"] = centers
        distance = np.zeros(
            (document_representations.shape[0], centers.shape[0]), dtype=float)
        for i, _emb_a in enumerate(document_representations):
            for j, _emb_b in enumerate(centers):
                distance[i][j] = np.linalg.norm(_emb_a - _emb_b)

    save_dict_data["documents_to_class"] = documents_to_class
    save_dict_data["distance"] = distance

    with open(os.path.join(data_dir, f"data.{naming_suffix}.pk"), "wb") as f:
        pk.dump(save_dict_data, f)