def sample(self): self._check_is_valid_density() sample = sample_vMF( mu=self.get_normalized_mean(), kappa=self.get_concentration(), num_samples=1)[0] return sample
def test_maximization(): num_points = 5000 n_features = 500 posterior = np.ones((1, num_points)) kappas = [5000, 8000, 16400] for kappa in kappas: mu = np.random.randn(n_features) mu /= np.linalg.norm(mu) X = sample_vMF(mu, kappa, num_points) centers, weights, concentrations = ( von_mises_fisher_mixture._maximization(X, posterior)) print('center estimate error', np.linalg.norm(centers[0, :] - mu)) print('kappa estimate', np.abs(kappa - concentrations[0]) / kappa, kappa, concentrations[0]) assert_almost_equal(1., weights[0]) assert_almost_equal(0.0, np.abs(kappa - concentrations[0]) / kappa, decimal=2) assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2)
def sample_mix_vMF(center, kappa, weight, num_doc): distrib_idx = np.random.choice(range(len(center)), num_doc, p=weight) samples = [] for idx in distrib_idx: samples.append(sample_vMF(center[idx], kappa[idx], 1)) samples = np.array(samples) samples = np.reshape(samples, (num_doc, -1)) return samples
def generateDataset(mu_s, kappa_s, num_samples): num_clusters, dim = mu_s.shape X_s_numpy = np.zeros((num_clusters, num_samples, dim)) Y_s_numpy = np.zeros((num_clusters, num_samples)) for index in range(num_clusters): X_s_numpy[index] = sample_vMF(mu_s[index], kappa_s[index], num_samples) Y_s_numpy[index] = index return X_s_numpy, Y_s_numpy
def sample(self): self._check_is_valid_density() alpha = self.natural_parameters['alpha_minus_one'] + 1.0 beta = self.natural_parameters['beta'] sample_concentration = scipy.stats.gamma.rvs( a=alpha, scale=1.0/beta, size=1)[0] sample_mean = sample_vMF( mu=self.get_normalized_mean(), kappa=sample_concentration, num_samples=1)[0] return dict( mean=sample_mean, concentration=sample_concentration, )
def create_data(seed, vmeans, vkappas, num_per_class=500): np.random.seed(seed) data = [[], []] for view in range(2): for comp in range(len(vmeans[0])): comp_samples = sample_vMF(vmeans[view][comp], vkappas[view][comp], num_per_class) data[view].append(comp_samples) for view in range(2): data[view] = np.vstack(data[view]) labels = list() for ind in range(len(vmeans[0])): labels.append(ind * np.ones(num_per_class, )) labels = np.concatenate(labels) return data, labels
def sample_observation(self, parameters): """ Sample observations Args: parameters (Map): normalized_mean (ndarray): lenth num_dim, l2 norm = 1 concentration (double): positive Returns: obs (ndarray) """ from spherecluster import sample_vMF if not np.isclose(np.linalg.norm(parameters.normalized_mean), 1.0): raise ValueError("normalized_mean must have l2 norm = 1.0") if parameters.concentration < 1e-16: raise ValueError("concentration must be positive") obs = sample_vMF(mu=parameters.normalized_mean, kappa=parameters.concentration, num_samples=1)[0] return obs
def pseudodocs(word_sup_array, total_num, background_array, sequence_length, len_avg, len_std, num_doc, interp_weight, vocabulary_inv, embedding_mat, centers, kappa, model, save_dir=None): for i in range(len(embedding_mat)): embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i]) # _, centers, kappas = \ # label_expansion(word_sup_array, save_dir, vocabulary_inv, embedding_mat) print("Pseudo documents generation...") background_vec = interp_weight * background_array if model == 'cnn': docs = np.zeros((num_doc*len(word_sup_array), sequence_length), dtype='int32') label = np.zeros((num_doc*len(word_sup_array), len(word_sup_array))) for i in range(len(word_sup_array)): docs_len = len_avg*np.ones(num_doc) center = centers[i] # kappa = kappas[i] discourses = sample_vMF(center, kappa, num_doc) for j in range(num_doc): discourse = discourses[j] prob_vec = np.dot(embedding_mat, discourse) prob_vec = np.exp(prob_vec) sorted_idx = np.argsort(prob_vec)[::-1] delete_idx = sorted_idx[total_num:] prob_vec[delete_idx] = 0 prob_vec /= np.sum(prob_vec) prob_vec *= 1 - interp_weight prob_vec += background_vec doc_len = int(docs_len[j]) docs[i*num_doc+j][:doc_len] = np.random.choice(len(prob_vec), size=doc_len, p=prob_vec) label[i*num_doc+j] = interp_weight/len(word_sup_array)*np.ones(len(word_sup_array)) label[i*num_doc+j][i] += 1 - interp_weight elif model == 'rnn': docs = np.zeros((num_doc*len(word_sup_array), sequence_length[0], sequence_length[1]), dtype='int32') label = np.zeros((num_doc*len(word_sup_array), len(word_sup_array))) doc_len = int(len_avg[0]) sent_len = int(len_avg[1]) for period_idx in vocabulary_inv: if vocabulary_inv[period_idx] == '.': break for i in range(len(word_sup_array)): center = centers[i] # kappa = kappas[i] discourses = sample_vMF(center, kappa, num_doc) for j in range(num_doc): discourse = discourses[j] prob_vec = np.dot(embedding_mat, discourse) prob_vec = np.exp(prob_vec) sorted_idx = np.argsort(prob_vec)[::-1] delete_idx = sorted_idx[total_num:] prob_vec[delete_idx] = 0 prob_vec /= np.sum(prob_vec) prob_vec *= 1 - interp_weight prob_vec += background_vec for k in range(doc_len): docs[i*num_doc+j][k][:sent_len] = np.random.choice(len(prob_vec), size=sent_len, p=prob_vec) docs[i*num_doc+j][k][sent_len] = period_idx label[i*num_doc+j] = interp_weight/len(word_sup_array)*np.ones(len(word_sup_array)) label[i*num_doc+j][i] += 1 - interp_weight print("Finished Pseudo documents generation.") return docs, label
def bow_pseudodocs(relevant_nodes, expand_num, background_array, sequence_length, len_avg, len_std, num_doc, interp_weight, vocabulary_inv, embedding_mat, class_emb, kappa, save_dir=None, total_num=50): n_classes = len(relevant_nodes) for i in range(len(embedding_mat)): embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i]) background_vec = interp_weight * background_array docs = np.zeros((num_doc * n_classes, sequence_length), dtype='int32') label = np.zeros((num_doc * n_classes, n_classes)) for i in range(n_classes): docs_len = len_avg * np.ones(num_doc) relevant_node = relevant_nodes[i] if relevant_node.children: children_nodes = relevant_node.children child_doc = num_doc // len(children_nodes) rm_doc = num_doc % len(children_nodes) children_node = children_nodes[0] center = class_emb[children_node.name] discourses = sample_vMF(center, kappa, rm_doc) for children_node in children_nodes: center = class_emb[children_node.name] discourses_child = sample_vMF(center, kappa, child_doc) discourses = np.concatenate((discourses, discourses_child), axis=0) else: center = class_emb[relevant_node.name] discourses = sample_vMF(center, kappa, num_doc) for j in range(num_doc): discourse = discourses[j] prob_vec = np.dot(embedding_mat, discourse) prob_vec = np.exp(prob_vec) sorted_idx = np.argsort(-prob_vec) delete_idx = sorted_idx[total_num:] prob_vec[delete_idx] = 0 prob_vec /= np.sum(prob_vec) prob_vec *= 1 - interp_weight prob_vec += background_vec doc_len = int(docs_len[j]) docs[i * num_doc + j][:doc_len] = np.random.choice(len(prob_vec), size=doc_len, p=prob_vec) label[i * num_doc + j] = interp_weight / n_classes * np.ones(n_classes) label[i * num_doc + j][i] += 1 - interp_weight return docs, label
return raw_input(val) ############################################################################### # Generate small-mix dataset mu_0 = np.array([-0.251, -0.968, -0.105]) mu_0 = mu_0 / np.linalg.norm(mu_0) mu_1 = np.array([0.399, 0.917, 0.713]) mu_1 = mu_1 / np.linalg.norm(mu_1) mus = [mu_0, mu_1] kappa_0 = 8 # concentration parameter kappa_1 = 2 # concentration parameter kappas = [kappa_0, kappa_1] num_points_per_class = 300 X_0 = sample_vMF(mu_0, kappa_0, num_points_per_class) X_1 = sample_vMF(mu_1, kappa_1, num_points_per_class) X = np.zeros((2 * num_points_per_class, 3)) X[:num_points_per_class, :] = X_0 X[num_points_per_class:, :] = X_1 labels = np.zeros((2 * num_points_per_class, )) labels[num_points_per_class:] = 1 ############################################################################### # K-Means clustering km = KMeans(n_clusters=2, init='k-means++', n_init=20) km.fit(X) cdists = [] for center in km.cluster_centers_: cdists.append(np.linalg.norm(mus[0] - center))
import seaborn # NOQA from spherecluster import sample_vMF plt.ion() n_clusters = 3 mus = np.random.randn(3, n_clusters) mus, r = np.linalg.qr(mus, mode='reduced') kappas = [15, 15, 15] num_points_per_class = 250 Xs = [] for nn in range(n_clusters): new_X = sample_vMF(mus[nn], kappas[nn], num_points_per_class) Xs.append(new_X.T) fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(1, 1, 1, aspect='equal', projection='3d', adjustable='box-forced', xlim=[-1.1, 1.1], ylim=[-1.1, 1.1], zlim=[-1.1, 1.1]) colors = ['b', 'r', 'g'] for nn in range(n_clusters):