else:
						unknown_vec = w2v_model.seeded_vector(token)
						unknown_words[token] = unknown_vec
						test_vectors.append(unknown_vec)

				# Train GMM
				print 'Starting GMM training'
				words = w2v_model.vocab.keys()
				word_vectors = w2v_model.syn0
				gmm_model = GMM(n_components=num_topics, n_iter=num_gmm_iterations, covariance_type='diag')
				gmm_model.fit(word_vectors)
				# joblib.dump(gmm_model, gmm_output_file) 
				print 'Done GMM training'

				# Get the likelihood of each word vector under each Gaussian component
				scores = gmm_model.score(test_vectors)
				print scores
				ll = sum(scores)
				print "LL:   "+str(ll)

				# Print topics if desired
				if print_topics:
					log_probs = log_multivariate_normal_density(word_vectors, gmm_model.means_, gmm_model.covars_, gmm_model.covariance_type)
					print np.min(log_probs)
					_, num_col = log_probs.shape
					for col in xrange(num_col):
						top_n = 10
						log_component_probs = (log_probs[:,col]).T
						sorted_indexes = np.argsort(log_component_probs)[::-1][:top_n]
						ordered_word_probs = [(w2v_model.index2word[idx], log_component_probs[idx]) for idx in sorted_indexes]
Beispiel #2
0
                        unknown_words[token] = unknown_vec
                        test_vectors.append(unknown_vec)

                # Train GMM
                print 'Starting GMM training'
                words = w2v_model.vocab.keys()
                word_vectors = w2v_model.syn0
                gmm_model = GMM(n_components=num_topics,
                                n_iter=num_gmm_iterations,
                                covariance_type='diag')
                gmm_model.fit(word_vectors)
                # joblib.dump(gmm_model, gmm_output_file)
                print 'Done GMM training'

                # Get the likelihood of each word vector under each Gaussian component
                scores = gmm_model.score(test_vectors)
                print scores
                ll = sum(scores)
                print "LL:   " + str(ll)

                # Print topics if desired
                if print_topics:
                    log_probs = log_multivariate_normal_density(
                        word_vectors, gmm_model.means_, gmm_model.covars_,
                        gmm_model.covariance_type)
                    print np.min(log_probs)
                    _, num_col = log_probs.shape
                    for col in xrange(num_col):
                        top_n = 10
                        log_component_probs = (log_probs[:, col]).T
                        sorted_indexes = np.argsort(