Ejemplo n.º 1
0
def test_lda_default_prior_params():
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_topics, X = _build_sparse_mtx()
    prior = 1. / n_topics
    lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior,
                                      topic_word_prior=prior, random_state=0)
    lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0)

    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2)
Ejemplo n.º 2
0
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)
Ejemplo n.º 3
0
def basic_lda(df, n_topics=200, max_df=0.5, min_df=5):
    '''
    Basic LDA model for album recommendations

    Args:
        df: dataframe with Pitchfork reviews
        n_topics: number of lda topics
        max_df: max_df in TfidfVectorizer
        min_df: min_df in TfidfVectorizer
    Returns:
        tfidf: sklearn fitted TfidfVectorizer
        tfidf_trans: sparse matrix with tfidf transformed data
        lda: sklearn fitted LatentDirichletAllocation
        lda_trans: dense array with lda transformed data

    '''

    X = df['review']
    cv = CountVectorizer(stop_words='english',
                         min_df=5,
                         max_df=0.5)
    cv_trans = cv.fit_transform(X)

    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=7)
    lda_trans = lda.fit_transform(cv_trans)

    return cv, cv_trans, lda, lda_trans
Ejemplo n.º 4
0
 def _get_model_LDA(self, corpus):
     #lda = models.LdaModel(corpus, id2word=self.corpus.dictionary, num_topics=5, alpha='auto', eval_every=50)
     lda = LatentDirichletAllocation(n_topics=self.num_of_clusters, max_iter=20,
                                     learning_method='online',
                                     learning_offset=50.,
                                     random_state=1)
     return lda.fit_transform(corpus)
Ejemplo n.º 5
0
def produceLDATopics():
    '''
    Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
    to extract topics.
    :return: pandas data frame with topic weights for each game (rows) and topic (columns)
    '''
    data_samples, gameNames = create_game_profile_df(game_path)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    topics = lda.fit_transform(tf)
    # for i in range(50):
    #     gameTopics = []
    #     for j in range(len(topics[0])):
    #         if topics[i,j] > 1.0/float(n_topics):
    #             gameTopics.append(j)
    #     print gameNames[i], gameTopics
    topicsByGame = pandas.DataFrame(topics)
    topicsByGame.index = gameNames
    print topicsByGame

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topicsByGame
Ejemplo n.º 6
0
def score_lda(src, dst):
	##read sentence pairs to two lists
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	vectorizer = CountVectorizer()
	vectors=vectorizer.fit_transform(b1 + b2)

	lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
	X = lda.fit_transform(vectors)
	print X.shape
	b1_v = vectorizer.transform(b1)
	b2_v = vectorizer.transform(b2)
	b1_vecs = lda.transform(b1_v)
	b2_vecs = lda.transform(b2_v)

	res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)]
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
Ejemplo n.º 7
0
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
def latdirall(content):
    lda = LatentDirichletAllocation(n_topics=10)
    tf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=1,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(content)
    lolz = lda.fit_transform(tf)
    tfidf_feature_names = tf_vectorizer.get_feature_names()
    return top_topics(lda, tfidf_feature_names, 10)
Ejemplo n.º 9
0
def test_lda_transform():
    # Test LDA transform.
    # Transform result cannot be negative
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
Ejemplo n.º 10
0
def test_lda_transform():
    # Test LDA transform.
    # Transform result cannot be negative and should be normalized
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
    assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
Ejemplo n.º 11
0
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        distr_1 = lda_1.fit_transform(X)
        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)

        distr_2 = lda_2.fit_transform(X)
        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
Ejemplo n.º 12
0
def test_lda_fit_transform(method):
    # Test LDA fit_transform & transform
    # fit_transform and transform result should be the same
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(50, 20))
    lda = LatentDirichletAllocation(n_components=5, learning_method=method,
                                    random_state=rng)
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)
Ejemplo n.º 13
0
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                    random_state=0)
    distr = lda.fit_transform(X)
    perplexity_1 = lda.perplexity(X, distr, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
def test_doc_topic_distr_deprecation():
    # Test that the appropriate warning message is displayed when a user
    # attempts to pass the doc_topic_distr argument to the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    distr1 = lda.fit_transform(X)
    distr2 = None
    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
    assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
Ejemplo n.º 15
0
def test_perplexity_input_format():
    # Test LDA perplexity for sparse and dense input
    # score should be the same for both dense and sparse input
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method='batch',
                                    total_samples=100, random_state=0)
    distr = lda.fit_transform(X)
    perp_1 = lda.perplexity(X)
    perp_2 = lda.perplexity(X, distr)
    perp_3 = lda.perplexity(X.toarray(), distr)
    assert_almost_equal(perp_1, perp_2)
    assert_almost_equal(perp_1, perp_3)
def fit_lda(tf,vectorizer):
    n_topics = 20
    n_top_words = 20
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,learning_method='online', learning_offset=50.,random_state=0)
    tf_lda = lda.fit_transform(tf)

    f_print = True
    if f_print:
        tf_feature_names = vectorizer.get_feature_names()
        print_top_words(lda, tf_feature_names, n_top_words)

    return [tf_lda,lda]
Ejemplo n.º 17
0
def latdirall(content):
    lda = LatentDirichletAllocation(n_topics=5)
    tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(content)
    lolz = lda.fit_transform(tf)
    tfidf_feature_names = tf_vectorizer.get_feature_names()
    tops = top_topics(lda, tfidf_feature_names, 10)
    wordlist = []
    for topic in tops:
        wordlist += topic
    return wordlist
Ejemplo n.º 18
0
 def infer_topics(self, num_topics=10, algorithm='variational', **kwargs):
     self.nb_topics = num_topics
     lda_model = None
     topic_document = None
     if algorithm == 'variational':
         lda_model = LDA(n_topics=num_topics, learning_method='batch')
         topic_document = lda_model.fit_transform(self.corpus.sklearn_vector_space)
     elif algorithm == 'gibbs':
         lda_model = lda.LDA(n_topics=num_topics, n_iter=500)
         topic_document = lda_model.fit_transform(self.corpus.sklearn_vector_space)
     else:
         raise ValueError("algorithm must be either 'variational' or 'gibbs', got '%s'" % algorithm)
     self.topic_word_matrix = []
     self.document_topic_matrix = []
     vocabulary_size = len(self.corpus.vocabulary)
     row = []
     col = []
     data = []
     for topic_idx, topic in enumerate(lda_model.components_):
         for i in range(vocabulary_size):
             row.append(topic_idx)
             col.append(i)
             data.append(topic[i])
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     row = []
     col = []
     data = []
     doc_count = 0
     for doc in topic_document:
         topic_count = 0
         for topic_weight in doc:
             row.append(doc_count)
             col.append(topic_count)
             data.append(topic_weight)
             topic_count += 1
         doc_count += 1
     self.document_topic_matrix = coo_matrix((data, (row, col)),
                                             shape=(self.corpus.size, self.nb_topics)).tocsr()
def text_transformation(initial_data, y, categorical_feature=['project_description']):
    tf = CountVectorizer(token_pattern='[a-zA-Z]{3,}',max_df=0.95, min_df=0.002,
                         max_features=2000, stop_words='english')
    serie = initial_data[categorical_feature[0]]
    articles_words = tf.fit_transform(serie.to_dict().values(), y)
    word_index = tf.get_feature_names()
    K = 20
    lda = LatentDirichletAllocation(n_topics=K, max_iter=10, learning_method='online', learning_offset=10.,
                                    random_state=0, n_jobs=-1)
    t0 = time()
    new_feature = lda.fit_transform(articles_words)
    print("done in %0.3fs." % (time() - t0))
    new_feature = pd.DataFrame(new_feature)
    return re_assemble_dataset(initial_data, new_feature, categorical_feature), lda
Ejemplo n.º 20
0
def lda_topics_modeling(n_topics: int, id2text, corpus, id2word, n_top_features=10, dump_to_db=True):
    """
    Возвращает вероятностное распределение документов по темам

    :param n_topics: Число возможных тем (топиков)
    :param id2text: Список имен (индексов) текстов
    :param corpus: текстов
    :param id2word: Список слов
    :param n_top_features: Число выводимых слов, характеризующих кластер
    :param dump_to_db: True - записывает топики в базу данных, False - не записывает топики в базуданных
    :return: кортеж с распределений слов по темам и документов по темам
    """

    t0 = time()
    lda = LatentDirichletAllocation(n_topics=n_topics)
    logging.info('LDA created in {:.3} sec'.format(time() - t0))

    t0 = time()
    doc_topic_dist = lda.fit_transform(corpus)
    logging.info('LDA model fit-transformed in {:.3} sec'.format(time() - t0))

    # Загрузка полученных топиков в базу данных
    if dump_to_db:
        lda_topics = connect_to_db()['lda_clusters']
        lda_topics.drop()
        for topic_idx, topic_dist in enumerate(lda.components_):
            doc = {'_id': int(topic_idx),
                   'terms': [id2word[i] for i in np.argsort(topic_dist)[:-n_top_features - 1:-1]]}
            lda_topics.insert(doc)
            logging.info('Topic {} dumped to database'.format(topic_idx))

    # Нормализация весов (получение вероятностей)
    topic_word_dist = np.apply_along_axis(_normalize_weights, 1, lda.components_)
    doc_topic_dist = np.apply_along_axis(_normalize_weights, 1, doc_topic_dist)

    topic_word_dist = pd.DataFrame(topic_word_dist, index=id2word)
    doc_topic_dist = pd.DataFrame(doc_topic_dist, columns=id2text)

    return topic_word_dist, doc_topic_dist, lda
Ejemplo n.º 21
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300, max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Ejemplo n.º 22
0
def lda_viz(docs, lengths, n_features, n_topics, n_top_words):
    n_samples = len(docs)

    norm = lambda data: pandas.DataFrame(data).div(data.sum(1),axis=0).values
    
    vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                    stop_words='english')
    vected = vect.fit_transform(docs)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    doc_topic_dists = norm(lda.fit_transform(vected))
    
    prepared = pyLDAvis.prepare(
                        doc_lengths = lengths,
                        vocab = vect.get_feature_names(),
                        term_frequency = vected.sum(axis=0).tolist()[0],
                        topic_term_dists = norm(lda.components_),
                        doc_topic_dists = doc_topic_dists,
                        )

    #print(doc_topic_dists)
    #print(n_samples)
    return prepared, doc_topic_dists
Ejemplo n.º 23
0
def extended_lda(df, n_topics=200):
    '''
    Trains an extended LDA model with custom text preprocessor
    and custom tokenizer

    Args:
        df: dataframe with Pitchfork reviews
        n_topics: number of topis in LDA model
    Returns:
        tfidf: sklearn fitted TfidfVectorizer
        tfidf_trans: sparse matrix with tf! transformed data
        lda: sklearn fitted LatentDirichletAllocation
        lda_trans: dense array with lda transformed data
    '''

    print('Starting TfIdf')
    # for LDA, use raw counts; that is, tfidf with appropriate parameters
    tfidf, tfidf_trans = extended_tfidf(df, use_idf=False, norm=None)

    print('Starting LDA')
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5)
    lda_trans = lda.fit_transform(tfidf_trans)

    return tfidf, tfidf_trans, lda, lda_trans
Ejemplo n.º 24
0
class Featurizer():
	def __init__(self, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False):
		t = None
		if tokenizer is 'named_entity':
			t = NETokenizer()
		elif tokenizer is 'lemma':
			t = LemmaTokenizer()
		self.use_genre_vecs = use_genre_vecs
		self.binary = plot_vectorizer is 'binary'
		if plot_vectorizer is 'tfidf':
			self.vectorizer = TfidfVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		elif plot_vectorizer is 'binary':
			self.vectorizer = CountVectorizer(analyzer = "word",	\
				tokenizer = t,	\
				preprocessor = None, \
				stop_words = 'english', \
				binary = True)
		else:
			self.vectorizer = CountVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		if lda:
			self.lda = LatentDirichletAllocation(n_topics=20, max_iter=2,	\
				learning_method='online', learning_offset=10.,	\
				random_state=0)
		else:
			self.lda = None

	def find_movie(self, title, year = None):
		""" Finds a movie with the given name substring. """
		return [movie for movie in self.movies.keys() if title in movie[0] and (year is None or year == movie[1])]

	def load(self, path):
		""" Loads the data into memory. """
		with io.open(path, 'r', encoding = 'latin-1') as f:
			movies = json.load(f)
			od = OrderedDict({(movie['title'],movie['year']):{'plot':movie['plot'],'cast':set(movie['cast']), \
				'genres':set(movie['genres'])} \
				for movie in movies}.items())
			return od

	def train(self, movies):
		""" Trains the featurizer. """
		movie_keys = list(movies.keys())
		self.movies = dict(zip(movie_keys, range(0, len(movie_keys))))
		self.movie_indices = dict([reversed(i) for i in self.movies.items()])
		plots = [movie['plot'] for movie in movies.values()]
		self.plots = self.vectorizer.fit_transform(plots)
		self.casts = [movie['cast'] for movie in movies.values()]
		self.genres = [movie['genres'] for movie in movies.values()]
		if self.lda is not None:
			self.plot_topics = self.lda.fit_transform(feat_vec)
		else:
			self.plot_topics = None

		if self.use_genre_vecs:
			genre_lis = set([])
			for g in self.genres:
				genre_lis.update(g)
			self.genre_lis = dict(zip(genre_lis, range(0, len(genre_lis))))
			self.genre_indices = dict([reversed(i) for i in self.genre_lis.items()])
			genre_plots = np.zeros((len(genre_lis),self.plots.shape[1]))
			for i in range(len(self.genres)):
				gl = self.genres[i]
				for g in gl:
					genre_plots[self.genre_lis[g],:] += self.plots[i,:]
			if self.binary:
				genre_plots = np.minimum(np.ones((len(genre_lis),self.plots.shape[1])),genre_plots)
			self.genre_plots = cosine_simil(self.plots, genre_plots)

	def load_train(self, path):
		""" Loads the data into memory and trains the featurizer. """
		self.train(self.load(path))


	def plot_features(self, base_movie, plots, plot_topics = None):
		""" Returns a feature matrix derived from the plots.
		The # of rows returned matches the length of the parameter plots.
		"""
		if self.use_genre_vecs:
			plot = self.genre_plots[self.movies[base_movie]]
			pv = cosine_simil(plots, plot)
			return pv
		else:
			plot = self.plots[self.movies[base_movie]]
			pv = cosine_simil(plots, plot)
			return pv

	def cast_features(self, base_movie, casts):
		""" Returns a feature matrix derived from the casts.
		The # of rows returned matches the length of the parameter casts.
		"""
		cv = np.array([jaccard(cast_set, self.casts[self.movies[base_movie]]) for cast_set in casts])
		return cv.reshape((cv.shape[0],1)) # Reshape into column vector

	def genre_features(self, base_movie, genres):
		""" Returns a feature matrix derived from the genres.
		The # of rows returned matches the length of the parameter genres.
		"""
		gv = np.array([jaccard(genre_set, self.genres[self.movies[base_movie]]) for genre_set in genres])
		return gv.reshape((gv.shape[0],1)) # Reshape into column vector

	def single_features(self, base_movie, trial_movie):
		""" Returns a feature matrix for a single movie. """
		ind = self.movies[trial_movie]
		return self.features(base_movie, movies = ((self.genre_plots[ind] if self.use_genre_vecs else self.plots[ind], self.plot_topics[ind] if self.lda is not None else None), [self.casts[ind]], [self.genres[ind]]))

	def features(self, base_movie, movies = None):
		""" Returns the feature set for the given movies,
		when compared to the base movie. When movies is None,
		uses the whole list of movies.

		Parameter movies must be a 3-tuple, representing the plots,
		casts and genres. The # of rows of each should match.

		Returns an AxB matrix where A is the # of rows for plots
		and B is the total number of features.
		"""
		plots = (self.genre_plots if self.use_genre_vecs else self.plots) if movies is None else movies[0][0]
		plot_topics = self.plot_topics if movies is None else movies[0][1]
		casts = self.casts if movies is None else movies[1]
		genres = self.genres if movies is None else movies[2]
		pv = self.plot_features(base_movie, plots, plot_topics)
		cv = self.cast_features(base_movie, casts)
		gv = self.genre_features(base_movie, genres)
		return hstack((pv,cv,gv)) if issparse(pv) else np.hstack((pv,cv,gv))

	def similar_movies(self, weights, base_movie, movies = None, n = 6):
		""" Gets the n similar movies to a base movie. """
		fv = self.features(base_movie, movies = movies)
		wv = weights.reshape((weights.shape[1],1))
		scores = fv.dot(wv)
		inds = np.argpartition(scores,-n, axis = 0)[-n:].reshape(n)
		return [self.movie_indices[i]for i in inds]
Ejemplo n.º 25
0
class LDA():

    def __init__(self, n_topics=10, n_features=5000, max_df=.75, min_df=2, max_iter=5, alpha=None, eta=None):
        '''
        '''

        self.n_topics = n_topics
        self.n_features = n_features
        self.max_df = max_df
        self.min_df = min_df
        self.max_iter = max_iter
        self.lda = None
        self.tf = None
        self.topics = None
        self.alpha = alpha
        self.eta = eta

    def vectorizecounts(self, docs):
        '''
        '''

        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA...")
        self.tf_vectorizer = CountVectorizer(max_df=self.max_df, min_df=self.min_df, max_features=self.n_features)
        t0 = time()
        self.tf = self.tf_vectorizer.fit_transform(docs)
        self.n_samples = len(docs)
        print("done in %0.3fs." % (time() - t0))


    def fitLDA(self):
        '''
        '''
        print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
              % (self.n_samples, self.n_features))
        self.lda = LatentDirichletAllocation(doc_topic_prior=self.alpha, topic_word_prior=self.eta, n_topics=self.n_topics, max_iter=self.max_iter,
                                        learning_method='online', learning_offset=10.,
                                        random_state=0, n_jobs=6)
        t0 = time()
        self.topics = self.lda.fit(self.tf)
        print("done in %0.3fs." % (time() - t0))

    def print_top_words(self, n_top_words):
        '''
        '''

        tf_feature_names = self.tf_vectorizer.get_feature_names()
        for topic_idx, topic in enumerate(self.lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([tf_feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

    def get_topic_content(self, topic):
        '''
        Parameters
        --------------
        topic: int 
            Topic index


        Returns
        -----------
        feature_names : list
            Array of words corresponding to the given feature. 

        topic_content : np.array(n_features)
            Topic vector over the feature space
        '''

        return self.tf_vectorizer.get_feature_names(), self.lda.components_

    def get_doc_topics(self, docs):

        # Convert the document into feature space. 
        feature_vec = self.tf_vectorizer.fit_transform(docs)
        return self.lda.fit_transform(feature_vec)
Ejemplo n.º 26
0
def preprocess_data(data, method):
    print "[EDEN I/O -- preprocess_data] Preprocessing data..."

    def format_entities(norm_ent):
        ents = []
        for ent in norm_ent:
            try:
                ents.append(porter.stem(ent['surface-form'].lower()))
            except:
                continue
        return " ".join(ents)

    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()

    def nlp_prepro(doc, porter, stop_words):
        return " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(doc) if i.lower() not in stop_words])

    d = [{"id": doc["_id"],
          "first-published": doc["_source"]["first-published"],
          "title": doc["_source"]["title"],
          "summary": doc["_source"]["title"],
          "content": doc["_source"]["content"],
          "entities": format_entities(doc["_source"]["normalised-entities"]),
          "content_prepro": nlp_prepro(doc["_source"]["content"], porter, stop_words)} for doc in data]

    df_story = pd.DataFrame(d)
    df_story['first-published-epoch'] = df_story['first-published'].apply(
        lambda x: int(datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").strftime("%s")))
    df_story = df_story.sort_values(by='first-published-epoch')
    df_story = df_story.reset_index()

    if method == 'ltc':
        vect = TfidfVectorizer(sublinear_tf=True, use_idf=True, norm='l2')
        vsm = vect.fit_transform(df_story['content_prepro'].values)
        vsm_arr = vsm.toarray()
        print "[EDEN I/O -- preprocess_data] VSM shape: ", vsm.shape
        print "[EDEN I/O -- preprocess_data] VSM type: ", type(vsm)
        df_story['vsm'] = [r for r in vsm_arr]

    elif method == 'ltc_ent':
        vect = TfidfVectorizer(use_idf=True, norm='l2', sublinear_tf=True)
        vsm = vect.fit_transform(df_story['entities'].values)
        vsm_arr = vsm.toarray()
        print "[EDEN I/O -- preprocess_data] VSM shape: ", vsm.shape
        print "[EDEN I/O -- preprocess_data] VSM type: ", type(vsm)
        df_story['vsm'] = [r for r in vsm_arr]

    elif method == 'word2vec':
        with open('../datasets/word2vec_signal/word2vec_signal.p', 'rb') as fin:
            word2vec_signal = pickle.load(fin)
        vecs = [word2vec_signal[id_] for id_ in df_story['id']]
        df_story['vsm'] = vecs

    elif method == 'LatentDirichlet':
        vect = CountVectorizer(max_df=0.90, min_df=2).fit_transform(
            df_story['content_prepro'].values)
        lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)
        vsm_arr = lda.fit_transform(vect, None)
        df_story['vsm'] = [r for r in vsm_arr]

    return df_story
Ejemplo n.º 27
0
	  % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
'''
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
	  % (n_samples, n_features))

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,learning_method='batch')
t0 = time()
result = lda.fit_transform(tf)
print("done in %0.3fs." % (time() - t0))


print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)



#number of clusters
clusters = 2


km = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=10,verbose=opts.verbose)
Ejemplo n.º 28
0
    token_pattern='[A-Za-z]{3,}|[A-Z]{2,}',
    stop_words='english')

tf_matrix = tf_vectorizer.fit_transform(df['ProcessedContents'])

print "lda model done now..."
lda = LatentDirichletAllocation(n_topics=20,
                                max_iter=15,
                                doc_topic_prior=0.4,
                                topic_word_prior=0.4,
                                learning_method='online',
                                learning_offset=50.,
                                verbose=1,
                                random_state=1)

lda_result = lda.fit_transform(tf_matrix)

print "dbscan model done now..."
dbscan_model = DBSCAN(eps=0.1, min_samples=3)
dbscan_model.fit(lda_result)
df['dbscan_labels'] = dbscan_model.labels_

max_cluster = max(df['dbscan_labels'])

print "There are total of " + str(max_cluster + 1) + " clusters..."

for cluster_id in range(0, max_cluster + 1):
    #    cluster_id = 1
    print "Cluster ID: " + str(cluster_id) + "..."
    subset_df = df[df['dbscan_labels'] == cluster_id]
    subset_df = subset_df.reset_index()
Ejemplo n.º 29
0
df = pd.read_csv("movie_data.csv", encoding="utf-8")

# Use CountVectorizer to create the bag-of-words matrix as input to the LDA.
count = CountVectorizer(stop_words="english",
                        max_df=0.1,
                        max_features=5000)
X = count.fit_transform(df["review"].values)

lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method="batch",
                                n_jobs=-1)
# Let the lda estimator do its estimation based on all the availabel training
# data (bag-of-words matrix) in one iteration.

X_topics = lda.fit_transform(X)  # fit to the data, then transform it.

components = lda.components_
print(components[0, 0: 10])

n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic {:.0f}:".format(topic_idx + 1))
    print(" ".join([feature_names[i] for i in
                    topic.argsort()[-1: -n_top_words - 1: -1]]))
    print("\n")

music = X_topics[:, 7].argsort()[::-1]
for iter_idx, movie_idx in enumerate(music[:3]):
    print("\nMusic Movie {:.0f}:".format(iter_idx + 1))
Ejemplo n.º 30
0
def main(filename):
    global data_vectorized
    global lda_output
    global plot_df
    df = pd.read_csv(filename)  # CHANGE THIS
    df = df.sample(frac=0.2, replace=False, random_state=1)
    N_NGRAM_RANGE = 2  # CHANGE HERE
    my_additional_stop_words = pd.read_csv(
        r'C:\Users\noel.alexander\Documents\Fullstack\Topic Modelling\Stopwords\custom_stopwords.csv'
    ).values.flatten().tolist()  #CHANGE THIS
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    data = df.content.values.tolist()

    # Remove Emails
    data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub(r'\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only Noun, Adj, Verb, Adverb
    data_lemmatized = lemmatization(
        n=nlp,
        texts=data_words,
        allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    vectorizer = CountVectorizer(
        analyzer='word',
        min_df=0.05,  # ignore terms that appear in less than 5% of the documents
        stop_words=stop_words,  # remove stop words
        lowercase=True,  # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
        ngram_range=(1, N_NGRAM_RANGE))

    data_vectorized = vectorizer.fit_transform(data_lemmatized)
    space = {
        'n_topics': hp.quniform("n_topics", 6, 10,
                                1),  # search n_topics from 2-20
        'learning_decay':
        hp.uniform('learning_decay', 0.5,
                   0.9),  # search learning_decay from 0.5-0.9
    }

    trials = Trials()

    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=5,
                trials=trials)

    LEARNING_DECAY = best['learning_decay']  #0.84529 #best['learning_decay']
    N_TOPICS = best['n_topics']  #9 #best['n_topics']
    print('starting lda')
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=int(N_TOPICS),  # number of topics
        learning_decay=
        LEARNING_DECAY,  # control learning rate in the online learning method
        max_iter=10,  # max learning iterations
        learning_method='online',  # use mini-batch of training data
        batch_size=128,  # n docs in each learning iter
        n_jobs=-1,  # use all available CPUs
    )

    lda_output = lda_model.fit_transform(data_vectorized)
    lda_output = lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2),
                                     columns=topicnames,
                                     index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(
        color_green).applymap(make_bold)
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts(
    ).reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    df_topic_distribution['Percent of Total'] = round(
        df_topic_distribution['Num Documents'] /
        np.sum(df_topic_distribution['Num Documents'].values), 2)
    topic_keywords = show_topics(vectorizer=vectorizer,
                                 lda_model=lda_model,
                                 n_words=15)

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = [
        'Word ' + str(i) for i in range(df_topic_keywords.shape[1])
    ]
    df_topic_keywords.index = [
        'Topic ' + str(i) for i in range(df_topic_keywords.shape[0])
    ]
    #pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(lda_model,
                                     data_vectorized,
                                     vectorizer,
                                     mds='tsne')
    """
    topics_dic ={}
    for i in range(int(N_TOPICS)):
        topics_dic[i] = 'topic ' + str(i)
    plot_df = pd.DataFrame({'topics':labels})
    plot_df['topics'] = plot_df['topics'].map(topics_dic)
    labels = []
    for doc in lda_output:
        labels.append(np.argmax(doc))
    labels = np.array(labels)

    embedding = umap.UMAP(n_neighbors=100, min_dist=0.9).fit_transform(lda_output)

    plot_df['axis_1'] = embedding[:, 0]
    plot_df['axis_2'] = embedding[:, 1]
    """
    html = pyLDAvis.prepared_data_to_html(panel)
    Html_file = open("html_output", "w")
    Html_file.write(html)
    Html_file.close()
    return html
Ejemplo n.º 31
0
#print("\n\nmyinput : ", myinput)

vectorizer = CountVectorizer(min_df=1,
                             max_df=0.95,
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}',
                             ngram_range=(1, 3))
data_vectorized = vectorizer.fit_transform(myinput)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=4,
                                      max_iter=50,
                                      learning_method='online',
                                      random_state=0)
lda_Z = lda_model.fit_transform(data_vectorized, num_titles)

print("\n\nNO_DOCUMENTS, NO_TOPICS (n_components) : ",
      lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Let's see how the first document in the corpus looks like in different topic spaces
print("\n\nlda_z : ", lda_Z[0])

model = (vectorizer, data_vectorized, lda_model.components_,
         lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_)

print("Start pickling LDA Model")
import pickle
pickle.dump(model, open("LDAModel_Pickle.p", "wb"))
print("Done pickling LDA Model")
Ejemplo n.º 32
0
from infant_pipe import pdf_extract, process
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import mglearn
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vect = CountVectorizer(ngram_range=(1, 1), stop_words='english')
circular_content = process(pdf_extract('001.pdf'))
dtm = vect.fit_transform([circular_content])

#print(pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names()))
print(type(circular_content))
lda = LatentDirichletAllocation(n_components=5)

lda_dtf = lda.fit_transform(dtm)

sorting = np.argsort(lda.components_)[:, ::-1]
features = np.array(vect.get_feature_names())
#print(mglearn.tools.print_topics(topics = range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=10))
#str_circular_content = str(circular_content)
Ejemplo n.º 33
0
# documentId, similarity = similarities[0]
# print(data[documentId][0:1000])

vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
dataVectorized = vectorizer.fit_transform(data)

# Build LDA Model : Sklearn

ldaModel1 = LatentDirichletAllocation(n_components=numOfTopics,
                                      max_iter=10,
                                      learning_method='online')
ldaZ = ldaModel1.fit_transform(dataVectorized)

# x = ldaModel1.transform(vectorizer.transform([text]))[0]
# print(x, x.sum())

# Visualize LDA Sklearn Results

panel = pyLDAvis.sklearn.prepare(ldaModel1,
                                 dataVectorized,
                                 vectorizer,
                                 mds="tsne")

pyLDAvis.save_html(
    panel,
    "C:/xampp/htdocs/SpeechArt/LDA_visualizations/" + doc_name + ".html")
Ejemplo n.º 34
0
sentences = pd.DataFrame(sentenses, columns=['Sentences'])

countVectorizer = CountVectorizer(strip_accents='unicode',
                                  analyzer='word',
                                  token_pattern=r'\w{1,}',
                                  stop_words='english',
                                  ngram_range=(1, 1))

vectorizedText = countVectorizer.fit_transform(sentences['Sentences'])

ldaModel = LatentDirichletAllocation(n_components=10,
                                     learning_method='online',
                                     random_state=0,
                                     verbose=0)
lda_topics = ldaModel.fit_transform(vectorizedText)

from collections import Counter

lda_keys = lda_topics.argmax(axis=1)
lda_categories, lda_counts = zip(*Counter(lda_keys).items())

tsne_Model = TSNE(n_components=2,
                  perplexity=50,
                  learning_rate=100,
                  n_iter=2000,
                  verbose=1,
                  random_state=0,
                  angle=0.75)
tsne_vector = tsne_Model.fit_transform(lda_topics)
term_idf_scores = [] 
for i in range(len(terms)):
    term_idf_scores.append([terms[i], term_idf_sums[0,i]]) 
print("The Term/Frequency matrix has", tf.shape[0], " rows, and", tf.shape[1], " columns.") 
print("The Term list has", len(terms), " terms.") 
term_idf_scores.sort(key=sortSecond, reverse=True) 
print("\nTerms with Highest TF-IDF Scores:") 
for i in range(10):
    j = i
    print('{:<15s}{:>8.2f}'.format(term_idf_scores[j][0],  term_idf_scores[j][1]))
    
uv = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                               learning_method=learning_method, \
                               learning_offset=learning_offset, \
                                random_state=12345) 
U = uv.fit_transform(tf) 

# Display the topic selections 
print("\n********** GENERATED TOPICS **********") 
TextAnalytics.display_topics(uv.components_, terms, n_terms=15, mask=None)
# Store topic selection for each doc in topics[] 
topics = [0] * n_reviews 
for i in range(n_reviews):
    max = abs(U[i][0])
    topics[i] = 0
    for j in range(n_topics):
        x = abs(U[i][j])
        if x > max:
            max = x
            topics[i] = j
            
Ejemplo n.º 36
0
    for indx, freq in line:
        rows.append(i)
        cols.append(indx)
        data.append(freq)
dtm = csr_matrix((data, (rows, cols)), shape=(Nrow, Ncol), dtype=int)

# Materialize the sparse data
# data_dense = dtm.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
# print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")
n_topics = list(range(
    50, 150, 10))  # + list(range(50, 200, 50)) + list(range(200, 500, 100))
for NTopic in n_topics:
    # Build LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=NTopic,  # Number of topics
        max_iter=10,  # Max learning iterations
        learning_method='online',
        batch_size=500000,  # n docs in each learning iter
        evaluate_every=-1,  # compute perplexity every n iters, default: Don't
        n_jobs=-1,  # Use all available CPUs
        verbose=1)
    lda_output = lda_model.fit_transform(dtm)
    from joblib import dump, load
    model_fname = './sklearnlda/lda_n_' + str(NTopic) + '.joblib'
    dump(lda_model, model_fname)
    X_fname = './sklearnlda/transformedX_n_' + str(NTopic) + '.joblib'
    dump(lda_output, X_fname)
    print(lda_model)  # Model attributes
# instatiate the vectorizer
vect_LDA = CountVectorizer(stop_words = stop_words, strip_accents = 'ascii')
# vectorize the documents - get dtm
data_vect = vect_LDA.fit_transform(documents)
data_vect.shape
# create a dataframe
corpus_df_LDA = pd.DataFrame(data_vect.toarray(), columns = vect_LDA.get_feature_names())
corpus_df_LDA.shape
# filter digits from column names
corpus_df_LDA = corpus_df_LDA[corpus_df_LDA.columns.drop(list(corpus_df_LDA.filter(regex = r'(\d+)')))]
corpus_df_LDA

# LDA MODEL n_components = number of topics
lda_model = LatentDirichletAllocation(n_components = NUM_TOPICS, max_iter = 10, learning_method = 'online')
# fit the model to the vectorized data (dtm)
lda_Z = lda_model.fit_transform(data_vect)
print(lda_Z.shape)  # 218 docs, 5 topics
# fit the model to the dataframe
lda_Z_DF = lda_model.fit_transform(corpus_df_LDA)


# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components = NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(corpus_df_LDA)
print(nmf_Z.shape)# 218 docs, 5 topics
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components = NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(corpus_df_LDA)
print(lsi_Z.shape)  # 40 docs, 10 topics
 
def run_kmeans(n,save=False,filename=""):
    tf_data_features = count_vectorizer.fit_transform([review[1] for review in reviews])
    #tf_data_features_array = tf_data_features.toarray()
    tf_vocab = count_vectorizer.get_feature_names() #to check that has same vocab
    
    
    from sklearn.decomposition import LatentDirichletAllocation
    lda = LatentDirichletAllocation(n_topics=17, max_iter=2,
                                    learning_method='online',learning_offset=10.,
                                    random_state=5)
    topic_transformed_features = lda.fit_transform(tf_data_features) #topic_transformed_features is array of topic composition of reviews
    
    #code taken from example on scikit learn to print
    for topic_idx, topic in enumerate(lda.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([tf_vocab[i] for i in topic.argsort()[:-50 - 1:-1]]))
        
    #normalize LDA topic score vectors
    topic_transformed_features = Normalizer(copy=False).fit_transform(topic_transformed_features)
    
    num_clusters = n
    k_means = cluster.KMeans(n_clusters=num_clusters)
    #run k means clustering algorithm
    k_means.fit(topic_transformed_features)
    
    original_space_centroids = k_means.cluster_centers_
    
    #print out the centroid topic scores for topics that scored above 0.10
    for i in range(num_clusters):
        print("Cluster %d: " % i, end='\n')
        for x in range(0,original_space_centroids[i].size):
            if original_space_centroids[i,x] > .1:
                print("Topic ",x,": ",original_space_centroids[i,x])
        print("")
        print("")
        
    #reduce dimensionality for visualization and silhouette score calculation
    svd = TruncatedSVD(2)
    data_features = svd.fit_transform(topic_transformed_features)
    space_centroids = svd.transform(k_means.cluster_centers_)
    
    data_features_array = data_features.tolist()
    firstArray = []
    secondArray = []
    thirdArray = []
    fourthArray = []
    for i in range(len(data_features_array)):
        #print(coord_pair)
        if k_means.labels_[i] == 0:
            firstArray.append((data_features_array[i][0],data_features_array[i][1]))
        elif k_means.labels_[i] == 1:
            secondArray.append((data_features_array[i][0],data_features_array[i][1]))
        elif k_means.labels_[i] == 2:
            thirdArray.append((data_features_array[i][0],data_features_array[i][1]))
        else:
            fourthArray.append((data_features_array[i][0],data_features_array[i][1]))
    plt.plot([x[0] for x in firstArray],[y[1] for y in firstArray], 'ro',label="Cluster 0")
    if n >= 2:
        plt.plot([x[0] for x in secondArray],[y[1] for y in secondArray], 'go',label="Cluster 1")
    if n >= 3:
        plt.plot([x[0] for x in thirdArray],[y[1] for y in thirdArray], 'bo',label="Cluster 2")
    if n >= 4:
        plt.plot([x[0] for x in fourthArray],[y[1] for y in fourthArray], 'mo',label="Cluster 3")
    
    
    plt.plot([centroid[0] for centroid in space_centroids], [centroid[1] for centroid in space_centroids],'ko')
    plt.title('K-Means Clustering with LDA Feature Vectorization ('+str(n)+' Clusters)')
    plt.legend(loc='upper right',shadow=True, fontsize='medium')    
    figure = plt.gcf()
    figure.set_size_inches(8,6)
    if save is True and filename != "":
        plt.savefig(filename+'.png', dpi=100)
    plt.show()
    
    #calculate silhouette score
    silhouette_score = metrics.silhouette_score(data_features,k_means.labels_,metric='euclidean',sample_size=len(reviews))
    print(num_clusters, ': silhouette score: ',silhouette_score)
    return silhouette_score
Ejemplo n.º 39
0
tfidf = transformer.fit_transform(cntTf)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
df_weight = pd.DataFrame(weight)
feature = df_weight.columns
df_weight['sum'] = 0
for f in tqdm(feature):
    df_weight['sum'] += df_weight[f]
deviceid_packages['tfidf_sum'] = df_weight['sum']

# In[10]:

lda = LatentDirichletAllocation(n_topics=5,
                                learning_offset=50.,
                                random_state=666)
docres = lda.fit_transform(cntTf)

# In[11]:

deviceid_packages = pd.concat(
    [deviceid_packages, pd.DataFrame(docres)], axis=1)

# In[12]:

temp = deviceid_packages.drop('apps', axis=1)
deviceid_train = pd.merge(deviceid_train, temp, on='device_id', how='left')

# In[13]:

#解析出所有的device_app_pair
device_id_arr = []
Ejemplo n.º 40
0

# X = docuemnt-term matrix
X = vectorizer.fit_transform(total_text)

t2 = time.time()


print('time for count vectorizer: ' + str((t2-t1)))

#vocab = vectorizer.get_feature_names()

n_top_words = 5

lda_model = LatentDirichletAllocation(n_components=17, random_state=100)
lda_model.fit_transform(X)

t3 = time.time()

print('time for LDA: ' + str((t3-t2)))

prepared_data = prepare(lda_model,X,vectorizer, mds = 'tsne', plot_opts={'xlab': '', 'ylab': ''})

 
t4 = time.time()



print('time for pyLDAvis: ' + str((t4-t3)))
print('total time: ' + str((t4-t0)))
 
Ejemplo n.º 41
0
def main(data_samples, lang, n_features, n_topics, n_top_words):

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")    

    # Get the top 1000 tokens in order to correct and lemmatize them
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                        max_features=n_features,
                                        stop_words=stopword_spec(lang))

    t0 = time.time()
    tf = tf_vectorizer.fit_transform(data_samples)

    print("done in %0.3fs." % (time.time() - t0))
    
    # extract the top 1000 words for later use
    words_list = list(tf_vectorizer.vocabulary_.keys())
    
    print("Initialization of the spell checker on tokens...")
    
    t0 = time.time()
    
    # Check spelling of the top 1000 words
    corrected_words = spell_checker(words_list)

    print("done in %0.3fs." % (time.time() - t0))
    
    print("Initialization of the Lemmatizer...")
    
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    words_lemmatized = []
    for word in corrected_words:
        lemma = treetaggerwrapper.make_tags(tagger.tag_text(word),exclude_nottags=False)[0].lemma
        words_lemmatized.append(lemma)
    
    # Dict containing {unmodified words: words corrected and lemmatized}
    word_to_lemma_dict = dict(zip(words_list, words_lemmatized))
    
    # Transform the matrix to take into account spell check and lemmatization
    # 1 - Convert sparse matrice to dataframe to chan
    tf_df = pd.DataFrame(tf.A, columns=tf_vectorizer.get_feature_names())
    
    # 2 - Change the name of the columns by the corrected and lemmatized words
    tf_df.rename(index=str, columns=word_to_lemma_dict, inplace=True)

    # 3 - Groupby columns with same name and sum of counts
    tf_df = tf_df.groupby(by=tf_df.columns, axis=1).sum()
    
    # 4 - Convert df back to sparse matrix
    tf = sps.csr_matrix(tf_df)
    
    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (len(data_samples), n_features))
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    t0 = time.time()
    doc_topics = lda.fit_transform(tf)
    print("done in %0.3fs." % (time.time() - t0))
    
    tf_feature_names = list(tf_df.columns)

    Topics = pd.DataFrame()

    
    for topic_idx, topic in enumerate(lda.components_):
        topic_words = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

        for count, word in enumerate(topic_words):
         
            probabilite=sorted(lda.components_[topic_idx][:-n_top_words - 1:-1], reverse=True)
            total=sum(probabilite)
            #print(probabilite)
            topic_words[count] = word + " ( %0.3f )"  % (probabilite[count]/total*100) 
            
            #topic_words[count] = word + " ( %0.3f )"  % (lda.components_[topic_idx, count]/lda.components_[topic_idx][:-n_top_words - 1:-1].sum()*100) 
            #topic_words[count] = word + " (" + str(lda.components_[topic_idx, count]/lda.components_[topic_idx].sum()*100) + ")" 
			 #print(topic_words[i])
        Topics[topic_idx] = topic_words
       
    Topics = Topics.transpose()
	# frequence = pd.DataFrame(lda.components_)
	 #frequence = frequence.transpose()
    print("end LDA")
    return doc_topics, Topics
Ejemplo n.º 42
0
class Cluster:

    """cluster input data using K-means, Minibatch-Kmeans or LDA. Input to clustering algorithms must be either
    a Tf-Idf vector or a hashing vector. tuning parameters can be configured in default.cfg file."""

    def __init__(self, config):
        self.config = config
        self.model = None
        self.svd = None

        # log_file = self.config.LOG_DIR + self.config.LOGFILE
        # logging.basicConfig(format='%(asctime)s::%(levelname)s::%(message)s', level=logging.INFO, filename=log_file)

    def do_kmeans(self, dataset):
        """vanilla k-means - Llyod's algorithm.
            Input:
                :parameter dataset: input data in the form of a term document matrix

            Output:
                :returns labels_: a list of cluster identifiers - 1 per input document
                :rtype list"""

        # # normalization
        # self.svd = TruncatedSVD(self.config.NCLUSTERS)
        # normalizer = Normalizer(copy=False)
        # lsa = make_pipeline(self.svd, normalizer)
        # dataset = lsa.fit_transform(dataset)

        # finish normalization,start k-means
        self.model = KMeans(n_clusters=self.config.NCLUSTERS, n_init=self.config.NINIT, n_jobs=self.config.INIT_PCNT)
        self.model.fit_transform(dataset)
        return self.model.labels_

    def do_minibatch_kmeans(self, dataset):
        """scalable version of k-means. used for large datasets. same input/output as k-means function
            Input:
                :parameter dataset: input data in the form of a term document matrix

            Output:
                :returns labels_: a list of cluster identifiers - 1 per input document
                :rtype list"""

        self.model = MiniBatchKMeans(n_clusters=self.config.NCLUSTERS, n_init=self.config.NINIT,
                                     batch_size=self.config.BATCHSIZE, max_iter=self.config.NITER, verbose=self.config)
        self.model.fit(dataset)
        return self.model.predict(dataset)

    def print_top_terms(self, features, model='kmeans'):
        """print top 'n' features(cluster centers) of each cluster
            Inputs:
                :parameter features: list of features returned by the vectorizer
                :parameter model: name of the model. default - kmeans"""

        if model == 'kmeans':
            for ind, term in enumerate(self.get_top_cluster_terms(features, model='kmeans')):
                print("Cluster #: {0}   Top terms: {1}".format(ind, term))
        elif model == 'lda':
            for ind, term in enumerate(self.get_top_cluster_terms(features, model='lda')):
                print("Topic #: {0}   Top terms: {1}".format(ind, term))

    def get_top_cluster_terms(self, features, model='kmeans', num_terms=15):
        """get top 'n' cluster features that constitute cluster centroids
            Input:
                :parameter features: list of features returned by the vectorizer
                :parameter model: name of the model. default - kmeans
                :parameter num_terms: # of terms to return. default - 15

            Output:
                :returns cluster centroids
                :rtype list"""

        top_terms = []
        if model == 'kmeans':
            # original_space_centroids = self.svd.inverse_transform(self.model.cluster_centers_)
            # order_centroids = original_space_centroids.argsort()[:, ::-1]
            order_centroids = self.model.cluster_centers_.argsort()[:, ::-1]
            for cluster_num in range(self.config.NCLUSTERS):
                top_terms.append(", ".join([features[i] for i in order_centroids[cluster_num, :num_terms]]))
        elif model == 'lda':
            for topic in self.model.components_:
                top_terms.append(", ".join([features[i] for i in topic.argsort()[:-num_terms - 1:-1]]))
        return top_terms

    def do_lda(self, dataset):
        """Latent Dirichlet Allocation
            Input:
                :parameter dataset: input data in the form of a term-document matrix

            Output:
                :return components_: list of topic labels for each topic
                :rtype list"""

        self.model = LatentDirichletAllocation(n_topics=self.config.NTOPICS, max_iter=self.config.NITER)
        self.model.fit(dataset)
        return self.model.components_

    def do_h2o_kmeans(self, dataset, server_url):
        """use the h2o module to perform k-means clustering.
            This method delegates clustering to a H2O server instance(local or remote). A connection attempt will be
            made to the provided server_url before clustering is initiated.
            input:
                :param dataset: input data - term document matrix
                :param server_url: URL of the H2O server instance on which clustering would run
            output:
                labels_: a list of cluster identifiers - 1 per input document
            :raises ConnectionError"""

        # establish connection to H20 server
        try:
            h2o.connect(url=server_url, verbose=False)
            logging.info("connected to H2O server")
            h2o_dataframe = h2o.H2OFrame(python_obj=dataset)
            self.model = H2OKMeansEstimator(max_iterations=self.config.NITER, k=self.config.NCLUSTERS, init="PlusPlus",
                                            standardize=False)
            self.model.train(training_frame=h2o_dataframe)
            logging.info("modelling complete. predicting cluster membership")
            return self.model.predict(h2o_dataframe)["predict"].as_data_frame(use_pandas=False, header=False)
        except H2OConnectionError:
            logging.error("unable to connect to H2O server @ {0}".format(server_url))
            raise ConnectionError("unable to connect to H2O server. check if server is running at specified URL")
Ejemplo n.º 43
0
#test_doc = [doc7, doc8]

NUM_TOPICS = 1

vectorizer = CountVectorizer(min_df=1,
                             max_df=6,
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(documents)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Let's see how the first document in the corpus looks like in different topic spaces
print("LDA")
print(lda_Z[0])
Ejemplo n.º 44
0
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,  # Number of topics
                                      max_iter=10,
                                      # Max learning iterations
                                      learning_method='online',
                                      random_state=100,
                                      # Random state
                                      batch_size=128,
                                      # n docs in each learning iter
                                      evaluate_every=-1,
                                      # compute perplexity every n iters, default: Don't
                                      n_jobs=-1,
                                      # Use all available CPUs
                                      )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

# LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
#                           evaluate_every=-1, learning_decay=0.7,
#                           learning_method="online", learning_offset=10.0,
#                           max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
#                           n_components=10, n_jobs=-1, n_topics=20, perp_tol=0.1,
#                           random_state=100, topic_word_prior=None,
#                           total_samples=1000000.0, verbose=0)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)


# # Topic models

# In[11]:

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features


# ## Show topics and their weights

# In[12]:

tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()
Ejemplo n.º 46
0
    'foo bar bar bar baz foo', 'foo foo foo bar baz', 'blah banana',
    'cookies candy', 'more text please', 'hey there are more words here',
    'bananas', 'i am a real boy', 'boy', 'girl'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)

vocab = vectorizer.get_feature_names()

n_top_words = 5
k = 2

model = LatentDirichletAllocation(n_topics=k, random_state=100)

id_topic = model.fit_transform(X)

topic_words = {}

for topic, comp in enumerate(model.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)
    word_idx = np.argsort(comp)[::-1][:n_top_words]
Ejemplo n.º 47
0
# First Create Term-Frequency/Inverse Doc Frequency by Review Matrix
# This requires constructing Term Freq. x Doc. matrix first
tf_idf = TfidfTransformer()
print("\nTF-IDF Parameters\n", tf_idf.get_params(), "\n")
tf_idf = tf_idf.fit_transform(tf)
# Or you can construct the TF/IDF matrix from the data
tfidf_vect = TfidfVectorizer(max_df=max_df, min_df=2, max_features=m_features,\
                             analyzer=my_analyzer, ngram_range=ngram)
tf_idf = tfidf_vect.fit_transform(discussions)
print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n")

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                                learning_method=learning_method, \
                                learning_offset=learning_offset, \
                                random_state=12345)
lda.fit_transform(tf_idf)
print('{:.<22s}{:>6d}'.format("Number of Reviews", tf.shape[0]))
print('{:.<22s}{:>6d}'.format("Number of Terms", tf.shape[1]))
print("\nTopics Identified using LDA with TF_IDF")
tf_features = cv.get_feature_names()
max_words = 15
desc = []
for topic_idx, topic in enumerate(lda.components_):
    message = "Topic #%d: " % topic_idx
    message += " ".join(
        [tf_features[i] for i in topic.argsort()[:-max_words - 1:-1]])
    print(message)
    print()
    desc.append([tf_features[i] for i in topic.argsort()[:-max_words - 1:-1]])

#Extract topic probablities
Ejemplo n.º 48
0
class TopicModeller(object):
    '''
    Wrapper for NMF
    '''
    def __init__(self,
                 model_type='NMF',
                 vectorizer_type='tfidf',
                 k_topics=8,
                 max_vocab_size=5000,
                 min_df=20,
                 max_df=1.0,
                 ngram_range=(1, 1),
                 **kwargs):
        '''
        Input:
            max_vocab_size - upper bound limit to the number of features/terms
            min_df - vectorizer: min document frequency
            max_df - vectorizer: ignore words with a document frequency above %
            ngram_range - number of ngrams to search for, starting from 1.
                          The lower and upper boundary of the range of n-values
                          for different n-grams to be extracted
        '''
        # self.x_train = X_train  # document set as Pandas Series
        self.vectorizer = None
        self.vectorizer_type = vectorizer_type.lower()
        self.document_term_mat = None
        self.model = None
        self.model_type = model_type.lower()
        self.W = None  # Populated only for NMF model
        self.H = None  # Populated only for NMF model
        self.d2v_model = None

        token_pattern = nlp_utils.get_token_pattern()
        stop_words = nlp_utils.get_stop_words()

        for key in ('vectorizer', 'model_type'):
            if key in kwargs:
                setattr(self, key, kwargs[key])

        # for NMF
        if self.model_type == 'nmf':
            if self.vectorizer_type == 'tfidf':
                self.vectorizer = TfidfVectorizer(token_pattern=token_pattern,
                                                  min_df=min_df,
                                                  max_df=max_df,
                                                  max_features=max_vocab_size,
                                                  stop_words=stop_words,
                                                  ngram_range=ngram_range)
            else:
                self.vectorizer = nlp_utils.LocalwiseVectorizer(
                    max_features=max_vocab_size,
                    min_df=min_df,
                    max_df=max_df,
                    ngram_range=ngram_range)

        # For LDA
        elif self.model_type == 'lda':
            # Use tf (raw term count) features for LDA.
            self.vectorizer = CountVectorizer(token_pattern=token_pattern,
                                              max_df=max_df,
                                              min_df=min_df,
                                              max_features=max_vocab_size,
                                              stop_words=stop_words,
                                              ngram_range=ngram_range)

    def vectorize(self, docs):
        '''
        Vectorize the document content and fit the NMF

        Input
            train_docs - Training document set
        Output:
            the fit model
        '''
        # list of document content
        # eg, resume content for each user or job posting description content
        print('Number of documents to process: %s\n' % docs.shape)

        print("Extracting Vectorizer features...")
        t1 = time.time()
        self.document_term_mat = self.vectorizer.fit_transform(docs)
        print("- Time: %0.3fs.\n" % (time.time() - t1))

    def fit(self, docs, k_topics):
        '''
        Input
            docs - Documents to topic model
            k_topics - k number of topics to generate
        '''
        if self.document_term_mat is None:
            print('Vectorizer wasn\'t fitted.  ' \
                  'Call your TopicModeller.vectorize first.')
            return

        print("Fitting %s model with %d documents.  " \
              "Vectorizer: \n%s" % (self.model_type, docs.shape[0],
                                    self.vectorizer))

        # for NMF
        if self.model_type == 'nmf':
            self.model = NMF(n_components=k_topics,
                             alpha=.1,
                             l1_ratio=.5,
                             init='nndsvd')
        # For LDA
        elif self.model_type == 'lda':
            self.model = LatentDirichletAllocation(n_components=k_topics,
                                                   max_iter=5,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0)
        else:
            print('Unsupported models type \'%s\'' % self.model_type)
            return

        t1 = time.time()
        W = self.model.fit_transform(self.document_term_mat)
        H = self.model.components_
        if self.model.__class__.__name__.upper() == 'NMF':
            self.W = W
            self.H = H
        print("- Time: %0.3fs.\n" % (time.time() - t1))
        self.describe_matrix_factorization_results(self.document_term_mat,
                                                   W,
                                                   H,
                                                   n_top_words=20)

        if self.d2v_model is None:
            self.d2v_model = nlp_utils.get_doc2vec_model(docs)

    # # Place holder
    # def fit_lda():
    #     t1 = time.time()
    #     lda.fit(document_term_mat)
    #     print "- Time: %0.3fs.\n" % (time.time() - t1)
    #
    #     print "Topics in LDA model:"
    #     tf_feature_names = self.vectorizer.get_feature_names()
    #     self.print_top_words(lda, tf_feature_names)

    def document_term_mat_toframe(self):
        all_feature_names = self.vectorizer.get_feature_names()
        dtm = self.document_term_mat.todense()
        dfv = pd.DataFrame(dtm, columns=all_feature_names)
        return dfv

    # def print_top_words(self, model, feature_names, n_top=15):
    #     for topic_idx, topic in enumerate(model.components_):
    #         message = 'Topic #%d: ' % topic_idx
    #         message += ' '.join([feature_names[i]
    #                              for i in topic.argsort()[:-n_top - 1:-1]])
    #         print(message)
    #     print()

    def reconst_mse(self, target, left, right):
        '''
        Calcuate the mean squared error between soruce matrix and
        the reconstruction of the matrix with W*H
        '''
        return (np.array(target - left.dot(right))**2).mean()

    def describe_matrix_factorization_results(self,
                                              document_term_mat,
                                              W,
                                              H,
                                              n_top_words=15):
        '''
        For each latent topic print the top n words assocaited with that topic

        TODO: print probabilities
        '''
        feature_words = self.vectorizer.get_feature_names()
        print("Reconstruction mse: %f" %
              (self.reconst_mse(document_term_mat, W, H)))
        for topic_num, topic in enumerate(H):
            top_features = ', '.join([
                feature_words[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print("Topic %d: %s\n" % (topic_num, top_features))

        return

    def rank_terms(self):
        # get the sums over each column/term
        sums = self.document_term_mat.sum(axis=0)
        terms = self.vectorizer.get_feature_names()
        # map weights to the terms
        weights = {}
        for col, term in enumerate(terms):
            weights[term] = sums[0, col]
        # rank the terms by their weight over all documents
        return sorted(list(weights.items()),
                      key=operator.itemgetter(1),
                      reverse=True)

    def get_doc_terms_and_scores(self, doc_index):
        '''
        Return the tfidf values for vectorized terms for a document.

        Input:
            doc_index:  The row index number for the fitted document_term_matrix
        Output:
            dictionary of {doc terms: tfidf scores}

        Hint: A sorted print to use:
            for key, value in sorted(tfidf_scores.iteritems(),
                                     key=lambda (k,v): (v,k), reverse=True):
                print "{:<10}: {:<10}".format(key, value)

        '''
        all_feature_names = self.vectorizer.get_feature_names()
        dtm = self.document_term_mat.todense()

        doc_terms_indicies = dtm[doc_index, :].nonzero()[1]
        tfidf_scores = {
            all_feature_names[term_idx]: dtm[doc_index, term_idx]
            for term_idx in doc_terms_indicies
        }

        return tfidf_scores

    def print_W_probs(self, W):
        '''
        Input
            W NMF matrix
        '''
        probs = (W / W.sum(axis=1, keepdims=True)).flatten()
        ordered = np.argsort(probs)[::-1]
        for idx in ordered:
            print('Topic %s: %0.3f' % (idx, probs[idx]))

    def get_normalized_probs(self, topic_weights):
        '''
        Return the normalized topic cluseter weights for a given row vector
        '''
        topic_weights = topic_weights.flatten()
        probs = (topic_weights / topic_weights.sum())
        return probs

    def get_top_topics_and_topic_probs(self):
        '''
        Generate the probability of each topic for each row (eg, job posting)
        in W, and add the top topic and probability and return each as a list,
        (for example to be used as new columns added to a dataframe)
        '''
        # For each row, get the topic weights, normalize, order by
        # weight value, and store in a list to add to the dataframe
        top_topics = []
        top_topic_weights = []
        for row_idx in range(self.W.shape[0]):
            W = self.W[row_idx]
            probs = self.get_normalized_probs(W)

            ordered_idxs = np.argsort(probs)[::-1]
            top_topics.append(ordered_idxs[0])
            top_topic_weights.append(probs[ordered_idxs[0]])

        return (top_topics, top_topic_weights)

    def custom_nmf(self,
                   document_term_mat,
                   k_topics=15,
                   n_iterations=50,
                   max_rows=20000,
                   eps=1e-6):
        '''
        Build the W and H matrix with least squares, clip negative values to 0

        k_topics is also said as number of components
        '''
        # n_rows = document_term_mat.shape[0]
        n_rows = max_rows
        n_cols = document_term_mat.shape[1]

        W = rand(n_rows * k_topics).reshape([n_rows, k_topics])
        H = rand(k_topics * n_cols).reshape([k_topics, n_cols])

        # linalg.lstsq doesn't work on sparse mats
        dense_document_term_mat = document_term_mat[0:n_rows].todense()
        print('dense_document_term_mat shape: ', dense_document_term_mat.shape)

        for i in range(n_iterations):
            print('iteration', i)
            H = np.linalg.lstsq(W, dense_document_term_mat)[0].clip(eps)
            W = np.linalg.lstsq(H.T, dense_document_term_mat.T)[0].clip(eps).T
        return np.array(W), np.array(H)

    def classify_training_docs(self, doc, display=True):
        '''
        Using the trained model to label each source doc used in training
        '''
        pass

    def classify_new_doc(self, doc, display=True):
        '''
        Classify a new document using the fit model (NMF, LDA, or other).

        Input
            doc - string
        Output
            Dictionary of topics and their weights
            Optional output on (True) by default
        '''
        if not self.model:
            'A model has not been fit yet.'
        if type(doc) != str:
            'Input document must be a string'

        # Using NMF
        # TODO word2Vec
        document_term_mat = self.vectorizer.transform([doc])
        W = self.model.transform(document_term_mat)
        # H = self.model.components_

        probs = (W / W.sum(axis=1, keepdims=True)).flatten()
        ordered = np.argsort(probs)[::-1]
        topic_dict = {}
        for idx in ordered:
            topic_dict[idx] = probs[idx]
            if display:
                print('Topic %s: %0.3f' % (idx, probs[idx]))

        return topic_dict
Ejemplo n.º 49
0
    lda.fit(corpBodySpaCy)
    score = lda.score(corpBodySpaCy)
    perplexity = lda.perplexity(corpBodySpaCy)
    print n,score,perplexity
    lda_eval.append({'topics':n,'score':score,'perplexity':perplexity})

for item in lda_eval:
    print item


#Best number of topics from the best vectorizer

lda15 = LatentDirichletAllocation(n_topics=15, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
tf_trans = lda15.fit_transform(corpBodySpaCy)

topics = pd.DataFrame(tf_trans)

ldaTopics = topics.idxmax(axis=1)

blogs['ldaTopics'] = ldaTopics

topics.iloc[1]

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

topics.iloc[1]
Ejemplo n.º 50
0
 def compute_latent_vectors(self, col2, df) -> np.ndarray:
     document_term_matrix = self.create_document_term_matrix(df, col2)
     transformer = LatentDirichletAllocation(n_components=5,
                                             learning_method="online",
                                             random_state=99)
     return transformer.fit_transform(document_term_matrix)
Ejemplo n.º 51
0
def gene_var_text_relation():
    print('Loading gene/variation text...')
    train_gene_text = pd.read_csv('data/intermediate/train_gene_text', sep='|')
    train_var_text = pd.read_csv('data/intermediate/train_variation_text',
                                 sep='|')

    test_gene_text = pd.read_csv('data/intermediate/test_gene_text', sep='|')
    test_var_text = pd.read_csv('data/intermediate/test_variation_text',
                                sep='|')
    print('train_gene_text.shape:', train_gene_text.shape,
          'train_var_text.shape:', train_var_text.shape,
          'test_gene_text.shape:', test_gene_text.shape,
          'test_var_text.shape:', test_var_text.shape)

    gene_text = pd.concat((train_gene_text, test_gene_text),
                          axis=0,
                          ignore_index=True)
    gene_text.columns = ['Entity', 'Text']
    gps = gene_text.groupby('Entity')
    entity = []
    text = []
    for val, gp in gps:
        if gp.shape[0] != 1:
            entity.append(gp['Entity'].values[0])
            text.append(gp['Text'].sum())
    for e in entity:
        gene_text = gene_text[gene_text['Entity'] != e]
    gene_text = gene_text.append(pd.DataFrame({
        'Entity': entity,
        'Text': text
    }),
                                 ignore_index=True)
    del entity, text, gps

    var_text = pd.concat((train_var_text, test_var_text),
                         axis=0,
                         ignore_index=True)
    var_text.columns = ['Entity', 'Text']
    gps = var_text.groupby('Entity')
    entity = []
    text = []
    for val, gp in gps:
        if gp.shape[0] != 1:
            entity.append(gp['Entity'].values[0])
            text.append(gp['Text'].sum())
    for e in entity:
        var_text = var_text[var_text['Entity'] != e]
    var_text = var_text.append(pd.DataFrame({
        'Entity': entity,
        'Text': text
    }),
                               ignore_index=True)
    del entity, text, gps

    gene_var_text = pd.concat((gene_text, var_text), axis=0, ignore_index=True)
    print(gene_text.shape, var_text.shape, gene_var_text.shape)

    print('Applying document level tfidf + svd...')
    tfidf_word_vector = TfidfVectorizer(strip_accents='unicode',
                                        ngram_range=(1, 3),
                                        stop_words='english')
    tfidf_svd = TruncatedSVD(n_components=50, n_iter=25, random_state=12)

    text_tfidf = tfidf_word_vector.fit_transform(gene_var_text['Text'].values)
    print('text_tfidf:', text_tfidf.shape)
    text_doc_svd = tfidf_svd.fit_transform(text_tfidf)
    print('text_doc_svd:', text_doc_svd.shape)
    pd.concat(
        (gene_var_text['Entity'], pd.DataFrame(data=text_doc_svd)),
        axis=1,
        ignore_index=True).to_csv('data/intermediate/gv_text_doc_svd.csv',
                                  header=False,
                                  index=False)

    print('Applying document level tfidf + nmf...')
    tfidf_nmf = NMF(n_components=60)
    text_doc_nmf = tfidf_nmf.fit_transform(text_tfidf)
    print('text_doc_nmf:', text_doc_nmf.shape)
    pd.concat(
        (gene_var_text['Entity'], pd.DataFrame(data=text_doc_nmf)),
        axis=1,
        ignore_index=True).to_csv('data/intermediate/gv_text_doc_nmf.csv',
                                  header=False,
                                  index=False)
    del text_tfidf

    print('Applying sentence level tfidf(word/char) + svd...')
    sent_win = np.zeros((gene_var_text.shape[0], ), dtype=object)
    for i, text in enumerate(gene_var_text['Text'].tolist()):
        sent_win[i] = ' '.join([sent for sent in sent_tokenize(text)])

    tfidf_char_vector = TfidfVectorizer(strip_accents='unicode',
                                        analyzer='char',
                                        ngram_range=(1, 8),
                                        stop_words='english')

    word = tfidf_word_vector.fit_transform(sent_win)
    print('word.shape:', word.shape)
    word_svd = tfidf_svd.fit_transform(word)
    print('word_svd.shape:', word_svd.shape)
    del word

    char = tfidf_char_vector.fit_transform(sent_win)
    print('char.shape:', char.shape)
    del sent_win
    char_svd = tfidf_svd.fit_transform(char)
    print('char_svd.shape:', char_svd.shape)
    del char

    sent_tfidf_word_char_svd = np.concatenate((word_svd, char_svd), axis=1)
    del word_svd, char_svd
    print('sent_tfidf_word_char_svd:', sent_tfidf_word_char_svd.shape)
    pd.concat(
        (gene_var_text['Entity'], pd.DataFrame(data=sent_tfidf_word_char_svd)),
        axis=1,
        ignore_index=True).to_csv(
            'data/intermediate/gv_sent_tfidf_word_char_svd.csv',
            header=False,
            index=False)

    print('Extracting tf features on gene text for LDA...')
    count_vector = CountVectorizer(analyzer='word', stop_words='english')
    gene_train_tf = count_vector.fit_transform(train_gene_text['Text'].values)
    gene_test_tf = count_vector.transform(test_gene_text['Text'].values)
    print('gene_train_tf:', gene_train_tf.shape)
    print('gene_test_tf:', gene_test_tf.shape)

    print('Applying Latent Dirichlet Allocation on gene text...')
    lda_vector = LatentDirichletAllocation(n_components=50)
    gene_train_lda = lda_vector.fit_transform(gene_train_tf)
    gene_test_lda = lda_vector.transform(gene_test_tf)
    print('gene_train_lda:', gene_train_lda.shape)
    print('gene_test_lda:', gene_test_lda.shape)
    del gene_train_tf, gene_test_tf
    gene_lda = np.concatenate((gene_train_lda, gene_test_lda), axis=0)
    del gene_train_lda, gene_test_lda
    gene_lda_df = pd.concat(
        (pd.concat([train_gene_text['Gene'], test_gene_text['Gene']],
                   axis=0,
                   ignore_index=True), pd.DataFrame(data=gene_lda)),
        axis=1,
        ignore_index=True)

    # merge same entities
    gps = gene_lda_df.groupby(0)
    entity = []
    vec = []
    for val, gp in gps:
        if gp.shape[0] != 1:
            entity.append(gp[0].values[0])
            vec.append(sum(gp.values[:, 1:].astype(float)))
    for e in entity:
        gene_lda_df = gene_lda_df[gene_lda_df[0] != e]
    gene_lda_df = gene_lda_df.append(
        pd.concat([pd.DataFrame(data=entity),
                   pd.DataFrame(data=vec)],
                  axis=1,
                  ignore_index=True))
    del entity, vec, gps

    gene_lda_df.to_csv('data/intermediate/gv_gene_tf_lda50.csv',
                       header=False,
                       index=False)
    del gene_lda

    print('Extracting tf features on variation text for LDA...')
    var_train_tf_feats = count_vector.fit_transform(
        train_var_text['Text'].values)
    var_test_tf_feats = count_vector.transform(test_var_text['Text'].values)
    print('var_train_tf_feats:', var_train_tf_feats.shape)
    print('var_test_tf_feats:', var_test_tf_feats.shape)

    print('Applying Latent Dirichlet Allocation on variation text...')
    var_train_lda_feats = lda_vector.fit_transform(var_train_tf_feats)
    var_test_lda_feats = lda_vector.transform(var_test_tf_feats)
    print('var_train_lda_feats:', var_train_lda_feats.shape)
    print('var_test_lda_feats:', var_test_lda_feats.shape)
    del var_train_tf_feats, var_test_tf_feats
    var_lda = np.concatenate((var_train_lda_feats, var_test_lda_feats), axis=0)
    del var_train_lda_feats, var_test_lda_feats
    var_lda_df = pd.concat(
        (pd.concat([train_var_text['Variation'], test_var_text['Variation']],
                   axis=0,
                   ignore_index=True), pd.DataFrame(data=var_lda)),
        axis=1,
        ignore_index=True)
    # merge same entities
    gps = var_lda_df.groupby(0)
    entity = []
    vec = []
    for val, gp in gps:
        if gp.shape[0] != 1:
            entity.append(gp[0].values[0])
            vec.append(sum(gp.values[:, 1:].astype(float)))
    for e in entity:
        var_lda_df = var_lda_df[var_lda_df[0] != e]
    var_lda_df = var_lda_df.append(
        pd.concat([pd.DataFrame(data=entity),
                   pd.DataFrame(data=vec)],
                  axis=1,
                  ignore_index=True))
    del entity, vec, gps
    var_lda_df.to_csv('data/intermediate/gv_var_tf_lda50.csv',
                      header=False,
                      index=False)
    del var_lda

    print('Applying TF custom idf feature on gene text...')
    gene_dic = _get_tf_dic(train_gene_text['Text'].values,
                           test_gene_text['Text'].values,
                           flag='gene')
    _, gene_idf_list = document_mining._word_occur_cls(gene_dic)
    gene_tfidf = document_mining._get_tfidf(gene_text['Text'].values, gene_dic,
                                            gene_idf_list)
    pd.concat((gene_text['Entity'], pd.DataFrame(data=gene_tfidf)),
              axis=1,
              ignore_index=True).to_csv(
                  'data/intermediate/gv_gene_tf_custom_idf.csv',
                  header=False,
                  index=False)

    print('Applying TF custom idf feature on variation text...')
    var_dic = _get_tf_dic(train_var_text['Text'].values,
                          test_var_text['Text'].values,
                          flag='variation')
    _, var_idf_list = document_mining._word_occur_cls(var_dic)
    var_tfidf = document_mining._get_tfidf(var_text['Text'].values, var_dic,
                                           var_idf_list)
    pd.concat(
        (var_text['Entity'], pd.DataFrame(data=var_tfidf)),
        axis=1,
        ignore_index=True).to_csv('data/intermediate/gv_var_tf_custom_idf.csv',
                                  header=False,
                                  index=False)
    del gene_dic, var_dic, gene_idf_list, var_idf_list, gene_tfidf, var_tfidf

    print('Applying TF custom idf feature on built gene/var dictionary...')
    gene_dic = set([
        line.rstrip('\n')
        for line in open('data/intermediate/gene_tf_unique_dict_all.txt', 'r')
    ])
    var_dic = set([
        line.rstrip('\n') for line in open(
            'data/intermediate/variation_tf_unique_dict_all.txt', 'r')
    ])
    gene_var_dic_intxn = set(gene_dic).intersection(set(var_dic))

    gene_unique_dic = list(gene_dic - gene_var_dic_intxn)
    _, gene_idf_list = document_mining._word_occur_cls(gene_unique_dic)
    gene_tfidf = document_mining._get_tfidf(gene_text['Text'].values,
                                            gene_unique_dic, gene_idf_list)
    pd.concat((gene_text['Entity'], pd.DataFrame(data=gene_tfidf)),
              axis=1,
              ignore_index=True).to_csv(
                  'data/intermediate/gv_gene_unique_tf_custom_idf.csv',
                  header=False,
                  index=False)

    var_unique_dic = list(var_dic - gene_var_dic_intxn)
    _, var_idf_list = document_mining._word_occur_cls(var_unique_dic)
    var_tfidf = document_mining._get_tfidf(var_text['Text'].values,
                                           var_unique_dic, var_idf_list)
    pd.concat((var_text['Entity'], pd.DataFrame(data=var_tfidf)),
              axis=1,
              ignore_index=True).to_csv(
                  'data/intermediate/gv_var_unique_tf_custom_idf.csv',
                  header=False,
                  index=False)

    _, idf_list = document_mining._word_occur_cls(list(gene_var_dic_intxn))
    tfidf = document_mining._get_tfidf(gene_var_text['Text'].values,
                                       list(gene_var_dic_intxn), idf_list)
    pd.concat((gene_var_text['Entity'], pd.DataFrame(data=tfidf)),
              axis=1,
              ignore_index=True).to_csv(
                  'data/intermediate/gv_gene_var_intxn_tf_custom_idf.csv',
                  header=False,
                  index=False)
Ejemplo n.º 52
0
        if len(line) == 0:
            print("File read finished")
            break
        line = re.sub("[^a-zA-Z ]", "", line)
        tokenized = word_tokenize(line)
        for word in tokenized:
            if word == "CNN" or word == "highlight":
                tokenized.remove(word)
        cnnTokenized.append(tokenized)

        sent = ""
        for w in tokenized:
            sent += (w + " ")
        cnnDoc.append(sent)

    cnnTF_IDF = vectorizer.fit_transform(cnnDoc)
    with open("cnn-tf-idf.pkl", 'wb') as handle:
        pickle.dump(cnnTF_IDF, handle)

    vocabs = vectorizer.get_feature_names()
    with open("cnn-terms.pkl", 'wb') as handle:
        pickle.dump(vectorizer.get_feature_names(), handle)

lda = LatentDirichletAllocation(n_components=20)
lda.fit_transform(cnnTF_IDF)
topics = lda.components_

for index, topic in enumerate(topics):
    print("Topic %d : " % (index + 1), end="")
    print([(vocabs[i], topic[i].round(5)) for i in topic.argsort()[:-6:-1]])
Ejemplo n.º 53
0
def generate_lda_feature(x, topic) -> pd.DataFrame:
    log.info(f"Generating lda features from x:{x.shape} topics:{topic}")
    lda = LatentDirichletAllocation(n_components=topic, max_iter=10, random_state=0)
    dt_matrix = lda.fit_transform(x)
    return pd.DataFrame(dt_matrix)
Ejemplo n.º 54
0
train_svd = svd2.fit_transform(train_tfidf)
train_svd = pd.DataFrame(train_svd)
train_svd = sc.fit_transform(train_svd)

test_svd = svd2.transform(test_tfidf)
test_svd = pd.DataFrame(test_svd)
test_svd = sc.fit_transform(test_svd)

# Words - LDiA
Counter = CountVectorizer(tokenizer=italian_tokenizer)
cv_bow = pd.DataFrame(Counter.fit_transform(raw_documents=cv_text).toarray())
dev_bow = pd.DataFrame(Counter.transform(raw_documents=dev_text).toarray())

ldia = LDiA(n_components=32, learning_method="batch")
cv_ldia = ldia.fit_transform(cv_bow)
cv_ldia = pd.DataFrame(cv_ldia)
cv_ldia = sc.fit_transform(cv_ldia)

dev_ldia = ldia.transform(dev_bow)
dev_ldia = pd.DataFrame(dev_ldia)
dev_ldia = sc.fit_transform(dev_ldia)

Counter2 = CountVectorizer(tokenizer=italian_tokenizer)

train_bow = pd.DataFrame(
    Counter2.fit_transform(raw_documents=df_train).toarray())
test_bow = pd.DataFrame(Counter2.transform(raw_documents=test_text).toarray())

ldia2 = LDiA(n_components=32, learning_method="batch")
Ejemplo n.º 55
0
                                                stop_words='english')
                tf = tf_vectorizer.fit_transform(text)
                # todo rationalize this
                true_k = tf.shape[0] * tf.shape[1] / tf.nnz
                logger.debug(
                    'using the TF/CountVectorizer data we expect %d clusters' %
                    true_k)
                n_topics = true_k
                lda = LatentDirichletAllocation(learning_method='online',
                                                learning_offset=50.,
                                                max_iter=5,
                                                n_topics=n_topics,
                                                random_state=random_state)
                lda.fit(tf)

                lda_results = lda.fit_transform(tf)
                tf_feature_names = tf_vectorizer.get_feature_names()
                for topic_idx, topic in enumerate(lda.components_):
                    logger.debug('Topic #%d:' % topic_idx)
                    logger.debug(' '.join([
                        '[' + tf_feature_names[i] + ']'
                        for i in topic.argsort()[:-n_top_words - 1:-1]
                    ]))
                # let's make a grid of topics and words
                if False:
                    values = lda.components_.copy()
                    # todo find a good threshold
                    # threshold = 0.75
                    # values[values < threshold] = 0
                    t4 = values.min()
                    t5 = values.max()
Ejemplo n.º 56
0
fr = open("Med5Sept.csv", "r")
if fr.mode == 'r':
    contenidofiltrado = fr.readlines()
    print(contenidofiltrado)

count_vect = CountVectorizer(max_df=0.1,
                             min_df=0,
                             stop_words=spanish_stopwords)  # 'spanish')
# doc_term_matrix = count_vect.fit_transform(reviews_datasets['description'].values.astype('U'))
# doc_term_matrix = count_vect.fit_transform(reviews_datasets['text'].values.astype('U'))
doc_term_matrix = count_vect.fit_transform(contenidofiltrado)

NUM_TOPICS = 6

LDA = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)
lda_Z = LDA.fit_transform(doc_term_matrix)

sentencia = []


def print_topics(model, vectorizer, top_n=11):  #  11 Sintagmas
    for idx, topic in enumerate(model.components_):
        # print("Topic %d:" % (idx))
        oracion = ' '.join([(vectorizer.get_feature_names()[i])
                            for i in topic.argsort()[:-top_n - 1:-1]])
        # print([(vectorizer.get_feature_names()[i] )
        #      for i in topic.argsort()[:-top_n - 1:-1]])
        # print(oracion)
        sentencia.append(oracion)

Ejemplo n.º 57
0
#%% Import libs and frameworks
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#%% Latent Dirichlet Allocation
df = pd.read_csv("movie_data.csv", encoding="utf-8")
count = CountVectorizer(stop_words="english", max_df=0.1, max_features=5000)
X = count.fit_transform(df["review"].values)

lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method="online")
X_topics = lda.fit_transform(X)
print(lda.components_.shape)

n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d" % (topic_idx + 1))
    print(" ".join(
        [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
Ejemplo n.º 58
0
    print("")
    
    print("==> DFS", prefix)
    print_results_for_field(dataset, Xt_full, "DFS", prefix)
    print("")
    print("")
    
    print("==> posOutcome", prefix)
    print_results_for_field(dataset, Xt_full, "posOutcome", prefix)
    print("")
    print("")


    

treat_dataset  = read_treat_dataset()
combat_dataset = read_combat_dataset()

X_full, _ = prepare_full_dataset(drop_trea(combat_dataset))


pam_types_cat_dataset = read_pam_types_cat_dataset()

assert all(pam_types_cat_dataset['patient_ID'] == combat_dataset['patient_ID'])

    
for n_cluster in [1, 5,10,20, 100, 200]:
    lda = LatentDirichletAllocation(n_components=n_cluster)
    Xt_full = lda.fit_transform(X_full - np.min(X_full))
    print_results(pam_types_cat_dataset, Xt_full, "nc" + str(n_cluster))
Ejemplo n.º 59
0
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(textVector)

from sklearn.decomposition import LatentDirichletAllocation

n_topics = 9
lda = LatentDirichletAllocation(n_topics=n_topics)
lda.fit(textVector)

topicWordMatrix = lda.components_

import numpy

prefixMatrix = numpy.where(topicWordMatrix >= 0, "", "^")

sort = numpy.argsort(-1 * numpy.abs(topicWordMatrix), axis=1)[:, 0:10]

prefixs = []
for i in range(n_topics):
    prefixs.append(prefixMatrix[i, sort[i]])

keywords = pandas.Index(countVectorizer.get_feature_names())[sort].values

print(prefixs + keywords)

textTopicMatrix = lda.fit_transform(textVector)

corpos['topic'] = textTopicMatrix.argmax(axis=1)

pandas.crosstab(corpos['class'], corpos['topic'])
Ejemplo n.º 60
0
df["content_cutted"] = df.content.apply(chinese_word_cut)
print(type(df.content_cutted))
#numpy.savetxt('new1.csv',df.content_cutted, delimiter = ',')
n_features = 10000  #特征词数量
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df=0.5,
                                min_df=10)
tf = tf_vectorizer.fit_transform(df.content_cutted)

n_topics = 30  #话题数量
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
cm = lda.fit_transform(tf)
n_top_words = 30  #打印的话题数量
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

numpy.savetxt('new2.csv', cm, delimiter=',')
#聚类20个
kmeans = KMeans(n_clusters=32, random_state=0).fit(cm)
kresults = pd.DataFrame(data=numpy.array(kmeans.labels_))
print(type(names), type(kresults))
newre = pd.concat([names, kresults], axis=1)
#newre = names.append(kresults)
newre.to_csv('new3.csv', encoding="utf-8", index=False)