def greene_metric(self, min_num_topics=10, step=5, max_num_topics=50, top_n_words=10, tao=10): """ Implements Greene metric to compute the optimal number of topics. Taken from How Many Topics? Stability Analysis for Topic Models from Greene et al. 2014. :param step: :param min_num_topics: Minimum number of topics to test :param max_num_topics: Maximum number of topics to test :param top_n_words: Top n words for topic to use :param tao: Number of sampled models to build :return: A list of len (max_num_topics - min_num_topics) with the stability of each tested k """ stability = [] # Build reference topic model # Generate tao topic models with tao samples of the corpus for k in np.arange(min_num_topics, max_num_topics + 1, step): self.infer_topics(k) reference_rank = [ list(zip(*self.top_words(i, top_n_words))[0]) for i in range(k) ] agreement_score_list = [] for t in range(tao): tao_corpus = Corpus( source_file_path=self.corpus._source_file_path, language=self.corpus._language, vectorization=self.corpus._vectorization, max_relative_frequency=self.corpus._max_relative_frequency, min_absolute_frequency=self.corpus._min_absolute_frequency, preprocessor=self.corpus._preprocessor, sample=True) tao_model = type(self)(tao_corpus) tao_model.infer_topics(k) tao_rank = [ list(zip(*tao_model.top_words(i, top_n_words))[0]) for i in range(k) ] agreement_score_list.append( stats.agreement_score(reference_rank, tao_rank)) stability.append(np.mean(agreement_score_list)) return stability
terms = tokenizer.tokenize(document) nb_terms = len(terms) for i in range(nb_terms): row_index = self.corpus.id_for_word(terms[i]) if row_index != -1: start = i - window if start < 0: start = 0 end = i + window if end >= nb_terms: end = nb_terms - 1 context0 = terms[start:i] context1 = terms[i + 1:end + 1] context0.extend(context1) for term in context0: column_index = self.corpus.id_for_word(term) if column_index != -1: self.word_context_matrix[row_index][ column_index] += 1 if __name__ == '__main__': corpus = Corpus(source_file_path='../input/egc_lemmatized.csv', language='french', vectorization='tfidf', max_relative_frequency=0.8, min_absolute_frequency=4, preprocessor=None) model = LanguageModel(corpus) model.compute_word_context_matrix(5)
__email__ = "*****@*****.**" # Flask Web server app = Flask(__name__) # Parameters max_tf = 0.8 min_tf = 4 lemmatizer = None num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/egc.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data')
# coding: utf-8 from nlp.topic_model import LatentDirichletAllocation, LatentSemanticAnalysis, NonNegativeMatrixFactorization from nlp.preprocessor import FrenchLemmatizer, EnglishStemmer, EnglishLemmatizer from structure.corpus import Corpus from visualization.visualization import Visualization import utils __author__ = "Adrien Guille, Pavel Soriano" __email__ = "*****@*****.**" # Load and prepare a corpus print 'Load documents from CSV' corpus = Corpus(source_file_path='input/egc.csv', language='french', # language for stop words vectorization='tfidf', # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency) max_relative_frequency=0.8, # ignore words which relative frequency is > than max_relative_frequency min_absolute_frequency=4, # ignore words which absolute frequency is < than min_absolute_frequency preprocessor=FrenchLemmatizer()) # pre-process documents print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) print 'Vector representation of document 0:\n', corpus.vector_for_document(0) # Instantiate a topic model topic_model = NonNegativeMatrixFactorization(corpus) # Estimate the optimal number of topics viz = Visualization(topic_model) viz.plot_greene_metric(min_num_topics=10, max_num_topics=30, tao=10, step=1, top_n_words=10)
__email__ = "*****@*****.**" # Flask Web server app = Flask(__name__) # Parameters max_tf = 0.8 min_tf = 4 lemmatizer = None num_topics = 20 vectorization = 'tfidf' # Load corpus corpus = Corpus(source_file_path='../input/elysee.csv', language='french', vectorization=vectorization, max_relative_frequency=max_tf, min_absolute_frequency=min_tf, preprocessor=None) print 'corpus size:', corpus.size print 'vocabulary size:', len(corpus.vocabulary) # Infer topics topic_model = NonNegativeMatrixFactorization(corpus=corpus) topic_model.infer_topics(num_topics=num_topics) topic_model.print_topics(num_words=10) # Clean the data directory if os.path.exists('static/data'): shutil.rmtree('static/data') os.makedirs('static/data')
p.add_argument('--nf', metavar='nb_features', type=int, help='Vocabulary size (default to 50000)', default=50000) p.add_argument('--ws', metavar='window_size', type=int, help='Context window size (default to 5)', default=5) p.add_argument( '--dw', metavar='decreasing_weighting', type=bool, help='Decreasing weighting (True or False, default to False)', default=False) args = p.parse_args() print( 'Arguments:\n Input file: %s\n Output file: %s\n Max number of features: %d\n Window size: %d\n Decreasing weighting: %s' % (args.i, args.o, args.nf, args.ws, args.dw)) print('Loading corpus...') start_time = timeit.default_timer() my_corpus = Corpus(args.i, nb_features=args.nf, window_size=args.ws, decreasing_weighting=args.dw) elapsed = timeit.default_timer() - start_time print('Corpus loaded in %f seconds.' % elapsed) pickle.dump(my_corpus.X, open(args.o, 'wb'))
# coding: utf-8 from structure.corpus import Corpus from nlp.semantic_model import PPMI_SVD, COALS, GloVe import timeit __authors__ = "Adrien Guille" __email__ = "*****@*****.**" print('Loading corpus...') start_time = timeit.default_timer() my_corpus = Corpus('data/messages3.csv', nb_features=50000, window_size=5, decreasing_weighting=True) elapsed = timeit.default_timer() - start_time print('Corpus loaded in %f seconds.' % elapsed) method = input('Select a method (either PPMI+SVD, COALS or GloVe): ') my_semantic_model = None if method == 'PPMI+SVD': print('Learning vector space with PPMI+SVD...') start_time = timeit.default_timer() my_semantic_model = PPMI_SVD(my_corpus) my_semantic_model.learn_vector_space(dimensions=100) elapsed = timeit.default_timer() - start_time print('Vector space learned in %f seconds.' % elapsed) elif method == 'COALS': print('Learning vector space with COALS...') start_time = timeit.default_timer() my_semantic_model = COALS(my_corpus)