Example #1
0
 def greene_metric(self,
                   min_num_topics=10,
                   step=5,
                   max_num_topics=50,
                   top_n_words=10,
                   tao=10):
     """
     Implements Greene metric to compute the optimal number of topics. Taken from How Many Topics?
     Stability Analysis for Topic Models from Greene et al. 2014.
     :param step:
     :param min_num_topics: Minimum number of topics to test
     :param max_num_topics: Maximum number of topics to test
     :param top_n_words: Top n words for topic to use
     :param tao: Number of sampled models to build
     :return: A list of len (max_num_topics - min_num_topics) with the stability of each tested k
     """
     stability = []
     # Build reference topic model
     # Generate tao topic models with tao samples of the corpus
     for k in np.arange(min_num_topics, max_num_topics + 1, step):
         self.infer_topics(k)
         reference_rank = [
             list(zip(*self.top_words(i, top_n_words))[0]) for i in range(k)
         ]
         agreement_score_list = []
         for t in range(tao):
             tao_corpus = Corpus(
                 source_file_path=self.corpus._source_file_path,
                 language=self.corpus._language,
                 vectorization=self.corpus._vectorization,
                 max_relative_frequency=self.corpus._max_relative_frequency,
                 min_absolute_frequency=self.corpus._min_absolute_frequency,
                 preprocessor=self.corpus._preprocessor,
                 sample=True)
             tao_model = type(self)(tao_corpus)
             tao_model.infer_topics(k)
             tao_rank = [
                 list(zip(*tao_model.top_words(i, top_n_words))[0])
                 for i in range(k)
             ]
             agreement_score_list.append(
                 stats.agreement_score(reference_rank, tao_rank))
         stability.append(np.mean(agreement_score_list))
     return stability
Example #2
0
            terms = tokenizer.tokenize(document)
            nb_terms = len(terms)
            for i in range(nb_terms):
                row_index = self.corpus.id_for_word(terms[i])
                if row_index != -1:
                    start = i - window
                    if start < 0:
                        start = 0
                    end = i + window
                    if end >= nb_terms:
                        end = nb_terms - 1
                    context0 = terms[start:i]
                    context1 = terms[i + 1:end + 1]
                    context0.extend(context1)
                    for term in context0:
                        column_index = self.corpus.id_for_word(term)
                        if column_index != -1:
                            self.word_context_matrix[row_index][
                                column_index] += 1


if __name__ == '__main__':
    corpus = Corpus(source_file_path='../input/egc_lemmatized.csv',
                    language='french',
                    vectorization='tfidf',
                    max_relative_frequency=0.8,
                    min_absolute_frequency=4,
                    preprocessor=None)
    model = LanguageModel(corpus)
    model.compute_word_context_matrix(5)
Example #3
0
__email__ = "*****@*****.**"

# Flask Web server
app = Flask(__name__)

# Parameters
max_tf = 0.8
min_tf = 4
lemmatizer = None
num_topics = 20
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='../input/egc.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf,
                preprocessor=None)
print 'corpus size:', corpus.size
print 'vocabulary size:', len(corpus.vocabulary)

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('static/data'):
    shutil.rmtree('static/data')
os.makedirs('static/data')
Example #4
0
# coding: utf-8
from nlp.topic_model import LatentDirichletAllocation, LatentSemanticAnalysis, NonNegativeMatrixFactorization
from nlp.preprocessor import FrenchLemmatizer, EnglishStemmer, EnglishLemmatizer
from structure.corpus import Corpus
from visualization.visualization import Visualization
import utils

__author__ = "Adrien Guille, Pavel Soriano"
__email__ = "*****@*****.**"

# Load and prepare a corpus
print 'Load documents from CSV'
corpus = Corpus(source_file_path='input/egc.csv',
                language='french',  # language for stop words
                vectorization='tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
                max_relative_frequency=0.8,  # ignore words which relative frequency is > than max_relative_frequency
                min_absolute_frequency=4,  # ignore words which absolute frequency is < than min_absolute_frequency
                preprocessor=FrenchLemmatizer())  # pre-process documents
print 'corpus size:', corpus.size
print 'vocabulary size:', len(corpus.vocabulary)
print 'Vector representation of document 0:\n', corpus.vector_for_document(0)

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=30,
                       tao=10, step=1,
                       top_n_words=10)
Example #5
0
__email__ = "*****@*****.**"

# Flask Web server
app = Flask(__name__)

# Parameters
max_tf = 0.8
min_tf = 4
lemmatizer = None
num_topics = 20
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='../input/elysee.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf,
                preprocessor=None)
print 'corpus size:', corpus.size
print 'vocabulary size:', len(corpus.vocabulary)

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('static/data'):
    shutil.rmtree('static/data')
os.makedirs('static/data')
    p.add_argument('--nf',
                   metavar='nb_features',
                   type=int,
                   help='Vocabulary size (default to 50000)',
                   default=50000)
    p.add_argument('--ws',
                   metavar='window_size',
                   type=int,
                   help='Context window size (default to 5)',
                   default=5)
    p.add_argument(
        '--dw',
        metavar='decreasing_weighting',
        type=bool,
        help='Decreasing weighting (True or False, default to False)',
        default=False)
    args = p.parse_args()

    print(
        'Arguments:\n   Input file: %s\n   Output file: %s\n   Max number of features: %d\n   Window size: %d\n   Decreasing weighting: %s'
        % (args.i, args.o, args.nf, args.ws, args.dw))
    print('Loading corpus...')
    start_time = timeit.default_timer()
    my_corpus = Corpus(args.i,
                       nb_features=args.nf,
                       window_size=args.ws,
                       decreasing_weighting=args.dw)
    elapsed = timeit.default_timer() - start_time
    print('Corpus loaded in %f seconds.' % elapsed)
    pickle.dump(my_corpus.X, open(args.o, 'wb'))
Example #7
0
# coding: utf-8
from structure.corpus import Corpus
from nlp.semantic_model import PPMI_SVD, COALS, GloVe
import timeit

__authors__ = "Adrien Guille"
__email__ = "*****@*****.**"

print('Loading corpus...')
start_time = timeit.default_timer()
my_corpus = Corpus('data/messages3.csv',
                   nb_features=50000,
                   window_size=5,
                   decreasing_weighting=True)
elapsed = timeit.default_timer() - start_time
print('Corpus loaded in %f seconds.' % elapsed)

method = input('Select a method (either PPMI+SVD, COALS or GloVe): ')
my_semantic_model = None

if method == 'PPMI+SVD':
    print('Learning vector space with PPMI+SVD...')
    start_time = timeit.default_timer()
    my_semantic_model = PPMI_SVD(my_corpus)
    my_semantic_model.learn_vector_space(dimensions=100)
    elapsed = timeit.default_timer() - start_time
    print('Vector space learned in %f seconds.' % elapsed)
elif method == 'COALS':
    print('Learning vector space with COALS...')
    start_time = timeit.default_timer()
    my_semantic_model = COALS(my_corpus)