Ejemplo n.º 1
0
def topic_model(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    print("Training LDA Model ...")
    model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     chunksize=chunksize,
                     alpha='auto',
                     eta='auto',
                     iterations=iterations,
                     num_topics=num_topics,
                     passes=passes,
                     eval_every=eval_every)

    return model.top_topics(corpus)
Ejemplo n.º 2
0
    def train_lda(self, data, dictionary, n_topics, n_iter):

        # Set training parameters.
        num_topics = n_topics
        chunksize = 1000
        passes = 1
        iterations = n_iter
        eval_every = None
        # Set random seed
        random_seed = 135
        state = np.random.RandomState(random_seed)

        # Train LDA model.
        model = LdaModel(corpus=data,
                         id2word=dictionary,
                         chunksize=chunksize,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every,
                         random_state=state)

        top_topics = model.top_topics(data)

        return model, top_topics
Ejemplo n.º 3
0
def load_top_topics(docs=[], cnt_topics=5):

	docs = [remove_stopwords(doc) for doc in docs] #remove stopwords
	tokenizer = RegexpTokenizer(r'\w+') #=> https://www.kite.com/python/docs/nltk.RegexpTokenizer
	for i in range(len(docs)):
		docs[i] = docs[i].lower() #lower strings
		docs[i] = tokenizer.tokenize(docs[i]) #split strings into tokens
	docs = [[token for token in doc if not token.isnumeric() and not token[0].isnumeric()] for doc in docs] #exclude numbers
	docs = [[token for token in doc if len(token) > 1] for doc in docs] #exclude too short tokens
	lemmatizer = WordNetLemmatizer() #=> https://www.nltk.org/_modules/nltk/stem/wordnet.html
	docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] #group similar words

	dictionary = Dictionary(docs) #create dictionary
	corpus = [dictionary.doc2bow(doc) for doc in docs] #create corpus

	model = LdaModel( #=> https://radimrehurek.com/gensim/models/ldamodel.html
		corpus=corpus,
		id2word=dictionary,
		# chunksize=2000,
		# alpha='auto',
		# eta='auto',
		iterations=200,
		# passes=20,
		# eval_every=None,
		num_topics=cnt_topics
	)
		
	top_topics = model.top_topics(corpus) #[([(a, x), ..., (a, x)], a), ...]

	return top_topics
Ejemplo n.º 4
0
def get_topics(data, filepath='./data/spam_topics.pkl'):
    if not os.path.exists(filepath):
        import pyLDAvis.gensim
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel, CoherenceModel

        texts = [sample['lemmas'] for sample in data]

        dictionary = Dictionary(texts)
        dictionary.filter_extremes(no_below=20, no_above=0.4)
        corpus = [dictionary.doc2bow(text) for text in texts]

        chunksize = 500
        passes = 5
        iterations = 400
        eval_every = None

        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        best_coherence = 0
        best_model_filepath = ''
        for num_topics in list(range(2, 20)):
            for alpha in ['asymmetric', 'symmetric']:
                for eta in ['symmetric', 'auto']:
                    filepath = 'out/topics/{}_{}_{}'.format(num_topics, alpha, eta)
                    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every)
                    coherence = float(CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence())
                    filepath += '_{:.4f}'.format(coherence)
                    model.save(filepath + '_model.pkl')

                    prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary)
                    pyLDAvis.save_html(prepared, filepath + '_plot.html')

                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_model_filepath = filepath + '_model.pkl'

        model = LdaModel.load(best_model_filepath)
        print('Best model: {}'.format(best_model_filepath))

        topics = [x[0] for x in model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary, topn=100)]

        data_topics = []
        for i, text in enumerate(texts):
            data_topics.append({k: v for k, v in model.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.0)})

        pickle.dump([topics, data_topics], open(filepath, 'wb'))
    else:
        [topics, data_topics] = pickle.load(open(filepath, 'rb'))

    for i in range(len(data_topics)):
        data[i]['topics'] = data_topics[i]

    return topics, data
Ejemplo n.º 5
0
def train_lda_model():
    """ Train the LDA model with topcoder selected challenges requirements."""
    print('Start processing doc.')
    clean_docs = [
        clean_and_tokenize(doc)
        for doc in TC.get_filtered_requirements().requirements.tolist()
    ]
    dictionary = Dictionary(clean_docs)

    corpus = [dictionary.doc2bow(doc) for doc in clean_docs]

    print('Training LDA...')
    lda = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=50)
    lda.save('./baseline/ptma_lda.model')

    pprint(lda.top_topics(corpus))
Ejemplo n.º 6
0
def word_modeling(tokens):
    from gensim.corpora import Dictionary
    from gensim.models import phrases, LdaModel

    bigram = phrases.Phraser(phrases.Phrases(tokens, min_count=2))
    for i, ts in enumerate(tokens):
        for btoken in bigram[ts]:
            if '_' in btoken and btoken not in tokens[i]:
                tokens[i].append(btoken)

    token_dict = Dictionary(tokens)
    corpus = [token_dict.doc2bow(t) for t in tokens]

    _ = token_dict[0]
    model = LdaModel(corpus=corpus, id2word=token_dict.id2token, chunksize=len(tokens), alpha="auto",
                     eta="auto", iterations=400, num_topics=20, passes=20, eval_every=None)
    pprint.pprint(model.top_topics(corpus))
Ejemplo n.º 7
0
def coherence_lda_model(corpus, dictionary):
    num_topics = 10
    chunksize = 2000
    passes = 40
    iterations = 50
    eval_every = None  # Don't evaluate model perplexity, takes too much time.
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)
    top_topics = model.top_topics(corpus, num_words=20)
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    pprint(top_topics)
Ejemplo n.º 8
0
def modelBuilder(num_topics=NUMBER_OF_TOPICS):
    docs = []
    with open('outputs/corpus.txt', mode='r') as lyrics:
        for line in lyrics:
            docs.append(line.split())

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    # Set training parameters.
    chunksize = 2000
    passes = 15
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    coherenceModel = CoherenceModel(
        model=model, texts=docs, dictionary=dictionary, coherence='c_v')

    top_topics = model.top_topics(corpus)  # , num_words=20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average UMass topic coherence for %d topics: %.4f.' %
          (num_topics, avg_topic_coherence))
    print('Average C_V topic coherence for %d topics: %.4f.' %
          (num_topics, coherenceModel.get_coherence()))
    return(model)
Ejemplo n.º 9
0
class MyLda:
    def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15):
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.myDictionary = myDictionary
        self.model = LdaModel(self.myDictionary.doc2bows, \
         id2word=self.myDictionary.dictionary, \
         num_topics=num_topics)
        self.topic2ids, self.id2topics = self.get_mappings()
        self.coherenceModel = None
        print("- Created MyLda with {} topics".format(self.num_topics))

    def get_mappings(self):
        topic2ids, id2topics = defaultdict(list), defaultdict(list)
        for i, doc2bow in enumerate(self.myDictionary.doc2bows):
            topic_pairs = self.model.get_document_topics(doc2bow)
            for j, (topic, prob) in enumerate(topic_pairs):
                if prob >= self.topic_threshold or j == 0:
                    topic2ids[topic].append(i)
                    id2topics[i].append(topic)
        return topic2ids, id2topics

    def get_topic_terms(self, topic):
        terms = self.model.get_topic_terms(topic)
        return terms

    def get_top_topic(self):
        top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows)
        average = sum([t[1] for t in top_topics]) / self.num_topics
        return top_topics, average

    def get_perplexity(self):
        return self.model.log_perplexity(self.myDictionary.doc2bows)

    def get_coherence(self):
        if not self.coherenceModel:
            self.coherenceModel = CoherenceModel(model=self.model, \
             corpus=self.myDictionary.doc2bows, \
             dictionary=self.myDictionary.dictionary, \
             coherence='u_mass')
        return self.coherenceModel.get_coherence()
Ejemplo n.º 10
0
def get_topics(cv, train_data):
    '''
    Uses gensim to perform topic modeling.
    
    Paramters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.
    
    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    '''
    #Create the gensim corpus from train data
    td_gensim = Sparse2Corpus(train_data)
    #Create vocab dictionary
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)
    #Create LDA model with specified parameters
    lda_gs = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda_gs.top_topics(corpus=td_gensim, num_words=5)
    return topics
Ejemplo n.º 11
0
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics
Ejemplo n.º 12
0
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(
    corpus=wiki,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

top_topics = model.top_topics(wiki)


np.save("wiki_topics.npy", model.get_topics())
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

pickle.dump(top_topics, open("top_topics.p", "wb"))
Ejemplo n.º 14
0

t, d, c = preprocess(tweets)

# Create the model
temp = d[0]
id2word = d.id2token

numTopics = 20
chunkSize = 8000
passes = 10
iterations = 100

model = LdaModel(corpus=c,
                 id2word=id2word,
                 chunksize=chunkSize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=numTopics,
                 passes=passes,
                 eval_every=None)

print("Completed LDA model training...")

# Model Analysis
topics = model.top_topics(corpus=c, topn=5)
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in topics]) / numTopics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(topics)
Ejemplo n.º 15
0
    def run(self, overwrite=False):
        if self.is_calculated and overwrite is False:
            print(
                f"{self} has already saved a model at {self.created_model}. Run with `overwrite=True` to overwrite the model"
            )
            return None

        # Wipe any topics formerly associated with this tm
        Topic.objects.filter(topic_model=self).delete()

        docs = list(self.documents.all().values_list("fulltext", flat=True))

        # Captured ordered document IDs when generating the model
        doc_ids = list(self.documents.all().values_list("id", flat=True))
        self.document_ids = doc_ids
        self.save()

        # Split the documents into tokens.
        tokenizer = RegexpTokenizer(r"\w+")
        for idx in range(len(docs)):
            docs[idx] = docs[idx].lower()  # Convert to lowercase.
            docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

        # Remove numbers, but not words that contain numbers.
        docs = [[token for token in doc if not token.isnumeric()]
                for doc in docs]

        # Remove words that are only one character.
        docs = [[token for token in doc if len(token) > 1] for doc in docs]

        # Remove stopwords
        english_stops = set(stopwords.words("english"))
        docs = [[token for token in doc if not token in english_stops]
                for doc in docs]

        # Lemmatize the documents.

        lemmatizer = WordNetLemmatizer()
        docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

        #  Compute bigrams.

        # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(docs, min_count=self.min_count)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if "_" in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)

        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)

        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=self.min_count,
                                   no_above=self.no_above)

        # Bag-of-words representation of the documents.
        corpus = [dictionary.doc2bow(doc) for doc in docs]

        # Set training parameters.
        num_topics = self.n_topics
        chunksize = self.chunksize
        passes = self.passes
        iterations = self.iterations

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            chunksize=chunksize,
            alpha="auto",
            eta="auto",
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
        )
        instance_path = f"{self.MODEL_PATH}/{self.id}.mm"
        model.save(instance_path)
        self.created_model = instance_path
        self.save()

        predictions = [model[doc] for doc in corpus]

        topic_objects = []
        for i, t in enumerate(model.top_topics(corpus)):
            terms = [term[1] for term in t[0]]
            topic_objects.append(
                Topic.objects.create(topic_model=self, terms=terms, index=i))

        for i, pred in enumerate(predictions):
            for ti, t in enumerate(pred):
                DocumentTopic.objects.create(
                    document=Document.objects.get(id=doc_ids[i]),
                    topic=topic_objects[ti],
                    log=t[1],
                )

        return self.created_model
Ejemplo n.º 16
0
class LDA_result(object):
    
    def __init__(self, abtract_complete_true,num_topics = 4, chunksize = 1000, passes = 60, iterations = 600, eval_every = None):
        self.num_journal = len(abtract_complete_true)
        self.abtract_complete_true = abtract_complete_true
        self.abtract_complete = self.abtract_complete_combination()
        self.dictionary = corpora.Dictionary(self.abtract_complete)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.abtract_complete]    
        self.temp = self.dictionary[0]
        self.id2word = self.dictionary.id2token
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.iterations = iterations
        self.eval_every = eval_every
        self.model = LdaModel(corpus = self.corpus, id2word = self.id2word, chunksize = self.chunksize, \
                       alpha='auto', eta='auto', \
                       iterations = self.iterations, num_topics = self.num_topics, \
                       passes = self.passes, eval_every = self.eval_every)           #建立模型    
    
    #组合摘要词汇
    def abtract_complete_combination(self):
        abtract_complete = []
        for journal_word_list in self.abtract_complete_true.values():
            abtract_complete.append(journal_word_list)
        return abtract_complete
                
    ##描述情况
    def description(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
    
    #转tfidf向量    
    def word2tfidf(self):
        tfidf = models.TfidfModel(self.corpus)
        corpusTfidf = tfidf[self.corpus]
        return corpusTfidf
    
    #输出各个主题关键词
    def key_words(self):
        top_topics = self.model.top_topics(self.corpus)
        pprint(top_topics) 
        
    #每一行包含了主题词和主题词的权重
    def key_weight(self):
        print(self.model.print_topic(0,10))
        print(self.model.print_topic(1,10))  
    
    #判断第一个训练集文档属于哪个主题,没什么卵用凑个数
    def topic_belong(self):
        for index, score in sorted(self.model[self.corpus[0]], key=lambda tup: -1*tup[1]):
            print("Score: {}\n Topic: {}".format(score, self.model.print_topic(index, 10)))
    
    #LDA进行可视化
    def visible(self):
        vis_wrapper = pyLDAvis.gensim.prepare(self.model,self.corpus,self.dictionary)
        pyLDAvis.display(vis_wrapper)
        pyLDAvis.save_html(vis_wrapper,"lda%dtopics.html"%self.num_topics)
        pyLDAvis.show(vis_wrapper)
    
    #给训练集输出其属于不同主题概率  
    def community_belong(self):
        journal_community = {}
        for i,element in enumerate(abtract_complete_true):
            journal_community[element] = []
            for index, score in sorted(self.model[self.corpus[i]], key=lambda tup: -1*tup[1]):
                if score > 0.2:
                    journal_community[element].append(str(index))
                print(index, score)
        return journal_community

    #给定新的语料    
#    @staticmethod
#    def word_corpus(abtract_complete):
#        dictionary = corpora.Dictionary(abtract_complete)
#        corpus = [dictionary.doc2bow(text) for text in abtract_complete]  
#        return corpus
    
    #判断新预料的主题归属
    def identify_community(self, abtract_complete):
        corpus = self.dictionary.doc2bow(abtract_complete)
        community = []
        for index, score in sorted(self.model[corpus], key=lambda tup: -1*tup[1]):
            if score > 0.2:
                community.append(str(index)) 
        return community
Ejemplo n.º 17
0
def get_topics_from_tweets(nltk_data_path,
                           cleaned_tweets_texts,
                           n_topics=1,
                           n_words_per_topic=10,
                           n_passes=2,
                           force_download=False):
    """Retrieves topics from cleaned_tweets_texts using LDA algorithm.

    Args:
        nltk_data_path (str): path to NLTK data
        cleaned_tweets_texts (list of str): List of tweets texts from which topics are learned
        n_topics (int, optional): Number of topics to learn. Defaults to 1.
        n_words_per_topic (int, optional): Number of words per topic. Defaults to 10.
        n_passes (int, optional): Passes to train LDA. See LDA doc for more details. Defaults to 2.
        force_download (bool, optional): If True, NLTK data will be downloaded. Defaults to False.

    Returns:
        pandas.DataFrame: DataFrame containing topic words and their probabilities (weights)

    """
    # Check NLTK data
    nltk_data_available = os.path.isdir(nltk_data_path) and len(
        os.listdir(nltk_data_path)) != 0

    if force_download or not nltk_data_available:
        nltk.download('wordnet', download_dir=nltk_data_path)
        nltk.download('averaged_perceptron_tagger',
                      download_dir=nltk_data_path)
    else:
        nltk.data.path.append(nltk_data_path)

    # Get dictionary from downloaded tweets
    custom_filters = [
        lambda x: x.lower(), strip_tags, strip_punctuation,
        strip_multiple_whitespaces, strip_numeric, remove_stopwords,
        strip_short
    ]
    preprocessed_tweets = [
        preprocess_string(d, custom_filters) for d in cleaned_tweets_texts
    ]
    preprocessed_tweets = lemmatize_sentences(preprocessed_tweets)
    dictionary = corpora.Dictionary(preprocessed_tweets)

    # Train LDA to reveal topics
    vectorized_tweets = [
        dictionary.doc2bow(p_tweet) for p_tweet in preprocessed_tweets
    ]
    lda = LdaModel(vectorized_tweets,
                   num_topics=n_topics,
                   passes=n_passes,
                   id2word=dictionary,
                   alpha="auto",
                   eta="auto")
    top_topics = lda.top_topics(vectorized_tweets, topn=n_words_per_topic)

    # Convert top_topics to dataframe
    top_topics_words = []
    top_topics_proba = []
    top_topics_indexes = []
    for i, topic in enumerate(top_topics):
        topic_words = [w_tuple[1] for w_tuple in topic[0]]
        topic_proba = [w_tuple[0] for w_tuple in topic[0]]
        top_topics_indexes.extend([i] * len(topic[0]))
        top_topics_words.extend(topic_words)
        top_topics_proba.extend(topic_proba)

    top_topics_df = pd.DataFrame({
        "words": top_topics_words,
        "topics": top_topics_indexes,
        "proba": top_topics_proba
    })

    return top_topics_df
Ejemplo n.º 18
0
def run_tm(topics, below, above, chunksize, passes, iterations):

    m, valid = arevalid(topics, below, above, chunksize, passes, iterations)
    if not valid:

        fehlerfenster = Toplevel()
        fehlerfenster.title('Fehler')
        fehlerfenster.geometry('300x300')
        # Label mit der Fehlermeldung
        labelfehler = Label(master=fehlerfenster, text=m)
        labelfehler.place(x=10, y=10, width=300, height=300)

    else:

        with open('../data/docs', 'rb') as f:
            docs = pickle.load(f)

        tweet_dictionary = Dictionary(docs)
        tweet_dictionary.filter_extremes(no_below=int(below),
                                         no_above=float(above))
        tweet_dictionary.save('../data/tweet_dictionary')

        ngram_docs = ngrams(input_docs=docs)
        corpus = make_bow_corpus(tweet_dictionary, ngram_docs)
        with open('../data/bow_corpus', 'wb') as f:
            pickle.dump(corpus, f)
        print('Number of unique tokens: %d' % len(tweet_dictionary))
        print('Number of documents: %d' % len(corpus))
        """Training parameters."""
        num_topics = int(
            topics
        )  # Number of topics, here relatively low so we can interpret them more easily -> can be set higher
        chunk_size = int(
            chunksize
        )  # Numbers of documents fed into the training algorithm (we have 7)
        passes = int(passes)  # Number of times trained on the entire corpus
        iterations = int(iterations)  # Number of loops over each document
        eval_every = None  # Don't evaluate model perplexity, takes too much time.
        """ Make a index to word dictionary."""
        temp = tweet_dictionary[0]  # This is only to "load" the dictionary.
        id2word = tweet_dictionary.id2token
        """Create model
        We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning
        two parameters in the model that we usually would have to specify explicitly."""
        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         chunksize=chunk_size,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)
        model_file = '../data/model/LDA_model_v1'
        model.save(model_file)
        """ Tests """
        # Top topics
        top_topics = model.top_topics(
            corpus
        )  # , num_words=20) Default value = 20, input is our corpus in BOW format

        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        """Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring 
        words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and 
        topics that are artifacts of statistical inference """
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        pprint(top_topics)
Ejemplo n.º 19
0
        print(df.shape)

    if method == 'LDA':
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel
        from gensim import models
        from sklearn.utils import shuffle
        data = shuffle(pd.read_excel("dataAll.xlsx"))

        print(data.shape)

        train_set = []
        lines = data['content'].values

        for line in lines:
            train_set.append([i for i in line.split()])

        dictionary = Dictionary(train_set)
        corpus = [dictionary.doc2bow(text) for text in train_set]  # 构建稀疏向量
        tfidf = models.TfidfModel(corpus)  # 统计tfidf
        corpus_tfidf = tfidf[corpus]  # 得到每个文本的tfidf向量,稀疏矩阵
        num_topics = 12
        lda_model = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, iterations=10)
        top_topics = lda_model.top_topics(corpus, coherence='u_mass', topn=12)
        print(top_topics)

        saves = []
        averages = []
        print_topics = []
        fw = open('lda model topicRestlts.txt', 'w', encoding='utf-8')
Ejemplo n.º 20
0
def do(originfile):
    keywords = helper.getKeywords(originfile)
    for emotion in ['Good', 'Bad']:
        print("begin " + emotion)
        for keyword in keywords.keys():
            start_time = time.time()
            print(keyword)
            raw_corpus = helper.getRawCorpus(csv_file=open(
                'resources/csvs/' + keyword + '_' + emotion.lower() + '.csv',
                mode='r',
                encoding="utf8",
                newline='\n'),
                                             id_and_country=True)
            print("starting preprocessing")
            stopwords = getStopwords(stopset)
            stwfromtfidf = list(
                TfidfVectorizer(stop_words='english').get_stop_words())
            stopwords = set(list(stopwords) + stwfromtfidf)
            for w in negationstopset:
                stopwords.add(w)
            bow, dictionary, corpus, raw_corpus = documentprocessor.fullpreprocessrawcorpustobow(
                raw_corpus, stopwords, min_count_bigrams=20)

            ###############################################################################
            # Let's see how many tokens and documents we have to train on.
            #

            print('Number of unique tokens: %d' % len(dictionary))
            print('Number of documents: %d' % len(bow))

            ###############################################################################
            # Training
            # --------
            #
            # We are ready to train the LDA model. We will first discuss how to set some of
            # the training parameters.
            #
            # First of all, the elephant in the room: how many topics do I need? There is
            # really no easy answer for this, it will depend on both your data and your
            # application. I have used 10 topics here because I wanted to have a few topics
            # that I could interpret and "label", and because that turned out to give me
            # reasonably good results. You might not need to interpret all your topics, so
            # you could use a large number of topics, for example 100.
            #
            # ``chunksize`` controls how many documents are processed at a time in the
            # training algorithm. Increasing chunksize will speed up training, at least as
            # long as the chunk of documents easily fit into memory. I've set ``chunksize =
            # 2000``, which is more than the amount of documents, so I process all the
            # data in one go. Chunksize can however influence the quality of the model, as
            # discussed in Hoffman and co-authors [2], but the difference was not
            # substantial in this case.
            #
            # ``passes`` controls how often we train the model on the entire corpus.
            # Another word for passes might be "epochs". ``iterations`` is somewhat
            # technical, but essentially it controls how often we repeat a particular loop
            # over each document. It is important to set the number of "passes" and
            # "iterations" high enough.
            #
            # I suggest the following way to choose iterations and passes. First, enable
            # logging (as described in many Gensim tutorials), and set ``eval_every = 1``
            # in ``LdaModel``. When training the model look for a line in the log that
            # looks something like this::
            #
            #    2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations
            #
            # If you set ``passes = 20`` you will see this line 20 times. Make sure that by
            # the final passes, most of the documents have converged. So you want to choose
            # both passes and iterations to be high enough for this to happen.
            #
            # We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat
            # technical, but essentially we are automatically learning two parameters in
            # the model that we usually would have to specify explicitly.
            #

            # Train LDA model.
            from gensim.models import LdaModel

            bestacc = -1
            bestmodel = None
            if len(bow) > 0:
                print(
                    "starting training and checking with different number of topics"
                )
                for numt in range(2, 21):

                    # Set training parameters.
                    num_topics = numt
                    chunksize = 2000
                    passes = 20
                    iterations = 400
                    eval_every = None  # Don't evaluate model perplexity, takes too much time.

                    # Make a index to word dictionary.
                    temp = dictionary[
                        0]  # This is only to "load" the dictionary.
                    id2word = dictionary.id2token

                    model = LdaModel(corpus=bow,
                                     id2word=id2word,
                                     chunksize=chunksize,
                                     alpha='auto',
                                     eta='auto',
                                     iterations=iterations,
                                     num_topics=num_topics,
                                     passes=passes,
                                     eval_every=eval_every)

                    ###############################################################################
                    # We can compute the topic coherence of each topic. Below we display the
                    # average topic coherence and print the topics in order of topic coherence.
                    #
                    # Note that we use the "Umass" topic coherence measure here (see
                    # :py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently
                    # obtained an implementation of the "AKSW" topic coherence measure (see
                    # accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).
                    #
                    # If you are familiar with the subject of the articles in this dataset, you can
                    # see that the topics below make a lot of sense. However, they are not without
                    # flaws. We can see that there is substantial overlap between some topics,
                    # others are hard to interpret, and most of them have at least some terms that
                    # seem out of place. If you were able to do better, feel free to share your
                    # methods on the blog at http://rare-technologies.com/lda-training-tips/ !
                    #

                    top_topics = model.top_topics(bow)  # , num_words=20)
                    acc = computetopacc(top_topics)
                    if acc > bestacc:
                        print("found better model with number of topics: " +
                              str(model.num_topics))
                        bestacc = acc
                        bestmodel = copy.deepcopy(model)
                    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
                    avg_topic_coherence = sum([t[1] for t in top_topics
                                               ]) / num_topics
                    cc.append(avg_topic_coherence)
                    print('Average topic coherence: %.4f.' %
                          avg_topic_coherence)
                savemodel(bestmodel, keyword, emotion, bow)
                print(
                    str(time.time() - start_time) + ' seconds to compute ' +
                    keyword + ' ' + emotion)
Ejemplo n.º 21
0
    minimum_probability=0.0,
    # iterations=100,
    # gamma_threshold=0.001,
    passes=10,
    per_word_topics=True)

lda.get_document_topics(bow=corpus, per_word_topics=True)
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)

t2 = time.time()
print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60,
      "min")

top_k_topics = lda.top_topics(corpus,
                              topn=5,
                              dictionary=dct,
                              texts=train_df['tokenized'])
indx = [i + 1 for i in range(6)]
contrib = np.transpose(contrib)
#%%
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)
DTdist = pd.DataFrame(
    contrib,
    columns=[
        "Top 5 words that contribute to each topic with associated probability"
    ],
    index=indx)

distLatex = DTdist.to_latex(index=True, index_names="Topics")
# document distribution
            ])
        doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv',
                          index=False)

        model_file = datapath((model_path / f'{key}_{n_topics}').resolve())
        lda.save(model_file)
        train_lda = LdaModel(corpus=train_corpus,
                             num_topics=n_topics,
                             id2word=pd.Series(train_tokens).to_dict())

        # see https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.log_perplexity
        test_perplexity = 2**(-train_lda.log_perplexity(test_corpus))

        # https://markroxor.github.io/gensim/static/notebooks/topic_coherence_tutorial.html
        u_mass = np.mean([
            c[1] for c in lda.top_topics(
                corpus=corpus, coherence='u_mass', topn=n_topics)
        ])

        # extrinsic - need to provide external corpus
        # cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_uci')
        # uci = cm.get_coherence()

        result_ = [
            vocab_size, test_vocab, max_features, n_topics, test_perplexity,
            u_mass
        ]
        topic_coherence.append(result_)
        result = pd.DataFrame(topic_coherence,
                              columns=cols).sort_values('u_mass')
    elapsed = time() - start
    print(
Ejemplo n.º 23
0
model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunk_size,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)
""" Save model so we can load it later - only needed if you need to train the model from anew """
model_file = '.././data/model/LDA_model_v1'
model.save(model_file)
""" Tests """
# Top topics
top_topics = model.top_topics(
    corpus
)  # , num_words=20) Default value = 20, input is our corpus in BOW format

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
"""Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring 
words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and 
topics that are artifacts of statistical inference """
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)
""" Things to experiment with: 
1. no_above and no_below parameters in filter_extremes method.
2. Adding bi-, trigrams or even higher order n-grams.
3. Consider whether using a hold-out set or cross-validation is the way to go for you."""
def getMusicRecommendationLDA(fileName, email, query, fileRatings,
                              alreadyLiked):
    textCorpus = [query]  # Contiene una lista di generi per un utente
    artistdf = []

    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:

            # Scarto la prima riga
            if 'artist_mb' in row[1]:
                continue

            if row[2]:
                pp_genre = row[2].split(';')
                pp_genre = pp.preprocess_string(
                    row[2], CUSTOM_FILTERS
                )  # Faccio il pre-processing del genere dell'artista
                pp_genre.append(row[1])

                if len(pp_genre) > 2:
                    textCorpus.append(pp_genre)
                    artistdf.append(row)

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(textCorpus)

    # Remove words that appear less than 5 times and that are in more than in 60% documents
    dictionary.filter_extremes(no_below=10, no_above=0.6)

    # Bag-of-words representation of the documents.
    dictionary.save('musica.dict')

    dictionary = dictionary.load('musica.dict')
    corpus = [dictionary.doc2bow(doc) for doc in textCorpus]

    # Train LDA Model
    # Set training parameters.
    num_topics = 100
    chunksize = 10
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    NUM_PASSES = 10
    NUM_TOPICS = 100
    RANDOM_STATE = 1

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    '''
    lda = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )'''

    lda = LdaModel(corpus,
                   id2word=dictionary,
                   random_state=RANDOM_STATE,
                   num_topics=NUM_TOPICS,
                   passes=NUM_PASSES)
    # corpusLDA = corpora.MmCorpus("corpus.mm")
    index = gensim.similarities.MatrixSimilarity(lda[corpus])
    index.save("simIndex.index")

    top_topics = lda.top_topics(corpus)  # num_words=20
    # print(top_topics)

    vec_bow = dictionary.doc2bow(query)

    vec_lda = lda[vec_bow]
    sims = index[vec_lda]

    for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]:

        if not (artistdf[s[0]][1].lower() in alreadyLiked):
            with io.open(fileRatings, "a", encoding="utf-8") as myfile:
                myfile.write(email + ";")
                myfile.write(artistdf[s[0]][1] + ";")
                myfile.write('{} \n'.format(s[1]))

            myfile.close()
    file.close()

    print('Scrittura in ' + fileRatings + ' avvenuta per ' + email + '!')

    return ''
Ejemplo n.º 25
0
# transform to vectorized form to put in model
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Finds how many unique tokens we've found and how many docs we have
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# index to word dictionary
temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

avg_topic_coherence = sum(
    [t[1] for t in model.top_topics(corpus)]
) / num_topics  # sum of topic coherences of all topics, divided by the number of topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
model.print_topics()
visualisation = pyLDAvis.gensim.prepare(model, corpus, dictionary)
full_output_path = path_output + "/visualization/LDA_Visualization_" + param_changes + ".html"
pprint(model.print_topics())
pyLDAvis.save_html(visualisation, full_output_path)
model.save(path_output + "/LDA_" + param_changes + ".model")
Ejemplo n.º 26
0
def do():
    docs = list(extract_documents())

    ###############################################################################
    # So we have a list of 1740 documents, where each document is a Unicode string.
    # If you're thinking about using your own corpus, then you need to make sure
    # that it's in the same format (list of Unicode strings) before proceeding
    # with the rest of this tutorial.
    #
    print(len(docs))
    print(docs[0][:500])

    ###############################################################################
    # Pre-process and vectorize the documents
    # ---------------------------------------
    #
    # As part of preprocessing, we will:
    #
    # * Tokenize (split the documents into tokens).
    # * Lemmatize the tokens.
    # * Compute bigrams.
    # * Compute a bag-of-words representation of the data.
    #
    # First we tokenize the text using a regular expression tokenizer from NLTK. We
    # remove numeric tokens and tokens that are only a single character, as they
    # don't tend to be useful, and the dataset contains a lot of them.
    #
    # .. Important::
    #
    #    This tutorial uses the nltk library for preprocessing, although you can
    #    replace it with something else if you want.
    #

    # Tokenize the documents.
    from nltk.tokenize import RegexpTokenizer

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    ###############################################################################
    # We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a
    # stemmer in this case because it produces more readable words. Output that is
    # easy to read is very desirable in topic modelling.
    #

    # Lemmatize the documents.
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    ###############################################################################
    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
    # Using bigrams we can get phrases like "machine_learning" in our output
    # (spaces are replaced with underscores); without bigrams we would only get
    # "machine" and "learning".
    #
    # Note that in the code below, we find bigrams and then add them to the
    # original data, because we would like to keep the words "machine" and
    # "learning" as well as the bigram "machine_learning".
    #
    # .. Important::
    #     Computing n-grams of large dataset can be very computationally
    #     and memory intensive.
    #

    # Compute bigrams.
    from gensim.models import Phrases

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    ###############################################################################
    # We remove rare words and common words based on their *document frequency*.
    # Below we remove words that appear in less than 20 documents or in more than
    # 50% of the documents. Consider trying to remove words only based on their
    # frequency, or maybe combining that with this approach.
    #

    # Remove rare and common tokens.
    from gensim.corpora import Dictionary

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    ###############################################################################
    # Finally, we transform the documents to a vectorized form. We simply compute
    # the frequency of each word, including the bigrams.
    #

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    ###############################################################################
    # Let's see how many tokens and documents we have to train on.
    #

    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    ###############################################################################
    # Training
    # --------
    #
    # We are ready to train the LDA model. We will first discuss how to set some of
    # the training parameters.
    #
    # First of all, the elephant in the room: how many topics do I need? There is
    # really no easy answer for this, it will depend on both your data and your
    # application. I have used 10 topics here because I wanted to have a few topics
    # that I could interpret and "label", and because that turned out to give me
    # reasonably good results. You might not need to interpret all your topics, so
    # you could use a large number of topics, for example 100.
    #
    # ``chunksize`` controls how many documents are processed at a time in the
    # training algorithm. Increasing chunksize will speed up training, at least as
    # long as the chunk of documents easily fit into memory. I've set ``chunksize =
    # 2000``, which is more than the amount of documents, so I process all the
    # data in one go. Chunksize can however influence the quality of the model, as
    # discussed in Hoffman and co-authors [2], but the difference was not
    # substantial in this case.
    #
    # ``passes`` controls how often we train the model on the entire corpus.
    # Another word for passes might be "epochs". ``iterations`` is somewhat
    # technical, but essentially it controls how often we repeat a particular loop
    # over each document. It is important to set the number of "passes" and
    # "iterations" high enough.
    #
    # I suggest the following way to choose iterations and passes. First, enable
    # logging (as described in many Gensim tutorials), and set ``eval_every = 1``
    # in ``LdaModel``. When training the model look for a line in the log that
    # looks something like this::
    #
    #    2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations
    #
    # If you set ``passes = 20`` you will see this line 20 times. Make sure that by
    # the final passes, most of the documents have converged. So you want to choose
    # both passes and iterations to be high enough for this to happen.
    #
    # We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat
    # technical, but essentially we are automatically learning two parameters in
    # the model that we usually would have to specify explicitly.
    #

    # Train LDA model.
    from gensim.models import LdaModel

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     chunksize=chunksize,
                     alpha='auto',
                     eta='auto',
                     iterations=iterations,
                     num_topics=num_topics,
                     passes=passes,
                     eval_every=eval_every)

    ###############################################################################
    # We can compute the topic coherence of each topic. Below we display the
    # average topic coherence and print the topics in order of topic coherence.
    #
    # Note that we use the "Umass" topic coherence measure here (see
    # :py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently
    # obtained an implementation of the "AKSW" topic coherence measure (see
    # accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).
    #
    # If you are familiar with the subject of the articles in this dataset, you can
    # see that the topics below make a lot of sense. However, they are not without
    # flaws. We can see that there is substantial overlap between some topics,
    # others are hard to interpret, and most of them have at least some terms that
    # seem out of place. If you were able to do better, feel free to share your
    # methods on the blog at http://rare-technologies.com/lda-training-tips/ !
    #

    top_topics = model.top_topics(corpus)  # , num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    from pprint import pprint

    pprint(top_topics)
Ejemplo n.º 27
0
Archivo: LDA_r.py Proyecto: mainla/LDA
                            eta='auto',
                            iterations=iterations,
                            num_topics=num_topics,
                            passes=passes,
                            eval_every=eval_every
                        )
    lda_model.save('lda_model.model')
    print("LDA...'s been created!")
    print("LDA...'s saved!")
    
    #ldamodel.LdaModel
    #filename = os.path.join(output_dir, ) % (model.K, model.M))
    #lda_model.save_result("Kd_Md.json")

print("stop LDA... ")
top_topics = lda_model.top_topics(corpus)
pprint(top_topics)
 
lda_model.print_topics()
pprint(lda_model.print_topics())

#Отображение доминирующей темы
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
Ejemplo n.º 28
0
# average topic coherence and print the topics in order of topic coherence.
#
# Note that we use the "Umass" topic coherence measure here (see
# :py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently
# obtained an implementation of the "AKSW" topic coherence measure (see
# accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).
#
# If you are familiar with the subject of the articles in this dataset, you can
# see that the topics below make a lot of sense. However, they are not without
# flaws. We can see that there is substantial overlap between some topics,
# others are hard to interpret, and most of them have at least some terms that
# seem out of place. If you were able to do better, feel free to share your
# methods on the blog at http://rare-technologies.com/lda-training-tips/ !
#

top_topics = model.top_topics(corpus)  #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

###############################################################################
# Things to experiment with
# -------------------------
#
# * ``no_above`` and ``no_below`` parameters in ``filter_extremes`` method.
# * Adding trigrams or even higher order n-grams.
# * Consider whether using a hold-out set or cross-validation is the way to go for you.
Ejemplo n.º 29
0
id2word = tokenizer.decoder
######################################
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

model = LdaModel(corpus=topical_dataset,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

top_topics = model.top_topics(topical_dataset)

np.save("topical_dataset_topics.npy", model.get_topics())
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint

pprint(top_topics)

pickle.dump(top_topics, open("top_topical_dataset_topics.p", "wb"))
Ejemplo n.º 30
0
passes = 20
iterations = 400
eval_every = None

temp = dictionary[0]

id2word = dictionary.id2token

model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

top_topics = model.top_topics(corpus)

from pprint import pprint
pprint(top_topics)

import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(model,
                                      corpus,
                                      dictionary,
                                      sort_topics=True)
pyLDAvis.display(lda_display)
Ejemplo n.º 31
0
def getTopicForQuery(question):
    newsdf = []
    docs = []
    #docs.append(question)
    fileName = 'newsEn2.csv'
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        tokenizer = RegexpTokenizer(r'\w+')
        # create English stop words list
        en_stop = set(stopwords.words('english'))
        # Create p_stemmer of class PorterStemmer
        p_stemmer = nltk.stem.porter.PorterStemmer()
        for row in reader:

            if row[5] and len(row[5]) > 10:
                raw = row[5].lower()
                tokens = tokenizer.tokenize(raw)
                # remove stop words from tokens
                stopped_tokens = [i for i in tokens if not i in en_stop]
                # stem tokens
                stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
                #stemmed_tokens = stopped_tokens
                docs.append(stemmed_tokens)
                newsdf.append(row)

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    # Lemmatize the documents.
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    from gensim.corpora import Dictionary

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # Bag-of-words representation of the documents.
    dictionary.save('musica.dict')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Train LDA model.
    from gensim.models import LdaModel

    # Set training parameters.
    num_topics = 200
    chunksize = 50
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    lda = LdaModel(corpus=corpus,
                   id2word=id2word,
                   chunksize=chunksize,
                   alpha='auto',
                   eta='auto',
                   iterations=iterations,
                   num_topics=num_topics,
                   passes=passes,
                   eval_every=eval_every)
    #corpusLDA = corpora.MmCorpus("corpus.mm")
    index = gensim.similarities.MatrixSimilarity(lda[corpus])
    index.save("simIndex.index")

    top_topics = lda.top_topics(corpus)  # , num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    from pprint import pprint
    #pprint(top_topics)
    important_words = []
    temp = question.lower()

    tokens = tokenizer.tokenize(temp)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # stem tokens
    doc = [p_stemmer.stem(i) for i in stopped_tokens]

    important_words = [lemmatizer.lemmatize(token) for token in doc]
    print(important_words)

    dictionary = Dictionary.load('musica.dict')

    vec_bow = dictionary.doc2bow(important_words)
    vec_lda = lda[vec_bow]

    sims = index[vec_lda]

    for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]:
        print(s)
        print(newsdf[s[0]][1] + ";")
        print('{} \n'.format(s[1]))

    return ''