Ejemplo n.º 1
0
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt',
                            mode='r',
                            encoding='utf8').readlines()
    stopwords = [w.strip() for w in stopwords]

    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([w for w in line if w not in stopwords])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)

    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
Ejemplo n.º 2
0
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None):

    if refer_dictionary is None:
        refer_docs = [
          [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
           and token not in [',', '.', '?']] for sample in dataset
        ]
        refer_dictionary = Dictionary(refer_docs)
        refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs]
        refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0)

    doc = [
        [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False
         and token not in [',', '.', '?']] for sample in dataset
    ]
    doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc]
    doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s]

    for (sample, doc_vec) in zip(dataset, doc_vecs):
        for topic_prob in doc_vec:
            sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1]

    return refer_dictionary, refer_lda_model
Ejemplo n.º 3
0
def train_model(dictionary, corpus):
  chunksize = int(math.ceil(len(corpus) / 1000.0)) * 1000
  passes = 20
  iterations = 400
  eval_every = None  # Don't evaluate model perplexity, takes too much time.

  # Make a index to word dictionary.
  temp = dictionary[0] # This is only to "load" the dictionary.
  id2word = dictionary.id2token
  model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=topics,
    passes=passes,
    eval_every=eval_every,
    per_word_topics=True
  )
  
  return model
Ejemplo n.º 4
0
def find_topic():
    """
    LdaModel params
        passes: Number of passes through the entire corpus
        chunk_size: how many documents to load into memory
        update_every: number of chunks to process prior to moving onto the M step of EM
    """
    with gzip.open(config['fun2vec']['corpus'], 'rb') as f:
        words = pickle.load(f)
    # 辞書作成
    dictionary = corpora.Dictionary(words)
    dictionary.filter_extremes(no_below=30, no_above=0.3)

    # コーパスを作成
    corpus = [dictionary.doc2bow(_words) for _words in words]
    # corpora.MmCorpus.serialize('cop.mm', corpus)
    lda = LdaModel(corpus,
                   num_topics=10,
                   chunksize=10000,
                   update_every=2,
                   id2word=dictionary)
    lda.save(config['topic_model'])
    pprint(lda.show_topics(num_words=20))
Ejemplo n.º 5
0
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit

    Returns:
    -------
    lda_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lda_list = []
    for num_topics in range(1, limit + 1):
        print("Topic %d" % num_topics)
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lda_list.append(lm)
        cm = CoherenceModel(model=lm,
                            texts=texts,
                            dictionary=dictionary,
                            coherence='c_v')
        # cm = CoherenceModel(model=lm, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())

    # Show graph
    x = range(1, limit + 1)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()

    return lda_list, c_v
Ejemplo n.º 6
0
def create_lda_model(project,
                     corpus,
                     id2word,
                     name,
                     use_level=True,
                     force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'

    if not os.path.exists(model_fname) or project.force or force:
        if corpus:
            update_every = None  # run in batch if we have a pre-supplied corpus
        else:
            update_every = 1

        model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            alpha=project.alpha,
            eta=project.eta,
            chunksize=project.chunksize,
            passes=project.passes,
            num_topics=project.num_topics,
            iterations=project.iterations,
            eval_every=None,  # disable perplexity tests for speed
            update_every=update_every,
        )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname
Ejemplo n.º 7
0
def LDA(tokens, start, stop, step=1):
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    model_list = []
    coherence_values = []
    max_topic_num = 0
    for i in range(start, stop, step):
        print('steps  ', i)
        model = LdaModel(corpus, id2word=dictionary,
                         num_topics=i + 1)  #LDA model
        model_list.append(model)
        coherence_model_lda = CoherenceModel(model,
                                             texts=tokens,
                                             dictionary=dictionary,
                                             coherence='c_v')  #Coherence
        coherence_lda = coherence_model_lda.get_coherence(
        )  #calculate the coherence score
        if i is not start and coherence_lda > max(coherence_values):
            max_topic_num = i
        coherence_values.append(coherence_lda)

    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()  #show graph of coherence score by pyplot
    max_ind = coherence_values.index(max(coherence_values))
    model_list[max_ind].save("result_model")
    prepared_data = gensimvis.prepare(model_list[max_ind],
                                      corpus=corpus,
                                      dictionary=dictionary)
    pyLDAvis.save_html(prepared_data,
                       'res.html')  #save the result of LDA by html file
    pyLDAvis.save_json(prepared_data,
                       'res.json')  #save the result of LDA by JSON file
    return model_list[max_ind], coherence_values[max_ind], max_topic_num
Ejemplo n.º 8
0
    def fit_lda(self):
        '''
        Read in serialized cards from disk. Fit the LdaModel for the class.

        Only operates on class fields.
        '''
        print('\nRun Gensim LdaModel on serialized documents')
        start = datetime.now()

        # Feed params from built_corpus into the LDA model
        self.lda_model = LdaModel(corpus=self.corpus_cards,
                                  num_topics=self.n_topics,
                                  id2word=self.built_corpus.vocabulary_,
                                  distributed=False,
                                  chunksize=2000,
                                  passes=10,
                                  update_every=1,
                                  alpha='symmetric',
                                  eta=None,
                                  decay=0.7,
                                  offset=10.0,
                                  eval_every=10,
                                  iterations=self.max_iter,
                                  gamma_threshold=0.001,
                                  minimum_probability=0.01,
                                  random_state=None,
                                  ns_conf=None,
                                  minimum_phi_value=0.01,
                                  per_word_topics=False,
                                  callbacks=None)

        end = datetime.now()
        print("   Time taken: {} on {} topics, max iterations: {}".format(
            end - start, self.n_topics, self.max_iter))

        end = datetime.now()
        print("   Time taken: {}".format(end - start))
Ejemplo n.º 9
0
    def make_lda_model(self, num_topics=11):
        '''
        Build an optimized LDA model.
        prints a coherence score for sanity checking (EDA has revealed the target coherence to be ~0.39)
        '''
        print('  - Building LDA Model model with {} topics'.format(num_topics))

        dictionary = corpora.Dictionary(self.token_list)
        corpus = [dictionary.doc2bow(text) for text in self.token_list]

        #set up mallet path
        # os.environ.update({'MALLET_HOME':r'anaconda3/lib/python3.7/site-packages/mallet-2.0.8/'})
        # mallet_path = '/anaconda3/lib/python3.7/site-packages/mallet-2.0.8/bin/mallet' # update this path
        #
        # #Make Model:
        # ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        ldamodel = LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            passes=20)
        #Get Coherence Score:
        coherence_score = CoherenceModel(model=ldamodel,
                                         texts=self.token_list,
                                         dictionary=dictionary,
                                         coherence='c_v').get_coherence()
        # model_topics = optimal_model.show_topics(formatted=False)

        # print topics
        pp.pprint(ldamodel.print_topics(num_words=6))
        print("  - Num Topics: {}. Coherence Value of: {:2.3f}".format(
            num_topics, coherence_score))

        self.all_topics = ldamodel.print_topics(num_words=6)
        self.ldamodel = ldamodel
        self.corpus = corpus
        self.dictionary = dictionary
        self.coherence_score = coherence_score
Ejemplo n.º 10
0
def fine_tune_lda(corpus, dictionary, texts, limit, start=2, step=2):
    """
    Compute c_v coherence for various number of topics
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    n_topics : numbmber of topics
    """
    coherence_values = []
    model_list = []
    n_topics = []
    for num_topics in range(start, limit, step):
        print('\nTraing with n_topics = {}, training sample = {}.'.format(num_topics,len(corpus)))
        model = LdaModel(corpus = corpus,
                          id2word = dictionary,
                          random_state = 2,
                          alpha='auto',
                          eta = 'auto',
                          num_topics = num_topics)#
                          #distributed = True)  # alpha='auto' is not implenented in distributed lda
        model_list.append(model)
        print('Calculating coherence score based on {} samples.'.format(len(texts)))
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        n_topics.append(num_topics)
        print("{}: {}".format(num_topics,coherence_values[-1]))
        

    return model_list, coherence_values,n_topics
Ejemplo n.º 11
0
def initiate_LDA(docs=None,no_topics=100):
    '''
    Initiates a 
    '''
    finalDict = dictbuild()
    stringSplit = re.compile('\\s')
    count =0
    stop = stopwords.words('english')
    for_corpus=[]
    for line in docs:
        lines = stringSplit.split(line.lower())
        for_corpus.append(lines)
        finalDict.add_documents([lines])
        count+=1

    finalDict.filter_tokens(stop)
    once_ids = [tokenid for tokenid, docfreq in finalDict.dfs.items() if docfreq <30] #5642 
    finalDict.filter_tokens(once_ids)
    finalDict.compactify() 
    
    
    corpus = [finalDict.doc2bow(line) for line in for_corpus]
    
    n=no_topics
    lda_model = LdaModel(corpus,num_topics=n,id2word=finalDict)
    topicsDist=[]
    for x in corpus:
        topics = lda_model[x]
        temp=[0]*n
        for t in topics:
            temp[t[0]]=t[1]
        topicsDist.append(temp)
    
    print("topics generated")    
    topicsDist = np.asarray(topicsDist)
    return topicsDist, finalDict,lda_model
def compute_coherence_values(dictionary, corpus, texts, id2word, topics_limit,
                             topics_start, topics_step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
    TODO: add random state to get same results

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(topics_start, topics_limit, topics_step):
        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         alpha='auto',
                         eta='auto',
                         random_state=203495)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print('num_topics:', num_topics, 'coherence:',
              coherencemodel.get_coherence())
    return model_list, coherence_values
Ejemplo n.º 13
0
def fit_lda(documents, corpus, vocab, K: int, alpha: float = None, eta: float = None, eval=False):
    """
    Fit LDA from a scipy CSR matrix (data).
    :param vocab: a dictionary over the words
    :param K: number of topics
    :param alpha: the alpha prior weight of topics in documents
    :param eta: the eta prior weight of words in topics
    :return: a lda model trained on the data and vocab
    """
    if eval:
        logging.basicConfig(filename='logstuff.log', format="%(asctime)s:%(levelname)s:%(message)s",
                            level=logging.NOTSET)
        perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
        convergence_logger = ConvergenceMetric(logger='shell')
        coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence='c_v', texts=documents)

        print("fitting lda...")
        model = LdaModel(corpus=corpus,
                         id2word=vocab,
                         num_topics=K,
                         eval_every=1,
                         passes=20,
                         iterations=100,
                         random_state=100,
                         update_every=1,
                         callbacks=[convergence_logger, perplexity_logger, coherence_cv_logger])
    else:
        print("fitting lda...")
        model = LdaMulticore(corpus=corpus,
                             id2word=vocab,
                             num_topics=K,
                             alpha=alpha,
                             eta=eta,
                             passes=20,
                             iterations=100)
    return model
Ejemplo n.º 14
0
def evaluate_graph_for_max_label(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    warnings.filterwarnings("ignore")
    for num_topics in range(2, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm,
                            texts=texts,
                            dictionary=dictionary,
                            coherence='c_v')
        c_v.append(cm.get_coherence())

    # Show graph
    x = range(2, limit)
    m = (i for i in range(len(c_v)) if c_v[i] == max([c_v]))
    print(max(c_v), m)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
Ejemplo n.º 15
0
def get_topics(chapters, nb_topics=10, nb_words=10):
    
    chapters_token = [[line.split(' ') for line in c] for c in chapters]
    
    dictionary = [corpora.Dictionary(c) for c in chapters_token]

    corpus_chapters = [[dictionary[i].doc2bow(token) for token in c] for i, c in enumerate(chapters_token)]

    lda_chapters = [LdaModel(corpus_c, num_topics=nb_topics, passes=3) for corpus_c in corpus_chapters]

    topics = list()
    for i, lda_c in enumerate(lda_chapters):
        topics.append([])
        for j in range(nb_topics):
            words = []
            # On regarde les 10 mots les plus associés au topic
            for x in lda_c.show_topic(j, topn=nb_words):
                words.append(dictionary[i][int(x[0])])
                
            #print(words)
            topics[i].append(words)

    
    return topics
Ejemplo n.º 16
0
    def LDAmodel(words, num_topics=5, num_words=5):
        """
        1. the number of words
        2. the mixture of topics ex: 1/2 the topic “health” and 1/2 the topic “vegetables" etc..
        3. the probability of topic depends on their dominancy
        """
        dictionary = corpora.Dictionary(words)
        # Term Document Frequency
        corpus = [dictionary.doc2bow(word) for word in words]
        # save it!
        pickle.dump(corpus, open('corpus.pkl', 'wb'))
        dictionary.save('dictionary.gensim')
        # Train model
        ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20)
        # lda_model = LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
        topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
        # Validation
        # A measure of how good the model is. lower the better.
        val_perplexity = ldamodel.log_perplexity(corpus)
        # cohherent score
        coherence_ldamodel = CoherenceModel(model=ldamodel, texts=words, dictionary=dictionary, coherence='c_v')
        val_coherence = coherence_ldamodel.get_coherence()

        return topics, val_perplexity, val_coherence
Ejemplo n.º 17
0
                        for num_topics in NT:
                            run += 1
                            print("Run " + str(run) + " out of " + str(runs))
                            writer.writerows([[
                                "Data size", "Topics", "no_above", "Chunksize",
                                "Passes", "Iteration"
                            ],
                                              [
                                                  size, num_topics, no_above,
                                                  chunksize, passes, iterations
                                              ], []])

                            lda = LdaModel(mm_used,
                                           num_topics=num_topics,
                                           chunksize=chunksize,
                                           id2word=dictionary,
                                           passes=passes,
                                           iterations=iterations,
                                           eval_every=eval_every)

                            lst = []
                            for topic in LdaModel.print_topics(lda, -1, 10):
                                terms = [
                                    x[0] for x in LdaModel.get_topic_terms(
                                        lda, topic[0], topn=10)
                                ]
                                term_strings = [
                                    str(dictionary[term]) for term in terms
                                ]
                                str_topic = []
                                str_topic.append("Topic " + str(topic[0] + 1))
Ejemplo n.º 18
0
    return quran_MI, quran_Chi, ot_MI, ot_Chi, nt_MI, nt_Chi


"""
Part 2: Text Analysis ----> execute writing method 
"""
if not os.path.exists('MI_&_ChiSquare.txt'):
    write_results('MI_&_ChiSquare.txt')
"""
Part 2: Text Analysis ----> LDA
"""
common_texts = quran_D + ot_D + nt_D
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
lda = LdaModel(common_corpus,
               num_topics=20,
               random_state=1000,
               id2word=common_dictionary)
"""
Part 2: Text Analysis ----> method of finding top 3 average score for each topic
                            and top 10 tokens for the top 3
"""


def Top3_and_Top10(corpora_doc_list):
    all_list = []
    for q in corpora_doc_list:
        each = common_dictionary.doc2bow(q)
        all_list.append(
            lda.get_document_topics(bow=each, minimum_probability=0.00))
    flatten = itertools.chain.from_iterable
    score_collection = list(flatten(all_list))
# Log to file (you'll probably want to delete this after)

import logging
f_path = os.path.join(os.getenv('DATA_PATH'), 'interim', 'onsite_search_gensim.log')
logging.basicConfig(filename=f_path, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# Running a test first...

# Latent Dirichlet Allocation (LDA) model

from gensim.models import LdaModel
ldamodel = LdaModel(
    corpus=df_search_terms.corpus.dropna().tolist()[:10],
    num_topics=5,
    id2word=dictionary,
)


f_path


cat '/Volumes/GoogleDrive/My Drive/ga-data-mining/data/interim/onsite_search_gensim.log'


# Training on the full dataset (running externally `onsite_search_gensim_lda.py`)

# Latent Dirichlet Allocation (LDA) model

# from gensim.models import LdaModel
Ejemplo n.º 20
0
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values


# Code starts here
topic_list, coherence_value_list = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=doc_clean,
    limit=41,
    start=1,
    step=5)
print(topic_list)

max_index = coherence_value_list.index(max(coherence_value_list))
print(max_index)
opt_topic = topic_list[max_index]
print(opt_topic)

lda_model = LdaModel(corpus=doc_term_matrix,
                     num_topics=opt_topic,
                     id2word=dictionary,
                     iterations=10,
                     passes=30,
                     random_state=0)
# printing the topics
pprint(lda_model.print_topics(5))
Ejemplo n.º 21
0
# 对分词处理后的文本进行
import codecs
from gensim import corpora
from gensim.models import LdaModel

train = []

# 获取分词的结果
fp = codecs.open('fenci_result.txt', 'r', encoding='utf-8')
for line in fp:
    line = line.split()
    train.append([w for w in line])

dictionary = corpora.Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=20)

for topic in lda.print_topics(num_words=2):
    termNumber = topic[0]
    print(topic[0], ':', sep='')
    listOfTerms = topic[1].split('+')
    for term in listOfTerms:
        listItems = term.split('*')
        print('  ', listItems[1], '(', listItems[0], ')', sep='')
Ejemplo n.º 22
0
import pandas as pd
import pickle
import gensim
from gensim.models import LdaModel

# ## Load cleaned text, dictionary, corpus
CLEAN_DATA_PATH = '/Users/richardkuzma/coding/analysis/monster/data/cleaned/'
jobs_cleaned_filename = 'monster_jobs_cleaned_text.pkl'
with open(CLEAN_DATA_PATH + jobs_cleaned_filename, 'rb') as f:
    jobs_cleaned = pickle.load(f)

jobs_dict_filename = 'monster_jobs_dict.pkl'
with open(CLEAN_DATA_PATH + jobs_dict_filename, 'rb') as f:
    dictionary = pickle.load(f)

jobs_corpus_filename = 'monster_jobs_corpus.pkl'
with open(CLEAN_DATA_PATH + jobs_corpus_filename, 'rb') as f:
    jobs_corpus = pickle.load(f)
"""Select number of topics"""
num_topics = 20

### make and save model
print('Making LDA model with np version {}'.format(np.__version__))
model = LdaModel(corpus=jobs_corpus, num_topics=num_topics, id2word=dictionary)

print('Saving model..')
MODEL_PATH = '/Users/richardkuzma/coding/analysis/monster/models/'
filename = 'LDA_' + str(num_topics) + '_topics.model'
model.save(MODEL_PATH + filename)
print('Saved model.\nPath: ' + MODEL_PATH + '\nname: ' + filename)
Ejemplo n.º 23
0
'''
LDA using gensim
'''

# Count words in the 'objective', keeping only those that occur at least 5 times
vectorizer = fe.text.CountVectorizer(stop_words='english', min_df=5)
X = vectorizer.fit_transform(h2020.objective)

# Convert to gensim format
corpus = Sparse2Corpus(X, documents_columns=False)

# Create mapping from word IDs (integers) to words (strings)
id2word = dict(enumerate(vectorizer.get_feature_names()))

# Fit LDA model with 10 topics
lda = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)

# Show top 5 words for each of the 10 topics
lda.show_topics(num_topics=10, num_words=5)
'''
word2vec using gensim
'''

# Convert adjectives and verbs to corresponding lemmas using spaCy
objectives = [ \
    [ x.lemma_ if x.pos == spacy.parts_of_speech.ADJ or \
                  x.pos == spacy.parts_of_speech.VERB \
      else x.text \
      for x in en(text) ] \
    for text in h2020.objective ]
Ejemplo n.º 24
0
    def topic_mining(self, _active_dataset):
        """
        Internal function for process topic mining and sace trained models
        @params:
            _active_dataset - Required  : name of active dataset (Str)
        """

        # TODO: перенести сохранение в файлы в слой models

        print("topic minning start..")
        vectorizer = TfidfVectorizer(max_df=0.5, max_features=500,
                                     min_df=2, stop_words='english',
                                     use_idf=True)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        print("text uploaded")
        text = _reviews

        X = vectorizer.fit_transform(text)
        print("text transformed")

        # mapping from feature id to acutal word
        id2words = {}
        for i, word in enumerate(vectorizer.get_feature_names()):
            id2words[i] = word

        corpus = matutils.Sparse2Corpus(X, documents_columns=False)

        print("train LDA models")
        #####################################################################
        _model_name = "LDA10"
        self.modelLDA_10 = LdaModel(corpus, num_topics=10, id2word=id2words)
        self.model_save(self.modelLDA_10, _model_name, _active_dataset)

        _cousines2topics = self.modelLDA_10.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (_rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        #####################################################################
        _model_name = "LDA15"
        self.modelLDA_15 = LdaModel(corpus, num_topics=15, id2word=id2words)
        self.model_save(self.modelLDA_15, _model_name, _active_dataset)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        _cousines2topics = self.modelLDA_15.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (
                                    _rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        #####################################################################
        _model_name = "LDA20"
        self.modelLDA_20 = LdaModel(corpus, num_topics=20, id2word=id2words)
        self.model_save(self.modelLDA_20, _model_name, _active_dataset)

        _cousines, _reviews = self.data_service.get_reviews_for_cousines()

        _cousines2topics = self.modelLDA_20.get_document_topics(
            corpus, minimum_probability=0)
        _topics2cousines = []
        for i, _topics_weight in enumerate(_cousines2topics):
            _topics2cousines.append([_cousines[i], _topics_weight])
        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f:
            pickle.dump(_topics2cousines, f)

        _rests_topics = []
        _rests = self.data_service.get_rests()
        for _rest in _rests:

            _cousines = self.data_service.get_cousines_for_rest(_rest[0])

            _rest_vector = []
            for _rc in _cousines:
                for _c in _topics2cousines:
                    if _c[0] == _rc:
                        if not _rest_vector:
                            # _rest_vector = _c[1]
                            for _t, _w in _c[1]:
                                _rest_vector.append([_t, float(_w)])
                        else:
                            for _t, _w in _c[1]:
                                _rest_vector[_t][1] = (
                                    _rest_vector[_t][1] + float(_w)) / 2

            _rests_topics.append([_rest, _rest_vector, _cousines])

        with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f:
            pickle.dump(_rests_topics, f)

        ###################################################################

        print("TRAIN MODELS DONE")
        return self.modelLDA_10
Ejemplo n.º 25
0
from gensim.models import LdaModel
num_topics = 10
chunksize = 1500
passes = 20
iterations = 400
eval_every = None

temp = dictionary[0]

id2word = dictionary.id2token

model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

top_topics = model.top_topics(corpus)

from pprint import pprint
pprint(top_topics)

import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(model,
                                      corpus,
                                      dictionary,
                prev_word = word
                
    processed_texts.append(processed_text)

#MODELO LDA

dictionary = Dictionary(processed_texts)

corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

#ENTRENAMIENTO DEL MODELO
num_topics = 1
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    iterations=5,
    passes=10,
    alpha='auto'
)

#DESCARGA DE DATOS A FICHEROS

word_dict = {}
today = date.today()
today_path = '../data/topic_today_EN.csv'
hist_path = '../data/topic_history_EN.csv'

for i in range(num_topics):
    words = lda_model.show_topic(i, topn = 10)
    word_dict['date'] = today
    word_dict['Topic'] = [i[0] for i in words]
Ejemplo n.º 27
0
    # for doc in corpus:
    #     for word, freq in doc:
    #         if word not in idf:
    #             idf[word] = 0
    #         idf[word] = idf[word] + freq
    # print(sorted([(id2word[wid], fr) for wid, fr in idf.items()], key=lambda x: x[1], reverse=True))

    # FINAL LDA MODEL
    num_topics = 46
    num_keywords = 100
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         update_every=1,
                         eval_every=1,
                         chunksize=4000,
                         passes=20,
                         iterations=100,
                         alpha='auto',
                         eta='auto',
                         random_state=42)

    print("Run for %d topics in %.2f mins" % (num_topics,
                                              (perf_counter() - start) / 60))
    start = perf_counter()
    # pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_keywords))
    print("\n\n")

    topics_shown = lda_model.show_topics(num_topics=num_topics,
                                         num_words=num_keywords,
                                         formatted=False)
Ejemplo n.º 28
0
def return_suggested_articles(request):
    """
    returns suggested articles based on topic of one currently being viewed

    Parameters
    ----------
    request : request (flask.Request): The request object

    Returns
    -------
    JSON of google search queries for articles to read

    """

    # get the requested json for the webpage
    request_json = request.get_json(silent=True)

    # get the headline and article
    headline = request_json['headline']
    article = request_json['article']
    print('requested json headline and article text')

    # make into one text file
    combined_article = headline + '. ' + article

    # set to 1 for single doc lda, 0 for tfidf
    do_single_document_LDA = 1

    # number of query words to return
    n_search_words = 5

    # can identify ngrams, but slows down performance
    do_ngrams = 1

    ### SINGLE DOC LDA PARAMS

    # set the number of topics to generate (5 seems to work pretty well)
    num_lda_topics = 5

    # set the number of passes
    n_passes = 10

    # if avoiding repeated words (only relevant if num_lda_topics > 1)
    do_unique_search_words = 0

    print('Downloading stop words')
    # download stopwords list
    # if use_bucket:
    download_blob('debiaser_data', 'sw1k.csv', '/tmp/sw1k.csv')

    # load stop words into pandas and then into list
    stop_words = pd.read_csv('/tmp/sw1k.csv')

    # remove from memory
    os.remove('/tmp/sw1k.csv')

    stop_words = stop_words['term']
    stop_words = [word for word in stop_words]

    # # adding some custom words
    stop_words.append('said')
    stop_words.append('youre')
    stop_words.append('mph')
    stop_words.append('inc')
    stop_words.append('cov')
    stop_words.append('jr')
    stop_words.append('dr')
    stop_words.append('ads')
    stop_words.append('cookies')
    stop_words.append('factset')

    print('Downloading news organizations from AllSidesMedia')
    # download all_sides_media list
    # if use_bucket:
    download_blob('debiaser_data',
                  'allsides_final_plus_others_with_domains.csv',
                  '/tmp/allsides_final_plus_others_with_domains.csv')

    # load domain names into dataframe and then get only names and
    all_sides = pd.read_csv('/tmp/allsides_final_plus_others_with_domains.csv')

    # remove from memory
    os.remove('/tmp/allsides_final_plus_others_with_domains.csv')

    # get the domain
    # all_sides_names = all_sides['name']
    all_sides_domains = all_sides['domain']
    # all_sides_names_domains = pd.concat([all_sides_names,all_sides_domains],axis=1)

    # get dictionary of entities in article
    # entity_dict = entity_recognizer(combined_article,nlp)

    if do_single_document_LDA:

        print('splitting article into sentences')
        # break up into sentences
        combined_article = tokenize.sent_tokenize(combined_article)

    else:

        # make into one element list for downstream processing
        combined_article = [combined_article]

    print('pre processing article text')
    # process article
    article_processed = process_all_articles(combined_article, nlp)

    print('removing stopwords')
    # remove stopwords
    article_processed = remove_stopwords(article_processed, stop_words)

    # floor for the frequency of words to remove
    # word_frequency_threshold = 1

    # get corpus, dictionary, bag of words
    # processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(article_processed,
    #                                                                                       word_frequency_threshold)

    if do_single_document_LDA:

        if do_ngrams:
            # load bigram trigram quadgram models
            bigram_mod_fname = '/tmp/bigram_mod.pkl'
            trigram_mod_fname = '/tmp/trigram_mod.pkl'
            quadgram_mod_fname = '/tmp/quadgram_mod.pkl'

            download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname)
            download_blob('debiaser_data', 'trigram_mod.pkl',
                          trigram_mod_fname)
            download_blob('debiaser_data', 'quadgram_mod.pkl',
                          quadgram_mod_fname)

            with open(bigram_mod_fname, 'rb') as pickle_file:
                bigram_mod = pickle.load(pickle_file)

            with open(trigram_mod_fname, 'rb') as pickle_file:
                trigram_mod = pickle.load(pickle_file)

            with open(quadgram_mod_fname, 'rb') as pickle_file:
                quadgram_mod = pickle.load(pickle_file)

            print('FINDING QUADGRAMS')

            # make up to quad grams
            article_processed = make_quadgrams(article_processed, bigram_mod,
                                               trigram_mod, quadgram_mod)

            # remove to free memory
            os.remove(bigram_mod_fname)
            os.remove(trigram_mod_fname)
            os.remove(quadgram_mod_fname)

        print('generating dictionary and bag of words vector...')
        start = time.process_time()
        processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(
            article_processed)
        print('TIME FOR GENERATING DICTIONARY AND BOW VECTOR')
        print(time.process_time() - start)

        print('generating lda model...')
        start = time.process_time()
        # generate the LDA model
        lda = LdaModel(corpus=bow_corpus,
                       num_topics=num_lda_topics,
                       id2word=processed_dictionary,
                       passes=n_passes)
        print('TIME FOR GENERATING LDA MODEL')
        print(time.process_time() - start)

        # get the topics from the lda model
        lda_topics = lda.show_topics(formatted=False)

        # ALL INTERESTING BUT DEPRECATED FOR NOW
        # WILL FOLLOW SIMPLER APPROACH:
        # Just take top word in each generated topic

        # get top words per topic
        lda_top_topic_words_string, lda_top_topic_words_list = get_lda_top_topic_words(
            lda_topics, num_lda_topics, do_unique_search_words, n_search_words)

    # doing tfidf
    else:

        # specify file name
        tfidf_matrix_filename = '/tmp/tfidf_matrix.pkl'

        # download the tfidf matrix
        print('DOWNLOADING TFIDF MODEL')
        download_blob('debiaser_data', 'tfidf_matrix.pkl',
                      tfidf_matrix_filename)

        with open(tfidf_matrix_filename, 'rb') as pickle_file:
            tfidf = pickle.load(pickle_file)

        # remove from memory
        os.remove(tfidf_matrix_filename)

        if do_ngrams:
            # load bigram trigram quadgram models
            bigram_mod_fname = '/tmp/bigram_mod.pkl'
            trigram_mod_fname = '/tmp/trigram_mod.pkl'
            quadgram_mod_fname = '/tmp/quadgram_mod.pkl'

            download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname)
            download_blob('debiaser_data', 'trigram_mod.pkl',
                          trigram_mod_fname)
            download_blob('debiaser_data', 'quadgram_mod.pkl',
                          quadgram_mod_fname)

            with open(bigram_mod_fname, 'rb') as pickle_file:
                bigram_mod = pickle.load(pickle_file)

            with open(trigram_mod_fname, 'rb') as pickle_file:
                trigram_mod = pickle.load(pickle_file)

            with open(quadgram_mod_fname, 'rb') as pickle_file:
                quadgram_mod = pickle.load(pickle_file)

            # make up to quad grams
            combined_article = make_quadgrams(combined_article, bigram_mod,
                                              trigram_mod, quadgram_mod)

            # remove to free memory
            os.remove(bigram_mod_fname)
            os.remove(trigram_mod_fname)
            os.remove(quadgram_mod_fname)

        # download dictionary
        id2word_fname = '/tmp/id2word.pkl'
        download_blob('debiaser_data', 'id2word_ec2.pkl', id2word_fname)

        with open(id2word_fname, 'rb') as pickle_file:
            processed_dictionary = pickle.load(pickle_file)

        # remove to free memory
        os.remove(id2word_fname)

        print('GENERATING BOW VECTOR FOR ARTICLE')
        # get bag of words representation
        bow_corpus_article = [
            processed_dictionary.doc2bow(text) for text in combined_article
        ]

        print('GETTING TF IDF SCORE')
        tfidf_vector = tfidf[bow_corpus_article[0]]

        # sort the tfidf vector
        tfidf_vector = sorted(tfidf_vector, key=getKey, reverse=True)

        # if there are fewer words than search words, then just use how many words there are
        if len(tfidf_vector) < n_search_words:
            n_search_words = len(tfidf_vector)

        top_tfidf_values = [
            tfidf_vector[i][0] for i in range(0, n_search_words)
        ]
        print(top_tfidf_values)

        top_words_list = [
            processed_dictionary[i].replace("_", " ") for i in top_tfidf_values
        ]

        top_words_string = ' '
        for word in top_words_list:
            if word not in top_words_string:
                top_words_string += ' ' + word

    # get dictionary of google queries
    queries_dict = {}

    for domain in all_sides_domains:

        # if this is single document lda
        if do_single_document_LDA:
            query = 'www.news.google.com/search?q=site:' + domain + lda_top_topic_words_string

        # if this is tfidf
        else:
            query = 'www.news.google.com/search?q=site:' + domain + top_words_string

        queries_dict[domain] = query

    return json.dumps(queries_dict)
Ejemplo n.º 29
0
## Remove unfrequent words
dictionary.filter_extremes(
    no_below=5, no_above=0.75
)  # we use this function to filter words that appear not often (as an integer, here 5) and too often (as a percentage, here in more than 75% [pe])
corpus = [dictionary.doc2bow(preprocess(review)) for review in reviews]

#### LDA ####
lda_model = LdaModel(
    corpus=corpus,  # This code runs your lda
    id2word=dictionary,
    random_state=100,
    num_topics=10,
    passes=5,
    chunksize=10000,
    alpha='asymmetric',
    decay=0.5,
    offset=64,
    eta=None,
    eval_every=0,
    iterations=100,
    gamma_threshold=0.001,
    per_word_topics=True)

## See the topics
lda_model.print_topics(-1)  #this allows to observe the topics
lda_model.get_topic_terms(0,
                          topn=10)  # this provides the top 10 words in topic 0
lda_model.log_perplexity(corpus)  # this compute the log perplexity
lda_model.get_document_topics(
    corpus[0]
Ejemplo n.º 30
0
    with open(os.path.join(path, 'data.tsv'), encoding='utf8') as f:
        reader = csv.reader(f, delimiter="\t")
        for line in reader:
            labels = line[0].split(', ')
            multi_hot_labels.append(labels)
            c = line[1:]
            c = clean_data(c)
            context.extend(c)
    #convert to multi-hot encoding
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(multi_hot_labels)
    label_list = list(mlb.classes_)

    token_context = [word_tokenize(x) for x in context]
    token_list = []
    for x in token_context:
        temp = [i for i in x if not i in stop_words]
        token_list.append(temp)
    token_context = [clean_data(x) for x in token_list]
    del token_list
    common_dictionary = Dictionary(token_context)
    common_corpus = [common_dictionary.doc2bow(text) for text in token_context]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus,
                   id2word=common_dictionary,
                   alpha='auto',
                   num_topics=3,
                   passes=5)
    print(lda.show_topic(2, 20))