Exemple #1
0
def get_doc_topic_dist(OUT_DIR=OUT_DIR):
    lda_dict = corpora.Dictionary.load(OUT_DIR + TV_SHOW + '.dict')
    lda_corpus = corpora.MmCorpus(OUT_DIR + TV_SHOW + '.mm')
    lda = LdaMulticore.load(OUT_DIR + TV_SHOW + '.lda')
    return _extract_data(topic_model=lda,
                         dictionary=lda_dict,
                         corpus=lda_corpus)
Exemple #2
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name',
                              type=str,
                              default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm(
             'There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run',
                                   type=int,
                                   default=20)
         num_topics = click.prompt('Number of topics',
                                   type=int,
                                   default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus,
                              id2word=id2word,
                              num_topics=num_topics,
                              workers=num_procs,
                              passes=num_epochs)
         model.save(fname)  #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(
             time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
 def _load(self):
     modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name))
     if not modeldir.exists():
         return False
     self._lda = LdaMulticore.load(str(modeldir))
     self._dictionary = Dictionary.load(
         str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
Exemple #4
0
def getTopics(jobs_):
  
    bigram_model = Phrases.load('data/bigram_model_all')
    trigram_model = Phrases.load('data/trigram_model_all')
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    lda = LdaMulticore.load('data/lda_model_all')

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    topics_ = []
    
    for job_ in jobs_:
      if job_ is not None:
        #print(job_[0])
        topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True):
    module_path = os.path.dirname(__file__)
    model_path = module_path + "/models"

    if verbose:
        print("prepare data")
    corpus = corpus.apply(lambda x: x.split(" "))
    dictionary = Dictionary(corpus)
    bow = [dictionary.doc2bow(doc) for doc in corpus]

    if type(load) == str:
        if verbose:
            print("loading lda")
        lda = LdaMulticore.load(model_path + "/" + load)
    else:
        if verbose:
            print("training lda")
        lda = LdaMulticore(bow, num_topics=num_topics)
        if save_as:
            try:
                os.mkdir(model_path)
            except:
                pass

            lda.save(model_path + "/" + save_as)
    if verbose:
        print("generate visualization")
    vis = pyLDAvis.gensim.prepare(lda, bow, dictionary)
    return lda, vis
Exemple #6
0
    def get_model(self,
                  n_topics=50,
                  n_workers=6,
                  recalculate=False,
                  from_scratch=True):

        filepath = self.paths.get_lda_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No LDA file exists but from_scratch is False')

            trigram_dictionary = self.get_corpus_dict()
            trigram_bow_corpus = self.get_trigram_bow_corpus(
                trigram_dictionary)

            print('Building LDA model...')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=n_topics,
                               id2word=trigram_dictionary,
                               workers=n_workers)

            lda.save(filepath)
            print('LDA model (n_topics={}) written to {}'.format(
                n_topics, filepath))
        else:
            print('Loading LDA model (n_topics={})...'.format(n_topics))
            lda = LdaMulticore.load(filepath)

        return lda
Exemple #7
0
def test():
    import pickle
    # from sqlalchemy.dialects.mssql import BINARY

    ## Create a semi-complex list to pickle
    listToPickle = LdaMulticore.load(model_dir)

    ## Pickle the list into a string
    pickledList = pickle.dumps(listToPickle, pickle.HIGHEST_PROTOCOL)

    connection = engine.connect()

    ## Create a cursor for interacting

    # cursor = connection.cursor()

    ## Add the information to the database table pickleTest
    connection.execute(
        """INSERT INTO dbo.model_test(id, binary_model) VALUES (?, ?)""",
        (1, pickledList))

    ## Select what we just added
    result = connection.execute(
        """SELECT binary_model FROM dbo.model_test WHERE id = 1""")

    ## Dump the results to a string
    rows = result.fetchall()

    ## Get the results
    for each in rows:
        ## The result is also in a tuple
        for pickledStoredList in each:
            ## Unpickle the stored string
            unpickledList = pickle.loads(pickledStoredList)
            print(unpickledList)
Exemple #8
0
def test_lda(sentence):
    """Tests the trained LDA model on an example sentence, i.e. returns the topics of that
    sentence.
    May only be called after train_lda().

    Args:
        sentence: A sentence to test on as string.
    """
    # validate and process the sentence
    if sentence is None or len(sentence) < 1:
        raise Exception("Missing or empty 'sentence' argument.")

    sentence = sentence.decode("utf-8").lower().strip().split(" ")
    if len(sentence) != cfg.LDA_WINDOW_SIZE:
        print("[INFO] the token size of your sentence does not match the defined window " \
              "size (%d vs %d)." % (len(sentence), cfg.LDA_WINDOW_SIZE))

    # load dictionary and trained model
    dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH)
    lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH)

    # sentence to bag of words
    bow = dictionary.doc2bow(sentence)

    # print topics of sentence
    print(lda_model[bow])
Exemple #9
0
def get_required_models(algorithm, best_topic_model_path, topic_modeling_path):
    with open(topic_modeling_path + "dictionary", 'rb') as pickle_file:
        dictionary = pickle.load(pickle_file)
    with open(topic_modeling_path + "tfidf_model", 'rb') as pickle_file:
        tfidf = pickle.load(pickle_file)
    topic_model = LdaMulticore.load(best_topic_model_path + algorithm +
                                    "/model/" + algorithm + ".model")
    return dictionary, tfidf, topic_model
Exemple #10
0
 def load(self):
     """
     Load previous saved ldaprocessor results
     """
     try:
         return LdaMulticore.load(self.lda_out_file_name)
     except:
         return None
 def explore_topic(self, topic_number, topn=20):
     lda = LdaMulticore.load(self.lda_model_filepath)
     """
     accept a user-supplied topic number and
     print out a formatted list of the top terms
     """
     print("{:20} {} \n".format("term", "frequency"))
     for term, frequency in lda.show_topic(topic_number, topn):
         print("{:20} {:.5f}".format(term, frequency))
Exemple #12
0
    def fit_universal_models(self):

        vec = CountVectorizer(stop_words='english', max_features=10000)
        vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences)

        id2word = {v: k for k, v in vec.vocabulary_.iteritems()}
        vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T)

        if os.path.isfile('lda.modl'):
            lda = LdaMulticore.load('lda.modl')
        else:
            lda = LdaMulticore(corpus=vec_corpus,
                               id2word=id2word,
                               iterations=200,
                               num_topics=2,
                               passes=10,
                               workers=4)
            lda.save('lda.modl')

        all_counts = vec.transform(' '.join(x) for x in self.all_sentences)
        self.d['all']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0])
        labeled_counts = vec.transform(' '.join(x) for x in self.X)
        self.d['labeled']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0])

        w2vmodel = Word2Vec(self.all_sentences,
                            size=100,
                            window=5,
                            min_count=3,
                            workers=4)

        best_centroids = None
        best_score = None
        for _ in xrange(
                10):  # todo -- implement kmeans++ instead of best of 10
            km = Kmeans(50)
            km.fit(w2vmodel.syn0)
            score = km.compute_sse(w2vmodel.syn0)
            if best_score is None or score < best_score:
                best_score = score
                best_centroids = km.centroids
        km.centroids = best_centroids

        self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words()))
        self.d['all']['_t'] = self.tfidf.fit_transform(
            ' '.join(x) for x in self.all_sentences)
        self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x)
                                                       for x in self.X)

        self.d['all']['_kmeans'] = np.array(
            kmeans_word2vecify(self.all_sentences, w2vmodel, km,
                               self.d['all']['_t'], self.tfidf))
        self.d['labeled']['_kmeans'] = np.array(
            kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'],
                               self.tfidf))
def lda_show_topic(i=[1]):
    # take list variable, return topic name and sub-topic items
    lda = LdaMulticore.load('../Models/lda_model_all_30')
    name = get_topic_name()
    lst = []
    for x in i:
        print('subtopic = {}'.format(name[x]))
        print(lda.show_topic(x, topn=25))
        lst.append(lda.show_topic(x, topn=25))
    return lst
def LDA_IO(lda_model_name):
    with open('dictionary.pkl', 'rb') as f:
        dictionary = pickle.load(f)

    with open('doc2idx.pkl', 'rb') as f:
        doc2idx = pickle.load(f)

    lda_model = LdaMulticore.load(lda_model_name)
    token2id = dictionary.token2id
    return doc2idx, lda_model, token2id
Exemple #15
0
def get_lda_model(corpus, dictionary, num_topics, SAVE_FILE=OUT_FILE, passes=20, iterations=100):
    if not os.path.exists(SAVE_FILE + '.lda'):
        print('creating lda model for the {} file..'.format(SAVE_FILE))
        print('num_topics: {}'.format(num_topics))
        lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics,
                                 passes=passes, iterations=iterations, chunksize=2500)
        lda_model.save(SAVE_FILE + '.lda')
    else:
        print('LDA model for the file:{} already exists.. loading..'.format(SAVE_FILE))
        lda_model = LdaMulticore.load(SAVE_FILE + '.lda')
    return lda_model
Exemple #16
0
 def __init__(self, examples, vocab, lda_vocab_path, lda_model_path, args):
     self.data = examples
     self.vocab = vocab
     self.args = args
     self.item_vocab = load_item_vocab(args)
     self.lda_vocab = Dictionary.load(lda_vocab_path)
     self.lda_model = LdaMulticore.load(lda_model_path)
     self.sent_lim = [
         self.args.cp_sentNum, self.args.desc_sentNum,
         self.args.require_sentNum, self.args.benefit_sentNum
     ]
Exemple #17
0
 def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None):
     """Initialize the LDA wrapper.
     Args:
         lda_filepath: Filepath to the trained LDA model.
         dictionary_filepath: Filepath to the dictionary of the LDA.
         cache_filepath: Optional filepath to a shelve cache for the LDA results.
     """
     self.lda = LdaMulticore.load(lda_filepath)
     self.dictionary = gensim.corpora.dictionary.Dictionary.load(dictionary_filepath)
     self.cache_synch_prob = 2 # in percent, 1 to 100
     self.cache_filepath = cache_filepath
     self.cache = shelve.open(cache_filepath) if cache_filepath is not None else None
 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)
    def _load_model(self):
        """This function is used to load a gensim LdaModel from the models
        folder. Or `None` if one does not exist.

        Returns:
            :obj:`gensim.models.ldamodel.LdaModel`: The model found
            in ucla_topic_analysis/model/lda.model or None if there was
            no lda model saved or the number of topics does not match.
        """

        if os.path.isfile(self.file_path):
            return LdaMulticore.load(self.file_path)
        return None
def load_all(modeldesc, sourcedesc):
    modelfilename = model_file('ldamodel-%s' % modeldesc)
    ldamodel = LdaMulticore.load(modelfilename)

    corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc))

    prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json'))

    sourcefilename = data_source_file(sourcedesc + '.csv')
    reader = csv_reader(sourcefilename)
    source_texts = {row['id']: row['text'] for row in reader}

    return ldamodel, corpus, prep_items, source_texts
 def load_topicmodel(self, model_path):
     print("--- Loading Model ---\n")
     if self.algo == 'gensim':
         self.model_path = model_path
         self.ldamodel = LdaMulticore.load(model_path + "/model_obj",
                                           mmap='r')
         self.dictionary = Dictionary.load(model_path + "/dictionary_obj",
                                           mmap='r')
         self.num_topics = self.ldamodel.num_topics
     else:
         self.ldamodel = pickle.load(model_path + "/model_obj.pk")
         self.lda_vectorizer = pickle.load(model_path +
                                           "/vectorizer_obj.pk")
         self.num_topics = self.ldamodel.n_components
 def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None):
     """Initialize the LDA wrapper.
     Args:
         lda_filepath: Filepath to the trained LDA model.
         dictionary_filepath: Filepath to the dictionary of the LDA.
         cache_filepath: Optional filepath to a shelve cache for the LDA results.
     """
     self.lda = LdaMulticore.load(lda_filepath)
     self.dictionary = gensim.corpora.dictionary.Dictionary.load(
         dictionary_filepath)
     self.cache_synch_prob = 2  # in percent, 1 to 100
     self.cache_filepath = cache_filepath
     self.cache = shelve.open(
         cache_filepath) if cache_filepath is not None else None
 def loadModel(self, filename):
     self.util.logDebug('LDA', 'Loading model from ' + filename)
     self.model = LdaMulticore.load(fname=filename)
     self.dictionary = Dictionary.load(fname=filename + '.dict')
     self.corpus = MmCorpus(filename + '.corpus')
     print(self.dictionary)
     print(self.model.print_topic(0, topn=5))
     print(self.model.print_topic(1, topn=5))
     print(self.model.print_topic(2, topn=5))
     print(self.model.print_topic(3, topn=5))
     self.loaded = True
     self.util.logDebug('LDA',
                        'Model loaded in ' + self.util.stopTimeTrack())
     self.labelTopics(filename)
Exemple #24
0
def show_topics():
    """Shows all topics of the trained LDA model.
    May only be called after train_lda().
    """
    # load trained model
    lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH)

    # list the topics
    topics = lda_model.show_topics(num_topics=cfg.LDA_COUNT_TOPICS, num_words=10, log=False,
                                   formatted=True)

    print("List of topics:")
    for i, topic in enumerate(topics):
        # not adding topic to the tuple here prevents unicode errors
        print("%3d:" % (i,), topic)
Exemple #25
0
def main(coursesList):
    lda = LDA.load("./best_model.lda")
    dictionary = Dictionary.load("best_model.lda.id2word")
    bigrams = Phraser.load("./bigram_model.pkl")
    trigrams = Phraser.load("./trigram_model.pkl")
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    corpus = [dictionary.doc2bow(text) for text in text_clean]
    create_vector_topics(lda, corpus, dictionary, coursesList)
    courses_topic = config.matrix_courses_topic.to_numpy()

    #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList)
    #courses_topic = config.matrix_courses_topic.to_numpy()

    cursor.execute("select id from auth_group")
    id_groups = cursor.fetchall()
    for i in id_groups:
        cursor.execute(
            "select distinct studyplan_id from students where group_id = %(id)s ",
            {'id': i[0]})
        studyplan_id = cursor.fetchall()
        for j in studyplan_id:
            subject_list = pd.DataFrame(columns=['id_subject', 'description'])
            subject_list = WordProcessing.word_processing(
                get_work_program(j[0], subject_list))
            #for k in subject_list:
            token_stud_prog = [
                program.split(' ') for program in subject_list['description']
            ]
            #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams)
            prog_corp = [
                dictionary.doc2bow(program) for program in token_stud_prog
            ]
            topic_prog = lda.get_document_topics(prog_corp)
            for l in range(0, len(topic_prog)):
                profile_student = np.zeros(config.num_lda_topic)
                dense_topic_prog = np.zeros(config.num_lda_topic)
                for m in topic_prog[l]:
                    dense_topic_prog[m[0]] += m[1]
                #mask = np.argsort(dense_topic_prog)[::-1][:1]
                #profile_student[mask] += 1
                profile_student = dense_topic_prog
                cosine_similarities = linear_kernel(
                    profile_student.reshape(1, -1), courses_topic).flatten()
                top_courses = np.where(cosine_similarities >= 0.2)[0]
                print(subject_list.loc[l, 'id_subject'])
                #print(top_courses)
                print(coursesList.loc[top_courses, 'name':'link'])
    def visualizeLDA(self, filename):

        dictionary = Dictionary.load(filename + '.dict')
        corpus = MmCorpus(filename + '.corpus')
        lda = LdaMulticore.load(filename)
        self.util.logDebug('LDA', 'Preparing HTML ')
        ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        self.util.logDebug('LDA',
                           'HTML prepared in ' + self.util.stopTimeTrack())
        pyLDAvis.save_html(ldavis, filename + '.html')
        self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack())


#
# lda = LDA(logfilename='/home/kah1/test.log')
# lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model')
# lda.labelTopics()
Exemple #27
0
def train_lda(corpus, dictionary, lda_model_filepath, num_topics,
              run_or_load_flag):
    if run_or_load_flag:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            # workers => sets the parallelism, and should be
            # set to your number of physical cores minus one
            lda = LdaMulticore(corpus,
                               num_topics=num_topics,
                               id2word=dictionary,
                               workers=3)
        lda.save(lda_model_filepath)
    else:
        lda = LdaMulticore.load(lda_model_filepath)

    return lda
Exemple #28
0
    def __init__(self, examples, tokenizer, lda_vocab_path, lda_model_path,
                 args):
        self.data = examples
        self.tokenizer = tokenizer
        # add new special token
        self.spec_tokens = load_special_tokens(args)
        self.tokenizer.additional_special_tokens = self.spec_tokens
        self.tokenizer.add_tokens(self.spec_tokens)
        self.args = args
        self.item_vocab = load_item_vocab(args)
        self.lda_vocab = Dictionary.load(lda_vocab_path)
        self.lda_model = LdaMulticore.load(lda_model_path)

        self.sent_lim = [
            self.args.cp_sentNum, self.args.desc_sentNum,
            self.args.require_sentNum, self.args.benefit_sentNum
        ]
        self.text_fields = self.data[0]._fields[:4]
Exemple #29
0
def main():
    options = {
        'corpus_file': 'data\\origtweets_dtm.pkl',
        'id_file': 'data\\row_origtweets.csv',
        'model_file': 'data\\orig_10topics.lda',
        'meta_file': 'data\\origtweets_meta.csv',
        'output_file': 'data\\origtweets_topics.csv'
    }

    start_time = time.time()
    id_df = pd.read_csv(options['id_file'], usecols=['row'], dtype='float')
    meta_df = pd.read_csv(options['meta_file'])

    with open(options['corpus_file']) as corpus_file:
        corpus = pickle.load(corpus_file)
    lda = LdaMulticore.load(options['model_file'])

    if len(meta_df) != len(corpus):
        print ('Warning: Some documents may have been deleted during processing.\n')
        print ('metadata size - corpus size = ' + str(len(meta_df) - len(corpus)))

    topic_features = [to_dense(lda[bow], lda.num_topics) for bow in corpus]

    topic_colname = 'topic{0}'.format
    topic_colnames = [topic_colname(t+1) for t in xrange(lda.num_topics)]
    topic_df = pd.DataFrame.from_records(topic_features, columns=topic_colnames)
    with open('data\\topic_df.pkl', 'wb') as pkl_file:
        pickle.dump(topic_df, pkl_file)


    print ('topic size - id size = ' + str(len(id_df) - len(topic_df)))
    if len(id_df) != len(topic_df):
       raise Exception()

    topic_df = pd.concat([id_df, topic_df], axis=1)
    
    merged_df = pd.merge(meta_df, topic_df, on='row', how='right', sort=False)
    merged_df.to_csv(options['output_file'], index=False)

    end_time = time.time()
    print ('running time: ' + str((end_time - start_time)/60) + ' minutes')
Exemple #30
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name', type=str, default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run', type=int, default=20)
         num_topics = click.prompt('Number of topics', type=int, default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs)
         model.save(fname) #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
def get_topics():
    '''Computes distribution over topics for each abstract'''

    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore.load('lda.gensim')

    base = 'datasets/dspace'
    new_base = 'datasets/dspace_topics'
    for filename in tqdm(os.listdir(base)):
        path = os.path.join(base, filename)
        with open(path, 'r') as f:
            d = json.load(f)
            abstract = d['abstract']
            if abstract is not None:
                words = tokenize(abstract.split())
                bow = dictionary.doc2bow(words)
                topics = lda.get_document_topics(bow, minimum_probability=0)
                topics = to_vec(topics)
                d['topics'] = topics
                new_path = os.path.join(new_base, filename)
                with open(new_path, 'w') as new_f:
                    json.dump(d, new_f)
    def generate_lda_topics(self):
        from gensim.corpora import Dictionary, MmCorpus
        from gensim.models.ldamulticore import LdaMulticore
        import pyLDAvis
        import pyLDAvis.gensim
        import warnings
        import _pickle as pickle

        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        trigram_dictionary = Dictionary(trigram_sentences)
        # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
        trigram_dictionary.compactify()
        trigram_dictionary.save(self.trigram_dictionary_filepath)

        def trigram_bow_generator(filepath):
            for sentence in LineSentence(filepath):
                yield trigram_dictionary.doc2bow(sentence)

        MmCorpus.serialize(
            self.trigram_bow_filepath,
            trigram_bow_generator(self.trigram_sentences_filepath))
        trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=3,
                               id2word=trigram_dictionary,
                               workers=3)
            lda.save(self.lda_model_filepath)
        lda = LdaMulticore.load(self.lda_model_filepath)
        lda.show_topic(0)
        lda.show_topic(1)
        lda.show_topic(2)
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                                  trigram_dictionary)
        pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
Exemple #33
0
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,
                           id2word=trigram_dictionary,
                           workers=3)

    lda.save(lda_model_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

explore_topic(topic_number=0)

topic_names = {
    0: 'looking_at_websites_for_info',
    1: 'doesnt_have_the_negative_exercise_effect',
    2: 'spend_time_looking_on_websites',
    3: 'games_and_information',
    4: 'bad_if_kids_spend_too_much_time'
}

topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)
Exemple #34
0
 def load_model(self, path):
     self.lda_model = LdaMulticore.load(path)
Exemple #35
0
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
coherence_values

# for m, cv in zip(x, coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

# optimal_model = model_list[1]
# for i, row in enumerate(optimal_model[corpus]):
#     print(i, row)

# optimal_model.save('lda.model')
# model.save('lda.model')
model = LdaMulticore.load('lda.model')


model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

# def format_topics_sentences(ldamodel, corpus=corpus, texts=texts):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row in enumerate(ldamodel[corpus]):
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic