Exemple #1
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name',
                              type=str,
                              default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm(
             'There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run',
                                   type=int,
                                   default=20)
         num_topics = click.prompt('Number of topics',
                                   type=int,
                                   default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus,
                              id2word=id2word,
                              num_topics=num_topics,
                              workers=num_procs,
                              passes=num_epochs)
         model.save(fname)  #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(
             time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
Exemple #2
0
 def create_lda_model(self,
                      no_topics=10,
                      random_state=42,
                      passes=5,
                      alpha='auto',
                      eta=None,
                      workers=None,
                      chunksize=2000):
     """
     :param no_topics: Number of topics that are to be explored by lda model
     :param random_state: Random state for reproducible results (default 42, gensim default is None)
     :param passes: Number of times the whole corpus is processed.
     :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric"
     (gensim default is "symmetric")
     :param eta: Word-topic distribution prior eta (beta)
     :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already
     uses all available cores. Higher number of workers results in a load bigger than the number of cores.
     :param chunksize: chunsize parameter of gensim
     """
     if eta is None:
         eta = 1 / no_topics
     if workers is None:
         workers = self.processes
     if self.bag_of_words is None:
         self.create_bag_of_words()
     self.lda_model = LdaMulticore(corpus=self.bag_of_words,
                                   id2word=self.id2word,
                                   num_topics=no_topics,
                                   eta=eta,
                                   workers=workers,
                                   random_state=random_state,
                                   alpha=alpha,
                                   passes=passes,
                                   chunksize=chunksize)
Exemple #3
0
    def __init__(self, corpora, num_topics, print_topics=True):
        self.num_topics = num_topics

        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()
        self.stemmer = nltk.stem.snowball.RussianStemmer()

        corpora_tokenzied = [
            self.tokenizer.tokenize(
                (self._keep_only_russian_chars(str(doc).lower())))
            for doc in corpora
        ]

        corpora_stemmed = []
        for doc in corpora_tokenzied:
            stemmed_doc = [
                self.stemmer.stem(token) for token in doc
                if token not in ru_stopwords
            ]
            stemmed_doc = [
                token for token in stemmed_doc if token not in ru_stopwords
            ]
            corpora_stemmed.append(stemmed_doc)

        self.dictionary = gensim.corpora.Dictionary(corpora_stemmed)
        corpora_bow = [self.dictionary.doc2bow(doc) for doc in corpora_stemmed]
        # self.tfidf = gensim.models.TfidfModel(corpora_bow)
        # corpora_tfidf = self.tfidf[corpora_bow]

        self.lda = LdaMulticore(num_topics=self.num_topics,
                                corpus=corpora_bow,
                                id2word=self.dictionary)

        if print_topics:
            for s in self.lda.print_topics():
                print(s)
Exemple #4
0
    def get_model(self,
                  n_topics=50,
                  n_workers=6,
                  recalculate=False,
                  from_scratch=True):

        filepath = self.paths.get_lda_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No LDA file exists but from_scratch is False')

            trigram_dictionary = self.get_corpus_dict()
            trigram_bow_corpus = self.get_trigram_bow_corpus(
                trigram_dictionary)

            print('Building LDA model...')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=n_topics,
                               id2word=trigram_dictionary,
                               workers=n_workers)

            lda.save(filepath)
            print('LDA model (n_topics={}) written to {}'.format(
                n_topics, filepath))
        else:
            print('Loading LDA model (n_topics={})...'.format(n_topics))
            lda = LdaMulticore.load(filepath)

        return lda
def createlda(num_topics, filename):
    dumppick(filename)
    num_topics = 50
    texts, texts_tf_idf, dictionary = loadpcik()
    # 利用lsi做主题分类的情况
    """
    print("**************LSI*************")
    lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20)    # 初始化一个LSI转换
    texts_lsi = lsi[texts_tf_idf]                # 对其在向量空间进行转换
    print(lsi.print_topics(num_topics=20, num_words=10))
    """
    # 利用LDA做主题分类的情况
    print("**************LDA*************")
    #ppl = []
    #for i in range(1,50,1):
    #texts = shuffle(texts)
    #texts_train = texts[:int(24012*(0.9))]
    #texts_vad = texts[int(24012*(0.9)):]
    lda = LdaMulticore(corpus=texts,
                       iterations=1000,
                       id2word=dictionary,
                       num_topics=num_topics,
                       passes=200,
                       per_word_topics=True)
    #texts_lda = lda[texts_tf_idf]
    out = open("./ldamd/{}tpc-tpc".format(num_topics),
               mode="w",
               encoding="utf8")
    print(lda.print_topics(num_topics=num_topics, num_words=10), file=out)
    lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18]))
    #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i)
    return lda, texts, texts_tf_idf, dictionary
def create_LDA_dict():
    #ONE TIME USE, to create and save LDA model
    trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict'
    trigram_reviews = LineSentence(
        '../Dataset/trigram_transformed_reviews_all.txt')
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()
    trigram_dictionary.save(trigram_dictionary_filepath)
    print('LDA dict saved.')
    trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm'
    MmCorpus.serialize(
        trigram_bow_filepath,
        trigram_bow_generator(
            '../Dataset/trigram_transformed_reviews_all.txt'))
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    lda_model_filepath = '../Models/lda_model_all'  #lda_model_all_30, lda_model_10topic
    # created LDA model with 10, 30, 50 topics, found 30 has best result
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = LdaMulticore(
            trigram_bow_corpus,
            num_topics=30,  #10, 30, 50
            id2word=trigram_dictionary,
            workers=8)
    lda.save(lda_model_filepath)
    print('LDA model saved.')
def train_topics(datap):
    # build or load topic model
    dictp = './topic/twitter.dict'
    tmodelp = './topic/twitter.model'
    text_idx = 2  # the column idx of tweets in tsv file
    topic_ns = [5, 10, 15, 20]  # number of topics, to choose the best topic k

    # load corpus
    corpus = []
    with open(datap) as dfile:
        dfile.readline()
        for line in dfile:
            line = line.strip().split('\t')
            corpus.append(line[text_idx].split())

    print('Build Dictionary for topic model...')
    if os.path.exists(dictp):
        dictionary = Dictionary.load(dictp)
    else:
        dictionary = Dictionary(corpus)
        dictionary.save(dictp)

    print('Training Topic Models......')
    if os.path.exists(tmodelp):
        best_m = LdaModel.load(tmodelp)
    else:
        # document to indices
        doc_matrix = [dictionary.doc2bow(doc) for doc in corpus]

        # find the best number of topics
        best_s = -10
        best_m = None
        for idx in range(len(topic_ns)):
            print('Trying topic number: ', topic_ns[idx])
            ldamodel = LdaMulticore(doc_matrix,
                                    id2word=dictionary,
                                    num_topics=topic_ns[idx],
                                    passes=1000,
                                    alpha='symmetric',
                                    eta=None)

            cm = CoherenceModel(
                model=ldamodel,
                corpus=doc_matrix,
                coherence='c_npmi',
                texts=corpus,
            )
            if cm.get_coherence() > best_s:
                best_s = cm.get_coherence()
                best_m = ldamodel
                best_m.save(tmodelp)

            print(
                'Topic number ' + str(topic_ns[idx]) + \
                ', get coherence: ' + str(cm.get_coherence())
            )

        corpus = None
        del corpus  # release memory
    return dictionary, best_m
Exemple #8
0
def createLDAModel(docs, dictionary, num_topics = 100, iterations = NUM_ITERATIONS, 
                   passes = NUM_PASSES, workers = 3, output = 'lda_model'):
    """Creates the LDA model for the given documents.  

    Args:
        docs (lst): List of tokenized documents
        dictionary (lst): The dictionary
        num_topics (int): The number of topics to discover
        iterations (int): The number of iterations of the LDA method
        passes (int): The number of passes of the LDA method
        workers (int): The number of workers employed in the creation of the model
        output (str): Prefix used to store the model in a set of files

    Returns:
        ldamodel: The LDA model
    """
    
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in docs]
    
    # generate LDA model
    ldamodel = LdaMulticore(corpus, id2word = dictionary, num_topics = NUM_TOPICS, 
                            iterations = iterations, passes = passes, workers = workers)
    ldamodel.save(output + '_i' + str(iterations) + '_p' + str(passes) + '_T' + str(num_topics) + '.lda')
    
    return ldamodel
Exemple #9
0
def fit_LdaMulticore(gensim_df,
                     id2word,
                     num_topics,
                     alpha,
                     workers=None,
                     passes=1,
                     iterations=1000,
                     update_every=1000,
                     chunksize=1000,
                     minimum_topic_probability=0.05,
                     forget_weight=0.5,
                     random_state=0):
    model = LdaMulticore(
        corpus=gensim_df,
        id2word=id2word,
        num_topics=num_topics,
        alpha=alpha,
        workers=workers,
        passes=passes,  # epochs
        iterations=iterations,
        chunksize=chunksize,  #batch size
        minimum_probability=minimum_topic_probability,
        decay=forget_weight,
        per_word_topics=True,
        random_state=random_state)
    return model
Exemple #10
0
    def fit(self, X, y=None):
        tokens = list(X[self.token_column].values)
        dictionary = corpora.Dictionary(tokens)
        self.dictionary = dictionary
        self.dictionary.filter_extremes(
            no_below=self.no_below,
            no_above=self.no_above,
            keep_n=1000000,
        )
        print('Number of unique tokens after filtering for LDA: %d' %
              len(dictionary))
        if not self.inplace:
            X = X.copy()
        X['bow'] = X[self.token_column].apply(dictionary.doc2bow)
        from gensim.models.ldamulticore import LdaMulticore
        eval_every = int(self.iterations / 20) + 1
        temp = dictionary[0]
        id2word = dictionary.id2token
        corpus = list(X['bow'].values)
        num_topics = 120

        model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=750, \
                             eta='auto', \
                             iterations=self.iterations, num_topics=self.num_topics, \
                             passes=self.passes, eval_every=eval_every, workers=self.cpus)
        self.model = model
Exemple #11
0
def lda_trainer(sentences, modelPath=None, nb_topics=190, multicore=False):
    '''
    @return: lda_model: the lda_model model trained by gensim, dictionary: all terms dictionary in lda_model model
    '''

    # load doc2bow
    dictionary = corpora.Dictionary(sentences)
    print('finish load dictionary!')
    corpus = [dictionary.doc2bow(text) for text in sentences]
    print('finish load doc2bow corpus!')
    # train lda_model model
    print('training lda_model model...')
    if multicore == True:
        # can just use in linux
        # very hard for CPU, cautiously use it
        lda_model = LdaMulticore(corpus=corpus,
                                 num_topics=nb_topics,
                                 id2word=dictionary)
    else:
        lda_model = LdaModel(corpus=corpus,
                             num_topics=nb_topics,
                             id2word=dictionary)
    print('finished lda_model model training, nb terms: %d' %
          lda_model.num_terms)

    # save lda_model model on disk
    if modelPath != None:
        lda_model.save(fname=modelPath)
        dictionary.save(fname_or_handle=modelPath.replace('.topic', '.dict'))
        print(
            'producing lda_model & dictionary model ... ok! model store in {0}(.dict)'
            .format(modelPath))

    return lda_model, dictionary
Exemple #12
0
def train_lda(corpus, params, dictionary):
    """Train LDA model according to provided params"""
    # Set training parameters.
    num_topics = params.num_topics
    chunksize = params.chunksize
    passes = params.passes
    iterations = params.iterations
    decay = params.decay
    offset = params.offset

    # Make an index to word dictionary.
    logging.info("Mapping ids to words...")
    temp = dictionary[0]
    id2word = dictionary.id2token
    logging.info("Done mapping ids to words.")

    logging.info("Making the LDA model...")
    lda = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        workers=3,  # Allows algorithm to run more efficiently
        chunksize=chunksize,
        alpha=
        'asymmetric',  # If low: Each document is represented by only a few topics
        eta='auto',  # If low: Each topic is only represented by a few words
        decay=decay,
        offset=offset,
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=None,
        random_state=230,
        per_word_topics=True)
    logging.info("Done making the LDA model.")
    return lda
Exemple #13
0
def pipeline_lda(que: pd.DataFrame,
                 dim: int) -> (Dictionary, TfidfModel, LdaMulticore):
    """
    Pipeline for training embeddings for questions via LDA algorithm
    on question titles and bodies

    :param que: raw questions.csv dataset
    :param dim: dimension of doc2vec embeddings to train
    :return: trained tags, industries embeddings and question's Doc2Vec model
    """
    lda_tokens = que['questions_whole'].apply(lambda x: x.split())

    # create Dictionary and train it on text corpus
    lda_dic = Dictionary(lda_tokens)
    lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000)
    lda_corpus = [lda_dic.doc2bow(doc) for doc in lda_tokens]

    # create TfidfModel and train it on text corpus
    lda_tfidf = TfidfModel(lda_corpus)
    lda_corpus = lda_tfidf[lda_corpus]

    # create LDA Model and train it on text corpus
    lda_model = LdaMulticore(lda_corpus,
                             num_topics=dim,
                             id2word=lda_dic,
                             workers=4,
                             passes=20,
                             chunksize=1000,
                             random_state=0)

    return lda_dic, lda_tfidf, lda_model
Exemple #14
0
def test_lda_streaming():
    generator = token_stream(NOVELS_DIRPATH)
    dictionary = Dictionary(generator)
    bags_of_words = [dictionary.doc2bow(tokens) for tokens in generator]
    lda = LdaMulticore(corpus=bags_of_words,
                       id2word=dictionary,
                       random_state=723812,
                       passes=10,
                       workers=4)
    parsed_topics = parse_topics(lda)
    assert len(parsed_topics) == 20
    #print(parsed_topics[0])
    #assert parsed_topics[0] == {'upshe': 0.001, 'jane': 0.001, 'think': 0.001, 'regular': 0.001, 'facile': 0.001, 'her': 0.001, 'power': 0.001, 'intimate': 0.001, 'saythat': 0.001, 'manyacquaintance': 0.001}
    #assert parsed_topics[0] == {'quite': 0.001, 'pomps': 0.001, 'inutility': 0.001, 'counts': 0.001, 'brought': 0.001, 'repent': 0.001, 'dayabout': 0.001, 'professor': 0.001, 'upward': 0.001, 'been': 0.001}
    # results not repeatable unless you set the random_state param!
    assert parsed_topics[0] == {
        'doctor': 0.001,
        'companion': 0.001,
        'lucky': 0.001,
        'somewhat': 0.001,
        'ofchildhood': 0.001,
        'rub': 0.001,
        'idea': 0.001,
        'pleasure': 0.001,
        'ofexistence': 0.001,
        'disposition': 0.001
    }
Exemple #15
0
    def train(self):
        split_archives = [article.tokens for article in self.articles]

        # create dictionary and corpus
        dictionary = corpora.Dictionary(split_archives)
        dictionary.filter_extremes(no_above=self.words_no_above)
        corpus = [dictionary.doc2bow(article) for article in split_archives]
        logger.info('Created dictionary and corpus')

        # get eta to force topics
        eta = get_eta(self.num_topics, dictionary)

        # create lda model with gensim
        lda_progress = LDAProgress(self.passes)
        ldamodel = LdaMulticore(corpus,
                                num_topics=self.num_topics,
                                id2word=dictionary,
                                passes=self.passes,
                                per_word_topics=True,
                                iterations=self.iterations,
                                eta=eta,
                                workers=cpu_count())
        lda_progress.close()

        logger.info('Created Topics model')

        # print the topics (debug)
        logger.debug('Topics:')
        topics = ldamodel.print_topics(num_words=5)
        for topic in topics:
            logger.debug(topic)
        self.model = ldamodel
        self.dictionary = dictionary
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True):
    module_path = os.path.dirname(__file__)
    model_path = module_path + "/models"

    if verbose:
        print("prepare data")
    corpus = corpus.apply(lambda x: x.split(" "))
    dictionary = Dictionary(corpus)
    bow = [dictionary.doc2bow(doc) for doc in corpus]

    if type(load) == str:
        if verbose:
            print("loading lda")
        lda = LdaMulticore.load(model_path + "/" + load)
    else:
        if verbose:
            print("training lda")
        lda = LdaMulticore(bow, num_topics=num_topics)
        if save_as:
            try:
                os.mkdir(model_path)
            except:
                pass

            lda.save(model_path + "/" + save_as)
    if verbose:
        print("generate visualization")
    vis = pyLDAvis.gensim.prepare(lda, bow, dictionary)
    return lda, vis
Exemple #17
0
 def learn_lda_model(self, corpus, dictionary, k, iterations=100):
     """
     learning LDA model
     :param corpus: corpus created by gensim
     :param dictionary: dictionary created by gensim
     :param k: number of topics
     :param iterations: number of iterations
     :return:
     """
     if not self.use_mallet:
         lda = LdaMulticore(corpus,
                            id2word=dictionary,
                            workers=self.cpu_count,
                            num_topics=k,
                            random_state=42,
                            iterations=iterations,
                            per_word_topics=False,
                            eval_every=None)
     else:
         lda = LdaMallet(self.path_to_mallet_binary,
                         corpus=corpus,
                         id2word=dictionary,
                         workers=self.cpu_count,
                         num_topics=k,
                         random_seed=42,
                         iterations=iterations,
                         optimize_interval=10)
     cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
     coherence = cm.get_coherence()
     print('{}: {}'.format(k, coherence))
     return coherence, lda
Exemple #18
0
def train_classifier(papers: list, num_topics: int) -> LdaModel:
    """
    Trains the Lda model with selected documents.
    Training is done by cleaning the documents, index the words
    and train the model with a given number of topics
    Args:
        papers: list of papers, each item containing the corpus of a document
        num_topics: amount of topics that need to be trained

    Returns: Trained lda model

    """
    papers_clean = [clean(paper) for paper in papers]
    dictionary = corpora.Dictionary(papers_clean)
    doc_term_matrix = [dictionary.doc2bow(paper) for paper in papers_clean]
    models = []
    print("Start generating models")
    for x in range(13, 14):
        ldamodel = LdaMulticore(doc_term_matrix, num_topics=x, id2word=dictionary, passes=50)
        topic_words = [w[0] for x in range(ldamodel.num_topics) for w in ldamodel.show_topic(x)]
        unique_words = set(topic_words)
        models.append(ldamodel)
        print(x, len(unique_words), len(unique_words)/float(len(topic_words)))
    x = 1
    while True:
        try:
            x = int(input("Enter the model you want to train labels for:\n"))
        except ValueError:
            print("not an integer")
            continue
        if x > len(models) or x < 1:
            print("Model does not exist")
        else:
            break
    return models[x-1], dictionary
Exemple #19
0
 def run_model(self,
               collection_name,
               num_topics,
               save_dir=None,
               save_file=None,
               alpha=0.1,
               beta=0.01,
               iterations=800,
               passes=1):
     model = LdaMulticore(corpus=self.corpus,
                          id2word=self.dictionary,
                          num_topics=num_topics,
                          alpha=alpha,
                          eta=beta,
                          iterations=iterations,
                          passes=passes)
     if save_dir is None:
         save_dir = Constants.SAVE_DIR.format(
             collection_name.lower().replace(' ', '_'))
     if not os.path.isdir(save_dir):
         os.makedirs(save_dir)
     if save_file is None:
         save_file = Constants.SAVE_FILE_FORMAT.format(
             collection_name.lower().replace(' ', '_'), num_topics, alpha,
             beta, iterations)
     logging.info(save_dir)
     model.save(os.path.join(save_dir, save_file))
     return model
Exemple #20
0
 def infer_model(self, timeline: dict, exec_key, verbose: bool = False):
     bow, dictionary = self.prepare_data(timeline)
     if self.__class__.model_is_already_inferred(timeline, exec_key):
         logger.info('Model is already inferred')
         model = pickle.loads(timeline['models'][exec_key])
     else:
         logger.info('Inferring LDA...')
         try:
             model = LdaMulticore(
                 bow,
                 id2word=dictionary,
                 num_topics=self.n_topics,
                 passes=self.n_passes,
                 random_state=0,
             )
         except ValueError as e:
             error = 'cannot compute LDA over an empty collection (no terms)'
             if str(e) == error:
                 logger.error(
                     'Cannot compute LDA, there are no terms enough. '
                     'Maybe you need to decrease LDA_MIN_DF setting')
             return None
     if verbose:
         self.print_terms(model)
     self.generate_html(model, bow, dictionary, timeline['user'])
     return model
Exemple #21
0
def generate_tags(tokens: list) -> list:
    """Perform LDA Topic Modelling to aquire tags.

    Args:
        tokens (list): List of tokens

    Returns:
        tags_list (list) List of appropriate tags for
        given tokens.
    """
    id2word = Dictionary(tokens)
    corpus = [id2word.doc2bow(d) for d in tokens]
    model = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        random_state=42,
        num_topics=10,
        passes=2,
        workers=1
    )
    words = [re.findall(r'"([^"]*)"', t[1]) for t in model.print_topics()]
    wordcount = Counter(words[0] + words[1] + words[2] + words[3] + words[4])
    tags = pd.DataFrame.from_dict(
        wordcount, orient='index', columns=['number']
    )
    tags = tags.drop(tags[tags['number'] <= 1].index)
    tags = tags.sort_values(by=['number'], ascending=False).T
    tags_list = [word for word in tags.columns]
    return tags_list
    def train(self,
              corpus,
              dictionary,
              num_topics=5,
              per_word_topics=False,
              passes=30,
              workers=4,
              iterations=10,
              chunksize=200,
              save=False):

        model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             per_word_topics=per_word_topics,
                             minimum_phi_value=0.005,
                             passes=passes,
                             workers=workers,
                             iterations=iterations,
                             chunksize=chunksize,
                             random_state=93)
        if save:
            self.save_model(lda_model=model)

        return model
Exemple #23
0
def guidedLDA_Model(topics, cores=11):
    """
    Topics represents desired LDA topics,
    cores should be physical cores minus one.
    Both should be integers.
    """

    # load finished dictionary from disk
    trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize('./models2/trigram_bow_corpus.nm',
                        trigram_bow_generator('./models2/trigram_transformed_reviews.txt'))

    # load finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')


    # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs,
    # along with the number of topics the model should learn

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                       num_topics=topics,
                       id2word=trigram_dictionary,
                       workers=cores)

    lda.save('./models2/lda_model')

    # load the finished LDA model from disk
    #lda = LdaMulticore.load('./models/lda_model_neg')

    return trigram_bow_corpus, lda
Exemple #24
0
 def train(self, num_topics, passes, iterations, workers):
     ldamodel = LdaMulticore(self.corpus,
                             num_topics=num_topics,
                             id2word=self.dictionary,
                             passes=passes,
                             workers=workers,
                             iterations=iterations)
     self.model = ldamodel
Exemple #25
0
def train_lda(args):
    print "[LDA > n_topics: %d ]" % args.dim
    lda_reader = LDAReader(args.ds, max_sent=args.max_sent)
    ldazito = LdaMulticore(lda_reader,
                           id2word=lda_reader.idx2wrd,
                           num_topics=args.dim,
                           workers=args.workers)
    ldazito.save(args.out)
Exemple #26
0
 def _train_model(self):
     self._lda = LdaMulticore(corpus=self._corpus,
                              id2word=self._id2word,
                              num_topics=self.num_topics,
                              workers=1,
                              chunksize=10000,
                              passes=1)
     self._save_model()
Exemple #27
0
def fit_lda(X, num_topics=5, passes=20):
    """ Fit LDA from a scipy CSR matrix (X). """
    print('fitting lda...')
    return LdaMulticore(corpus,
                        num_topics=num_topics,
                        id2word=dictionary,
                        passes=passes,
                        eval_every=5,
                        workers=5)
Exemple #28
0
 def train(self, vecs):
     """
     Build the topic model.
     """
     corp = Scipy2Corpus(vecs)
     self.m = LdaMulticore(corp,
                           num_topics=self.n_topics,
                           iterations=1000,
                           workers=3)
Exemple #29
0
def get_lda_model(index_tokens, num_topics=6, passes=3):
    print('Getting gensim LDA topic model')
    dictionary = gensim.corpora.Dictionary(index_tokens)
    corpus = [dictionary.doc2bow(text) for text in index_tokens]
    lda_model = LdaMulticore(corpus,
                             num_topics=num_topics,
                             id2word=dictionary,
                             passes=passes)
    return lda_model, dictionary, corpus
    def topics_by_lda(self,
                      tokenized_corpus_path,
                      num_topics=20,
                      num_words=10,
                      max_lines=10000,
                      split="\s+",
                      max_df=100):
        """
        读入经过分词的文件并且对其进行 LDA 训练

        Arguments:
        tokenized_corpus_path -> string -- 经过分词的语料集地址
        num_topics -> integer -- 主题数目
        num_words -> integer -- 主题词数目
        max_lines -> integer -- 每次读入的最大行数
        split -> string -- 文档的词之间的分隔符
        max_df -> integer -- 避免常用词,过滤超过该阈值的词
        """

        # 存放所有语料集信息
        corpus = []

        with open(tokenized_corpus_path, 'r',
                  encoding='utf-8') as tokenized_corpus:

            flag = 0

            for document in tokenized_corpus:

                # 判断是否读取了足够的行数
                if (flag > max_lines):
                    break

                # 将读取到的内容添加到语料集中
                corpus.append(re.split(split, document))

                flag = flag + 1

        # 构建语料集的 BOW 表示
        (vocab, DTM) = self.corpus2dtm(corpus, max_df=max_df)

        # 训练 LDA 模型

        lda = LdaMulticore(matutils.Sparse2Corpus(DTM,
                                                  documents_columns=False),
                           num_topics=num_topics,
                           id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                           workers=4)

        # 打印并且返回主题数据
        topics = lda.show_topics(num_topics=num_topics,
                                 num_words=num_words,
                                 formatted=False,
                                 log=False)

        for ti, topic in enumerate(topics):
            print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))