Esempio n. 1
0
    def get_model(self,
                  n_topics=50,
                  n_workers=6,
                  recalculate=False,
                  from_scratch=True):

        filepath = self.paths.get_lda_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No LDA file exists but from_scratch is False')

            trigram_dictionary = self.get_corpus_dict()
            trigram_bow_corpus = self.get_trigram_bow_corpus(
                trigram_dictionary)

            print('Building LDA model...')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=n_topics,
                               id2word=trigram_dictionary,
                               workers=n_workers)

            lda.save(filepath)
            print('LDA model (n_topics={}) written to {}'.format(
                n_topics, filepath))
        else:
            print('Loading LDA model (n_topics={})...'.format(n_topics))
            lda = LdaMulticore.load(filepath)

        return lda
def create_LDA_dict():
    #ONE TIME USE, to create and save LDA model
    trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict'
    trigram_reviews = LineSentence(
        '../Dataset/trigram_transformed_reviews_all.txt')
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()
    trigram_dictionary.save(trigram_dictionary_filepath)
    print('LDA dict saved.')
    trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm'
    MmCorpus.serialize(
        trigram_bow_filepath,
        trigram_bow_generator(
            '../Dataset/trigram_transformed_reviews_all.txt'))
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    lda_model_filepath = '../Models/lda_model_all'  #lda_model_all_30, lda_model_10topic
    # created LDA model with 10, 30, 50 topics, found 30 has best result
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = LdaMulticore(
            trigram_bow_corpus,
            num_topics=30,  #10, 30, 50
            id2word=trigram_dictionary,
            workers=8)
    lda.save(lda_model_filepath)
    print('LDA model saved.')
def createlda(num_topics, filename):
    dumppick(filename)
    num_topics = 50
    texts, texts_tf_idf, dictionary = loadpcik()
    # 利用lsi做主题分类的情况
    """
    print("**************LSI*************")
    lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20)    # 初始化一个LSI转换
    texts_lsi = lsi[texts_tf_idf]                # 对其在向量空间进行转换
    print(lsi.print_topics(num_topics=20, num_words=10))
    """
    # 利用LDA做主题分类的情况
    print("**************LDA*************")
    #ppl = []
    #for i in range(1,50,1):
    #texts = shuffle(texts)
    #texts_train = texts[:int(24012*(0.9))]
    #texts_vad = texts[int(24012*(0.9)):]
    lda = LdaMulticore(corpus=texts,
                       iterations=1000,
                       id2word=dictionary,
                       num_topics=num_topics,
                       passes=200,
                       per_word_topics=True)
    #texts_lda = lda[texts_tf_idf]
    out = open("./ldamd/{}tpc-tpc".format(num_topics),
               mode="w",
               encoding="utf8")
    print(lda.print_topics(num_topics=num_topics, num_words=10), file=out)
    lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18]))
    #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i)
    return lda, texts, texts_tf_idf, dictionary
Esempio n. 4
0
 def run_model(self,
               collection_name,
               num_topics,
               save_dir=None,
               save_file=None,
               alpha=0.1,
               beta=0.01,
               iterations=800,
               passes=1):
     model = LdaMulticore(corpus=self.corpus,
                          id2word=self.dictionary,
                          num_topics=num_topics,
                          alpha=alpha,
                          eta=beta,
                          iterations=iterations,
                          passes=passes)
     if save_dir is None:
         save_dir = Constants.SAVE_DIR.format(
             collection_name.lower().replace(' ', '_'))
     if not os.path.isdir(save_dir):
         os.makedirs(save_dir)
     if save_file is None:
         save_file = Constants.SAVE_FILE_FORMAT.format(
             collection_name.lower().replace(' ', '_'), num_topics, alpha,
             beta, iterations)
     logging.info(save_dir)
     model.save(os.path.join(save_dir, save_file))
     return model
Esempio n. 5
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name',
                              type=str,
                              default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm(
             'There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run',
                                   type=int,
                                   default=20)
         num_topics = click.prompt('Number of topics',
                                   type=int,
                                   default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus,
                              id2word=id2word,
                              num_topics=num_topics,
                              workers=num_procs,
                              passes=num_epochs)
         model.save(fname)  #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(
             time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
Esempio n. 6
0
def lda_trainer(sentences, modelPath=None, nb_topics=190, multicore=False):
    '''
    @return: lda_model: the lda_model model trained by gensim, dictionary: all terms dictionary in lda_model model
    '''

    # load doc2bow
    dictionary = corpora.Dictionary(sentences)
    print('finish load dictionary!')
    corpus = [dictionary.doc2bow(text) for text in sentences]
    print('finish load doc2bow corpus!')
    # train lda_model model
    print('training lda_model model...')
    if multicore == True:
        # can just use in linux
        # very hard for CPU, cautiously use it
        lda_model = LdaMulticore(corpus=corpus,
                                 num_topics=nb_topics,
                                 id2word=dictionary)
    else:
        lda_model = LdaModel(corpus=corpus,
                             num_topics=nb_topics,
                             id2word=dictionary)
    print('finished lda_model model training, nb terms: %d' %
          lda_model.num_terms)

    # save lda_model model on disk
    if modelPath != None:
        lda_model.save(fname=modelPath)
        dictionary.save(fname_or_handle=modelPath.replace('.topic', '.dict'))
        print(
            'producing lda_model & dictionary model ... ok! model store in {0}(.dict)'
            .format(modelPath))

    return lda_model, dictionary
Esempio n. 7
0
def train_lda(args):
	print "[LDA > n_topics: %d ]" % args.dim	
	lda_reader = LDAReader(args.ds, max_sent=args.max_sent)		
	ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd,
									   num_topics=args.dim, 
									   workers=args.workers)
	ldazito.save(args.out)	
Esempio n. 8
0
def guidedLDA_Model(topics, cores=11):
    """
    Topics represents desired LDA topics,
    cores should be physical cores minus one.
    Both should be integers.
    """

    # load finished dictionary from disk
    trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize('./models2/trigram_bow_corpus.nm',
                        trigram_bow_generator('./models2/trigram_transformed_reviews.txt'))

    # load finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')


    # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs,
    # along with the number of topics the model should learn

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                       num_topics=topics,
                       id2word=trigram_dictionary,
                       workers=cores)

    lda.save('./models2/lda_model')

    # load the finished LDA model from disk
    #lda = LdaMulticore.load('./models/lda_model_neg')

    return trigram_bow_corpus, lda
Esempio n. 9
0
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True):
    module_path = os.path.dirname(__file__)
    model_path = module_path + "/models"

    if verbose:
        print("prepare data")
    corpus = corpus.apply(lambda x: x.split(" "))
    dictionary = Dictionary(corpus)
    bow = [dictionary.doc2bow(doc) for doc in corpus]

    if type(load) == str:
        if verbose:
            print("loading lda")
        lda = LdaMulticore.load(model_path + "/" + load)
    else:
        if verbose:
            print("training lda")
        lda = LdaMulticore(bow, num_topics=num_topics)
        if save_as:
            try:
                os.mkdir(model_path)
            except:
                pass

            lda.save(model_path + "/" + save_as)
    if verbose:
        print("generate visualization")
    vis = pyLDAvis.gensim.prepare(lda, bow, dictionary)
    return lda, vis
Esempio n. 10
0
def createLDAModel(docs, dictionary, num_topics = 100, iterations = NUM_ITERATIONS, 
                   passes = NUM_PASSES, workers = 3, output = 'lda_model'):
    """Creates the LDA model for the given documents.  

    Args:
        docs (lst): List of tokenized documents
        dictionary (lst): The dictionary
        num_topics (int): The number of topics to discover
        iterations (int): The number of iterations of the LDA method
        passes (int): The number of passes of the LDA method
        workers (int): The number of workers employed in the creation of the model
        output (str): Prefix used to store the model in a set of files

    Returns:
        ldamodel: The LDA model
    """
    
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in docs]
    
    # generate LDA model
    ldamodel = LdaMulticore(corpus, id2word = dictionary, num_topics = NUM_TOPICS, 
                            iterations = iterations, passes = passes, workers = workers)
    ldamodel.save(output + '_i' + str(iterations) + '_p' + str(passes) + '_T' + str(num_topics) + '.lda')
    
    return ldamodel
Esempio n. 11
0
def train_lda(args):
    print "[LDA > n_topics: %d ]" % args.dim
    lda_reader = LDAReader(args.ds, max_sent=args.max_sent)
    ldazito = LdaMulticore(lda_reader,
                           id2word=lda_reader.idx2wrd,
                           num_topics=args.dim,
                           workers=args.workers)
    ldazito.save(args.out)
Esempio n. 12
0
    def fit_universal_models(self):

        vec = CountVectorizer(stop_words='english', max_features=10000)
        vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences)

        id2word = {v: k for k, v in vec.vocabulary_.iteritems()}
        vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T)

        if os.path.isfile('lda.modl'):
            lda = LdaMulticore.load('lda.modl')
        else:
            lda = LdaMulticore(corpus=vec_corpus,
                               id2word=id2word,
                               iterations=200,
                               num_topics=2,
                               passes=10,
                               workers=4)
            lda.save('lda.modl')

        all_counts = vec.transform(' '.join(x) for x in self.all_sentences)
        self.d['all']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0])
        labeled_counts = vec.transform(' '.join(x) for x in self.X)
        self.d['labeled']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0])

        w2vmodel = Word2Vec(self.all_sentences,
                            size=100,
                            window=5,
                            min_count=3,
                            workers=4)

        best_centroids = None
        best_score = None
        for _ in xrange(
                10):  # todo -- implement kmeans++ instead of best of 10
            km = Kmeans(50)
            km.fit(w2vmodel.syn0)
            score = km.compute_sse(w2vmodel.syn0)
            if best_score is None or score < best_score:
                best_score = score
                best_centroids = km.centroids
        km.centroids = best_centroids

        self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words()))
        self.d['all']['_t'] = self.tfidf.fit_transform(
            ' '.join(x) for x in self.all_sentences)
        self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x)
                                                       for x in self.X)

        self.d['all']['_kmeans'] = np.array(
            kmeans_word2vecify(self.all_sentences, w2vmodel, km,
                               self.d['all']['_t'], self.tfidf))
        self.d['labeled']['_kmeans'] = np.array(
            kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'],
                               self.tfidf))
Esempio n. 13
0
def train_lda(args):
	print "[LDA > n_topics: %d ]" % args.dim	
	lda_reader = LDAReader(args.input, max_sent=args.max_sent)		
	lda_reader.compute_vocabulary()	
	lda_model = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd,
									   num_topics=args.dim, 
									   workers=args.workers)
	lda_model.save(args.output)	
	idx_path =  os.path.splitext(args.output)[0]+"_idx.pkl"
	lda_reader.save_vocabulary(idx_path)
Esempio n. 14
0
def get_lda_model(corpus, dictionary, num_topics, SAVE_FILE=OUT_FILE, passes=20, iterations=100):
    if not os.path.exists(SAVE_FILE + '.lda'):
        print('creating lda model for the {} file..'.format(SAVE_FILE))
        print('num_topics: {}'.format(num_topics))
        lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics,
                                 passes=passes, iterations=iterations, chunksize=2500)
        lda_model.save(SAVE_FILE + '.lda')
    else:
        print('LDA model for the file:{} already exists.. loading..'.format(SAVE_FILE))
        lda_model = LdaMulticore.load(SAVE_FILE + '.lda')
    return lda_model
Esempio n. 15
0
def train_lda():
    """
    Train the LDA model.
    generate_dictionary() must be called before this method.
    """
    print("------------------")
    print("Training LDA model")
    print("------------------")

    # load dictionary, as generated by generate_dictionary()
    print("Loading dictionary...")
    dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH)

    # generate a mapping from word id to word
    print("Generating id2word...")
    id2word = {}
    for word in dictionary.token2id:
        id2word[dictionary.token2id[word]] = word

    # initialize LDA
    print("Initializing LDA...")
    lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word,
                             workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE)

    # Train the LDA model
    print("Training...")
    examples = []
    update_every_n_windows = 25000
    windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE,
                           only_labeled_windows=True)
    for i, window in enumerate(windows):
        tokens_str = [token.word.lower() for token in window.tokens]
        bow = dictionary.doc2bow(tokens_str) # each window as bag of words
        examples.append(bow)
        if len(examples) >= update_every_n_windows:
            print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA))
            # this is where the LDA model is trained
            lda_model.update(examples)
            examples = []
        if i >= COUNT_EXAMPLES_FOR_LDA:
            print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,))
            break

    # i don't update here with the remainder of windows, because im not sure if each update step's
    # results are heavily influenced/skewed by the the number of examples
    #if len(examples) > 0:
    #    print("Updating with remaining windows...")
    #    lda_model.update(examples)

    # save trained model to HDD
    print("Saving...")
    lda_model.save(cfg.LDA_MODEL_FILEPATH)
Esempio n. 16
0
def run_model(modelname, opts, corpus, outdir):
    """Run an LDA model with specified options.

    Run an LDA model using ``LdaMulticore` with options ``opts` using
    ``corpus`` and save results to directory ``outdir``. Call the model
    ``modelname``, which will be used to prefix the result filenames.

    """
    LOGGER.info(f"Running model {modelname}")
    mod = LdaMulticore(ShuffledCorpus(corpus['corpus']),
                       **opts,
                       id2word=corpus['id2word'])
    filename = os.path.join(outdir, modelname + ".pickle")
    LOGGER.info(f"Saving to {filename}")
    mod.save(filename)
    model_to_csv(mod, corpus, modelname, outdir)
Esempio n. 17
0
def create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary):
    """
    creates and saves topic to file called lda
    """

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)

    lda.save(lda_model_filepath)
Esempio n. 18
0
def train_lda(corpus, dictionary, lda_model_filepath, num_topics,
              run_or_load_flag):
    if run_or_load_flag:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            # workers => sets the parallelism, and should be
            # set to your number of physical cores minus one
            lda = LdaMulticore(corpus,
                               num_topics=num_topics,
                               id2word=dictionary,
                               workers=3)
        lda.save(lda_model_filepath)
    else:
        lda = LdaMulticore.load(lda_model_filepath)

    return lda
Esempio n. 19
0
    def perform(self, option="load"):
        """
        Perform LDA analysis to generate topics
        and topic distribution for each app
        """
        logging.info("Start Lda analysis")

        ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration)
        logging.info("LDA multicore modeling done")

        ldamodel.save(self.lda_out_file_name)

        self.topics = {}
        for i in range(0, self.ntopic, 1):
            self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword)
            logging.info("Topic{}".format(i))
            words = [w[1] for w in self.topics["topic{}".format(i)]]
            logging.info(words)
Esempio n. 20
0
def LDAmulticoreModel(df, num_topics=10):

    import warnings
    def fxn():
        warnings.warn('deprecated', DeprecationWarning)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

    dictionary = Dictionary(df['AbstractNarration'].apply(get_tokens))
    dictionary_from_nlpAbstract = Dictionary(df['nlp_abstract'])
    dictionary_from_nlpAbstract.save('gensim_dict_fromNLPAbstract.gensim')
    corpus = [dictionary.doc2bow(text) for text in df['nlp_abstract']]
    # multicore model
    np.random.seed(44)
    lda_multicore = LdaMulticore(corpus, num_topics, id2word=dictionary,workers=4)
    lda_multicore.save('../models/lda_multicoremodel.gensim')
    #print('Topics from LDA Multicore model', lda_multicore.print_topics())
    return lda_multicore
Esempio n. 21
0
def main(argv):
    cli_parser = make_cli_parser()
    opts, args = cli_parser.parse_args(argv)
    if len(args) != 2:
        cli_parser.error("Please provide an input/output file")

    if not os.path.isfile(args[1] + '.lda'):
        if os.path.isfile(args[1] + '.bow2mm') and os.path.isfile(args[1] +
                                                                  '.id2word'):
            id2word = corpora.Dictionary.load(args[1] + '.id2word')
        else:
            id2word = corpora.Dictionary(iter_file(args[0], opts.numlines))
            # ignore words that appear in less than 5 documents or more than 20% documents
            # when we do filtering, some vector becomes empty! it generates a huge problem!!
            # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None)
            # save dictionary
            id2word.save(args[1] + '.id2word')
            # save doc2bow vector
            corpora.MmCorpus.serialize(
                args[1] + '.bow2mm',
                iter_doc2bow(args[0], opts.numlines, id2word))
        mm_corpus = corpora.MmCorpus(args[1] + '.bow2mm')
        model = LdaMulticore(mm_corpus,
                             id2word=id2word,
                             num_topics=opts.numtopics,
                             workers=opts.numprocs,
                             passes=opts.numepochs)
        model.save(args[1] + '.lda')

    infile = open(args[0])
    outfile = open(args[1] + '.csv', "w")
    out_csvfile = csv.writer(outfile, delimiter=',')
    in_csvfile = csv.reader(infile, delimiter=',')
    for row in in_csvfile:
        if row[0] == 0:
            break
        processed_post = preprocess(row[3]).split()
        if len(processed_post) == 0:  # skip 0~2 word documents (quite useless)
            continue
        result_list = row[1:3]
        result_list.extend(query_tag(id2word, model, processed_post))
        out_csvfile.writerow(result_list)
    infile.close()
    outfile.close()
Esempio n. 22
0
 def create_LDA_model(self):
     trigram_articles = LineSentence(self.trigram_articles_filepath)
     trigram_dictionary = Dictionary(trigram_articles)
     trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
     trigram_dictionary.compactify()
     trigram_dictionary.save_as_text(self.trigram_dictionary_filepath)
     # trigram_dictionary = Dictionary.load(self.trigram_dictionary_filepath)
     MmCorpus.serialize(self.trigram_bow_filepath,
                        self.trigram_bow_generator(self.trigram_articles_filepath,
                                                   trigram_dictionary))
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     print(trigram_bow_corpus)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         lda = LdaMulticore(trigram_bow_corpus,
                            num_topics=20,
                            id2word=trigram_dictionary,
                            workers=3)
     lda.save(self.lda_model_filepath)
Esempio n. 23
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    args = parse_args()

    dictionary = corpora.Dictionary.load(os.path.join(args.prefix, 'review.dict'))
    logging.info('Pruning dictionary')
    dictionary.filter_extremes(no_below=args.no_below,
                               no_above=args.no_above)

    corpus = ReviewCorpus(os.path.join(args.prefix, 'review.json'),
                          dictionary)

    logging.info('Computing LDA model')
    lda = LdaMulticore(corpus, num_topics=args.num_topics, id2word=dictionary,
                       workers=args.workers)

    logging.info('Persisting LDA model')
    lda.save(os.path.join(args.prefix, 'review.ldamodel'))
Esempio n. 24
0
class LDAembedding(InputEmbedding):
    def __init__(self, workdir="./embedding-models", name="lda-embedding"):
        """
        Erstellt durch Aufruf von Pretrain ein Vokabular
        :param workdir:
        :param name:
        """
        super(LDAembedding, self).__init__(workdir=workdir, name=name)
        self._normalizer = TweetNormalisation()

    def _load(self):
        modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name))
        if not modeldir.exists():
            return False
        self._lda = LdaMulticore.load(str(modeldir))
        self._dictionary = Dictionary.load(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))

    def pretrain(self, texts: typing.Iterable[typing.Text]):
        texts = [self._normalizer(text).split() for text in tqdm(texts)]
        self._dictionary = Dictionary(texts, prune_at=200000)
        corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)]
        self._lda = LdaMulticore(corpus=corpus,
                                 id2word=self._dictionary,
                                 workers=15,
                                 num_topics=50)

        self._dictionary.save(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
        self._lda.save(
            str(self._workdir.joinpath("ldamodel_{}".format(self._name))))

    def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array:
        to_array = lambda x: np.array([
            v
            for _, v in self._lda.get_document_topics(x, minimum_probability=0)
        ])
        return np.stack([
            to_array(self._dictionary.doc2bow(self._normalizer(text).split()))
            for text in texts
        ])
Esempio n. 25
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name', type=str, default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run', type=int, default=20)
         num_topics = click.prompt('Number of topics', type=int, default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs)
         model.save(fname) #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
Esempio n. 26
0
def create_LDA_model(coursesList):
    warnings.filterwarnings('ignore')
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    bigrams, trigrams = create_n_grams(text_clean)
    text_clean = add_n_grams(text_clean, bigrams, trigrams)

    id2word = Dictionary(text_clean)
    id2word.filter_extremes(no_below=5, no_above=0.45)
    corpus = [id2word.doc2bow(text) for text in text_clean]

    num_topics = config.num_lda_topic
    lda_model = LDA(corpus=corpus,
                    id2word=id2word,
                    num_topics=num_topics,
                    random_state=42,
                    alpha='asymmetric',
                    passes=25)
    lda_model.save("./best_model.lda")
    coherence_model_c_v = CoherenceModel(model=lda_model,
                                         texts=text_clean,
                                         dictionary=id2word,
                                         coherence='c_v')
    c_v = coherence_model_c_v.get_coherence()
    term_topic_mat = lda_model.get_topics()
    aver_cosine_similarities = 0
    for i in range(0, (num_topics - 1)):
        cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1),
                                            term_topic_mat[i + 1:]).flatten()
        aver_cosine_similarities += sum(cosine_similarities)
    if num_topics != 1:
        aver_cosine_similarities = aver_cosine_similarities / (
            num_topics * (num_topics - 1) / 2)
    print(c_v)
    print(aver_cosine_similarities)

    create_vector_topics(lda_model, corpus, id2word, coursesList)

    visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(visual_data, 'topics.html')
    return lda_model, id2word, bigrams, trigrams
Esempio n. 27
0
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True):

	"""
	Args: 
	num_topics_list = list of number of topics, a model will be fitted for each
	save: indicates whether model should be saved
	Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics
	"""
	topics_dict = {}
	logfile = open(logfilename, 'w')
	for num_topics in num_topics_list:
		
		print('training', num_topics)
		np.random.seed(NUM)

		start_time = time.time()
		model = LdaMulticore(corpus=train_corpus, id2word=id2word,
							 num_topics=num_topics, iterations=iters,
							 eval_every=None, workers=workers,
							 chunksize=chunksize)
		end_time = time.time()

		if save:
			fname = 'data\\orig_' + str(num_topics) + 'topics.lda'
			model.save(fname)

		per_word_bound = model.log_perplexity(test_corpus)
		perplexity = np.exp2(-1.0 * per_word_bound)

		logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n')
		logfile.write('perplexity: ' + str(perplexity) + '\n')
		logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n')

		topics = model.show_topics(num_topics=num_topics, num_words=20)
		topics_dict[str(num_topics)] = topics
		for topic in topics:
			logfile.write('\n\t' + topic.encode('ascii', 'ignore')  + '\n')

	logfile.close()		
	return topics_dict
Esempio n. 28
0
    def build_lda_model(self, topics: int=20):
        ignore_words = [
            'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure',
            'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going',
            'WEBLINK', 'got', 'way', ''
        ]
        filename = op.join(self.input_dir, f'{self.board}.dictionary')
        dictionary: Dictionary = Dictionary.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: dictionary.doc2bow(
                [w for w in y.split() if w not in ignore_words]
            )
        )

        lda = LdaMulticore(
            documents, id2word=dictionary, num_topics=topics, iterations=2)

        filename = op.join(self.input_dir, f'{self.board}.lda')
        lda.save(filename)

        return lda
def _lda_(gensim_dictionary,
          corpus_path=fpathroot + fpathappend + '_serialized.mm',
          lda_model_filepath=fpathroot + fpathappend + '_lda_' +
          str(numtopics),
          returnlda=True,
          numtopics=numtopics,
          passes=1,
          iterations=50,
          args=None):
    """
    Run Gensim LDA, optional return of model
    """
    if (type(corpus_path) == str) | (type(corpus_path) == unicode):
        corpus = MmCorpus(corpus_path)
    else:
        corpus = corpus_path
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        if args == None:
            lda = LdaMulticore(corpus,
                               num_topics=numtopics,
                               id2word=gensim_dictionary,
                               workers=n_threads,
                               passes=passes,
                               iterations=iterations)
        else:
            lda = LdaMulticore(corpus,
                               num_topics=numtopics,
                               id2word=gensim_dictionary,
                               workers=n_threads,
                               passes=passes,
                               iterations=iterations,
                               **args)
        lda.save(lda_model_filepath)
        if returnlda == True:
            return lda
    def generate_lda_topics(self):
        from gensim.corpora import Dictionary, MmCorpus
        from gensim.models.ldamulticore import LdaMulticore
        import pyLDAvis
        import pyLDAvis.gensim
        import warnings
        import _pickle as pickle

        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        trigram_dictionary = Dictionary(trigram_sentences)
        # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
        trigram_dictionary.compactify()
        trigram_dictionary.save(self.trigram_dictionary_filepath)

        def trigram_bow_generator(filepath):
            for sentence in LineSentence(filepath):
                yield trigram_dictionary.doc2bow(sentence)

        MmCorpus.serialize(
            self.trigram_bow_filepath,
            trigram_bow_generator(self.trigram_sentences_filepath))
        trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=3,
                               id2word=trigram_dictionary,
                               workers=3)
            lda.save(self.lda_model_filepath)
        lda = LdaMulticore.load(self.lda_model_filepath)
        lda.show_topic(0)
        lda.show_topic(1)
        lda.show_topic(2)
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                                  trigram_dictionary)
        pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
Esempio n. 31
0
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim


def bow(filepath, d):  # output bag of words representation
    for review in LineSentence(filepath):
        yield d.doc2bow(review)


real_sent = LineSentence('real.txt')
real_dict = Dictionary(real_sent)
real_dict.filter_extremes(no_below=5, no_above=0.2)
real_dict.compactify()
real_dict.save('real.dict')
real_dict = Dictionary.load('real.dict')

MmCorpus.serialize('real.mm', bow('real.txt', real_dict))
real_corpus = MmCorpus('real.mm')

real_lda = LdaMulticore(real_corpus,
                        num_topics=10,
                        id2word=real_dict,
                        workers=2)
real_lda.save('./real_lda_model')
#==============================================================================
# No need to run LDA everytime, model has bee stored
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(res)
vocab = vectorizer.get_feature_names()
start_time = time.time()
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=9,passes=10,
                    chunksize=5000,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))
fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#==============================================================================
# # Get all topics from training 
# topic_number, number_of_aritcles, top_words
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]    
    current_prob = 0
    for var in doc_lda:
        if var[1]>current_prob:
            current_prob = var[1]
Esempio n. 33
0
# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,
                           id2word=trigram_dictionary,
                           workers=3)

    lda.save(lda_model_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

explore_topic(topic_number=0)

topic_names = {
    0: 'looking_at_websites_for_info',
    1: 'doesnt_have_the_negative_exercise_effect',
    2: 'spend_time_looking_on_websites',
    3: 'games_and_information',
    4: 'bad_if_kids_spend_too_much_time'
}

topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
vocab = vectorizer.get_feature_names()

# single LDA
topic_number = 15
start_time = time.time()
model = LdaMulticore(
    matutils.Sparse2Corpus(X, documents_columns=False),
    num_topics=topic_number,
    passes=10,
    chunksize=5000,
    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
    workers=7,
)
print("--- %s seconds ---" % (time.time() - start_time))
fname = folder_name + 'LDA' + str(topic_number) + 'topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#perplexity
perplexity = model.log_perplexity(matutils.Sparse2Corpus(
    X, documents_columns=False),
                                  total_docs=None)

# batch LDA
model_eval = []
for k in range(2, 21):
    topic_number = k
    start_time = time.time()
Esempio n. 35
0
def LDA_Machine(lst_dict, handle_lst):
    assert type(lst_dict) == list, "Please enter a list of dictionary's"
    assert type(handle_lst) == list, "Please enter a list of handles"

    file_path_corpus = "/home/igabr/new-project-4/mm_corpus/"

    cnt_1 = -1
    cnt_2 = -1

    for handle in handle_lst:
        cnt_1 += 1

        clean_tweet_list = []

        handle_tweets = lst_dict[cnt_1][handle]['content']

        if handle_tweets == []:
            continue
        else:
            for raw_tweet in handle_tweets:

                clean_tweet = ""

                tokenized_tweet = nlp(raw_tweet)

                for token in tokenized_tweet:
                    if token.is_space:
                        continue
                    elif token.is_punct:
                        continue
                    elif token.is_stop:
                        continue
                    elif token.is_digit:
                        continue
                    elif len(token) == 1:
                        continue
                    elif len(token) == 2:
                        continue
                    else:
                        clean_tweet += str(token.lemma_) + " "

                clean_tweet_list.append(clean_tweet)
            clean_tweet_list = list(map(str.strip, clean_tweet_list))
            clean_tweet_list = [x for x in clean_tweet_list if x != ""]
            lst_dict[cnt_1][handle]['tokenized_tweets'] = clean_tweet_list
            print("{} tokenized_tweets inserted!".format(handle))
            print()

    master_df = make_df(lst_dict)

    to_remove = list(master_df[master_df['tokenized_tweets'].isnull()].index)

    index_to_remove = []
    for i in to_remove:
        index_to_remove.append(handle_lst.index(i))

    new_handle_list = [
        v for i, v in enumerate(handle_lst)
        if i not in frozenset(index_to_remove)
    ]

    master_df.dropna(subset=['tokenized_tweets'], inplace=True)

    master_df = filtration(master_df, "tokenized_tweets")

    clean_lst_dict = dataframe_to_dict(master_df)
    print()
    print("Cleaning of master dataframe complete!")

    for handle in new_handle_list:
        cnt_2 += 1

        try:
            list_of_tweets = clean_lst_dict[cnt_2][handle]['tokenized_tweets']
        except KeyError:
            continue

        gensim_format_tweets = []

        for tweet in list_of_tweets:
            list_form = tweet.split()
            gensim_format_tweets.append(list_form)

        gensim_dictionary = Dictionary(gensim_format_tweets)
        gensim_dictionary.filter_extremes(no_below=10, no_above=0.4)
        gensim_dictionary.compactify(
        )  # remove gaps after words that were removed

        MmCorpus.serialize(
            file_path_corpus + "{}.mm".format(handle),
            bag_of_words_generator(gensim_format_tweets, gensim_dictionary))

        corpus = MmCorpus(
            file_path_corpus +
            "{}.mm".format(handle))  #loading the corpus from disk

        if corpus.num_terms == 0:
            continue
        else:
            lda = LdaMulticore(corpus,
                               num_topics=10,
                               id2word=gensim_dictionary,
                               passes=100,
                               workers=100)
            lda.save(file_path_corpus + "lda_model_{}".format(handle))
            print("LDA model for {} saved!".format(handle))

            word_list = []

            for i in range(10):
                for term, frequency in lda.show_topic(i, topn=100):
                    if frequency != 0:
                        word_list.append(term)

            LDA_Counter = Counter(word_list)

            clean_lst_dict[cnt_2][handle]['LDA'] = LDA_Counter
            print("Inserted LDA Counter into {} dictionary".format(handle))

    pickle_object(clean_lst_dict, "2nd_degree_connections_LDA_complete")
    print("Script Complete")
Esempio n. 36
0
            starttime = datetime.datetime.now()
            print('dataset:', data, 'num_topics:', n_topics, 'random_state:', random_state)
            data_dir = './%s_data'%data
            dictionary = Dictionary.load(os.path.join(data_dir, 'ne_nedf_weighting.dict'))
            bow_news = load_model(os.path.join(data_dir, 'ne8_nedf_%s_weighting.bow')%(topn_concepts))
            dict_id2token = dict(dictionary.items())

            lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\
                               eval_every=eval_every, workers=workers, random_state=random_state)

            name = 'ne8_nedf_%s_topic%s_passes%s_iteration%s_random%s' % (topn_concepts, n_topics, passes, iterations, random_state)
            result_dir = os.path.join(data_dir, name)
            if not os.path.exists(result_dir):
                os.mkdir(result_dir)

            lda.save(os.path.join(result_dir, 'lda_model'))

            topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False)
            with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f:
                for topic in topics:
                    f.write('topic ' + str(topic[0]) + ':\n')
                    for t in topic[1]:
                        f.write(t[0] + ': ' + str(t[1]) + '\n')
                    f.write('\n')

            endtime = datetime.datetime.now()
            duration = (endtime - starttime).seconds
            duration_list.append(duration)
            print('Totol running for ', (endtime - starttime).seconds, ' seconds.')
        print(sum(duration_list)/len(duration_list))
Esempio n. 37
0
trigram_users_bow_corpus = MmCorpus(trigram_users_bow_file)

lda_threads_model_file = "lda_threads_model"
lda_users_model_file = "lda_users_model"

for i in range(5, 50, 5):
    print("Starting to process with " + str(i) + " topics")
    t0 = time.time()
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_threads = LdaMulticore(trigram_threads_bow_corpus,
                                   num_topics=i,
                                   id2word=trigram_dictionary,
                                   workers=4)
    t1 = time.time()
    lda_threads.save("models/" + lda_threads_model_file + str(i))
    print("Time to generate lda_threads " + str(i) + " : " + str(t1 - t0))
'''
Starting to process with 5 topics
Time to generate lda_threads 5 : 53.75977849960327
Starting to process with 10 topics
Time to generate lda_threads 10 : 75.05263686180115
Starting to process with 15 topics
Time to generate lda_threads 15 : 99.37945866584778
Starting to process with 20 topics
Time to generate lda_threads 20 : 118.13127422332764
Starting to process with 25 topics
Time to generate lda_threads 25 : 138.435448884964
Starting to process with 30 topics
Time to generate lda_threads 30 : 166.0134561061859
Starting to process with 35 topics
Esempio n. 38
0
def learn(corpus):
    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5)
    for line in lda.print_topics(NUM_TOPICS):
        print line
    lda.save('lda.gensim')
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel, LdaModel
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

wiki_corpus = MmCorpus('Wiki_Corpus.mm')   # Loading the corpus 
print (".... successfully loaded the corpus")

wiki_dict = Dictionary.load('WikiDictionary200k.dict') # Loading the dictionary
print (".... successfully loaded the dictionary")

lda = LdaMulticore(corpus=wiki_corpus, id2word=wiki_dict, num_topics=300, chunksize=10000, passes=2)

print ".... successfully extracted the topics; saving the model"
lda.save('WikiLDA_300.lda')

print "finished ...."