Beispiel #1
0
def loadModelfromFile(modelPath, readOnly=False):

    if readOnly == True:
        lda_model = LdaModel.load(fname=modelPath, mmap='r')
        dictionary = Dictionary.load(fname=modelPath.replace(
            '.topic', '.dict'),
                                     mmap='r')
    else:
        lda_model = LdaModel.load(fname=modelPath)
        dictionary = Dictionary.load(
            fname=modelPath.replace('.topic', '.dict'))
    print('load lda_model model from {0} ok!'.format(modelPath))

    return lda_model, dictionary
Beispiel #2
0
    def __init__(self,
                 topics=10,
                 worker=3,
                 pretrained_model=None,
                 dictionary=None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
Beispiel #3
0
Datei: lda.py Projekt: freygit/36
    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
Beispiel #4
0
def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
        os.makedirs(output_d)
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')
Beispiel #5
0
def trainModel():
    """ Train a model
    """
    if args.mode == 'Random':
        return args.topics, 0
    # need to train on dump
    files = [
        f"{args.input}/{f}" for f in os.listdir(args.input)
        if os.path.isfile(os.path.join(args.input, f))
    ]
    if args.mode == 'LDA':
        # create dictionary
        with open(files[0], "r", encoding='utf-8') as f:
            dct = Dictionary([' '.join(f.readlines()).split()])
        for filename in files[1:]:
            with open(filename, "r", encoding='utf-8') as f:
                dct.add_documents([' '.join(f.readlines()).split()])
        # create corpus
        corpus = []
        for filename in files:
            with open(filename, "r", encoding='utf-8') as f:
                corpus.append(dct.doc2bow(' '.join(f.readlines()).split()))
        lda = LdaModel(corpus, num_topics=args.topics)
        lda.save("./models/LDAdump.model")
        dct.save("./models/LDAdump.dct")
        return lda, dct
    if args.mode == 'loadLDA':
        return LdaModel.load("./models/LDAdump.model"), Dictionary.load(
            "./models/LDAdump.dct")
Beispiel #6
0
    def __init__(self, fnames, model=None, corpus=None, dictionary=None):
        """`fnames` is an array of files for [lda_model, distribution]"""
        self.reviews = open('data/electronics_topics_in.txt').readlines()

        print "Loding topic model..."
        if model is not None:
            print "Using argument model"
            self.lda = model
        else:
            self.lda = LdaModel.load(fnames[0])

        if corpus is not None:
            print "Using argument corpus and dictionary"
            self.corpus = corpus
            self.dictionary = dictionary
        else:
            print "Loading corpus and dictionary from file"
            self.corpus = load("data/models/electronics_tfidf_corpus.pkl")
            self.dictionary = load("data/models/electronics_dict.pkl")

        print "Loading review-topic distribution..."
        self.review_dist = [l for l in self.lda[self.corpus]]
        tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
        self.review_dist = map(lambda dist: tmp(dist), self.review_dist)

        print "processing topics"
        tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t),
                  self.lda.show_topics(-1))
        self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model
    def __init__(self, fnames, model=None, corpus=None, dictionary=None):
        """`fnames` is an array of files for [lda_model, distribution]"""
        self.reviews = open('data/electronics_topics_in.txt').readlines()

        print "Loding topic model..."
        if model is not None:
            print "Using argument model"
            self.lda = model
        else:
            self.lda = LdaModel.load(fnames[0])

        if corpus is not None:
            print "Using argument corpus and dictionary"
            self.corpus = corpus
            self.dictionary = dictionary
        else:
            print "Loading corpus and dictionary from file"
            self.corpus = load("data/models/electronics_tfidf_corpus.pkl")
            self.dictionary = load("data/models/electronics_dict.pkl")

        print "Loading review-topic distribution..."
        self.review_dist = [l for l in self.lda[self.corpus]]
        tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
        self.review_dist = map(lambda dist: tmp(dist), self.review_dist)

        print "processing topics"
        tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1))
        self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
def display_perplexity_on_topic_num(start, step, limit):
    model_list = []
    pplxty_list = []
    names = locals()
    for num_topics in range(start, limit, step):
        print("############### current num:", num_topics, "###############")
        model_path = os.getcwd() + "\\Model\\topic_num_" + str(
            num_topics) + ".model"
        if not os.path.exists(model_path):
            # Modeling!!!!!
            print("Modeling in progress...")
            names['model' + str(num_topics)] = LdaModel(
                pubs_corpus,
                num_topics=num_topics,
                id2word=pubs_dictionary,
                passes=10,
                eval_every=1)
            names['model' + str(num_topics)].save(model_path)
        else:
            print("Model already exists.")
            names['model' + str(num_topics)] = LdaModel.load(model_path)
        model_list.append(names['model' + str(num_topics)])
        pplxty_value = perplexity(names['model' + str(num_topics)],
                                  pubs_corpus, pubs_dictionary,
                                  len(pubs_dictionary.keys()), num_topics)
        pplxty_list.append(pplxty_value)
    return model_list, pplxty_list
Beispiel #10
0
    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics
Beispiel #11
0
 def load(self, subfolder=None):
     if subfolder:
         sf = subfolder + '/'
     else:
         sf = ''
     self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
     self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)
Beispiel #12
0
def get_matrices():
    start = time.time()
    with open("matrix.json") as df:
        doc_term_matrix = json.load(df)
    ldamodel = Lda.load(MODEL_PATH)
    loaded = time.time()
    print "Doc-Term Matrix loaded in", loaded - start, "seconds"

    doc_topic_mtx = ldamodel[doc_term_matrix]
    topic_word_mtx = ldamodel.print_topics()
    array = []
    for i in range(len(doc_topic_mtx)):
        mp = {}
        for topic_id, topic_score in doc_topic_mtx[i]:
            mp[topic_id] = topic_score
        array.append(mp)

    topicwordarray = []
    for _, words in (topic_word_mtx):
        topicwordarray.append(words)

    with open("doc_topic_mtx.json", "w") as df:
        json.dump(array, df)
    with open("topic_word_mtx.json", "w") as df:
        json.dump(topicwordarray, df)

    for i in topic_word_mtx:
        print i
    print "Doc-Topic and Topic-Word Matrices loaded in", time.time(
    ) - loaded, "seconds"

    return array, ldamodel
def prepare_for_analysis():
    import configparser

    config_parser = configparser.ConfigParser()
    config_parser.read("config.ini")
    config = config_parser['default']

    from corpus_compiler.tbmmcorpus import TbmmCorpus

    corpus = TbmmCorpus(metadata=True, config=config)

    corpus.load_tbmm_corpus("corpus-v0.1/tbmm_corpus.mm")

    corpus.prepare_metadata_to_description_dictionary()

    corpus.generate_word_counts()

    from gensim.models.ldamodel import LdaModel
    lda = LdaModel.load("tbmm_lda.model.passes_100")

    import matplotlib
    matplotlib.use(
        'Agg')  # Must be before importing matplotlib.pyplot or pylab!

    topic_dist_matrix, label_vector = corpus.calculate_topic_distributions_of_all_documents(
        lda)

    for topic_no in range(1, 20):
        corpus.plot_topic_across_time(topic_no, topic_dist_matrix,
                                      label_vector)

    corpus.plot_word_freqs_given_a_regexp(r"^lokavt", keyword="lokavt")

    corpus.plot_word_freqs_given_a_regexp(r"^mebus", keyword="mebus")
def LDAmodel(X, passes=2, num_topics=10, workers=2, re_train=False):
    tokens = []
    for c in X:
        tokens.append(c.split())
    dictionary = gensim.corpora.Dictionary(tokens)
    bow_corpus = [dictionary.doc2bow(caption) for caption in tokens]
    if (re_train == True):
        ldamodel = gensim.models.LdaMulticore(bow_corpus,
                                              num_topics=num_topics,
                                              id2word=dictionary,
                                              passes=passes,
                                              workers=workers)
        ldamodel.save("data/3_topic_modeling_weights//ldamodel_weights")
    else:
        ldamodel = LdaModel.load(
            "data/3_topic_modeling_weights//ldamodel_weights")

    sent_topics_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[bow_corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                sent_topics_df = sent_topics_df.append(pd.Series(
                    [str(int(topic_num)),
                     round(prop_topic, 4)]),
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'perc_contribution']

    return sent_topics_df
	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		if not os.path.exists(self.res_dir):
			print 'Creando carpeta para resultados...'
			os.mkdir(self.res_dir)

		# Aplicar cada modelo
		for idioma, modelos in self.input()['lda']['langs'].iteritems():
			corp_path = self.input()['corp']['langs'][idioma].path
			corpus = corpora.MmCorpus(corp_path)
			for n_topics, modelo in modelos.iteritems():
				model_path = modelo.path
				model = LdaModel.load(model_path)
				classification = []
				for doc in corpus:
					topic = model.get_document_topics(doc)
					classification.append(topic)
				print '--------------------------------------'
				print 'USER INFO: Clasificando textos en %s con nivel de limpieza "%s" con %d tópicos' % (idioma, kind, n_topics)
				model.print_topics(len(corpus),5)
				with self.output()['langs'][idioma][n_topics]['doc_topics'].open('w') as f:
					pickle.dump(classification, f)
				with self.output()['langs'][idioma][n_topics]['topics'].open('w') as f:
					pickle.dump(model.print_topics(n_topics,5), f) # el 5 es un parámetro que se puede editar (numero de palabras del tópico a mostrar)	
Beispiel #16
0
def calculate_topics(application_id: str) -> None:
    """Uses the latest topic model to assign a topic for each completely fetched account in the database."""
    with engine.begin() as connection:
        topic_model = models.topic_model.select_latest(application_id,
                                                       SOURCES['TWITTER'],
                                                       connection)

        if not topic_model:
            return

        accounts = list(
            models.account.select_multiple_complete(application_id,
                                                    SOURCES['TWITTER'],
                                                    connection))
        topic_model_path = get_topic_model_path(application_id)
        lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel'))
        dictionary = Dictionary.load(
            os.path.join(topic_model_path, 'dictionary'))
        documents = load_documents(accounts, connection)
        topic_iteration_id = models.topic_iteration.insert_one(
            topic_model['id'], connection)
        for account, document in zip(accounts, documents):
            bow = dictionary.doc2bow(document)
            weights = get_document_topic_weights(lda_model, bow)
            models.topic.insert_one(account['id'], weights, topic_iteration_id,
                                    connection)
        cluster_accounts(topic_iteration_id, connection)
 def load_model(self, model_path='lda.model'):
     """
     Loads a pretrained LDA model
     :param model_path:
     :return LDA model:
     """
     return LdaModel.load(model_path)
Beispiel #18
0
def newsList_topicInfer(news_list,
                        model_basepath='./model',
                        reserved_word_path=''):
    dictionary_path = model_basepath + '/dictionary.pickle'
    fr = file(dictionary_path, 'rb')
    dictionary = pickle.load(fr)
    model_path = model_basepath + '/lda_100.model'
    lda = LdaModel.load(model_path, mmap='r')

    for news_json in news_list:
        news_title = news_json['title']
        news_content = news_json['content']
        if not news_content:
            news_content = news_title * 3
        all_content = news_title + news_content

        word_list = utils.wordcut4lda(
            all_content, reserved_word_path=reserved_word_path).split()
        topic_vec = [
            str(key[1])
            for key in lda.get_document_topics(dictionary.doc2bow(word_list),
                                               minimum_probability=0)
        ]

        news_json['topic'] = topic_vec
Beispiel #19
0
def main():
    logger.info(f'Loading data from {args.dataset_dir}')
    corpus = load_corpus(args.dataset_dir)
    model_path = os.path.join(args.dump_dir, 'lda.model')
    logger.info(f'Loading model from {model_path}')
    model = LdaModel.load(model_path)
    corpus_bow = (model.id2word.doc2bow(text['candidates']) for text in corpus)

    predictions_path = os.path.join(args.dump_dir, 'lda.prediction.jsonl')
    topic_ids = set()
    with open(predictions_path, 'w') as f:
        for tweet, tweet_bow in tqdm(zip(corpus, corpus_bow)):
            topics = model.get_document_topics(tweet_bow)
            topics = [(topic_id, topic_prob.item())
                      for topic_id, topic_prob in topics]
            tweet['topics'] = topics
            f.write(json.dumps(tweet) + '\n')
    logger.info(f'Predictions have been written to {predictions_path}')

    topics_path = os.path.join(args.dump_dir, 'lda.topics.txt')
    topics = model.show_topics(num_topics=model.num_topics,
                               num_words=10,
                               log=False,
                               formatted=True)

    with open(topics_path, 'w') as f:
        for topic_no, topic in topics:
            f.write(f'Topic {topic_no}: {topic}\n')
    logging.info(f'Topics have been written to {topics_path}')
def convert_to_pyLDAvis(data_folder, basename, **opts):

    opts = extend(
        dict(R=50,
             mds='tsne',
             sort_topics=False,
             plot_opts={
                 'xlab': 'PC1',
                 'ylab': 'PC2'
             }), opts or {})

    target_folder = os.path.join(data_folder, basename)

    corpus_filename = os.path.join(target_folder, 'corpus.mm')
    model_filename = os.path.join(target_folder,
                                  'gensim_model_{}.gensim.gz'.format(basename))

    lda = LdaModel.load(model_filename)
    corpus = MmCorpus(corpus_filename)

    data = pyLDAvis.gensim.prepare(lda, corpus, lda.id2word, **opts)

    pyLDAvis.save_html(data, os.path.join(target_folder, 'pyldavis.html'))

    return data
Beispiel #21
0
	def getAllTopicTerms(self):
		""" return all topic / word distribution """
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))
		idto = self.dictionary.id2token
		allTiDistr = self.ldaModel.get_topics()
		return allTiDistr
Beispiel #22
0
def add_lda(x, corpus):
    train_lda = []
    lda = LdaModel.load('lda.model')
    for i in range(len(x)):
        top_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(2)]
        train_lda.append(topic_vec)
    return train_lda
Beispiel #23
0
def model_play(fname):
    """Extract insights from trained model."""
    start = time.time()
    ldamodel = Lda.load(fname)
    print "[INFO] Model loaded in", time.time() - start, "seconds"

    for i in range(ldamodel.num_topics):
        print "[INFO]", ldamodel.print_topic(i)
Beispiel #24
0
def plot_lda(application_id):
    """Saves a html file that visualizes the topic model"""
    topic_model_path = get_topic_model_path(application_id)
    lda_model = LdaModel.load(os.path.join(topic_model_path, 'ldamodel'))
    SerializedCorpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm'))
    dictionary = Dictionary.load(os.path.join(topic_model_path, 'dictionary'))
    vis_data = gensim.prepare(lda_model, SerializedCorpus, dictionary)
    pyLDAvis.save_html(
        vis_data, os.path.join(topic_model_path, 'lda_visualization.html'))
Beispiel #25
0
	def getTopicTerms(self, topicId, topN):
		""" return word distribution for a topic """
		if self.ldaModel is None:
			self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
			self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))
		idto = self.dictionary.id2token
		tiDistr =  self.ldaModel.get_topic_terms(topicId, topN)
		toDistr = [(idto[ti[0]], ti[1])  for ti in tiDistr]
		return toDistr
Beispiel #26
0
def class_model(m):

    lda = LdaModel.load("model/lda_model/lda_model_user_ml")

    dic = Dictionary.load('model/lda_model/lda_model_user_ml.id2word')

    topics = [tokenizer.find_topic(message, lda, dic) for message in m]

    return pd.Series(topics)
Beispiel #27
0
 def __init__(self, connect_file, database, model_file):
   self._alphabet = 'abcdefghijklmnopqrstuvwxyz'
   self._dbConnect(connect_file, database)
   with open(model_file, 'rb') as mdlf:
     contents = json.load(mdlf)
     model = contents['model-path']
     dictionary = contents['dictionary-path']
   self.model = LdaModel.load(model)
   self.dictionary = Dictionary.load(dictionary)
 def __init__(self, jobdesc_fname, jobtitle_fname):
     self.es = Elasticsearch([{'host': app.config['ES_HOST'], 'port': 9200, 'timeout': 120}])
     self.model = LdaModel.load(app.config['RCMDR_LDA_MODEL'])
     self.job_labels = {
         int(k):v
         for k, v in (line.split("=") for line in open(app.config['RCMDR_JOB_LABELS'])
                 .read().strip().split('\n'))
         }
     self.jobdesc_fname = jobdesc_fname
     self.jobtitle_fname = jobtitle_fname
Beispiel #29
0
 def load_model(self,
                dir_name="text_mining_models",
                file_name="gensim_model"):
     path = os.path.join(os.getcwd(), "..", dir_name, file_name)
     if os.path.isfile(path):
         return LdaModel.load(path)
     else:
         model = self.create_model()
         self.save_model()
         return model
	def __init__(self, ac):
		with open('../TextMining/Topic/data.loc','rb') as f:
			load(f)
			self.data = load(f)
		with open('../TextMining/Topic/translator.loc','rb') as f:
			self.translator = load(f)
		self.index = similarities.MatrixSimilarity.load('../TextMining/Topic/index.loc')
		self.lda = LdaModel.load('../TextMining/Topic/lda.loc')
		self.dictionary = Dictionary().load("../TextMining/Topic/dic.loc")
		self.ac_terms = ac
Beispiel #31
0
	def analyze(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Beispiel #32
0
	def analyze(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Beispiel #33
0
 def __init__(self, model_dir=os.path.join('models', 'gensim', 'lda'), stopwords_dir=os.path.join('resources', 'stopwords')):
     Service.__init__(self, 'topic-modeling', 'lda-gensim', ['parse'])
     self.models = {}
     self.stopwords = {}
     for name in os.listdir(model_dir):
         self.models[name] = LdaModel.load(os.path.join(model_dir, name, 'model'))
     for name in os.listdir(stopwords_dir):
         lang = name[:2]
         with open(os.path.join(stopwords_dir, name)) as f:
             self.stopwords[lang] = set([line.strip() for line in f.readlines()])
Beispiel #34
0
def inference(doc, model_loc):

    lda = LdaModel.load(model_loc + '/ldamodel.model')
    unseen_doc = px.prepareDoc(doc)
    dictionary = Dictionary.load(model_loc + '/dictionary.dic')

    unseen_corpus = dictionary.doc2bow(unseen_doc)
    topics = lda[unseen_corpus]

    return topics, lda
Beispiel #35
0
def get_lda_model(regenmod, lmod):
    list_models = []
    if os.path.isfile(lmod) and not regenmod:
        model = LdaModel.load(lmod)
    else:
        model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         iterations=50,
                         num_topics=2)
    list_models.append(model)
    return list_models
def save_top_words(model_file, output_file):
    lda = LdaModel.load(model_file)
    topics = lda.show_topics(-1, topn=20, formatted=False)

    topics = [[word for (_, word) in topic] for topic in topics]
    with open(output_file, 'w') as fp:
        for i, topic in enumerate(topics):
            #print topic
            line = 'Topic %d: %s\n' % (i + 1, ', '.join(topic))
            fp.write(line)
    return topics
def AuthorTopicStd():
    import nltk

    from gensim import corpora
    from gensim import matutils
    from gensim.models.ldamodel import LdaModel
    from nltk.corpus import stopwords
    from unidecode import unidecode

    TOPIC_FILE = './lda_topic.dump'
    LDA_FILE = './result.lda'
    DICTIONARY_FILE = './keywords.dict'

    with open(TOPIC_FILE, 'rb') as f:
        num_topics, topic_result = serializer.load(f)

    lda = LdaModel.load(LDA_FILE)

    dictionary = corpora.Dictionary.load(DICTIONARY_FILE)

    tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}')
    stopwords_set = set(stopwords.words())

    my_topic_cache_by_aid = [None, None]

    def calculator(aid, pid):
        if my_topic_cache_by_aid[0] == aid:
            my_topic = my_topic_cache_by_aid[1]
        else:
            my_keywords = []

            for ipid, iaid in paper_authors.get_by_aid(aid):
                paper = papers.get(ipid)
                if paper is None:
                    continue
                keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower())
                if not keywords:
                    continue
                my_keywords.extend(keywords)

            my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords))
            if not my_keywords:
                return np.nan

            my_topic = lda[dictionary.doc2bow(my_keywords)]

            my_topic_cache_by_aid[0] = aid
            my_topic_cache_by_aid[1] = my_topic

        my_topic_array = matutils.sparse2full(my_topic, num_topics)
        return np.std(my_topic_array)

    return calculator
Beispiel #38
0
def getLdaModel(bow_corpus, dictionary, useSavedTill):
    if useSavedTill >= USESAVED.lda_model:
        common_logger.info("loading LDA model from file")
        return LdaModel.load(file_lda_model)
    else:
        common_logger.info("Training LDA model")
        num_topics = int(math.log(len(bow_corpus)) + 1)  # assumption:
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
        common_logger.info("Saving LDA model")
        lda_model.save(file_lda_model)
        common_logger.info("Done creating LDA model")
        return lda_model
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
	def fetch_model(dictionary):
		print "Fetching LDA Model... ",
		try:
			lda = LdaModel.load('Topic/lda.tm')
			print "LDA Model loaded!"
		except IOError:
			print "Model not found, building LDA..."
			corpus=MyCorpus()
			#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
			lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
			print "LDA Built!"
			lda.save('Topic/lda.tm')
		return lda
Beispiel #41
0
	def update(self, docs):
		# load dictionary and model
		self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file"))
		self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file"))

		# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
		docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs]

		numPass = self.config.getIntConfig("train.num.pass")[0]
		self.ldaModel.update(docTermMatrix, passes=numPasses)

		docTopicDistr = self.getDocumentTopics(docTermMatrix)
		return docTopicDistr
Beispiel #42
0
  def SNAP_ldaTopicsForTopic(self, topic, numTopics = 10):
    if numTopics not in [5, 10, 20, 30]:
      print("[ERROR] Invalid numTopics")
      return
    inPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'snap_data',
      "gensim_snap_lda_%s_%d" % (topic, numTopics)
    )
    lda = LdaModel.load(inPath)
    return lda.print_topics(numTopics)

  ##################
  #
  ##################
def main():
    logformat = '%(asctime)s %(name)-12s: %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=logformat)
    kera = NOB_kera()
    es = Elasticsearch(port=9201)
    mod = LdaModel.load(modelfile)
    vocab = Dictionary.load(vocabulary)
    tfidf = TfidfModel(dictionary=vocab)
    results = []
    for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
        res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
        results.append({'topics': topics, 'result': res, 'topicid': topicid})
    results = add_keywords(results, kera)
    df = pd.DataFrame(results)
    df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
Beispiel #44
0
def get_lda_model(num_topics):
    file_name = None

    if num_topics == 10:
        file_name = LDA_FILE_10
    elif num_topics == 30:
        file_name = LDA_FILE_30
    elif num_topics == 60:
        file_name = LDA_FILE_60
    elif num_topics == 120:
        file_name = LDA_FILE_120
    else:
        raise ValueError("bad number of topics")

    return LdaModel.load(file_name)
def main():
    file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250'
    mod = LdaModel.load(file)
    dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab'
    vocab = Dictionary.load(dict)
    corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp'
    corpus = gensim.corpora.MmCorpus(corpfile)

    print mod.show_topic(0)
    print mod.id2word
    mod.id2word = vocab

    print mod.show_topic(0)

    pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab)
    pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html')
    pyLDAvis.show(pydavis)
Beispiel #46
0
def setup(files):
    # setup the output directory
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_dir = '../browser/json/' + base_model_name + '/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # load the topic model
    model = LdaModel.load(files.model)
    # load replacements used
    bug_to_id = json.loads(open(files.replacements).read())
    # invert to id<->bug map, ditching s. genus terms
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}
    # load the docsXwords and docsXtopics matrices (in sparse format)
    corpus = mmcorpus.MmCorpus(files.corpus)
    docsXwords_sparse = corpus2csc(corpus, num_terms=len(model.id2word.token2id)).T
    docsXtopics = mmcorpus.MmCorpus(files.docsXtopics)
    docsXtopics_sparse = corpus2csc(docsXtopics).T
    return docsXtopics_sparse, docsXwords_sparse, id_to_bug, model, output_dir
    def read_model(self):
        self.dictionary = corpora.Dictionary.load(DICT)
        self.bow_corpus = corpora.MmCorpus(BOW_CORPUS)
        self.lda_model = LdaModel.load(MODEL)
        self.logit_classifier = joblib.load(CLASSIFIER)

        corpus = []
        corpus += load_expo_cdc()
        corpus += load_lago()
        corpus += load_news()
        corpus += load_news_ic()
        corpus += load_palestras()
        corpus = preprocessing(corpus)

        test_bow = [self.dictionary.doc2bow(text) for text in corpus]
        lda_corpus = [self.lda_model[bow] for bow in test_bow]
        lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose()
        probs = self.logit_classifier.predict_proba(lda_dense)
Beispiel #48
0
def main(argv):
    if len(argv) < 4:
        print 'python train_lda.py group_id num_topics passes'
        sys.exit(1)
        
    group_id = argv[1]
    num_topics = int(argv[2])
    passes = int(argv[3])
    log.info('Prepare corpus for group: %s' % group_id)

    base_path = 'tables/' + group_id + '/'
    model_base_path = 'ldamodels/' + group_id + '/'
    
    # buid dict and corpus
    #now = datetime.now()
    indicator = 'title-comment'
    source_path = base_path + 'corpus-topic-comment'
    
    corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm'
    dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict'
    
    log.info('Building the dict...')
    build_dict_corpus(source_path, corpus_path, dict_path)
    
    log.info('Loading dict from pre-saved file...')
    dictionary = corpora.Dictionary.load(dict_path)
    log.info('Done')
    
    #dictionary.save_as_text(base_path + 'text-dict.txt')
    
    log.info('Build a lda model...')
    log.info('Loading corpus from pre-saved .mm file...')
    mmcorpus = corpora.MmCorpus(corpus_path)
    log.info('Done')
    
    log.info('Training lda model...')
    model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes)
    model_path = model_base_path + indicator + '-' + group_id + '.ldamodel'
    model.save(model_path)
    log.info('Done.')
    
    model = LdaModel.load(model_path)
    model.show_topics(topics=num_topics, topn=10, log=True)
    def __init__(self, fnames):
        """`fnames` is an array of files for [lda_model, distribution]"""
        print "Accumulating tweets..."
        files = glob.glob("data/relevant/*")
        self.tweets = map(lambda f: open(f).read(), files)

        print "Loding topic model..."
        self.lda = LdaModel.load(fnames[0])

        self.corpus, self.features, self.dictionary = get_params(files)

        print "Loading tweet distribution..."
        self.tweet_dist = [l for l in self.lda[self.corpus]]
        tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
        self.tweet_dist = map(lambda dist: tmp(dist), self.tweet_dist)
        # self.tweet_dist = json.load(open(fnames[1]))

        tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1))
        self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
        self.topics.reverse()
Beispiel #50
0
def train(refresh=True):
    if refresh:
        ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN)
        train_folders = [str(i) + str(j) for i in range(2) for j in range(10)]
        train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)]

        dictionary = corpora.dictionary.Dictionary()
        train_documents = list()

        logger.debug('Starting to parse training documents')
        for folder in train_folders:
            for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)):
                document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)])
                if len(document_sentences) > DOC_LEN_THRESHOLD:
                    doc2sentence = list(chain.from_iterable(document_sentences))
                    doc2sentence = clean_text(doc2sentence)
                    dictionary.add_documents([doc2sentence])
                    train_documents.append(doc2sentence)
        logger.debug('Parsed all training documents')

        dictionary.filter_extremes(no_below=1, no_above=0.5)
        dictionary.save(DICTIONARY_FILE)

        logger.debug('Creating corpus for training data')
        corpus = [dictionary.doc2bow(text) for text in train_documents]
        logger.debug('Finished creating corpus')

        logger.debug('Training LDA model on corpus')
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20)
        logger.debug('Completed LDA training')

        lda.save(LDA_MODEL_FILE)
    else:
        dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE)
        lda = LdaModel.load(LDA_MODEL_FILE)

    return lda, dictionary
 def update(self, name, n=500, method='FastICA'):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     
     # load the models
     dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
     ngram_size = len(dictionary[0])
     transformer = NgramTransformer(ngram_size)
     ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
     
     # get the input
     segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
     documents = [s.value for s in segments]
     
     # prepare args
     kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': method}
     Xt = clusterer.fit_transform(documents, **kwargs)
     labels = clusterer.assign_labels(documents)
     data = self._make_data(Xt, labels, documents)
     return json.dumps({'result': 'OK',
                        'data': data})
Beispiel #52
0
 def load(self):
     self._lda = LdaModel.load(self._model_file)
     self._dictionary = Dictionary.load(self._dict_file)
Beispiel #53
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Cluster segments')
    parser.add_argument('clustermodel', type=unicode, help='The clusterer model to use.')
    
    args = parser.parse_args()

    setstorage = MongoSettingsStorage()
    docstorage = MongoDocumentStorage()
    segstorage = MongoSegmentStorage()
    
    logger.info('Loading clusterer model')
    settings = setstorage.load(encode_name(args.clustermodel))
    dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
    ngram_size = len(dictionary[0])
    transformer = NgramTransformer(ngram_size)
    ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
    logger.info('Clusterer model loaded!')
    
    kwargs = {'dictionary': dictionary,
                  'ngramtransformer': transformer,
                  'ldamodel': ldamodel,
                  'method': 'LDA'}
    
    
    logger.info('Fitting clusterer')
    clusterer = Clusterer(settings)
    texts, labels = clusterer.get_training_data()
    clusterer.fit(texts, labels, **kwargs)
    logger.info('Fitting completed!')
    
    # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation
# Make / Load LDA result

def make_lda_result():
    lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, iterations=1000)

    # save LDA result
    lda.save(LDA_FILE)


if os.path.isfile(LDA_FILE):
    if input('Do you want to reload LDA result? (yes|otherwise)') == 'yes':
        make_lda_result()
else:
    make_lda_result()

lda = LdaModel.load(LDA_FILE)


# 4. Make and Save topic belief for each publication


with open(PUBLICATION_KEYWORDS_FILE, 'rb') as f:
    publication_keywords = serializer.load(f)

topic_result = dict()

for i, (pub_id, keywords) in enumerate(publication_keywords.items()):
    pub_topic = dict(lda[corpus[i]])
    if len(pub_topic) == 0:
        continue
    topic_belief = np.array([pub_topic.get(j, 0.0) for j in range(NUM_TOPICS)])
Beispiel #55
0
 def __init__(self):
     self.model = LdaModel.load(settings.lda_model_name)
     self.dictionary = Dictionary.load_from_text(settings.wordids_txt)
def AuthorPaperTopicSim():
    import nltk

    from gensim import corpora
    from gensim import matutils
    from gensim.models.ldamodel import LdaModel
    from nltk.corpus import stopwords
    from unidecode import unidecode

    TOPIC_FILE = './lda_topic.dump'
    LDA_FILE = './result.lda'
    DICTIONARY_FILE = './keywords.dict'

    with open(TOPIC_FILE, 'rb') as f:
        num_topics, topic_result = serializer.load(f)

    lda = LdaModel.load(LDA_FILE)

    dictionary = corpora.Dictionary.load(DICTIONARY_FILE)

    tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}')
    stopwords_set = set(stopwords.words())

    my_topic_cache_by_aid = [None, None]

    def calculator(aid, pid):
        paper = papers.get(pid)
        if paper is None or paper[Papers.IDX_PUB_ID] is None:
            return np.nan

        publication = publications.get(paper[Papers.IDX_PUB_ID])

        pub_ori_id = publication[Publications.IDX_ORIGINAL_ID]
        if pub_ori_id not in topic_result:
            return np.nan

        publication_topic = topic_result[pub_ori_id]

        if my_topic_cache_by_aid[0] == aid:
            my_topic = my_topic_cache_by_aid[1]
        else:
            my_keywords = []

            for ipid, iaid in paper_authors.get_by_aid(aid):
                paper = papers.get(ipid)
                if paper is None:
                    continue
                keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower())
                if not keywords:
                    continue
                my_keywords.extend(keywords)

            my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords))
            if not my_keywords:
                return np.nan

            my_topic = lda[dictionary.doc2bow(my_keywords)]

            my_topic_cache_by_aid[0] = aid
            my_topic_cache_by_aid[1] = my_topic

        # Use Hellinger distance
        my_topic_array = matutils.sparse2full(my_topic, num_topics)
        sim = np.sqrt(0.5 * ((np.sqrt(my_topic_array) - np.sqrt(publication_topic)) ** 2).sum())

        return sim

    return calculator
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import time


from scipy import sparse

corpusType = "all_";
subDirectory = 'run_sraa'
t1 = time.time()

corpus = corpora.MmCorpus(subDirectory+'/'+ corpusType+'corpus.mm')
# dictionary = corpora.dictionary.Dictionary.load(subDirectory+''+ corpusType+'/dictionary.dict')
classes = np.loadtxt(subDirectory+'/'+ corpusType+'classes.dat',dtype=int)
model = LdaModel.load(subDirectory+'/'+corpusType+'sraa.lda_model')

numFeatures = model.num_topics
numData = len(corpus)
numNodes = numData + numFeatures + 2

sparseData = []
for data in corpus:
    sparseData.append(model[data])

A = sparse.lil_matrix((numNodes,numNodes))

# features: 0-numFeatures
# data: numFeature-(numFeature+numData)
# label: (numFeature+numData), (numFeature+numData+1)
# connect datas to features
Beispiel #58
0
groups_users = json.load(open('data/app_gensim/groups_users_filt.txt'))
users_groups = json.load(open('data/app_gensim/users_groups_filt.txt'))
users_topics = json.load(open('data/app_gensim/users_topics_filt.txt'))
groups_topics = json.load(open('data/app_gensim/group_topics.txt'))

# member_data = pd.read_pickle('data/memfiltcleanfinal.pkl')
member_data = pd.read_pickle('data/app_final_filt_members_data.pkl')
# member_data['id'] = member_data['id'].apply(lambda x: str(x))
# member_data.set_index('id', inplace = True)

group_data = pd.read_pickle('data/app_final_groups_data.pkl')
loaded_model = graphlab.load_model('data/groups_model')
dictionary = corpora.Dictionary.load('data/app_gensim/dictionary.dict')
corpus = corpora.MmCorpus('data/app_gensim/corpus_tfidf.mm')
# loaded_model.get_similar_users(users = ['68157442'], k=10)
lda = LdaModel.load('data/app_gensim/model.lda')
#group index --> {group_ind: [groupid, grouptext]}
lda_dict = json.load(open('data/app_gensim/lda_dict.txt'))

users_wanted = list(member_data.index.values)
member_data['score']= member_data['n_connected']+ member_data['n_topics']
groups_topics = json.load(open('data/app_gensim/group_topics.txt'))
users_groups = json.load(open('data/app_gensim/users_groups_filt.txt'))


dfref = member_data.to_dict('dict')

users_sims = np.load("users_sims.npy")


def get_sim_score(user_id):
Beispiel #59
0
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from news.document import Tokenizer

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
dictionary_file = data_dir+'/id_token_df'
model_file = data_dir+'/lda_model'

print 'creating tokenizer...'
dictionary = Dictionary.load_from_text(dictionary_file)
tok = Tokenizer(dictionary)

print 'loading model...'
lda = LdaModel.load(model_file)

while True:
    text = raw_input('enter text (q to quit): ')
    if text == 'q':
        print 'bye!'
        break
    doc = tok.text2bow(text)
    topics = lda[doc]
    for topic,weight in sorted(topics,key=itemgetter(1),reverse=True):
        print weight,lda.show_topic(topic,topn=4)