def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics
def add_topics(args): print(args) nlp = spacy.load("en", disable=["parser", "ner"]) def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): allowed_postags = set(allowed_postags) docs = nlp.pipe(texts) text_tokens = [] for doc in docs: tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.is_punct and not token.is_stop] text_tokens.append(tokens) return text_tokens model = HdpModel.load(args["topic_model"]) corpus_dict = model.id2word topics = model.show_topics(num_topics=args["num_topics"], num_words=args["num_terms"], log=False, formatted=False) topics_to_save = [] for topic in topics: topic_dict = {} topic_terms = ", ".join([t[0] for t in topic[1]]) topic_dict["topic_id"] = int(topic[0]) topic_dict["terms"] = topic_terms topics_to_save.append(topic_dict) database = args["database"] dataset_db = f"sqlite:///{database}" with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db: db.create_table("corpus_topics") topic_ids = db["corpus_topics"].insert_many(topics_to_save) print(topic_ids) print(topics_to_save) batch = [] for sentence in db['sentence']: batch.append(sentence) if len(batch) == args["batch_size"]: insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize) batch = [] if len(batch) > 0: insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize) db["corpus_topics_sentences"].create_index(['sentence_id']) db["corpus_topics_sentences"].create_index(['topic_id'])
def _load_model(model_type, fname): logger.info(f'{model_type} type of {fname} is loading..') try: if model_type == 'lsi': return LsiModel.load(f'../model/lsi_model/{fname}') elif model_type == 'lda': return LdaModel.load(f'../model/lda_model/{fname}') elif model_type == 'mallet': return LdaMallet.load(f'../model/mallet_model/{fname}') elif model_type == 'hdp': return HdpModel.load(f'../model/mallet_model/{fname}') except Exception as ex: logger.warning(f'{model_type} type of {fname} could not be loaded.', exc_info=ex) return None
def get_topic_model_subset(tweets, args): hdp = HdpModel.load('models/daily_topics/' + args.date + '_topics.model') topic_dists = pd.read_csv('data/daily_topic_distributions/' + args.date + '.tsv', sep='\t', lineterminator='\n') topic_dists = topic_dists[ topic_dists['probability'] >= args.prob_threshold].reset_index( drop=True) relevant_topics = get_relevant_topics(hdp, args.keywords, topn=args.topn) relevant_ids = list( set(topic_dists[topic_dists['topic'].isin(relevant_topics)] ['tweet_id'].values)) subset = tweets[tweets['tweet_id'].isin(relevant_ids)] return subset
def get_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, language_processed_data: list, model_view: bool): if self.model is None: logging.info("--- Getting HDP model") model_file_path = Advisor.get_model_type_folders_file_path( lang, data_version, dictionary_version, model_version, param_name, param_version, self.model_type, "HDP-model") if path.exists(model_file_path): self.model = HdpModel.load(model_file_path) else: logging.info("---- HDP model was crated before") self.set_model(lang, data_version, dictionary_version, model_version, param_name, param_version, model_file_path, language_processed_data) logging.info("--- HDP model captured") if model_view: self.visualization.get_model_visualizations( self.model_type, self.model, self.essentials.corpus, language_processed_data) return self.model
print("Demo 1:\n%s" % demo1) print(get_lda_best_topic_words(demo1, dictionary, lda)) print("Demo 2:\n%s" % s1) print(get_lda_best_topic_words(s1, dictionary, lda)) print("Demo 3:\n%s" % s2) print(get_lda_best_topic_words(s2, dictionary, lda)) print("Demo 4:\n%s" % s3) print(get_lda_best_topic_words(s3, dictionary, lda)) print("Demo 5:\n%s" % s4) print(get_lda_best_topic_words(s4, dictionary, lda)) elif "lsa" in model_path: lsi = LsiModel.load(model_path) print("Demo 1:\n%s" % demo1) print(print_lsa_topic(demo1, dictionary, lsi)) print("Demo 2:\n%s" % s1) print(print_lsa_topic(s1, dictionary, lsi)) print("Demo 3:\n%s" % s2) print(print_lsa_topic(s2, dictionary, lsi)) print("Demo 4:\n%s" % s3) print(print_lsa_topic(s3, dictionary, lsi)) print("Demo 5:\n%s" % s4) print(print_lsa_topic(s4, dictionary, lsi)) print(get_lsa_topic_embeding(s4, dictionary, lsi, w2Id, embeddings)) elif "hdp" in model_path: hdp = HdpModel.load(model_path) print("Demo 1:\n%s" % demo1) print(print_hdp(demo1, dictionary, hdp)) end = time() print("Total processing time: %d seconds" % (end - begin))
def load_hdp_model(filepath): return HdpModel.load(filepath)
start = time() Hdp = gensim.models.hdpmodel.HdpModel hdpmodel = Hdp(doc_term_matrix, id2word=dictionary) print('used: {:.2f}s'.format(time() - start)) print(hdpmodel.print_topics(num_topics=2, num_words=4)) for i in hdpmodel.print_topics(): for j in i: print(j) hdpmodel.save(MODEL_FILE) from gensim.models import HdpModel loading = HdpModel.load(MODEL_FILE) print(loading.print_topics(num_topics=2, num_words=4)) import pyLDAvis.gensim import gensim d = gensim.corpora.Dictionary.load(DICT_FILE) c = gensim.corpora.MmCorpus(CORPUS_FILE) hdp = gensim.models.HdpModel.load(MODEL_FILE) # the followin is error #lda = hdp.hdp_to_lda() #data = pyLDAvis.gensim.prepare(lda, c, d) data = pyLDAvis.gensim.prepare(hdp, c, d) #print (data)
__author__ = "Emanuel Juliano Morais Silva" __email__ = "*****@*****.**" import pickle from gensim.models import CoherenceModel, HdpModel from gensim.corpora import Dictionary, MmCorpus import pyLDAvis.gensim import warnings warnings.filterwarnings('ignore') # Let's clean the output hdpmodel = HdpModel.load('hdp_model_spacy.gensim') dictionary = Dictionary.load('hdp_dictionary.dict') corpus = MmCorpus('hdp_corpus.mm') with open("texts.txt", "rb") as fp: # Unpickling texts = pickle.load(fp) print("Files loaded") topic_info = hdpmodel.print_topics() for topic in topic_info: print(topic) vis_hdpmodel = hdpmodel.suggested_lda_model() vis_hdp = [[word for word, prob in topic] for topicid, topic in vis_hdpmodel.show_topics(formatted=False)] coherence = CoherenceModel(topics=vis_hdp[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence()