Beispiel #1
0
    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics
def add_topics(args):
    print(args)

    nlp = spacy.load("en", disable=["parser", "ner"])

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [token.lemma_ for token in doc if
                      token.pos_ in allowed_postags and not token.is_punct and not token.is_stop]
            text_tokens.append(tokens)
        return text_tokens

    model = HdpModel.load(args["topic_model"])
    corpus_dict = model.id2word

    topics = model.show_topics(num_topics=args["num_topics"], num_words=args["num_terms"], log=False, formatted=False)

    topics_to_save = []
    for topic in topics:
        topic_dict = {}
        topic_terms = ", ".join([t[0] for t in topic[1]])
        topic_dict["topic_id"] = int(topic[0])
        topic_dict["terms"] = topic_terms

        topics_to_save.append(topic_dict)

    database = args["database"]
    dataset_db = f"sqlite:///{database}"
    with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db:
        db.create_table("corpus_topics")

        topic_ids = db["corpus_topics"].insert_many(topics_to_save)
        print(topic_ids)

        print(topics_to_save)

        batch = []
        for sentence in db['sentence']:
            batch.append(sentence)

            if len(batch) == args["batch_size"]:
                insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize)
                batch = []

        if len(batch) > 0:
            insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize)

        db["corpus_topics_sentences"].create_index(['sentence_id'])
        db["corpus_topics_sentences"].create_index(['topic_id'])
Beispiel #3
0
def _load_model(model_type, fname):
    logger.info(f'{model_type} type of {fname} is loading..')
    try:
        if model_type == 'lsi':
            return LsiModel.load(f'../model/lsi_model/{fname}')
        elif model_type == 'lda':
            return LdaModel.load(f'../model/lda_model/{fname}')
        elif model_type == 'mallet':
            return LdaMallet.load(f'../model/mallet_model/{fname}')
        elif model_type == 'hdp':
            return HdpModel.load(f'../model/mallet_model/{fname}')
    except Exception as ex:
        logger.warning(f'{model_type} type of {fname} could not be loaded.',
                       exc_info=ex)
        return None
def get_topic_model_subset(tweets, args):
    hdp = HdpModel.load('models/daily_topics/' + args.date + '_topics.model')
    topic_dists = pd.read_csv('data/daily_topic_distributions/' + args.date +
                              '.tsv',
                              sep='\t',
                              lineterminator='\n')
    topic_dists = topic_dists[
        topic_dists['probability'] >= args.prob_threshold].reset_index(
            drop=True)

    relevant_topics = get_relevant_topics(hdp, args.keywords, topn=args.topn)
    relevant_ids = list(
        set(topic_dists[topic_dists['topic'].isin(relevant_topics)]
            ['tweet_id'].values))
    subset = tweets[tweets['tweet_id'].isin(relevant_ids)]

    return subset
Beispiel #5
0
    def get_model(self, lang: str, data_version: int,
                  dictionary_version: float, model_version: str,
                  param_name: str, param_version: int,
                  language_processed_data: list, model_view: bool):
        if self.model is None:
            logging.info("--- Getting HDP model")
            model_file_path = Advisor.get_model_type_folders_file_path(
                lang, data_version, dictionary_version, model_version,
                param_name, param_version, self.model_type, "HDP-model")
            if path.exists(model_file_path):

                self.model = HdpModel.load(model_file_path)
            else:
                logging.info("---- HDP model was crated before")
                self.set_model(lang, data_version, dictionary_version,
                               model_version, param_name, param_version,
                               model_file_path, language_processed_data)
        logging.info("--- HDP model captured")
        if model_view:
            self.visualization.get_model_visualizations(
                self.model_type, self.model, self.essentials.corpus,
                language_processed_data)
        return self.model
        print("Demo 1:\n%s" % demo1)
        print(get_lda_best_topic_words(demo1, dictionary, lda))
        print("Demo 2:\n%s" % s1)
        print(get_lda_best_topic_words(s1, dictionary, lda))
        print("Demo 3:\n%s" % s2)
        print(get_lda_best_topic_words(s2, dictionary, lda))
        print("Demo 4:\n%s" % s3)
        print(get_lda_best_topic_words(s3, dictionary, lda))
        print("Demo 5:\n%s" % s4)
        print(get_lda_best_topic_words(s4, dictionary, lda))
    elif "lsa" in model_path:
        lsi = LsiModel.load(model_path)
        print("Demo 1:\n%s" % demo1)
        print(print_lsa_topic(demo1, dictionary, lsi))
        print("Demo 2:\n%s" % s1)
        print(print_lsa_topic(s1, dictionary, lsi))
        print("Demo 3:\n%s" % s2)
        print(print_lsa_topic(s2, dictionary, lsi))
        print("Demo 4:\n%s" % s3)
        print(print_lsa_topic(s3, dictionary, lsi))
        print("Demo 5:\n%s" % s4)
        print(print_lsa_topic(s4, dictionary, lsi))
        print(get_lsa_topic_embeding(s4, dictionary, lsi, w2Id, embeddings))
    elif "hdp" in model_path:
        hdp = HdpModel.load(model_path)
        print("Demo 1:\n%s" % demo1)
        print(print_hdp(demo1, dictionary, hdp))

    end = time()
    print("Total processing time: %d seconds" % (end - begin))
Beispiel #7
0
def load_hdp_model(filepath):
    return HdpModel.load(filepath)
start = time()
Hdp = gensim.models.hdpmodel.HdpModel
hdpmodel = Hdp(doc_term_matrix, id2word=dictionary)

print('used: {:.2f}s'.format(time() - start))
print(hdpmodel.print_topics(num_topics=2, num_words=4))

for i in hdpmodel.print_topics():
    for j in i:
        print(j)

hdpmodel.save(MODEL_FILE)

from gensim.models import HdpModel
loading = HdpModel.load(MODEL_FILE)
print(loading.print_topics(num_topics=2, num_words=4))

import pyLDAvis.gensim
import gensim

d = gensim.corpora.Dictionary.load(DICT_FILE)
c = gensim.corpora.MmCorpus(CORPUS_FILE)
hdp = gensim.models.HdpModel.load(MODEL_FILE)

# the followin is error
#lda = hdp.hdp_to_lda()
#data = pyLDAvis.gensim.prepare(lda, c, d)

data = pyLDAvis.gensim.prepare(hdp, c, d)
#print (data)
Beispiel #9
0
__author__ = "Emanuel Juliano Morais Silva"
__email__ = "*****@*****.**"

import pickle
from gensim.models import CoherenceModel, HdpModel
from gensim.corpora import Dictionary, MmCorpus
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')  # Let's clean the output

hdpmodel = HdpModel.load('hdp_model_spacy.gensim')
dictionary = Dictionary.load('hdp_dictionary.dict')
corpus = MmCorpus('hdp_corpus.mm')
with open("texts.txt", "rb") as fp:  # Unpickling
    texts = pickle.load(fp)

print("Files loaded")

topic_info = hdpmodel.print_topics()
for topic in topic_info:
    print(topic)

vis_hdpmodel = hdpmodel.suggested_lda_model()

vis_hdp = [[word for word, prob in topic]
           for topicid, topic in vis_hdpmodel.show_topics(formatted=False)]

coherence = CoherenceModel(topics=vis_hdp[:10],
                           texts=texts,
                           dictionary=dictionary,
                           window_size=10).get_coherence()