Exemple #1
0
def generate_topic_words():  # output to csv files.
    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        df = pd.DataFrame()
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)

        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            df = df.append(pd.Series([topic_num, terms_string[:-2]]),
                           ignore_index=True)

        output_path = f'./turn-in/{bigram_threshold}/topic_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.columns = ['Topic', 'Terms']
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
def explore_temporal_trends(topics, gender):
    model = LdaMallet.load('ldamodel.' + gender + '.' + str(topics))
    topic_words = model.show_topics(num_topics=topics, num_words=100, formatted=False)
    words_only = [(tp[0], [wd[0] for wd in tp[1]]) for tp in topic_words]

    words = list()
    for tp in words_only: words.extend(tp[1])

    weekly_stats = dict()
    data = Serialization.load_obj('week2comments.' + gender)
    for week in range(len(data)):
        if week not in data: continue
        print('processing data for week', week)
        week_data = ' '.join(data[week]).lower()
        total = len(week_data.split())

        for word in set(words):
            current_word_stats = weekly_stats.get(word, list())
            current_word_stats.append(float(week_data.split().count(word))/total)
            weekly_stats[word] = current_word_stats
        # end for
    # end for

    for word in weekly_stats:
        y = np.array(weekly_stats[word])
        x = np.array([i for i in range(len(weekly_stats[word]))]).reshape(-1, 1)
        model = LinearRegression().fit(x, y)
        print(word, model.coef_[0])
Exemple #3
0
def chatbot_thread(bot_name, lda_path, dict_path, dom_path):

    # Get the stop words
    stop_words = stopwords.words('english')

    # Load pre-trained modules
    lda = LdaMallet.load(lda_path)
    dictionary = Dictionary.load(dict_path)
    dominant_topics = pd.read_csv(dom_path)

    # Announce bots entrance
    post_message(bot_name, "Hello there!", True)

    # Loop forever until program termination
    while True:
        while message_queue.empty():
            continue
        user_message = message_queue.get()

        prob_still_chat = min(random(), INITIAL_CHATTINESS)

        # Loop forever to elicit multiple responses to a single query with decreasing probability
        responding = True
        while responding:
            response = get_response(user_message, lda, dictionary,
                                    dominant_topics, stop_words)
            post_message(bot_name, response, True)
            if prob_still_chat < CHATTINESS:
                responding = False
            else:
                prob_still_chat *= prob_still_chat
Exemple #4
0
def generate_topic_proportion_terms(
):  # This calculation is based on dominant topic belonged to each doc.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df_dominant_topic_document = dominant_topics(lda_model=lda_model,
                                                     corpus=corpus)
        # Number of Documents for Each Topic
        topic_counts = df_dominant_topic_document[
            'Dominant_Topic'].value_counts()

        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts / topic_counts.sum(), 4)

        # Topic Nums
        topic_nums = pd.Series(topic_contribution.index,
                               topic_contribution.index)

        topic_terms = pd.Series()
        # Topic Terms
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)
        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            topic_terms = topic_terms.append(
                pd.Series(terms_string[:-2], index=[topic_num * 1.0]))

        # Concatenate Column wise
        df_dominant_topics = pd.concat(
            [topic_nums, topic_counts, topic_contribution, topic_terms],
            axis=1)

        # Change Column names
        df_dominant_topics.columns = [
            'Topic', 'Count_Documents', 'Proportion_Over_Documents', 'Terms'
        ]
        df_dominant_topics.sort_values(
            by=['Topic'], ascending=True,
            inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df_dominant_topics.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
Exemple #5
0
def _load_model(type, fname='../../model/'):
    try:
        if type == 'lsi':
            return LsiModel.load(fname)
        elif type == 'lda':
            return LdaModel.load(fname)
        elif type == 'mallet':
            return LdaMallet.load(fname)
    except:
        return None
 def train(self, train_filename):
     print("traini QuoteRec")
     train_name = os.path.basename(train_filename)
     model_filename = train_name + ".quoterec_model"
     if os.path.isfile(model_filename):
         self.model = LdaMallet.load(model_filename)
     else:
         self.corpus = preprocessing.GensimCorpus(train_filename)
         self.model.save(model_filename)
         topics_str = self.model.show_topics(num_topics=-1)
         open(train_name + ".lda_model.topics", 'w').write(str(topics_str))
Exemple #7
0
def load_lda_model(lda_fname):
    """
    Loads an LDA model that could either be a Gensim trained LdaModel or a
    MALLET wrapper (an instance of LdaMallet).
    """
    
    if lda_fname.endswith('.lda.pickle'):
        return LdaModel.load(lda_fname)
    elif lda_fname.endswith('.ldamallet.pickle'):
        return LdaMallet.load(lda_fname)
    else:
        raise ValueError('filename {} does not end with either .lda.pickle '
                         'or .ldamallet.pickle'.format(repr(lda_fname)))
 def train(self, train_filename):
     print("train LDA")
     train_name = os.path.basename(train_filename)
     model_filename = train_name + ".lda_model"
     if os.path.isfile(model_filename):
         self.model = LdaMallet.load(model_filename)
     else:
         self.corpus = preprocessing.GensimCorpus(train_filename)
         self.model = LdaMallet(mallet_path,
                                self.corpus,
                                num_topics=100,
                                id2word=self.corpus.dictionary)
         self.model.save(model_filename)
         topics_str = self.model.show_topics(num_topics=-1)
         open(train_name + ".lda_model.topics", 'w').write(str(topics_str))
Exemple #9
0
def _load_model(model_type, fname):
    logger.info(f'{model_type} type of {fname} is loading..')
    try:
        if model_type == 'lsi':
            return LsiModel.load(f'../model/lsi_model/{fname}')
        elif model_type == 'lda':
            return LdaModel.load(f'../model/lda_model/{fname}')
        elif model_type == 'mallet':
            return LdaMallet.load(f'../model/mallet_model/{fname}')
        elif model_type == 'hdp':
            return HdpModel.load(f'../model/mallet_model/{fname}')
    except Exception as ex:
        logger.warning(f'{model_type} type of {fname} could not be loaded.',
                       exc_info=ex)
        return None
 def get_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str,param_name: str, param_version: int,
               language_processed_data: list, model_view: bool):
     logging.info("--- Getting LDA Mallet model")
     if self.model is None:
         model_file_path = Advisor.get_model_type_folders_file_path(lang, data_version,
                                                                    dictionary_version, model_version, param_name, param_version,
                                                                    self.model_type, "MLDA-model")
         if os.path.exists(model_file_path):
             self.model = LdaMallet.load(model_file_path)
         else:
             logging.info("---- LDA Mallet model was crated before")
             self.set_model(lang, data_version, dictionary_version, model_version, param_name, param_version, model_file_path,
                            language_processed_data)
     logging.info("--- LDA Mallet model captured")
     if model_view:
         self.visualization.get_model_visualizations(self.model_type, self.model, self.essentials.corpus,
                                                     language_processed_data)
     return self.model
Exemple #11
0
def topic_model_coherence_generator(texts, start_topic_count, end_topic_count,
                                    step):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1,
                                 step)):
        dictionary_path = 'dictionary_mallet122_' + str(
            topic_nums) + '.dictionary'
        dictionary = corpora.Dictionary.load(dictionary_path)
        corpus = [dictionary.doc2bow(text) for text in texts]
        mallet_path = 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\122文章\\mallet模型\\dictionary_mallet122_' + str(
            topic_nums) + '.model'
        mallet_lda_model = LdaMallet.load(mallet_path)
        cv_coherence_model_mallet_lda = CoherenceModel(model=mallet_lda_model,
                                                       corpus=corpus,
                                                       texts=texts,
                                                       dictionary=dictionary,
                                                       coherence='c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
    return coherence_scores
Exemple #12
0
def generate_topic_weight_terms():
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        df = topics_proportion(lda_model=lda_model,
                               corpus=corpus,
                               num_topics=num_topics[i])
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
Exemple #13
0
def calculate_entropy_mallet_models():  # output to csv files.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    index = 0
    dataset = pandas.read_csv(dataset_csv_path)
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df = pd.DataFrame()
        pbar = tqdm.tqdm(total=len(lda_model[corpus]))

        for i, row in enumerate(lda_model[corpus]):
            topic_dist = sorted(row, key=lambda x: (x[1]), reverse=True)
            rs_string = ''
            topic_entropy = 0
            for topic in topic_dist:
                rs_string = rs_string + 'Topic ' + str(topic[0] +
                                                       1) + ': ' + str(
                                                           topic[1]) + '; '
                topic_entropy = topic_entropy + (-math.log2(topic[1]))
            df = df.append(pd.Series([
                str(i), dataset['Submission_Num'][i], rs_string,
                str(topic_entropy), dataset['Submission_Text'][i]
            ]),
                           ignore_index=True)
            pbar.update(1)
        df.columns = [
            'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
            'Submission_Text'
        ]

        csv_file_result_path = f'./turn-in/{bigram_threshold}/model_entropy/{num_topics[index]}.csv'
        index = index + 1
        create_file(csv_file_result_path)
        df.to_csv(csv_file_result_path, index=False)
        pbar.close()
Exemple #14
0
import gensim
from gensim.models.wrappers import LdaMallet
# If mallet doesn't work, use normal LDA.
from gensim.models.ldamodel import LdaModel
ldamallet = LdaMallet.load(
    '/home/ashwath/Programs/MAGCS/LDA/ldamallet_mag50.model')
lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    ldamallet, gamma_threshold=0.001, iterations=50)
lda.save('lda_mag50.model')
    vocab_index, phis, thetas = [None, None, None]
    print('Building phi and theta matrices')
    if tool == 'gensim':
        lda_model = LdaModel.load(args.ldamodel_fn)
        vocab_index = {term: i for i, term in lda_model.id2word.items()}
        thetas = get_gensim_thetas(args.in_tsv, vocab_index, lda_model)
        phis = lda_model.get_topics()
        phis = phis / phis.sum(axis=1, keepdims=True)
    elif tool == 'mallet':
        vocab, vocab_index = get_vocab(args.vocab_fn, True)
        thetas = get_mallet_thetas(args.doc_topics_fn)
        n_topics = thetas.shape[1]
        phis = get_mallet_phis(args.word_weights_fn, vocab_index, n_topics)
    elif tool == 'mallet-gensim':
        lda_model = LdaMallet.load(args.ldamodel_fn)
        vocab_index = {term: i for i, term in lda_model.id2word.items()}
        thetas = get_gensim_mallet_thetas(args.in_tsv, vocab_index, lda_model)
        phis = lda_model.get_topics()
        phis = phis / phis.sum(axis=1, keepdims=True)
    print('Building topic-author-term counts')
    (topic_term_counts,
     topic_author_term_counts) = estimate_topic_counts(args.in_tsv,
                                                       vocab_index,
                                                       author_index,
                                                       thetas,
                                                       phis,
                                                       verbose=args.verbose)
    print('Evaluating topic-author correlation')
    evaluate_correlation(topic_term_counts, topic_author_term_counts,
                         args.out_tsv)
 def load_model(self, path_to_model):
     """Load the LDA model"""
     lda_model = LdaMallet.load(path_to_model)
     return lda_model
Exemple #17
0
hybrid_weights.extend(hybrid_weights)
hybrid_weights = np.array(hybrid_weights)
# Convert to probabilities
hybrid_weights = hybrid_weights / hybrid_weights.sum()

# GLOBAL num_items_to_pick (with replacement) -- high number: half a million
num_picks = 1000000

# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict')
corpus = corpora.MmCorpus(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load(
        '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load(
        '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model')

#index = similarities.MatrixSimilarity(ldamallet[corpus])
#index.save("simIndex.index")
malletindex = similarities.MatrixSimilarity.load(
    '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index')
with open(
        '/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle',
        'rb') as pick:
    docid_to_magid = pickle.load(pick)
Exemple #18
0
DICT_PATH  = 'docs.dict'
MODEL_PATH = 'docs.model'

raw_corpus = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
docs = [doc.split() for doc in raw_corpus]

if exists(MODEL_PATH):
    print('Testing...\n')
    dict = corpora.Dictionary.load(DICT_PATH)
    lda  = LdaMallet.load(MODEL_PATH)
    for doc in docs:
        topics = lda[dict.doc2bow(doc)]
        print(topics, doc)
else:
    print('Training...\n')
    dictionary = corpora.Dictionary(docs)
    dictionary.save(DICT_PATH)
    corpus = [dictionary.doc2bow(text) for text in docs]

    lda = LdaMallet(MALLET_PATH, corpus=corpus,
                    num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX)
    lda.save(MODEL_PATH)
 def load_model(self, model_name):
     self.model_name = model_name
     self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
     self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
     self.model.mallet_path = self.mallet_path
Exemple #20
0
def load_lda_mallet_model(filepath):
    return LdaMallet.load(filepath)
def load_mallet_model(artefacts_path='./artefacts', suffix=''):
    model_path = str(Path(artefacts_path) / 'model')
    if suffix:
        model_path = model_path + f'_{suffix}'
    model = LdaMallet.load(model_path)
    return model
Exemple #22
0
def load_gensim_file(file_name):
    return LdaMallet.load('data/gensim_models/' + file_name)
from HyperDoc2Vec import *

snowball = SnowballStemmer(language='english')
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.Defaults.stop_words |= {'table', 'ref', 'formula', 'citation', 'cit', 'references'
                            'fig', 'figure', 'abstract', 'introduction',
                            'description','conclusion','results','discussion'}
mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet'


# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ArxivCS/LDA/arxivmag.dict')
corpus = corpora.MmCorpus('/home/ashwath/Programs/ArxivCS/LDA/arxivmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load('/home/ashwath/Programs/ArxivCS/LDA/ldamallet_arxiv.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load('/home/ashwath/Programs/ArxivCS/LDA/lda_arxiv.model')
    
malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ArxivCS/LDA/simIndexArxiv.index')
with open('/home/ashwath/Programs/ArxivCS/LDA/docid_to_magid_training_arxiv.pickle', 'rb') as pick:
    docid_to_magid = pickle.load(pick)

hd2vmodel = HyperDoc2Vec.load('/home/ashwath/Programs/ArxivCS/hyperdoc2vec_arxivmag/models/hd2v_arxivmag.model')
print("MODELS took {} seconds to load".format(time()-loadmodstart))

def remove_stopwords(context):
    #print("Removing stop words.")
                                     id2word=dictionary,
                                     prefix=path_to_mallet_output,
                                     workers=multiprocessing.cpu_count())
        mallet_lda_model.save('{}lda_model.pkl'.format(path_to_mallet_output))
        # mallet_lda_model.save('{}lda_model_{}.pkl'.format(path_to_mallet_output, uuid))
        print('calculate model coherence C_v score')
        coherence_model_lda = CoherenceModel(model=mallet_lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('model coherence score: {}'.format(coherence_lda))
    else:
        print('load mallet LDA model')
        # mallet_lda_model = LdaMallet.load('{}lda_model.pkl'.format(path_to_mallet_output))
        mallet_lda_model = LdaMallet.load('{}lda_model_{}.pkl'.format(
            path_to_mallet_output, uuid))

    # # #convert the model to gensim format
    # # lda_model = malletmodel2ldamodel(mallet_lda_model)
    # # lda_model.save('{}gensim_lda_model.pkl'.format(path_to_mallet_output))

    # # topics = np.array(list(zip(*lda_model.show_topics(lda_model.num_topics, num_words=20, formatted=False)))[1])
    # # topics = topics[:, :, 0]
    # # topics = pd.DataFrame(topics)
    # # topics.to_csv('{}topics_with_20_words.csv'.format(path_to_mallet_output))

    # #determine which words will be included in each topic -> keyword string mapping
    # print('Determining word weight per topic')
    # topic_ids = mallet_lda_model.word_topics.argsort()[:, ::-1]
    # topic_weights = mallet_lda_model.word_topics
    # topic_weights.sort()