Beispiel #1
0
def main(args):
    f = sys.argv[1]
    if args.model_type == "lda":
        loaded_model = LdaModel.load(args.model)

        for topic_num, topic in enumerate(
                loaded_model.show_topics(num_topics=-1)):
            topic_num, topic_str = topic

            print(str(topic_num) + ':', end=' ')
            for term in topic_str.split(' + '):
                weight, word = term.split('*')
                if args.model_type == "dtm":
                    word = "\"" + word + "\""
                print(word, end=' ')
            print()

    elif args.model_type == "dtm":
        loaded_model = DtmModel.load(args.model)
        for topic_id in range(loaded_model.num_topics):
            for time in range(len(loaded_model.time_slices)):
                top_words = loaded_model.show_topic(topic_id, time, topn=10)

                print("Topic",
                      str(topic_id) + ", time slice",
                      str(time) + ':',
                      end=' ')
                for weight, word in top_words:
                    print(word, end=', ')
                print()
            print()
    elif args.model_type == "ldaseq":
        loaded_model = LdaSeqModel.load(args.model)
        # maybe use dtm_coherence?
        print(loaded_model.num_topics)
        print(loaded_model.time_slice)
        for topic_id in range(loaded_model.num_topics):
            for time in range(len(loaded_model.time_slice)):
                top_words = loaded_model.print_topic(topic=topic_id,
                                                     time=time,
                                                     top_terms=20)
                print("Topic",
                      str(topic_id) + ", time slice",
                      str(time) + ':',
                      end=' ')
                for word, weight in top_words:
                    print(word, end=' ')
                print()
            print()
    else:
        print("Unknown model type provided: " + args.model_type)
        sys.exit(1)
def dtm(dtm_path, corpus, dictionary, time_slices, num_topics=40, load=False):
    # dtm_path should have your local binary of Blei-DTM
    print("Running DTM")
    if load is False:
        model = DtmModel(dtm_path,
                         corpus,
                         time_slices,
                         num_topics=num_topics,
                         id2word=dictionary,
                         initialize_lda=True)
        model.save("DTM")
        return model
    elif load is True:
        model = DtmModel.load('DTM')
        return model
Beispiel #3
0
    tagged_words = pos_tag(words)
    lemmatized = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) for word, tag in tagged_words]
    lemmatized = list(filter(lambda w: not any(p in w for p in punctuation) and w not in stopword_list and w not in punctuation and len(w) >= 3, lemmatized))
    return lemmatized

def timestamp():
    return datetime.now().strftime('%x %X')

print('({}) DTM training data preprocessing started'.format(timestamp()))
start = datetime.now()
orig_df = pd.read_pickle(r'dfs\2020-03-22-to-2020-11-18-1000-daily')
orig_texts = [preprocess(text) for text in orig_df['full_text']]
orig_dictionary = corpora.Dictionary(orig_texts)
orig_corpus = [orig_dictionary.doc2bow(text) for text in orig_texts]

dtm_model = DtmModel.load(r'dtm\2020-03-22-to-2020-11-18-1000-daily')
print('Time to preprocess training texts:', str(datetime.now() - start))

######################################
##### DAY-BY-DAY TOPIC LABELLING #####
######################################

conn = sqlite3.connect('database/tweets.db')

# df = pd.read_pickle(r'dfs\2020-03-22-to-2020-08-19-4000-daily')
# df = pd.read_sql_query('select * from tweets where "user.screen_name" in (select screen_name from labels) and created_at between \'2020-03-22\' and \'2020-11-18\'', conn)

# comment out n_tweets_per_day lines and cumulative_tweets lines when using full dataset
# n_tweets_per_day = df['created_at'].apply(lambda x: x[:10]).value_counts().sort_index().values.tolist()

# cumulative_tweets[i] is the index of the first Tweet from i days after START_DATE

#Corpus class for DTM data load
class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)


corpus = DTMcorpus(documents)

#path where dtm file is installed
dtm_path = "/home/ankit/NLP_Project/dtm/dtm/dtm"
model = DtmModel.load("DTMMOdel.txt")

#model.save("DTModel.txt")
#Gives top 25 topics
tp = model.show_topics(num_topics=-1,
                       times=1,
                       num_words=100,
                       log=False,
                       formatted=False)
print tp
print type(tp)
for i in tp:
    for j in i:
        print type(j), j[1].decode("utf-8")

    #print i.decode("utf-8")