train_ls = [str(i) for i in range(6, 28, 1)] corpus = [] count = 0 for d in train_ls: fn = "training_tweets/{}.txt".format(d) txt = "" with open(fn, 'r') as f: csv_reader = csv.reader(f, delimiter='\t') for row in csv_reader: t = removeSpecialChar(removeURLs(removeUsernames(row[3]))) corpus.append(word_tokenization(t, tagger)) count += 1 print("File {} is done".format(d)) print("Total tweets are", count) docs = pytm.DocumentSet(corpus, min_df=5, max_df=0.5) print("Corpus Created") #Applying LDA on our dataset n_topics = 100 lda = pytm.SVILDA(n_topics, docs.get_n_vocab()) lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5) print("LDA fitted") topic_list = [] alphas = [lda.get_alpha(k) for k in range(n_topics)]
return re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in orignal.split("\n") def remove2Char(tweet): new_tweet = "" for w in tweet.split(): if len(w) >= 3: new_tweet=new_tweet + w + " " return new_tweet only_text = [] with open('all_english_tweets.txt', mode = 'r') as f: csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter = ',') for row in csv_reader: only_text.append(remove2Char(removeSpecialChar(removeStopWords(removeUsernames(removeURLs(row[1])))))) docs = pytm.DocumentSet(only_text, min_df=5, max_df=0.5) #Applying LDA on our dataset n_topics = 1000 lda = pytm.SVILDA(n_topics, docs.get_n_vocab()) lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5) #Getting topic's and alpha values topic_list = [] alphas = [lda.get_alpha(k) for k in range(n_topics)] for k, alpha in enumerate(alphas): vocab = docs.get_vocab() phi = lda.get_phi(k) new_phi = np.around(list(phi), decimals = 3) print('topic {0} (alpha = {1})'.format(k, np.around(alpha, decimals = 2))) a = sorted(zip(vocab, new_phi), key=lambda x: -x[1])[:10]
return new_tweet contents = [] years = [] with open('nips-papers/papers.csv', mode = 'r') as f: csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter = ',') for row in csv_reader: txt = remove2Char(removeSpecialChar(removeStopWords(removeURLs(row[6])))) if len(txt) > 500: contents.append(txt) years.append(row[1]) print("Corpus Created") docs = pytm.DocumentSet(contents, min_df=5, max_df=0.5) #Applying LDA on our dataset n_topics = 60 lda = pytm.SVILDA(n_topics, docs.get_n_vocab()) lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5) #Getting topic's and alpha values topic_list = [] alphas = [lda.get_alpha(k) for k in range(n_topics)] for k, alpha in enumerate(alphas): vocab = docs.get_vocab() phi = lda.get_phi(k) new_phi = np.around(list(phi), decimals = 3) print('topic {0} (alpha = {1})'.format(k, np.around(alpha, decimals = 2))) a = sorted(zip(vocab, new_phi), key=lambda x: -x[1])[:50]
only_text = [] max_doc_size = 1000 count = 0 with open('ordered_day_hashtag_dtm.txt', mode='r') as f: csv_reader = csv.reader((x.replace('\0', '') for x in f), delimiter=',') for row in csv_reader: only_text.append( remove2Char(removeSpecialChar(removeStopWords(removeURLs( row[2]))))) count += 1 if count == max_doc_size: break docs = pytm.DocumentSet(only_text, min_df=5, max_df=0.5) #Applying LDA on our dataset n_topics = 20 lda = pytm.SVILDA(n_topics, docs.get_n_vocab()) lda.fit(docs, n_iteration=1000, B=1000, n_inner_iteration=5, n_hyper_iteration=20, J=5) #Getting topic's and alpha values topic_list = [] alphas = [lda.get_alpha(k) for k in range(n_topics)] for k, alpha in enumerate(alphas):