def setUp(self): """ read a test txt file into a pandas dataframe, and create a RawDocs object from it. """ data = pd.read_table( "topicmodel_tests/testfiles/speech_data_extend.txt", encoding="utf-8") self.data = data[data.year >= 1947] self.docsobj = topicmodels.RawDocs(self.data.speech, "long")
def __init__(self,dataframe,stopword_remove=0, ngram=1): docsobj = topicmodels.RawDocs(dataframe.text, "long") docsobj.token_clean(1) docsobj.stopword_remove("tokens") docsobj.stem() docsobj.stopword_remove("stems") docsobj.term_rank("stems") if stopword_remove>0: docsobj.stopword_remove("stems",stopword_remove) dataframe = dataframe.drop('text',1) dataframe['text'] = [' '.join(s) for s in docsobj.stems] self.dataframe = dataframe all_stems = [s for d in docsobj.stems for s in d] self.stems = set(find_ngrams(all_stems,ngram)) self.ngram = ngram
def setUp(self): """ Do all the preprocessing steps above, but with a smaller subset of the data """ data = pd.read_table( "topicmodel_tests/testfiles/speech_data_extend.txt", encoding="utf-8") self.data = data[data.year >= 1997] self.docsobj = topicmodels.RawDocs(self.data.speech, "long") self.docsobj.token_clean(1) self.docsobj.stopword_remove("tokens") self.docsobj.stem() self.docsobj.stopword_remove("stems") self.docsobj.term_rank("stems") self.docsobj.rank_remove("tfidf", "stems", self.docsobj.tfidf_ranking[1000][1]) self.all_stems = [s for d in self.docsobj.stems for s in d] ## now create self.ldaobj = topicmodels.LDA.LDAGibbs(self.docsobj.stems, 30)
def index_count(self,wordList,colName): """ This function counts number of occurances in wordList in self.dataframe[colName] wordList: list of words to generate index counts for colName = name of the column of frequency counts """ wordobj = topicmodels.RawDocs(wordList, "stopwords.txt") wordobj.token_clean(1) wordobj.stopword_remove("tokens") wordobj.stem() word_stems = set([s for d in wordobj.stems for s in d]) def count_frequency(doc_text): freqs = pd.Series(collections.Counter(doc_text.split())) return freqs.loc[set(freqs.index.values)&set(word_stems)].sum() #Create vector of frequencies for each paragraph of number of words in word_stems word_freqs = self.dataframe.text.apply(count_frequency) #Create vector of total number of words for each paragraph total_words = self.dataframe.text.apply(lambda x: len(x.split())) #Compute compute sentiment weights freqs = word_freqs/total_words self.dataframe[colName]= freqs
Python script for tutorial illustrating collapsed Gibbs sampling for Latent Dirichlet Allocation. See explanation for commands on http://nbviewer.ipython.org/url/www.econ.upf.edu/~shansen/tutorial_notebook.ipynb. """ import pandas as pd import topicmodels ########## select data on which to run topic model ######### data = pd.read_table("speech_data_extend.txt",encoding="utf-8") data = data[data.year >= 1947] ########## clean documents ######### docsobj = topicmodels.RawDocs(data.speech, "stopwords.txt") docsobj.token_clean(1) docsobj.stopword_remove("tokens") docsobj.stem() docsobj.tf_idf("stems") docsobj.stopword_remove("stems",5000) all_stems = [s for d in docsobj.stems for s in d] print("number of unique stems = %d" % len(set(all_stems))) print("number of total stems = %d" % len(all_stems)) ########## estimate topic model ######### ldaobj = topicmodels.LDA(docsobj.stems,30) ldaobj.sample(0,50,10)
import pandas as pd import topicmodels ############### # select data on which to run topic model ############### data = pd.read_table("speech_data_extend.txt", encoding="utf-8") data = data[data.year >= 1947] ############### # clean documents ############### docsobj = topicmodels.RawDocs(data.speech, "long") docsobj.token_clean(1) docsobj.stopword_remove("tokens") docsobj.stem() docsobj.stopword_remove("stems") docsobj.term_rank("stems") docsobj.rank_remove("tfidf", "stems", docsobj.tfidf_ranking[5000][1]) all_stems = [s for d in docsobj.stems for s in d] print("number of unique stems = %d" % len(set(all_stems))) print("number of total stems = %d" % len(all_stems)) ############### # estimate topic model ###############
import matplotlib.pyplot as plt import numpy as np import time #change directory to where topicmodels is os.chdir('/users/timkreienkamp/documents/text-mining-tutorial/') import topicmodels #change it back to the folder for the homework os.chdir('/users/timkreienkamp/documents/textminingbgse/HW4/') #load data data = pd.read_table("../HW2/data_puntuation.csv") docsobj = topicmodels.RawDocs(data.Text, "../HW2/stopwords.txt") docsobj.token_clean(1) print docsobj.tokens[3] docsobj.stopword_remove("tokens") print docsobj.tokens[3] docsobj.stem() print docsobj.stems[3] docsobj.tf_idf("stems")
import numpy as np import topicmodels ############### # select data on which to run topic model ############### data = pd.read_table("mpc_minutes.txt", encoding="utf-8") ############### # bag of words ############### data_agg = data.groupby('year').agg(lambda x: ' '.join(x)) docsobj = topicmodels.RawDocs(data_agg.minutes, "long") docsobj.token_clean(1) docsobj.stopword_remove("tokens") docsobj.stem() docsobj.stopword_remove("stems") docsobj.term_rank("stems") all_stems = [s for d in docsobj.stems for s in d] print("number of unique stems = %d" % len(set(all_stems))) print("number of total stems = %d" % len(all_stems)) bowobj = topicmodels.BOW(docsobj.stems) data_agg['pos'] = bowobj.pos_count('stems') data_agg['neg'] = bowobj.neg_count('stems') data_agg['index'] = (data_agg.pos - data_agg.neg) /\