def setUp(self):
        """
        read a test txt file into a pandas dataframe, and create a RawDocs object from it.
        """

        data = pd.read_table(
            "topicmodel_tests/testfiles/speech_data_extend.txt",
            encoding="utf-8")
        self.data = data[data.year >= 1947]
        self.docsobj = topicmodels.RawDocs(self.data.speech, "long")
Esempio n. 2
0
	def __init__(self,dataframe,stopword_remove=0, ngram=1):
		docsobj = topicmodels.RawDocs(dataframe.text, "long")
		docsobj.token_clean(1)
		docsobj.stopword_remove("tokens")
		docsobj.stem()
		docsobj.stopword_remove("stems")
		docsobj.term_rank("stems")
		if stopword_remove>0:
			docsobj.stopword_remove("stems",stopword_remove)
		dataframe = dataframe.drop('text',1)
		dataframe['text'] = [' '.join(s) for s in docsobj.stems]
		self.dataframe = dataframe
		all_stems = [s for d in docsobj.stems for s in d]
		self.stems = set(find_ngrams(all_stems,ngram))
		self.ngram = ngram
    def setUp(self):
        """
        Do all the preprocessing steps above, but with a smaller subset of the data
        """

        data = pd.read_table(
            "topicmodel_tests/testfiles/speech_data_extend.txt",
            encoding="utf-8")
        self.data = data[data.year >= 1997]
        self.docsobj = topicmodels.RawDocs(self.data.speech, "long")
        self.docsobj.token_clean(1)
        self.docsobj.stopword_remove("tokens")
        self.docsobj.stem()
        self.docsobj.stopword_remove("stems")
        self.docsobj.term_rank("stems")
        self.docsobj.rank_remove("tfidf", "stems",
                                 self.docsobj.tfidf_ranking[1000][1])
        self.all_stems = [s for d in self.docsobj.stems for s in d]
        ## now create
        self.ldaobj = topicmodels.LDA.LDAGibbs(self.docsobj.stems, 30)
Esempio n. 4
0
	def index_count(self,wordList,colName):
		"""
		This function counts number of occurances in wordList in 
				self.dataframe[colName]
		
		wordList: list of words to generate index counts for 
		colName = name of the column of frequency counts
		"""
		wordobj = topicmodels.RawDocs(wordList, "stopwords.txt")
		wordobj.token_clean(1)
		wordobj.stopword_remove("tokens")
		wordobj.stem()
		word_stems = set([s for d in wordobj.stems for s in d])
		def count_frequency(doc_text):
			freqs = pd.Series(collections.Counter(doc_text.split()))
			return freqs.loc[set(freqs.index.values)&set(word_stems)].sum()
		#Create vector of frequencies for each paragraph of number of words in word_stems
		word_freqs = self.dataframe.text.apply(count_frequency)
		#Create vector of total number of words for each paragraph
		total_words  = self.dataframe.text.apply(lambda x: len(x.split()))
		#Compute compute sentiment weights
		freqs = word_freqs/total_words
		self.dataframe[colName]= freqs
Python script for tutorial illustrating collapsed Gibbs sampling for Latent Dirichlet Allocation.

See explanation for commands on http://nbviewer.ipython.org/url/www.econ.upf.edu/~shansen/tutorial_notebook.ipynb.
"""

import pandas as pd
import topicmodels

########## select data on which to run topic model #########

data = pd.read_table("speech_data_extend.txt",encoding="utf-8")
data = data[data.year >= 1947]

########## clean documents #########

docsobj = topicmodels.RawDocs(data.speech, "stopwords.txt")
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")
docsobj.stem()
docsobj.tf_idf("stems")
docsobj.stopword_remove("stems",5000)

all_stems = [s for d in docsobj.stems for s in d]
print("number of unique stems = %d" % len(set(all_stems)))
print("number of total stems = %d" % len(all_stems))

########## estimate topic model #########

ldaobj = topicmodels.LDA(docsobj.stems,30)

ldaobj.sample(0,50,10)
import pandas as pd
import topicmodels

###############
# select data on which to run topic model
###############

data = pd.read_table("speech_data_extend.txt", encoding="utf-8")
data = data[data.year >= 1947]

###############
# clean documents
###############

docsobj = topicmodels.RawDocs(data.speech, "long")
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")
docsobj.stem()
docsobj.stopword_remove("stems")
docsobj.term_rank("stems")
docsobj.rank_remove("tfidf", "stems", docsobj.tfidf_ranking[5000][1])

all_stems = [s for d in docsobj.stems for s in d]
print("number of unique stems = %d" % len(set(all_stems)))
print("number of total stems = %d" % len(all_stems))

###############
# estimate topic model
###############
Esempio n. 7
0
import matplotlib.pyplot as plt
import numpy as np
import time

#change directory to where topicmodels is
os.chdir('/users/timkreienkamp/documents/text-mining-tutorial/')

import topicmodels

#change it back to the folder for the homework
os.chdir('/users/timkreienkamp/documents/textminingbgse/HW4/')

#load data
data = pd.read_table("../HW2/data_puntuation.csv")

docsobj = topicmodels.RawDocs(data.Text, "../HW2/stopwords.txt")

docsobj.token_clean(1)

print docsobj.tokens[3]

docsobj.stopword_remove("tokens")

print docsobj.tokens[3]

docsobj.stem()

print docsobj.stems[3]

docsobj.tf_idf("stems")
Esempio n. 8
0
import numpy as np
import topicmodels

###############
# select data on which to run topic model
###############

data = pd.read_table("mpc_minutes.txt", encoding="utf-8")

###############
# bag of words
###############

data_agg = data.groupby('year').agg(lambda x: ' '.join(x))

docsobj = topicmodels.RawDocs(data_agg.minutes, "long")
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")
docsobj.stem()
docsobj.stopword_remove("stems")
docsobj.term_rank("stems")

all_stems = [s for d in docsobj.stems for s in d]
print("number of unique stems = %d" % len(set(all_stems)))
print("number of total stems = %d" % len(all_stems))

bowobj = topicmodels.BOW(docsobj.stems)

data_agg['pos'] = bowobj.pos_count('stems')
data_agg['neg'] = bowobj.neg_count('stems')
data_agg['index'] = (data_agg.pos - data_agg.neg) /\