def stemWord(self, fileName, preprocessedFileName=''): "Stemming word and write to temp file" p = PorterStemmer() print('Preprocessing...') print('Stemming words...') if len(preprocessedFileName) != 0: self.tempFileName = preprocessedFileName with open(self.tempFileName, 'w') as outputfile: with open(fileName, 'r') as file: while 1: word = '' line = file.readline() if line == '': break # skip first word(category) category = '' for ch in line: if ch == ' ': if len(category) != 0: outputfile.write(category + ' ') break else: category += ch # skip first word (category label) for i in range(len(category) + 1, len(line)): if line[i].isalpha(): word += line[i].lower() else: if word: outputfile.write(p.stem( word, 0, len(word) - 1)) word = '' outputfile.write(line[i].lower())
def convert_keyboard_query(): qry = raw_input("Type in your query:") words = qry.strip().split(' ') p = PorterStemmer() QUERY_WEIGHT = 2 new_doc_vec = defaultdict(int) for word in words: word = word.strip() if re.search('[a-zA-Z]', word): word = word.lower() word = p.stem(word, 0, len(word) - 1) if word in new_doc_vec: new_doc_vec[word] += QUERY_WEIGHT elif word not in stoplist_hash and word in corp_freq_hash: new_doc_vec[word] = QUERY_WEIGHT else: continue new_vect = defaultdict(int) for key in new_doc_vec: new_vect[key] = new_doc_vec[key] if key in synonyms: sim_words_list = synonyms_list[synonyms[key]] for sim_word in sim_words_list: if sim_word not in stoplist_hash and re.search( "[a-zA-z]", sim_word): if corp_freq_hash[sim_word] > 1: new_vect[sim_word] = new_doc_vec[key] return new_vect
def Word_appear_count(text, type, Word_count_pubmed, Word_count_twitter, Word_count_all): text = remove_tag(text) word = text.split() p = PorterStemmer() for i in word: i = p.stem(i, 0, len(i) - 1) # porter # pubmed if i not in Word_count_pubmed.keys(): Word_count_pubmed[i] = 0 if type == 'pubmed': Word_count_pubmed[i] += 1 elif i in Word_count_pubmed.keys() and type == 'pubmed': Word_count_pubmed[i] += 1 # twitter if i not in Word_count_twitter.keys(): Word_count_twitter[i] = 0 if type == 'twitter': Word_count_twitter[i] += 1 elif i in Word_count_twitter.keys() and type == 'twitter': Word_count_twitter[i] += 1 # all if i not in Word_count_all.keys(): Word_count_all[i] = 1 elif i in Word_count_all.keys(): Word_count_all[i] += 1 return Word_count_pubmed, Word_count_twitter,Word_count_all
def __init__(self, stop_words_file=""): self.word_doc_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) self.doc_class_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) self.total_words = 0 self.class_word_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) self.class_doc_frequency = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) self.stop_words = collections.defaultdict(lambda: False) self.stemmer = PorterStemmer() self.test_set = collections.defaultdict(lambda: []) self.total_doc_test_set = 0 self.train_set = collections.defaultdict(lambda: []) self.total_doc_train_set = 0 self.data_set_directory = "" if stop_words_file != "": with open(stop_words_file, 'r') as f: for line in f: for w in line.split(): w = self.normalize(w) self.stop_words[w] = True
def __init__(self, parent, docno, doc, terms): QtGui.QDialog.__init__(self, parent) self.setupUi(self) # Set fields self.labelDocumentNo.setText(docno) textDocument = self.textEdit.document() textCursor = QtGui.QTextCursor(textDocument) normalFormat = QtGui.QTextCharFormat() termFormat = QtGui.QTextCharFormat() termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red"))) termFormat.setFontWeight(QtGui.QFont.Bold) textCursor.beginEditBlock() stemmer = PorterStemmer() terms = terms.split(",") stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms] for line in unicode(doc).split("\n"): for word in line.split(" "): nword = word.lower().strip(punctuation) sword = stemmer.stem(nword, 0, len(nword)-1) if nword in terms or sword in stemmed_terms: textCursor.insertText(word, termFormat) else: textCursor.insertText(word, normalFormat) textCursor.insertText(" ", normalFormat) textCursor.insertText("\n", normalFormat) self.textEdit.moveCursor(QtGui.QTextCursor.Start)
def __init__(self, is_turbo=False): self.name = 'Leroy' self.userName = '' self.is_turbo = is_turbo self.stemmer = PorterStemmer() self.read_data() self.parsed_sentiment = dict() self.negationWords = ["didn't","not","no","don't"] self.punctuation = {"but",",",".","!",":",";"} self.strongPosVerbs = {"love","loved","adored","adore","enjoy","enjoyed"} self.strongPosAdjectives = {"amazing","cool","awesome","favorite"} self.strongNegVerbs = {"hate","hated","abhored","abhor","loathed","loathe","dispised","dispise"} self.strongNegAdjectives = {"apalling"} self.intensifiersSubject = {"really","reeally","extremely","absolutely"} self.intensifiersObject = {"really","reeally","very","extremely","remarkably","unusually","utterly","absolutely","exceptionally"} self.corrected_movie_trigger = False #For Two movie input self.similarity_words = {"either", "neither", "both", "and"} self.disimilarity_words = {"but"} #TODO: any more? self.userMovies = collections.defaultdict() self.userEmotions = [0,0,0,0,0] # anger, disgust, fear, joy, sadness self.movieDict = collections.defaultdict(lambda:0) self.genreDict = collections.defaultdict(lambda:0) self.movieIDToName = collections.defaultdict(lambda:0) self.movie_name_to_id() self.movie_history = [] self.movie_recommendations = []
def __init__(self, creative=False): # The chatbot's default name is `moviebot`. Give your chatbot a new name. self.name = 'Lit!' self.creative = creative # This matrix has the following shape: num_movies x num_users # The values stored in each row i and column j is the rating for # movie i by user j self.titles, ratings = movielens.ratings() self.sentiment = {} self.porter_stemmer = PorterStemmer() sentimentCopy = movielens.sentiment() for k, v in sentimentCopy.items(): key = self.porter_stemmer.stem(k) self.sentiment[key] = v self.user_ratings = [] ############################################################################# # TODO: Binarize the movie ratings matrix. # ############################################################################# ratings = self.binarize(ratings) # Binarize the movie ratings before storing the binarized matrix. self.ratings = ratings
def __init__(self, is_turbo=False): self.name = 'moviebot' self.is_turbo = is_turbo self.p = PorterStemmer() self.read_data() # self.titles, self.ratings = ratings() self.binarize() self.RecommendationStrings = [ "I think you should check out %s! ", "This movie will blow your mind: %s. ", "Watch %s. It will ruin all other movies for you. " ] self.ratedMovieList = {} self.userRatingVector = np.zeros(len(self.titles)) self.recommendedMovies = [] self.inTheMiddleOfSentimentAnalysis = False self.currentMovieForMoreInformation = "" self.TwoMoviesBoolean = False self.currentConjunction = "" self.sentimentOfPreviousMovie = 0 self.check = {} self.distanceThreshold = 10 self.confirm = False self.previousInput = ""
def process_word(token): token = token.lower() if constants.STEM is True: p = PorterStemmer() token = p.stem(token, 0,len(token)-1) return token
def __init__(self, is_turbo=False): self.name = 'IAN' self.is_turbo = is_turbo self.read_data() self.stemmer = PorterStemmer() self.counter = 0 self.already_seen = [] self.recommendations = [] self.delimiters = [".", ",", ";", "!", "?", ":"] self.usersentiment = 0 self.usermovie = "" self.clarify = 0 self.check_spelling_flag = 0 # build stemmed dictionary self.stemmed_sentiment = {} for word in self.sentiment.keys(): self.stemmed_sentiment[self.stemmer.stem(word, 0, len(word) - 1)] = self.sentiment[word] # build editCounts dictionary for spell checking self.editCounts = collections.defaultdict(list) with open("deps/count_1edit.txt") as f: for line in f: rule, countString = line.split("\t") originalText, editedText = rule.split("|") if self.editCounts[originalText] is None: self.editCounts[originalText] = [(editedText, int(countString))] else: self.editCounts[originalText].append( (editedText, int(countString))) sorted(self.editCounts[originalText], key=lambda x: x[1]) self.check_spelling = "" self.already_mentioned = [] self.fromList = []
def stem_words(list_of_tokens): stemmer = PorterStemmer() # Declares the stemmer object for token_index, token in enumerate(list_of_tokens): list_of_tokens[token_index] = stemmer.stem( token, 0, len(token) - 1) # Stems the word using the function return list_of_tokens # Returns the "post-stem" list of tokens
def stem(tokens): p = PorterStemmer() stems = [] for token in tokens: stem = p.stem(token, 0, len(token) - 1) stems.append(stem) return list(filter(None, stems))
def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer()
def stemWords(inList): ##Function that stems the words. ##Name: stemWords; input: list (of tokens); output: list (of stemmed tokens) outlist = [] p = PorterStemmer() for word in inList: outlist.append(p.stem(word, 0, len(word)-1)) return outlist
def __init__(self, creative=False): # The chatbot's default name is `moviebot`. Give your chatbot a new name. self.name = 'moviebot' self.creative = creative # This matrix has the following shape: num_movies x num_users # The values stored in each row i and column j is the rating for # movie i by user j self.titles, ratings = movielens.ratings() self.sentiment = movielens.sentiment() self.new_sentiment = {} self.p = PorterStemmer() # create a new sentiment dict with stemmed keys for key in self.sentiment: new_key = self.p.stem(key) self.new_sentiment[new_key] = self.sentiment[key] self.bin_ratings = self.binarize(ratings) # a tuple with the sentiment of the movie being discussed self.current_sentiment = None # the movie title entered by the user self.current_title = None # a list of current movie candidates self.current_idxs = [] self.prev_movie = None self.prev_sentiment = None # a dict where dict[i] = j is the user's sentiment j for movie index i # for movies that the user has described and the chatbot has processed self.user_movies = {} # a set of movie indexes that the user has already described self.user_movie_set = set() self.prefix_match_found = False self.disambig = False # if chatbot is in recommend mode, only respond to yes or no self.recommend_mode = False # a list of recommendations for the user self.recommendations = [] self.recommend_idx = 0 # preprocess movie list by extracting possible titles and year self.movies = [] for entry in self.titles: self.movies.append(extract_titles_and_year(entry[0])) ############################################################################# # TODO: Binarize the movie ratings matrix. # ############################################################################# # Binarize the movie ratings before storing the binarized matrix. self.ratings = ratings
def stemming(self, tokens): stemmed_tokens = [] stem_func = PorterStemmer() for c in tokens: if c.isalpha(): stemmed_tokens.append(stem_func.stem(c, 0,len(c)-1)) else: stemmed_tokens.append(c) return stemmed_tokens
def stemInputAndCheckMatch(self, uType, word): ps = PorterStemmer() stemmedWord = ps.stem(word) matchingWords = self.checkMatches(uType, stemmedWord) data = self.getMostFrequentWords(matchingWords) if (data[1] != 1): return data[0] else: return []
def __init__(self, path, num_records): self.porter = PorterStemmer() self.stop = set() with open('stop.words.dat', 'r') as sw: for line in sw: self.stop.add(line[:-1]) if path != '' and num_records != 0: self.process(path, num_records)
def load_dictionary(filename, stem=True): """Loads line separated dictionary into a list""" out = [] for word in open("dictionaries/%s" % filename, "r"): word = word.lower() if stem is True: p = PorterStemmer() word = p.stem(word, 0,len(word)-1) out.append(word) return out
def __init__(self): self.pStemmer = PorterStemmer() self.num2Word = NumberToWord() self.stopWords = [ "i", "me", "my", "we", "the", "on", "and", "in", "to", "s", "t", "a", "an", "at", "of", "is", "or", "by", "it", "as", "be" ] self.sOutput = [] self.artistNames = [] self.albumNames = []
def __init__(self): self.vocabulary = [] self.invertedIndex = {} self.documents = [] self.documentsUnstemmed = [] self.tfidf = {} # {word: { docId: tfidf}} self.docIdToFilename = {} self.stemmer = PorterStemmer() self.stopWords = []
def preprocess(self, query): p = PorterStemmer() result = [] # remove any non-alphanumeric characters [a-zA-Z0-9_] query = re.sub("[^\w]", " ", query) query = query.lower().split(' ') for word in query: if word not in self.stopwords: result.append(p.stem(word, 0, len(word) - 1)) return result
def __init__(self): ''' self.saver : Pickles the object on to the disk. self.ps : Porter stemmer class object. It is required to get stem of a word. self.st : Class object to check for stop words. ''' self.saver = save_object() self.ps = PorterStemmer() self.st = stopwords() self.m_ds = {} self.models = {} self.m_mod = model()
def format_description(text, stop_words): words = text.split() stemmer = PorterStemmer() non_stop_words = [] for word in words: if word not in stop_words: # Not a stop word, so lower, remove punctuation, and stem lowered_token = remove_punctuation(word).lower() #non_stop_words.append(lowered_token) non_stop_words.append(stemmer.stem(lowered_token)) return ' '.join(non_stop_words)
def finalize(tInput, swInput): p = PorterStemmer() output = open("output.txt", 'w') for i in range(len(tInput)): token = tInput[i] if token == "a" or token == "an" or token == "the": output.write("%s\t- article\n" % token) elif any(token in x for x in swInput): output.write("%s\t- stop word\n" % token) else: stemword = p.stem(token, 0, len(token) - 1) output.write("%s\t- %s\n" % (token, stemword)) output.close()
def stem_text(text): p = PorterStemmer() stemmed_text = '' word = '' for i, c in enumerate(text): if c.isalpha(): word += c.lower() if not c.isalpha() or i == (len(text) - 1): if word: stemmed_text += p.stem(word, 0,len(word)-1) word = '' if c.lower() == ' ': stemmed_text += c.lower() return stemmed_text
def __init__(self, stop_words_file=""): self.stop_words = collections.defaultdict(lambda: False) self.stemmer = PorterStemmer() self.map_trans_terms = collections.defaultdict(lambda: 0) self.map_trans_docs = collections.defaultdict(lambda: 0) self.total_words = 0 if stop_words_file != "": with open(stop_words_file, 'r') as f: for line in f: for w in line.split(): w = self.normalize(w) self.stop_words[w] = True self.BLOCK_SIZE = 100 # Number of entity, each record is a (term, docID)
def remove_porterstemmer(input_file, noise_words_set): questions = list() word_weight = [] p = PorterStemmer() for line in input_file: line = line.lower() words = filter(None, re.split("\W*\d*", line)) question = [] for word in words: new_word = p.stem(word, 0, len(word) - 1) if new_word not in noise_words_set and len(new_word) > 2: question.append(new_word) questions.append(question) word_weight.append(Counter(question)) return word_weight, questions
def remove_porterstemmer(input_file,noise_words_set): questions = list() word_weight = [] p = PorterStemmer() for line in input_file: line = line.lower() words = filter(None, re.split("\W*\d*", line)) question = [] for word in words: new_word = p.stem(word,0,len(word)-1) if new_word not in noise_words_set and len(new_word)>2: question.append(new_word) questions.append(question) word_weight.append(Counter(question)) return word_weight, questions
def en_preprocess(file_path: str, stop_words: list, step: int = 4) -> str: ''' Step1: Extract pure-text content from the original html file Step2: To lower case, remove special characters Step3: Remove stop words Step4: Porter stemming (Final result) ''' with open(file_path, "r", encoding="UTF-8") as f: html_content = f.read() parsed_content = BeautifulSoup(html_content, 'html.parser') text_content = "" # Extract pure-text content from the original html file for child in parsed_content.find(id="mw-content-text").div.children: if child.name in ("p", "h2", "h3", "h4", "h5"): text_content += child.get_text() if step == 1: return text_content # To lower case text_content = text_content.lower() # Remove special characters text_content = text_content.replace("'", "") text_content = text_content.replace("-", "") for i in range(len(text_content)): curr_char = text_content[i] if not ((curr_char >= 'a' and curr_char <= 'z')): text_content = text_content.replace(curr_char, " ") # Remove duplicated spaces text_content = re.sub("[ ]+", " ", text_content) if step == 2: return text_content # Tokenize token_list = text_content.split(" ") # Remove stop words new_list = [] for token in token_list: if token not in stop_words and token != "": new_list.append(token) token_list = new_list if step == 3: return " ".join(token_list) # Porter stemming p = PorterStemmer() new_list = [] for i in range(len(token_list)): new_list.append(p.stem(token_list[i], 0, len(token_list[i]) - 1)) token_list = new_list final_result = " ".join(token_list) return final_result
def __init__(self, is_turbo=False): self.name = 'Moviebot' self.is_turbo = is_turbo self.is_binarized = True self.p = PorterStemmer() self.punctuation = set([",", ".","?","!",":",'"',"'","(",")"]) self.endPunctuation = set([".","?","!",":",'"',"'","(",")"]) self.negateWords = set(["not", "no", "never", "neither", "nor"]) self.prevNegateWords = set(["but", "although", "however", "yet"]) self.extremeWords = set(["very", "really", "extremely"]) self.movies = {} self.movie_to_index_dict = {} self.alternate_titles_dict = {} self.movie_scores = [] self.non_binarized_matrix = {} self.read_data()
def __init__(self, pathData, pathID, pathQuery, uniqueT, isStemming): """ :param pathData: string of pathData :param pathID: string of pathID :param pathQuery: string of pathQuery :param uniqueT: integer of unique term :param isStemming: True if using stemming, False if not using stemming """ self.__buildID(pathID) if isStemming: self.__stemmer = PorterStemmer() self.__buildQuery(pathQuery, isStemming) self.__uniqueTerm = uniqueT self.__buildData(pathData)
def getStemWords(query_line, stopwords): raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \ .replace("(", "").replace(")", "").split(" ") for i in stopwords: while i in raw_data: raw_data.remove(i) stemmedArray = raw_data p = PorterStemmer() for i in range(1, stemmedArray.__len__()): while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1): stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1) return raw_data[0], raw_data[1:], stemmedArray[1:]
class Parser: #A processor for removing the commoner morphological and inflexional endings from words in English stemmer=None stopwords=[] def __init__(self,): self.stemmer = PorterStemmer() #English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop #self.stopwords = open('data/english.stop', 'r').read().split() def clean(self, string): """ remove any nasty grammar tokens from string """ string = string.replace(".","") string = string.replace("\s+"," ") string = string.lower() return string def removeStopWords(self,list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords ] def tokenise(self, string): """ break string up into tokens and stem words """ string = self.clean(string) words = string.split(" ") return [self.stemmer.stem(word,0,len(word)-1) for word in words]
def tokenize(documents): # Read the stopwords stop_word_set = set(open('./stopwords.txt', 'r').read().split()) # Initialize the Porter stemmer p = PorterStemmer() # Create a dictionary where each element is also a dictionary. The outer dictionary will map stemmed words to # document ids and the inner dictionaries will map the document ids to their indices in the document. word_to_doc = defaultdict(lambda: defaultdict(list)) # Positional inverted index for document_index, document in enumerate(documents, start=1): for word_index, word in enumerate(document.split()): if word not in stop_word_set: # Store each word as stemmed and put them to the inverted index stemmed_word = p.stem(word, 0, len(word) - 1) # stemmed_word = word word_to_doc[stemmed_word][document_index].append(word_index) return word_to_doc
def __init__(self, is_turbo=False): self.name = 'moviebot' self.is_turbo = is_turbo #Initialize relevant classes self.stemmer = PorterStemmer() self.sentiment = {} self.read_data() #User data self.response_indexes = {} #Read in data self.responses = self.readInFile('deps/responses.txt', False) self.articles = ['the', 'a', 'an'] self.negations = self.readInFile('deps/negations.txt', True) self.punctuation = '.,?!-;' self.no_words = self.readInFile('deps/no_words.txt', True) self.yes_words = self.readInFile('deps/yes_words.txt', True) self.findpatterns = [ #patterns for finding movies without quotes '\"(.*?)\"', 'movie.*?(?:was|is|start(?:ed|s)|end(?:ed|s)) (.*)', '(?:I (?:think|thought|feel|felt) | watching )?(.*?) (?:was|is|start(?:ed|s)|end(?:ed|s)) .*?', 'I .*? watching (.*)', 'I .*?(?:watch|enjoy|hat|(?:dis)?lik|lov)ed (.*)' ] #Read in fine-sentiment data self.intensifiers = self.readInFile('deps/intensifiers.txt', True) self.strong_negative = self.readInFile('deps/strong_negative.txt', True) self.strong_negative = [self.stemmer.stem(word) for word in self.strong_negative] self.strong_positive = self.readInFile('deps/strong_positive.txt', True) self.strong_positive = [self.stemmer.stem(word) for word in self.strong_positive] #Binarize ratings matrix self.binarize() self.justGaveRec = False #Initialize relevant vars self.recommendations = [] self.INFO_THRESHOLD = 5 #Pre-process titles, ratings to make later work more efficient. self.titles_map = self.processTitles(self.titles) ## Remember which movies were mentioned without an explicit sentiment self.mentioned_movies = [] self.justFollowedUp = False self.checkingDisamb = False self.prevEmotion = 0 self.emotionWords = self.readInEmotions()
def search_dic(text, SearDic, original_word, index): text = remove_tag(text) word = text.split() p = PorterStemmer() for i in word: # poter_i = i poter_i = p.stem(i, 0, len(i) - 1) # porter if poter_i not in SearDic.keys(): SearDic[poter_i] = [index] original_word[poter_i] = [i] else: if index not in SearDic[poter_i]: SearDic[poter_i].append(index) if i not in original_word[poter_i]: original_word[poter_i].append(i) return SearDic, original_word
def dict_qryid_terms(is_stopping): global STOPWORDS_FILE stopwords_list = stopwords(STOPWORDS_FILE) ## create stopwords list p = PorterStemmer() ##create an Porter Stemmer instance dictquery = defaultdict(lambda: []) ## create the target dictionary with open(QUERY_TEXT_FILE, 'r') as f: for line in f: data_list = re.findall(r"[\w]+", line) query_id = data_list[0] for term in data_list[1:]: term = term.lower() if is_stopping: if term not in stopwords_list: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) else: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) return dictquery
def stem_string(line): if line == "": return "" p = PorterStemmer() word = "" output = "" for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0,len(word)-1) word = '' output += c.lower() if word: output += p.stem(word, 0,len(word)-1) return output
def getQuestionKeywords(question): """Return the keywords from a question. The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates Currently there are still issues with 1. stop words list is not complete: eg "recommend" etc is not a stop word. 2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau" >>> getQuestionKeywords('what is the best preschool in Potomac?') ['potomac', 'preschool'] >>> getQuestionKeywords('Can someone help with a preschool around potomac?') ['potomac', 'preschool'] >>> getQuestionKeywords('What is the best cafeteria around potomac?') ['potomac', 'restaurant'] """ # split the question into a list keywordList = question.split() # strip the punctuations etc keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList] # convert into lower case keywordList = [keyword.lower() for keyword in keywordList] #remove stop words from keywords keywordList = [keyword for keyword in keywordList if keyword not in stopWords] #stem the keywords stemmer = PorterStemmer() keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList] #take care of synonyms keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ] #remove duplicates keywordList = list(set(keywordList)) #sort the keywords keywordList.sort() return keywordList
def parse(self): #remove stop words self.dataList = [w for w in self.dataList if not w in self.stopWords] #get the stem of the words st = PorterStemmer() self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList] # add to list based on frequency of occurrence wordFreq = {} for word in self.dataList: if word in wordFreq: wordFreq[word] = wordFreq[word] + 1 else: wordFreq[word] = 0 wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1)) newList = [] for w in wordList: newList.insert(0,w[0]) self.dataList = newList
def __init__(self, path, num_records): self.porter = PorterStemmer() self.stop = set() with open("stop.words.dat", "r") as sw: for line in sw: self.stop.add(line[:-1]) if path != "" and num_records != 0: self.process(path, num_records)
def __init__(self): self.stoplist = open('stopword_list.txt', 'r').read().split() self.porter = PorterStemmer() doc2id = pickle.load(open('doc2id.pkl', 'rb')) self.id2doc = {v:k for k, v in doc2id.items()} self.index = pickle.load(open('index.pkl', 'rb')) self.pos_index = pickle.load(open('pos_index.pkl', 'rb')) self.idf_new_term = log(len(doc2id)/0.5, 2)
def getTopTerms(currentQuery, weightsMap, topX): p = PorterStemmer() current_terms = [] for term in currentQuery.split(): term = p.stem(term.lower(), 0,len(term)-1) current_terms.append(term) i = 0 new_terms = [] for term in sorted(weightsMap, key=weightsMap.get, reverse=True): if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms: continue new_terms.append(term) current_terms.append(p.stem(term.lower(), 0,len(term)-1)) i = i + 1 if (topX != 'ALL' and i >= topX): break; return new_terms
def classify(self, query): if self.isSuicide(query): return [('suicidal ideation', 1), ('depression', .5), ('emotional disturbance', .5)] query = "".join(c for c in query if c not in ('!','.',':',',',';','?')).lower() query_words = query.split() p = PorterStemmer() query_words = [p.stem(query_words[i]) for i in range(len(query_words))] q = np.zeros(len(self.word_to_index)) for word in query_words: if word in self.word_to_index: q[self.word_to_index[word]] += self.idf[self.word_to_index[word]] membership_scores = [] for i in range(len(self.tfidf_matrix)): #compute cosine similarity docvec = self.tfidf_matrix[i] cossim = (np.inner(docvec, q)/(np.linalg.norm(docvec)*np.linalg.norm(q))).item(0,0) membership_scores.append(cossim) return sorted(zip(self.categories, membership_scores), key=lambda x: x[1], reverse=True)
def __init__(self): self.dname2id = pickle.load(open('doc2id.pkl', 'rb')) try: f = open('stopword_list.txt', 'r') except IOError: raise 'Failed to open stopword_list.txt.' self.stoplist = f.read().split() self.porter = PorterStemmer() ## term to its posting list. self.index = {} self.pos_index = defaultdict(list) self.doc_num = len(self.dname2id)
def dicts_docid_words_docid_doclen(): global STOPWORDS_FILE p = PorterStemmer() stopwords_list = stopwords(STOPWORDS_FILE) docid_words_dict = defaultdict(lambda: []) docid_doclen_dict = {} path = CACM_PATH """extract all the file names in the path and put them into a list""" dirs_list = os.listdir(path) for docname in dirs_list: docno = ''.join([s for s in docname if s.isdigit()]) f = urllib.urlopen(path+docname).read() data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1) data = re.findall(r"[\w]+", data) for word in data: word = word.lower() if word not in stopwords_list: word_stemmed = p.stem(word, 0,len(word)-1) docid_words_dict[docno].append(word_stemmed) """doclen is the length of doc after stopping and stemming""" docid_doclen_dict[docno]=len(data) return docid_words_dict,docid_doclen_dict
class Tokenizer: """ Helper class for tokenizing document space and removing stop words """ corpus = None terms = [] stop_words = [] stemmer = None def __init__(self): # read stop words from file self.stop_words = open('stop_words.txt', 'r').read().split() self.stemmer = PorterStemmer() def tokenize(self, docs_string): """ Tokenizer's most important method. It separates the whole corpus string in tokens and removes stop words. """ self.corpus = docs_string self.clean() self.terms = self.corpus.split(" ") self.remove_stop_words() self.remove_duplicates() return self.terms def clean(self): """ get rid of punctuation signs, convert to lower case, standardize spacing """ self.corpus = self.corpus.replace(".", " ") self.corpus = self.corpus.replace(",", " ") self.corpus = self.corpus.lower() self.corpus = self.corpus.replace("\s+", " ") def remove_stop_words(self): self.terms = [self.stemmer.stem(term,0,len(term)-1) for term in self.terms if term not in self.stop_words] def remove_duplicates(self): """ remove duplicated terms in the list """ from sets import Set self.terms = Set((term for term in self.terms))
class Processor: def __init__(self, path, num_records): self.porter = PorterStemmer() self.stop = set() with open("stop.words.dat", "r") as sw: for line in sw: self.stop.add(line[:-1]) if path != "" and num_records != 0: self.process(path, num_records) def process(self, path, num_records): with open(path, "r", encoding="utf-8") as src: with open("sample.txt", "w") as dst: num_total = 0 for line in src: AnonID, Query, QueryTime = line.split("\t")[:3] if AnonID == "AnonID": continue if num_total < num_records: tidy = self.trim(Query) if tidy != "": Query = self.remove_stop_words(tidy) Query = self.porter_stemming(Query) if Query != "": dst.write("{}\t{}\t{}\n".format(AnonID, Query, QueryTime)) num_total += 1 def trim(self, string): return re.sub(r"\W", " ", string) def remove_stop_words(self, string): words = string.split() return " ".join([w for w in words if w not in self.stop]) def porter_stemming(self, string): result = [self.porter.stem(word, 0, len(word) - 1) for word in string.split()] return " ".join(result)
class Indexer(object): def __init__(self): self.dname2id = pickle.load(open('doc2id.pkl', 'rb')) try: f = open('stopword_list.txt', 'r') except IOError: raise 'Failed to open stopword_list.txt.' self.stoplist = f.read().split() self.porter = PorterStemmer() ## term to its posting list. self.index = {} self.pos_index = defaultdict(list) self.doc_num = len(self.dname2id) def terms_for_keywords_query(self, terms): ## Filter out stop words. return [t for t in terms if t not in self.stoplist] def get_terms(self, contents): terms = contents.split() terms = map(del_punc, terms) terms = map(lambda s : s.lower(), terms) ## Terms for keywords based query(aka: free text query). terms_for_kq = [self.porter.stem(term, 0, len(term)-1) for term in self.terms_for_keywords_query(terms)] ## Terms for phrase query. terms_for_pq = [self.porter.stem(term, 0, len(term)-1) for term in terms] return terms_for_kq, terms_for_pq def get_doc_id(self, dname): return self.dname2id[dname] def build_posting_list_for_pq(self, terms, doc_id): """ Build posting list(term : [doc, [positions]]) for phrase query. """ term2doc_pos = {} for pos, term in enumerate(terms): try: term2doc_pos[term][1].append(pos) except: term2doc_pos[term] = [doc_id, [pos]] for term, posting in term2doc_pos.iteritems(): self.pos_index[term].append(posting) def build_posting_list_for_kq(self, terms, doc_id): """ Build posting list(term : [idf, [(doc1, tf), (doc2, tf), ...]]) for keywords based query. """ tf_counter = Counter(terms) max_elem = tf_counter.most_common(1) most_common_term = max_elem[0][0] max_tf = max_elem[0][1] # print 'Most common term is:', most_common_term, '\tMax tf is:', max_tf for term, tf in tf_counter.iteritems(): if not self.index.has_key(term): df = 1 self.index[term] = [df, [(doc_id, float(tf)/max_tf)]] else: df = self.index[term][0] df += 1 self.index[term][0] = df self.index[term][1].append((doc_id, float(tf)/max_tf)) def write_index_to_file(self): pickle.dump(self.index, open('index.pkl', 'wb')) pickle.dump(self.pos_index, open('pos_index.pkl', 'wb')) def compute_idf(self): for term, postings in self.index.iteritems(): postings[0] = log(float(self.doc_num)/postings[0], 2) def parse_collection(self): stdout_old = sys.stdout sys.stdout = open('indexer_log', 'w') print 'Total %d documents need to be processed.' % self.doc_num for index, (doc_name, doc_id) in enumerate(sorted(self.dname2id.iteritems(), key=itemgetter(1))): try: print 'Building index for:', os.path.basename(doc_name), print '\tDocument ID:', doc_id f = open(doc_name, 'r') except IOError: raise 'Unable to open document [%s]' % doc_name ## Get terms for keywords based query and phrase based query. terms_for_kq, terms_for_pq = self.get_terms(f.read()) self.build_posting_list_for_kq(terms_for_kq, doc_id) self.build_posting_list_for_pq(terms_for_pq, doc_id) self.compute_idf() self.write_index_to_file() sys.stdout = stdout_old
def __init__(self): # read stop words from file self.stop_words = open('stop_words.txt', 'r').read().split() self.stemmer = PorterStemmer()
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): # ------------------------------------------------------------------- # TODO: Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # NOTE that you probably do *not* want to store a value for every # word-document pair, but rather just for those pairs where a # word actually occurs in the document. print "Calculating tf-idf..." self.tfidf = {} # initialized for word in self.vocab: for d in range(len(self.docs)): if word not in self.tfidf: self.tfidf[word] = {} self.tfidf[word][d] = 0.0 N = len(self.docs) for word in self.vocab: indices = self.inv_index[word] for i in indices: tf = 1 + math.log10(indices[i]) idf = math.log10(N*1.0 / len(self.get_posting(word))) self.tfidf[word][i] = tf * idf #print self.tfidf # ------------------------------------------------------------------ def get_tfidf(self, word, document): # ------------------------------------------------------------------ # TODO: Return the tf-idf weigthing for the given word (string) and # document index. tfidf = 0.0 if word in self.tfidf: tfidf = self.tfidf[word][document] # ------------------------------------------------------------------ return tfidf def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." # ------------------------------------------------------------------ # TODO: Create an inverted index. # Granted this may not be a linked list as in a proper # implementation. # Some helpful instance variables: # * self.docs = List of documents # * self.titles = List of titles # Example: inv_index['separ'] = {54: 3} in doc id 54, occurs 3 times! inv_index = {} for word in self.vocab: inv_index[word] = {} numdocs = len(self.docs) for d in xrange(0, numdocs): doc = self.docs[d] for word in doc: #if word == "zulu": # print "zulu", inv_index[word] if d in inv_index[word]: inv_index[word][d] = inv_index[word][d]+1 else: inv_index[word][d] = 1 #print inv_index['separ'] #print "zulu inverted index", inv_index['zulu'] #print inv_index self.inv_index = inv_index # ------------------------------------------------------------------ def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ # ------------------------------------------------------------------ # TODO: return the list of postings for a word. posting = [] for i in self.inv_index[word]: posting.append(i) posting.sort() #if word == "zulu": # print "posting for word", word , posting return posting # ------------------------------------------------------------------ def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ # ------------------------------------------------------------------ # TODO: Implement Boolean retrieval. You will want to use your # inverted index that you created in index(). # Right now this just returns all the possible documents! qsets = {} for qword in query: qsets[qword] = set() if qword in self.inv_index: for i in self.inv_index[qword]: qsets[qword].add(i) #for qword in qsets: # print "word", qword, "set", qsets[qword] # initial set final = qsets[query[0]] for x in range(1, len(query)): final = final.intersection(qsets[query[x]]) #print "final set ", final docs = list(final) # ------------------------------------------------------------------ return sorted(docs) # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for xx in range(len(self.docs))] # ------------------------------------------------------------------ # TODO: Implement cosine similarity between a document and a list of # query words. # Right now, this code simply gets the score by taking the Jaccard # similarity between the query and every document. tf = {} words_in_query = set() for word in query: words_in_query.add(word) if word not in tf: tf[word] = 1 else: tf[word] = tf[word]+1 #print query, tf for d, doc in enumerate(self.docs): words_in_doc = set(doc) #scores[d] = len(words_in_query.intersection(words_in_doc)) \ # / float(len(words_in_query.union(words_in_doc))) union = words_in_query.union(words_in_doc) #inter = words_in_query.intersection(words_in_doc) # ltclnn = {} # # for w in union: # ltclnn[w] = {} # ltclnn[w]["dn"] = 0 # ltclnn[w]["qn"] = 0 # if w in tf: # ltclnn[w]["qwt"] = 1+ math.log10(tf[w]) # ltclnn[w]["qn"] = ltclnn[w]["qn"] + ltclnn[w]["qwt"]**2 # else: # ltclnn[w]["qwt"] = 0 # ltclnn[w]["qn"] = 0 # # ltclnn[w]["dwt"] = self.get_tfidf(w, d) # ltclnn[w]["dn"] = ltclnn[w]["dn"] + ltclnn[w]["dwt"]**2 # # for w in ltclnn: # ltclnn[w]["qwtn"] = ltclnn[w]["qwt"] / math.sqrt(ltclnn[w]["qn"]) # ltclnn[w]["dwtn"] = ltclnn[w]["dwt"] / math.sqrt(ltclnn[w]["dn"]) # # prod = 0 # for w in ltclnn: # prod = prod + ltclnn[w]["qwtn"] * ltclnn[w]["dwtn"] # # scores[d] = prod ltc_sum = 0 #lnn_sum = 0 ltc_lnn = 0 for term in union: ltc = self.get_tfidf(term, d) ltc_sum = ltc_sum + ltc*ltc if term in tf: lnn = 1 + math.log10(tf[term]) else: lnn = 0 #lnn_sum = lnn_sum + lnn*lnn ltc_lnn = ltc_lnn + ltc*lnn scores[d] = ltc_lnn / math.sqrt(ltc_sum) #print scores # ------------------------------------------------------------------ ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
def __init__(self,): self.stemmer = PorterStemmer()
class TextIndex: def __init__(self): self.index = defaultdict(list) self.p = PorterStemmer() '''get stop words from stopwords file''' def getStopWords(self, stopwordsFile): f = open(stopwordsFile, 'r') stopwords = [line.rstrip() for line in f] self.sw = dict.fromkeys(stopwords) f.close() '''Create an inverted index to store word-document pairs''' def create(self, docList, dirPath, stopwordsFile): self.getStopWords(dirPath + stopwordsFile) for d in docList: file = open(dirPath + d) pos = 1 docIndex={} for word in file.read().split(): '''Remove the punctuation marks''' key = word.lower().strip(".") if key not in self.sw: '''Use the Porter Stemmer algorithm to stem words.''' key = self.p.stem(key, 0, len(key) - 1) try: docIndex[key][1].append(pos) except: docIndex[key]=[d, array('I',[pos])] pos += 1 '''Merge the document index with global index''' for docName, positions in docIndex.items(): self.index[docName].append(positions) print(self.index) '''Get the query type''' def getQueryType(self, query): if '"' in query: return 'PQ' elif (len(query.split()) > 1): return 'FTQ' else: return 'OWQ' '''Query the Index created above''' def queryIndex(self): while True: q = sys.stdin.readline() q = q.rstrip() if q == '': break queryType = self.getQueryType(q) if queryType == 'OWQ': self.oneWordQuery(q) elif queryType == 'FTQ': self.freeTextQuery(q) '''One Word Query''' def oneWordQuery(self, q): originalQuery = q q = self.p.stem(q, 0, len(q) - 1) if len(q) == 0: print('Length of q is zero') return q = "'{}'".format(q) print(q) '''Query contains only one word''' if q not in self.index.keys(): print('q is not in index') return else: pos = self.index[q] pos = [x[0] for x in pos] pos = ' '.join(pos) print(pos) '''Extract words from the free text query ''' def getTerms(self, line): line = line.lower() '''replace non alphanumeric characters with space''' line = re.sub(r'[^a-z0-9 ]',' ',line) line = line.split() line = [x for x in line if x not in self.sw] line = [self.p.stem(word, 0, len(word) -1) for word in line] return line '''This function returns the intersection of lists''' def intersectsLists(self, lists): if len(lists) == 0: return [] '''Sort the list on the basis of length such that smallest item appears first''' lists.sort(key=len) return list(reduce(lambda x, y: set(x) & set(y), lists)) def getPostings(self, terms): '''all terms in the list are guaranteed to be in the index''' return [self.index[term] for term in terms] def getDocsFromPostings(self, postings): '''no empty list in postings''' return [[x[0] for x in p] for p in postings] '''Free Text Query''' def freeTextQuery(self, q): q = self.getTerms(q) if len(q)==0: print('') return li = set() for term in q: try: p=self.index[term] p=[x[0] for x in p] li=li|set(p) except: #term not in index pass li = list(li) li.sort() print(' '.join(li)) '''Phrase Query''' def phraseQuery(self, q): originalQuery=q q = self.getTerms(q) if len(q) == 0: print('') return elif len(q) == 1: self.owq(originalQuery) return phraseDocs = self.phraseQueryDocs(q) print(' '.join(map(str, phraseDocs))) def phraseQueryDocs(self, termList): phraseDocs = [] length = len(termList) '''first find matching docs''' for term in termList: if term not in self.index: '''if a term doesn't appear in the index there can't be any document matching it''' return [] postings = self.getPostings(termList) docs = self.getDocsFromPostings(postings) '''docs are the documents that contain every term in the query''' docs = self.intersectLists(docs) '''postings are the postings list of the terms in the documents docs only'''
def __init__(self): self.index = defaultdict(list) self.p = PorterStemmer()
def stem_words(l): ps = PorterStemmer() return [ps.stem(x, 0, len(x) - 1) for x in l]
from PorterStemmer import PorterStemmer from pprint import pprint import math corpus = ["At work", "New job", "Enjoying", "Beer", "Days off", "wedding", "Office", "Drinks", "Wine", "Drinks", "Blessed", "A drink", "Hubby", "Much needed", "New place", "Thankful", "apartment", "Excited about", "Vacation", "Celebrate", "Let me know", "Had a blast", "laundry", "care of", "company", "Grocery", "Wishes", "Drinking for eveveryone", "After work", "To work tommorow", "Bills", "taxes", "Husband", "shift", "The bar", "Potty", "ready to", "Celebrating", "To enjoy", "My babies", "Errands", "Relaxing", "apt", "Fingers crossed", "Poor baby", "Day to all", "women", "Work", "Yard", "Doesn't", "Uni", "Days", "Volunteer", "Schedule", "repeat", "House", "Apartment", "Moving", "place", "Rent", "Move", "Month", "Bedroom", "Lease", "Signed", "Roommate", "Interested", "Complex", "Area", "Interest", "apt", "Drinking", "Beer", "Drink", "Cold", "Root", "Beers", "Pong", "Ale", "Ginger", "Cans", "Drinkin", "ginger", "Pint", "Cans", "Bbq", "Pub", "bottles", "Home", "Work", "Ready", "Hubby", "Bed", "Dinner", "relax", "Shower", "Heading", "Relaxing", "Chill", "Nap", "Early", "Supper", "Snuggle", "Money", "Pay", "Bills", "Paid", "Paying", "Bill", "Job", "Month", "Rent", "Check", "Taxes", "Bucks", "Debt", "paycheck", "job", "Position", "Company", "Interview", "Experience", "Manager", "Assistant", "Interested", "Career", "Business", "Resume", "Sales", "Hiring", "Hire"] stoplist = set('for a of the and to in'.split()) stemmer = PorterStemmer() texts = [[word for word in string.lower().split() if word not in stoplist] for string in corpus] words = reduce(list.__add__, texts) stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) stemCounts = {} numStems = len(stems) for word in stems: if word not in stemCounts: stemCounts[word] = 1.0 else: stemCounts[word] = stemCounts[word] + 1.0 for word in stemCounts: stemCounts[word] = stemCounts[word]/numStems;