def __init__(self, parent, docno, doc, terms): QtGui.QDialog.__init__(self, parent) self.setupUi(self) # Set fields self.labelDocumentNo.setText(docno) textDocument = self.textEdit.document() textCursor = QtGui.QTextCursor(textDocument) normalFormat = QtGui.QTextCharFormat() termFormat = QtGui.QTextCharFormat() termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red"))) termFormat.setFontWeight(QtGui.QFont.Bold) textCursor.beginEditBlock() stemmer = PorterStemmer() terms = terms.split(",") stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms] for line in unicode(doc).split("\n"): for word in line.split(" "): nword = word.lower().strip(punctuation) sword = stemmer.stem(nword, 0, len(nword)-1) if nword in terms or sword in stemmed_terms: textCursor.insertText(word, termFormat) else: textCursor.insertText(word, normalFormat) textCursor.insertText(" ", normalFormat) textCursor.insertText("\n", normalFormat) self.textEdit.moveCursor(QtGui.QTextCursor.Start)
def stemWord(self, fileName, preprocessedFileName=''): "Stemming word and write to temp file" p = PorterStemmer() print('Preprocessing...') print('Stemming words...') if len(preprocessedFileName) != 0: self.tempFileName = preprocessedFileName with open(self.tempFileName, 'w') as outputfile: with open(fileName, 'r') as file: while 1: word = '' line = file.readline() if line == '': break # skip first word(category) category = '' for ch in line: if ch == ' ': if len(category) != 0: outputfile.write(category + ' ') break else: category += ch # skip first word (category label) for i in range(len(category) + 1, len(line)): if line[i].isalpha(): word += line[i].lower() else: if word: outputfile.write(p.stem( word, 0, len(word) - 1)) word = '' outputfile.write(line[i].lower())
def convert_keyboard_query(): qry = raw_input("Type in your query:") words = qry.strip().split(' ') p = PorterStemmer() QUERY_WEIGHT = 2 new_doc_vec = defaultdict(int) for word in words: word = word.strip() if re.search('[a-zA-Z]', word): word = word.lower() word = p.stem(word, 0, len(word) - 1) if word in new_doc_vec: new_doc_vec[word] += QUERY_WEIGHT elif word not in stoplist_hash and word in corp_freq_hash: new_doc_vec[word] = QUERY_WEIGHT else: continue new_vect = defaultdict(int) for key in new_doc_vec: new_vect[key] = new_doc_vec[key] if key in synonyms: sim_words_list = synonyms_list[synonyms[key]] for sim_word in sim_words_list: if sim_word not in stoplist_hash and re.search( "[a-zA-z]", sim_word): if corp_freq_hash[sim_word] > 1: new_vect[sim_word] = new_doc_vec[key] return new_vect
def getStemWords(query_line, stopwords): raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \ .replace("(", "").replace(")", "").split(" ") for i in stopwords: while i in raw_data: raw_data.remove(i) stemmedArray = raw_data p = PorterStemmer() for i in range(1, stemmedArray.__len__()): while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1): stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1) return raw_data[0], raw_data[1:], stemmedArray[1:]
def process_word(token): token = token.lower() if constants.STEM is True: p = PorterStemmer() token = p.stem(token, 0,len(token)-1) return token
class Parser: #A processor for removing the commoner morphological and inflexional endings from words in English stemmer=None stopwords=[] def __init__(self,): self.stemmer = PorterStemmer() #English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop #self.stopwords = open('data/english.stop', 'r').read().split() def clean(self, string): """ remove any nasty grammar tokens from string """ string = string.replace(".","") string = string.replace("\s+"," ") string = string.lower() return string def removeStopWords(self,list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords ] def tokenise(self, string): """ break string up into tokens and stem words """ string = self.clean(string) words = string.split(" ") return [self.stemmer.stem(word,0,len(word)-1) for word in words]
def Word_appear_count(text, type, Word_count_pubmed, Word_count_twitter, Word_count_all): text = remove_tag(text) word = text.split() p = PorterStemmer() for i in word: i = p.stem(i, 0, len(i) - 1) # porter # pubmed if i not in Word_count_pubmed.keys(): Word_count_pubmed[i] = 0 if type == 'pubmed': Word_count_pubmed[i] += 1 elif i in Word_count_pubmed.keys() and type == 'pubmed': Word_count_pubmed[i] += 1 # twitter if i not in Word_count_twitter.keys(): Word_count_twitter[i] = 0 if type == 'twitter': Word_count_twitter[i] += 1 elif i in Word_count_twitter.keys() and type == 'twitter': Word_count_twitter[i] += 1 # all if i not in Word_count_all.keys(): Word_count_all[i] = 1 elif i in Word_count_all.keys(): Word_count_all[i] += 1 return Word_count_pubmed, Word_count_twitter,Word_count_all
def stem(tokens): p = PorterStemmer() stems = [] for token in tokens: stem = p.stem(token, 0, len(token) - 1) stems.append(stem) return list(filter(None, stems))
def stemWords(inList): ##Function that stems the words. ##Name: stemWords; input: list (of tokens); output: list (of stemmed tokens) outlist = [] p = PorterStemmer() for word in inList: outlist.append(p.stem(word, 0, len(word)-1)) return outlist
def dict_qryid_terms(is_stopping): global STOPWORDS_FILE stopwords_list = stopwords(STOPWORDS_FILE) ## create stopwords list p = PorterStemmer() ##create an Porter Stemmer instance dictquery = defaultdict(lambda: []) ## create the target dictionary with open(QUERY_TEXT_FILE, 'r') as f: for line in f: data_list = re.findall(r"[\w]+", line) query_id = data_list[0] for term in data_list[1:]: term = term.lower() if is_stopping: if term not in stopwords_list: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) else: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) return dictquery
def dict_qryid_terms(is_stopping): global STOPWORDS_FILE stopwords_list = stopwords(STOPWORDS_FILE) ## create stopwords list p = PorterStemmer() ##create an Porter Stemmer instance dictquery = defaultdict(lambda: []) ## create the target dictionary with open(QUERY_TEXT_FILE, 'r') as f: for line in f: data_list = re.findall(r"[\w]+", line) query_id = data_list[0] for term in data_list[1:]: term = term.lower() if is_stopping: if term not in stopwords_list: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) else: dictquery[query_id].append(p.stem(term, 0,len(term)-1)) return dictquery
def stem_string(line): if line == "": return "" p = PorterStemmer() word = "" output = "" for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0,len(word)-1) word = '' output += c.lower() if word: output += p.stem(word, 0,len(word)-1) return output
def stem_words(list_of_tokens): stemmer = PorterStemmer() # Declares the stemmer object for token_index, token in enumerate(list_of_tokens): list_of_tokens[token_index] = stemmer.stem( token, 0, len(token) - 1) # Stems the word using the function return list_of_tokens # Returns the "post-stem" list of tokens
def stemInputAndCheckMatch(self, uType, word): ps = PorterStemmer() stemmedWord = ps.stem(word) matchingWords = self.checkMatches(uType, stemmedWord) data = self.getMostFrequentWords(matchingWords) if (data[1] != 1): return data[0] else: return []
def stemming(self, tokens): stemmed_tokens = [] stem_func = PorterStemmer() for c in tokens: if c.isalpha(): stemmed_tokens.append(stem_func.stem(c, 0,len(c)-1)) else: stemmed_tokens.append(c) return stemmed_tokens
def getTopTerms(currentQuery, weightsMap, topX): p = PorterStemmer() current_terms = [] for term in currentQuery.split(): term = p.stem(term.lower(), 0,len(term)-1) current_terms.append(term) i = 0 new_terms = [] for term in sorted(weightsMap, key=weightsMap.get, reverse=True): if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms: continue new_terms.append(term) current_terms.append(p.stem(term.lower(), 0,len(term)-1)) i = i + 1 if (topX != 'ALL' and i >= topX): break; return new_terms
def load_dictionary(filename, stem=True): """Loads line separated dictionary into a list""" out = [] for word in open("dictionaries/%s" % filename, "r"): word = word.lower() if stem is True: p = PorterStemmer() word = p.stem(word, 0,len(word)-1) out.append(word) return out
def preprocess(self, query): p = PorterStemmer() result = [] # remove any non-alphanumeric characters [a-zA-Z0-9_] query = re.sub("[^\w]", " ", query) query = query.lower().split(' ') for word in query: if word not in self.stopwords: result.append(p.stem(word, 0, len(word) - 1)) return result
def format_description(text, stop_words): words = text.split() stemmer = PorterStemmer() non_stop_words = [] for word in words: if word not in stop_words: # Not a stop word, so lower, remove punctuation, and stem lowered_token = remove_punctuation(word).lower() #non_stop_words.append(lowered_token) non_stop_words.append(stemmer.stem(lowered_token)) return ' '.join(non_stop_words)
def getTopTerms(currentQuery, weightsMap, topX): p = PorterStemmer() current_terms = [] for term in currentQuery.split(): term = p.stem(term.lower(), 0, len(term) - 1) current_terms.append(term) i = 0 new_terms = [] for term in sorted(weightsMap, key=weightsMap.get, reverse=True): if term in constants.QUERY_SKIP_TERMS or p.stem( term.lower(), 0, len(term) - 1) in current_terms: continue new_terms.append(term) current_terms.append(p.stem(term.lower(), 0, len(term) - 1)) i = i + 1 if (topX != 'ALL' and i >= topX): break return new_terms
def finalize(tInput, swInput): p = PorterStemmer() output = open("output.txt", 'w') for i in range(len(tInput)): token = tInput[i] if token == "a" or token == "an" or token == "the": output.write("%s\t- article\n" % token) elif any(token in x for x in swInput): output.write("%s\t- stop word\n" % token) else: stemword = p.stem(token, 0, len(token) - 1) output.write("%s\t- %s\n" % (token, stemword)) output.close()
def __init__(self, parent, docno, doc, terms): QtGui.QDialog.__init__(self, parent) self.setupUi(self) # Set fields self.labelDocumentNo.setText(docno) textDocument = self.textEdit.document() textCursor = QtGui.QTextCursor(textDocument) normalFormat = QtGui.QTextCharFormat() termFormat = QtGui.QTextCharFormat() termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red"))) termFormat.setFontWeight(QtGui.QFont.Bold) textCursor.beginEditBlock() stemmer = PorterStemmer() terms = terms.split(",") stemmed_terms = [ stemmer.stem(term, 0, len(term) - 1) for term in terms ] for line in unicode(doc).split("\n"): for word in line.split(" "): nword = word.lower().strip(punctuation) sword = stemmer.stem(nword, 0, len(nword) - 1) if nword in terms or sword in stemmed_terms: textCursor.insertText(word, termFormat) else: textCursor.insertText(word, normalFormat) textCursor.insertText(" ", normalFormat) textCursor.insertText("\n", normalFormat) self.textEdit.moveCursor(QtGui.QTextCursor.Start)
def stem_text(text): p = PorterStemmer() stemmed_text = '' word = '' for i, c in enumerate(text): if c.isalpha(): word += c.lower() if not c.isalpha() or i == (len(text) - 1): if word: stemmed_text += p.stem(word, 0,len(word)-1) word = '' if c.lower() == ' ': stemmed_text += c.lower() return stemmed_text
def remove_porterstemmer(input_file, noise_words_set): questions = list() word_weight = [] p = PorterStemmer() for line in input_file: line = line.lower() words = filter(None, re.split("\W*\d*", line)) question = [] for word in words: new_word = p.stem(word, 0, len(word) - 1) if new_word not in noise_words_set and len(new_word) > 2: question.append(new_word) questions.append(question) word_weight.append(Counter(question)) return word_weight, questions
def remove_porterstemmer(input_file,noise_words_set): questions = list() word_weight = [] p = PorterStemmer() for line in input_file: line = line.lower() words = filter(None, re.split("\W*\d*", line)) question = [] for word in words: new_word = p.stem(word,0,len(word)-1) if new_word not in noise_words_set and len(new_word)>2: question.append(new_word) questions.append(question) word_weight.append(Counter(question)) return word_weight, questions
def en_preprocess(file_path: str, stop_words: list, step: int = 4) -> str: ''' Step1: Extract pure-text content from the original html file Step2: To lower case, remove special characters Step3: Remove stop words Step4: Porter stemming (Final result) ''' with open(file_path, "r", encoding="UTF-8") as f: html_content = f.read() parsed_content = BeautifulSoup(html_content, 'html.parser') text_content = "" # Extract pure-text content from the original html file for child in parsed_content.find(id="mw-content-text").div.children: if child.name in ("p", "h2", "h3", "h4", "h5"): text_content += child.get_text() if step == 1: return text_content # To lower case text_content = text_content.lower() # Remove special characters text_content = text_content.replace("'", "") text_content = text_content.replace("-", "") for i in range(len(text_content)): curr_char = text_content[i] if not ((curr_char >= 'a' and curr_char <= 'z')): text_content = text_content.replace(curr_char, " ") # Remove duplicated spaces text_content = re.sub("[ ]+", " ", text_content) if step == 2: return text_content # Tokenize token_list = text_content.split(" ") # Remove stop words new_list = [] for token in token_list: if token not in stop_words and token != "": new_list.append(token) token_list = new_list if step == 3: return " ".join(token_list) # Porter stemming p = PorterStemmer() new_list = [] for i in range(len(token_list)): new_list.append(p.stem(token_list[i], 0, len(token_list[i]) - 1)) token_list = new_list final_result = " ".join(token_list) return final_result
def tokenize(documents): # Read the stopwords stop_word_set = set(open('./stopwords.txt', 'r').read().split()) # Initialize the Porter stemmer p = PorterStemmer() # Create a dictionary where each element is also a dictionary. The outer dictionary will map stemmed words to # document ids and the inner dictionaries will map the document ids to their indices in the document. word_to_doc = defaultdict(lambda: defaultdict(list)) # Positional inverted index for document_index, document in enumerate(documents, start=1): for word_index, word in enumerate(document.split()): if word not in stop_word_set: # Store each word as stemmed and put them to the inverted index stemmed_word = p.stem(word, 0, len(word) - 1) # stemmed_word = word word_to_doc[stemmed_word][document_index].append(word_index) return word_to_doc
def search_dic(text, SearDic, original_word, index): text = remove_tag(text) word = text.split() p = PorterStemmer() for i in word: # poter_i = i poter_i = p.stem(i, 0, len(i) - 1) # porter if poter_i not in SearDic.keys(): SearDic[poter_i] = [index] original_word[poter_i] = [i] else: if index not in SearDic[poter_i]: SearDic[poter_i].append(index) if i not in original_word[poter_i]: original_word[poter_i].append(i) return SearDic, original_word
def getQuestionKeywords(question): """Return the keywords from a question. The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates Currently there are still issues with 1. stop words list is not complete: eg "recommend" etc is not a stop word. 2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau" >>> getQuestionKeywords('what is the best preschool in Potomac?') ['potomac', 'preschool'] >>> getQuestionKeywords('Can someone help with a preschool around potomac?') ['potomac', 'preschool'] >>> getQuestionKeywords('What is the best cafeteria around potomac?') ['potomac', 'restaurant'] """ # split the question into a list keywordList = question.split() # strip the punctuations etc keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList] # convert into lower case keywordList = [keyword.lower() for keyword in keywordList] #remove stop words from keywords keywordList = [keyword for keyword in keywordList if keyword not in stopWords] #stem the keywords stemmer = PorterStemmer() keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList] #take care of synonyms keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ] #remove duplicates keywordList = list(set(keywordList)) #sort the keywords keywordList.sort() return keywordList
class Tokenizer: """ Helper class for tokenizing document space and removing stop words """ corpus = None terms = [] stop_words = [] stemmer = None def __init__(self): # read stop words from file self.stop_words = open('stop_words.txt', 'r').read().split() self.stemmer = PorterStemmer() def tokenize(self, docs_string): """ Tokenizer's most important method. It separates the whole corpus string in tokens and removes stop words. """ self.corpus = docs_string self.clean() self.terms = self.corpus.split(" ") self.remove_stop_words() self.remove_duplicates() return self.terms def clean(self): """ get rid of punctuation signs, convert to lower case, standardize spacing """ self.corpus = self.corpus.replace(".", " ") self.corpus = self.corpus.replace(",", " ") self.corpus = self.corpus.lower() self.corpus = self.corpus.replace("\s+", " ") def remove_stop_words(self): self.terms = [self.stemmer.stem(term,0,len(term)-1) for term in self.terms if term not in self.stop_words] def remove_duplicates(self): """ remove duplicated terms in the list """ from sets import Set self.terms = Set((term for term in self.terms))
def parse(self): #remove stop words self.dataList = [w for w in self.dataList if not w in self.stopWords] #get the stem of the words st = PorterStemmer() self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList] # add to list based on frequency of occurrence wordFreq = {} for word in self.dataList: if word in wordFreq: wordFreq[word] = wordFreq[word] + 1 else: wordFreq[word] = 0 wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1)) newList = [] for w in wordList: newList.insert(0,w[0]) self.dataList = newList
class Processor: def __init__(self, path, num_records): self.porter = PorterStemmer() self.stop = set() with open('stop.words.dat', 'r') as sw: for line in sw: self.stop.add(line[:-1]) if path != '' and num_records != 0: self.process(path, num_records) def process(self, path, num_records): with open(path, 'r', encoding='utf-8') as src: with open('sample.txt', 'w') as dst: num_total = 0 for line in src: AnonID, Query, QueryTime = line.split('\t')[:3] if AnonID == 'AnonID': continue if num_total < num_records: tidy = self.trim(Query) if tidy != '': Query = self.remove_stop_words(tidy) Query = self.porter_stemming(Query) if Query != '': dst.write('{}\t{}\t{}\n'.format( AnonID, Query, QueryTime)) num_total += 1 def trim(self, string): return re.sub(r'\W', ' ', string) def remove_stop_words(self, string): words = string.split() return ' '.join([w for w in words if w not in self.stop]) def porter_stemming(self, string): result = [ self.porter.stem(word, 0, len(word) - 1) for word in string.split() ] return ' '.join(result)
def classify(self, query): if self.isSuicide(query): return [('suicidal ideation', 1), ('depression', .5), ('emotional disturbance', .5)] query = "".join(c for c in query if c not in ('!','.',':',',',';','?')).lower() query_words = query.split() p = PorterStemmer() query_words = [p.stem(query_words[i]) for i in range(len(query_words))] q = np.zeros(len(self.word_to_index)) for word in query_words: if word in self.word_to_index: q[self.word_to_index[word]] += self.idf[self.word_to_index[word]] membership_scores = [] for i in range(len(self.tfidf_matrix)): #compute cosine similarity docvec = self.tfidf_matrix[i] cossim = (np.inner(docvec, q)/(np.linalg.norm(docvec)*np.linalg.norm(q))).item(0,0) membership_scores.append(cossim) return sorted(zip(self.categories, membership_scores), key=lambda x: x[1], reverse=True)
class Processor: def __init__(self, path, num_records): self.porter = PorterStemmer() self.stop = set() with open("stop.words.dat", "r") as sw: for line in sw: self.stop.add(line[:-1]) if path != "" and num_records != 0: self.process(path, num_records) def process(self, path, num_records): with open(path, "r", encoding="utf-8") as src: with open("sample.txt", "w") as dst: num_total = 0 for line in src: AnonID, Query, QueryTime = line.split("\t")[:3] if AnonID == "AnonID": continue if num_total < num_records: tidy = self.trim(Query) if tidy != "": Query = self.remove_stop_words(tidy) Query = self.porter_stemming(Query) if Query != "": dst.write("{}\t{}\t{}\n".format(AnonID, Query, QueryTime)) num_total += 1 def trim(self, string): return re.sub(r"\W", " ", string) def remove_stop_words(self, string): words = string.split() return " ".join([w for w in words if w not in self.stop]) def porter_stemming(self, string): result = [self.porter.stem(word, 0, len(word) - 1) for word in string.split()] return " ".join(result)
def dicts_docid_words_docid_doclen(): global STOPWORDS_FILE p = PorterStemmer() stopwords_list = stopwords(STOPWORDS_FILE) docid_words_dict = defaultdict(lambda: []) docid_doclen_dict = {} path = CACM_PATH """extract all the file names in the path and put them into a list""" dirs_list = os.listdir(path) for docname in dirs_list: docno = ''.join([s for s in docname if s.isdigit()]) f = urllib.urlopen(path+docname).read() data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1) data = re.findall(r"[\w]+", data) for word in data: word = word.lower() # if word not in stopwords_list: word_stemmed = p.stem(word, 0,len(word)-1) docid_words_dict[docno].append(word_stemmed) """doclen is the length of doc after stopping and stemming""" docid_doclen_dict[docno]=len(data) return docid_words_dict,docid_doclen_dict
def dicts_docid_words_docid_doclen(): global STOPWORDS_FILE p = PorterStemmer() stopwords_list = stopwords(STOPWORDS_FILE) docid_words_dict = defaultdict(lambda: []) docid_doclen_dict = {} path = CACM_PATH """extract all the file names in the path and put them into a list""" dirs_list = os.listdir(path) for docname in dirs_list: docno = ''.join([s for s in docname if s.isdigit()]) f = urllib.urlopen(path+docname).read() data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1) data = re.findall(r"[\w]+", data) for word in data: word = word.lower() if word not in stopwords_list: word_stemmed = p.stem(word, 0,len(word)-1) docid_words_dict[docno].append(word_stemmed) """doclen is the length of doc after stopping and stemming""" docid_doclen_dict[docno]=len(data) return docid_words_dict,docid_doclen_dict
def useSynonyms(): # Using Thesures(Synonyms) global synonyms global synonyms_list synonyms = {} synonyms_list = [] useThesures = True if useThesures: p = PorterStemmer() with open("../txt_files/synonyms_short.txt", 'r') as f: lines = f.read().split('$') index = 0 for line in lines: words_list = [] words = line.split(',') for word in words: word = word.strip() if re.search('[a-zA-Z]', word): word = word.lower() word = p.stem(word, 0, len(word) - 1) words_list.append(word) synonyms[word] = index synonyms_list.append(words_list) index += 1
class Parser: stemmer = None stopwords = [] def __init__(self, ): self.stemmer = PorterStemmer() self.stopwords = open('english.stop', 'r').read().split() def clean(self, string): """ remove any nasty grammar tokens from string """ string = string.replace(".", "") string = string.replace("\s+", " ") string = string.lower() return string def remove_stop_words(self, list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords] def tokenise(self, string): """ break string up into tokens and stem words """ string = self.clean(string) words = string.split(" ") return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]
from pprint import pprint import math corpus = ["At work", "New job", "Enjoying", "Beer", "Days off", "wedding", "Office", "Drinks", "Wine", "Drinks", "Blessed", "A drink", "Hubby", "Much needed", "New place", "Thankful", "apartment", "Excited about", "Vacation", "Celebrate", "Let me know", "Had a blast", "laundry", "care of", "company", "Grocery", "Wishes", "Drinking for eveveryone", "After work", "To work tommorow", "Bills", "taxes", "Husband", "shift", "The bar", "Potty", "ready to", "Celebrating", "To enjoy", "My babies", "Errands", "Relaxing", "apt", "Fingers crossed", "Poor baby", "Day to all", "women", "Work", "Yard", "Doesn't", "Uni", "Days", "Volunteer", "Schedule", "repeat", "House", "Apartment", "Moving", "place", "Rent", "Move", "Month", "Bedroom", "Lease", "Signed", "Roommate", "Interested", "Complex", "Area", "Interest", "apt", "Drinking", "Beer", "Drink", "Cold", "Root", "Beers", "Pong", "Ale", "Ginger", "Cans", "Drinkin", "ginger", "Pint", "Cans", "Bbq", "Pub", "bottles", "Home", "Work", "Ready", "Hubby", "Bed", "Dinner", "relax", "Shower", "Heading", "Relaxing", "Chill", "Nap", "Early", "Supper", "Snuggle", "Money", "Pay", "Bills", "Paid", "Paying", "Bill", "Job", "Month", "Rent", "Check", "Taxes", "Bucks", "Debt", "paycheck", "job", "Position", "Company", "Interview", "Experience", "Manager", "Assistant", "Interested", "Career", "Business", "Resume", "Sales", "Hiring", "Hire"] stoplist = set('for a of the and to in'.split()) stemmer = PorterStemmer() texts = [[word for word in string.lower().split() if word not in stoplist] for string in corpus] words = reduce(list.__add__, texts) stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) stemCounts = {} numStems = len(stems) for word in stems: if word not in stemCounts: stemCounts[word] = 1.0 else: stemCounts[word] = stemCounts[word] + 1.0 for word in stemCounts: stemCounts[word] = stemCounts[word]/numStems; stemCounts[word] = float("{0:.3f}".format(stemCounts[word]))
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): # ------------------------------------------------------------------- # Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # ------------------------------------------------------------------- print "Calculating tf-idf..." self.tfidf = {} idf = {} doc_cont = {} for i, doc in enumerate(self.docs): doc_cont[i] = collections.Counter(doc) for word in self.vocab: word_set = 0.0 + len(IRSystem.get_posting(self, word)) idf[word] = math.log10(len(self.docs) / word_set) if word not in self.tfidf: self.tfidf[word] = {} for d in range(len(self.docs)): tf = doc_cont[d][word] if tf == 0.0: self.tfidf[word][d] = 0.0 else: self.tfidf[word][d] = (1.0 + math.log10(tf)) * idf[word] def get_tfidf(self, word, document): """ Return the tf-idf weigthing for the given word (string) and document index. """ tfidf = self.tfidf[word][document] return tfidf def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." inv_index = {} # Create a list for each word for word in self.vocab: inv_index[word] = [] # Copy the index of document where the word is for i, doc in enumerate(self.docs): for word in set(doc): inv_index[word].append(i) self.inv_index = inv_index def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ posting = [] # Return the list of the given word posting = self.inv_index[word] set(posting) sorted(posting) return posting def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ docs = [] words_list = [] # Store in words_list the inv_index of each word of the query for word in query: words_list.append(set(self.inv_index[word])) # Intersect the words_list in a list with the common documents docs = reduce (lambda x,y: x & y, words_list) return sorted(docs) # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for xx in range(len(self.docs))] q_count = {} # Calculate a counter of term-frecuency in query q_count = collections.Counter(query) for d, doc in enumerate(self.docs): intersec = set(query).intersection(set(doc)) numerator = 0.0 denominator = 0.0 for word in intersec: qt = (1.0 + math.log10(q_count[word])) dt = self.get_tfidf(word,d) numerator = numerator + qt*dt for word in set(doc): dd = self.get_tfidf(word,d) denominator = denominator + dd*dd scores[d] = numerator/math.sqrt(denominator) ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
file = open('english.stop') StopWords = set() for word in file: word = word.strip() if word != '': StopWords.add(word) for line in sys.stdin: line = line.strip().split('\t',2) if len(line) != 2: continue else: title = line[0] text = line[1] #print text p = PorterStemmer() word = '' if text == '': continue for c in text: if c.isalpha(): word += c.lower() else: if word: if word not in StopWords: output = p.stem(word, 0,len(word)-1) print "%s@-@%s\t1" %(output, title) word = ''
def main(filename, crashed=False): #"crashed" is an option to continue from the current state if the requests time out #print filename #print crashed h = html2text.HTML2Text() h.ignore_links = True stemmer = PorterStemmer() tf_folder_path = os.path.join(os.getcwd(), 'tf') if not os.path.exists(tf_folder_path): os.mkdir(tf_folder_path) corpus = set() pause_time = 0.5 if check_validity: valid_words = set(str(stemmer.stem(line.rstrip().lower())) for line in open(valid_words_file, 'r')) stop_words = set(str(stemmer.stem(line.rstrip().lower())) for line in open(stop_words_file, 'r')) keywords = set(str(stemmer.stem(word.lower()) for word in extra_search_keywords.split())) stop_words = stop_words.union(keywords) if logging: log = open('tf-log', 'w') # Step 1: Find all distinct specialty classes. connection = sqlite3.connect(db_name) c = connection.cursor() db.select(c, ['specialty'], 'th_specialties', distinct=True) issues = set(str(re.sub(r'[^a-zA-Z]+', ' ', i[0])).lower() for i in c.fetchall()) connection.close() if logging: log.write("Issues: \n") log.write(', '.join(issues)) log.write('\n\n') print "Step 1 complete." # Step 2: For each category, find the top num_articles google results and generate tf counts of the stemmed plaintext. if crashed: completed = set(f for f in os.listdir(tf_folder_path) if os.path.isfile(os.path.join(tf_folder_path, f))) issues = issues - completed #print issues for issue in issues: results = search(issue + ' ' + extra_search_keywords, stop = num_articles, pause = pause_time) urls = [str(url) for url in results][: num_articles] if logging: print issue log.write('Issue: ' + issue + '\n') log.write('\n'.join(urls)) log.write('\n\n') cumulative = [] for url in urls: if not url.endswith('.pdf'): try: html = urllib2.urlopen(url) #gets the raw html of the url plaintext = h.handle(unicode(html.read(), 'ISO-8859-1')) #converts the html into plaintext processed = re.sub(r'[^a-zA-Z]+', ' ', plaintext) if check_validity: for word in processed.split(): processed = str(stemmer.stem(word.lower())) if processed not in stop_words and processed in valid_words: cumulative.append(processed) else: stemmed = [str(stemmer.stem(word.lower())) for word in processed.split()] cumulative += stemmed except: #mostly to ignore urllib2 errors... pass counts = Counter(cumulative) tf = open(os.path.join(tf_folder_path, issue), 'w') for word in sorted(counts.keys()): #sort words in alphabetical order corpus.add(word) tf.write(str((word, counts[word]))) #write tuples of words with the word count tf.write('\n') tf.close() print "Step 2 complete." # Step 3: Combine files files = sorted(issues) num_files = len(files) count_vectors = {} for word in corpus: count_vectors[word] = [0]*num_files # Flesh out count_vectors for i in range(len(files)): curr = open(os.path.join(tf_folder_path, files[i]), 'r') for line in curr.readlines(): pair = ast.literal_eval(line) count_vectors[pair[0]][i] = pair[1] curr.close() # Write to tf_matrix tf_matrix = open(filename, 'w') tf_matrix.write(','.join(files)) tf_matrix.write('\n') for word in sorted(count_vectors.keys()): line = word + ',' + ','.join([str(num) for num in count_vectors[word]]) tf_matrix.write(line) tf_matrix.write('\n') tf_matrix.close() shutil.rmtree(tf_folder_path) #removes intermediates! print "Step 3 complete." if logging: log.close()
class Indexer(object): def __init__(self): self.dname2id = pickle.load(open('doc2id.pkl', 'rb')) try: f = open('stopword_list.txt', 'r') except IOError: raise 'Failed to open stopword_list.txt.' self.stoplist = f.read().split() self.porter = PorterStemmer() ## term to its posting list. self.index = {} self.pos_index = defaultdict(list) self.doc_num = len(self.dname2id) def terms_for_keywords_query(self, terms): ## Filter out stop words. return [t for t in terms if t not in self.stoplist] def get_terms(self, contents): terms = contents.split() terms = map(del_punc, terms) terms = map(lambda s : s.lower(), terms) ## Terms for keywords based query(aka: free text query). terms_for_kq = [self.porter.stem(term, 0, len(term)-1) for term in self.terms_for_keywords_query(terms)] ## Terms for phrase query. terms_for_pq = [self.porter.stem(term, 0, len(term)-1) for term in terms] return terms_for_kq, terms_for_pq def get_doc_id(self, dname): return self.dname2id[dname] def build_posting_list_for_pq(self, terms, doc_id): """ Build posting list(term : [doc, [positions]]) for phrase query. """ term2doc_pos = {} for pos, term in enumerate(terms): try: term2doc_pos[term][1].append(pos) except: term2doc_pos[term] = [doc_id, [pos]] for term, posting in term2doc_pos.iteritems(): self.pos_index[term].append(posting) def build_posting_list_for_kq(self, terms, doc_id): """ Build posting list(term : [idf, [(doc1, tf), (doc2, tf), ...]]) for keywords based query. """ tf_counter = Counter(terms) max_elem = tf_counter.most_common(1) most_common_term = max_elem[0][0] max_tf = max_elem[0][1] # print 'Most common term is:', most_common_term, '\tMax tf is:', max_tf for term, tf in tf_counter.iteritems(): if not self.index.has_key(term): df = 1 self.index[term] = [df, [(doc_id, float(tf)/max_tf)]] else: df = self.index[term][0] df += 1 self.index[term][0] = df self.index[term][1].append((doc_id, float(tf)/max_tf)) def write_index_to_file(self): pickle.dump(self.index, open('index.pkl', 'wb')) pickle.dump(self.pos_index, open('pos_index.pkl', 'wb')) def compute_idf(self): for term, postings in self.index.iteritems(): postings[0] = log(float(self.doc_num)/postings[0], 2) def parse_collection(self): stdout_old = sys.stdout sys.stdout = open('indexer_log', 'w') print 'Total %d documents need to be processed.' % self.doc_num for index, (doc_name, doc_id) in enumerate(sorted(self.dname2id.iteritems(), key=itemgetter(1))): try: print 'Building index for:', os.path.basename(doc_name), print '\tDocument ID:', doc_id f = open(doc_name, 'r') except IOError: raise 'Unable to open document [%s]' % doc_name ## Get terms for keywords based query and phrase based query. terms_for_kq, terms_for_pq = self.get_terms(f.read()) self.build_posting_list_for_kq(terms_for_kq, doc_id) self.build_posting_list_for_pq(terms_for_pq, doc_id) self.compute_idf() self.write_index_to_file() sys.stdout = stdout_old
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): # ------------------------------------------------------------------- # TODO: Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # NOTE that you probably do *not* want to store a value for every # word-document pair, but rather just for those pairs where a # word actually occurs in the document. print "Calculating tf-idf..." self.tfidf = {} N = len(self.docs) for word in self.vocab: if word not in self.tfidf: self.tfidf[word] = {} idf = math.log10(N*1./len(self.inv_index[word])) for index,d in enumerate(self.inv_index[word]): tf = math.log10(1.*len(self.inv_index[word][d])) self.tfidf[word][d] = (1+tf)*idf # Calculate per-document l2 norms for use in cosine similarity # self.tfidf_l2norm[d] = sqrt(sum[tdidf**2])) for tdidf of all words in # document number d tfidf_l2norm2 = {} for word, d_dict in self.tfidf.items(): for d,val in d_dict.items(): tfidf_l2norm2[d] = tfidf_l2norm2.get(d, 0.0) + val ** 2 self.tfidf_l2norm = dict((k,math.sqrt(v)) for k,v in tfidf_l2norm2.items()) # ------------------------------------------------------------------ # The term frequency tft,d of term t in document d is defined as the number of times that t occurs in d. def get_tfidf(self, word, document): # ------------------------------------------------------------------ # TODO: Return the tf-idf weigthing for the given word (string) and # document index. if self.tfidf[word][document] is not None: return self.tfidf[word][document] else: return 0 def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." # ------------------------------------------------------------------ # TODO: Create an inverted index. # Granted this may not be a linked list as in a proper # implementation. # Some helpful instance variables: # * self.docs = List of documents # * self.titles = List of titles inv_index = {} for i,title in enumerate(self.titles): for j,word in enumerate(self.docs[i]): if not word in inv_index: inv_index[word] = {} if not i in inv_index[word]: inv_index[word][i] = [] inv_index[word][i].append(j) self.inv_index = inv_index # ------------------------------------------------------------------ def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ # ------------------------------------------------------------------ # TODO: return the list of postings for a word. posting = self.inv_index[word].keys() return posting # ------------------------------------------------------------------ def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ # ------------------------------------------------------------------ # TODO: Implement Boolean retrieval. You will want to use your # inverted index that you created in index(). # Right now this just returns all the possible documents! docs = [] for d in range(len(self.docs)): docs.append(d) docsets = set(docs) for q in query: docsets &= set(self.inv_index[q].keys()) docs = list(docsets) # ------------------------------------------------------------------ return docs # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for xx in range(len(self.docs))] # ------------------------------------------------------------------ # TODO: Implement cosine similarity between a document and a list of # query words. # Right now, this code simply gets the score by taking the Jaccard # similarity between the query and every document. """ words_in_query = set() for word in query: words_in_query.add(word) for d, doc in enumerate(self.docs): words_in_doc = set(doc) scores[d] = len(words_in_query.intersection(words_in_doc)) \ / float(len(words_in_query.union(words_in_doc))) # ------------------------------------------------------------------ ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results """ wordvec = {} for word in query: wordvec[word] = wordvec.get(word,0) + 1 wordvec = dict((word, math.log10(wordvec[word])+1.) for word in wordvec) def get_score(d): """Return score for document d This is cos(query_vec * d_vec/norm) where d_vec[word] = tfidf of word in doc number d norm = sqrt(d_vec[w]**2) for all words w in doc number d """ d_vec = dict((word, self.tfidf[word].get(d,0.0)) for word in wordvec) return sum(wordvec[word] * d_vec[word] for word in d_vec)/self.tfidf_l2norm[d] # Compute scores and add to a priority queue scores = [] for d in range(len(self.docs)): heapq.heappush(scores, (get_score(d), d)) # Return top 10 scores return [(k,v) for v,k in heapq.nlargest(10,scores)] def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i + 1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [ idx for idx, title in sorted(enumerate(titles), key=lambda xx: xx[1]) ] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def index(self): """ Build an index of the documents. """ print "Indexing..." self.tf = defaultdict(Counter) #'term-frequency' #tf[1]['winter']=2.0 which means that the word 'winter' has been mentioned twice in the second docoment #takes around '5' seconds inv_index = defaultdict(set) for i in range(len(self.docs)): for word in self.docs[i]: self.tf[i][word] += 1. inv_index[word].add(i) self.inv_index = inv_index def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ return self.inv_index[word] def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ out = self.get_posting(query[0]) if len(query) > 1: for word in query[1:]: out = self.get_posting(word).intersection(out) return sorted(out) def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def compute_tfidf(self): print "Calculating tf-idf..." self.tfidf = defaultdict(Counter) self.doc_tfidf = defaultdict(float) #used in 'cosine similarity' N = len(self.docs) #number of whole documents for word in self.vocab: idf = math.log10(float(N) / len(self.get_posting(word))) for i in range(N): try: self.tfidf[i][word] = (1. + math.log10(self.tf[i][word])) * idf self.doc_tfidf[i] += self.tfidf[i][word]**2 except ValueError: self.tfidf[i][word] = 0. def get_tfidf(self, word, document): return self.tfidf[document][word] def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for _ in range(len(self.docs))] query_tf = Counter(query) for word in query: query_weight = 1. + math.log10(query_tf[word]) posting_set = self.get_posting(word) for d in posting_set: scores[d] += self.tfidf[d][word] * query_weight for d in range(len(self.docs)): scores[d] /= math.sqrt(self.doc_tfidf[d]) # ------------------------------------------------------------------ #Sort the 'scores' ranking = [ idx for idx, sim in sorted( enumerate(scores), key=lambda xx: xx[1], reverse=True) ] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
class TextIndex: def __init__(self): self.index = defaultdict(list) self.p = PorterStemmer() '''get stop words from stopwords file''' def getStopWords(self, stopwordsFile): f = open(stopwordsFile, 'r') stopwords = [line.rstrip() for line in f] self.sw = dict.fromkeys(stopwords) f.close() '''Create an inverted index to store word-document pairs''' def create(self, docList, dirPath, stopwordsFile): self.getStopWords(dirPath + stopwordsFile) for d in docList: file = open(dirPath + d) pos = 1 docIndex={} for word in file.read().split(): '''Remove the punctuation marks''' key = word.lower().strip(".") if key not in self.sw: '''Use the Porter Stemmer algorithm to stem words.''' key = self.p.stem(key, 0, len(key) - 1) try: docIndex[key][1].append(pos) except: docIndex[key]=[d, array('I',[pos])] pos += 1 '''Merge the document index with global index''' for docName, positions in docIndex.items(): self.index[docName].append(positions) print(self.index) '''Get the query type''' def getQueryType(self, query): if '"' in query: return 'PQ' elif (len(query.split()) > 1): return 'FTQ' else: return 'OWQ' '''Query the Index created above''' def queryIndex(self): while True: q = sys.stdin.readline() q = q.rstrip() if q == '': break queryType = self.getQueryType(q) if queryType == 'OWQ': self.oneWordQuery(q) elif queryType == 'FTQ': self.freeTextQuery(q) '''One Word Query''' def oneWordQuery(self, q): originalQuery = q q = self.p.stem(q, 0, len(q) - 1) if len(q) == 0: print('Length of q is zero') return q = "'{}'".format(q) print(q) '''Query contains only one word''' if q not in self.index.keys(): print('q is not in index') return else: pos = self.index[q] pos = [x[0] for x in pos] pos = ' '.join(pos) print(pos) '''Extract words from the free text query ''' def getTerms(self, line): line = line.lower() '''replace non alphanumeric characters with space''' line = re.sub(r'[^a-z0-9 ]',' ',line) line = line.split() line = [x for x in line if x not in self.sw] line = [self.p.stem(word, 0, len(word) -1) for word in line] return line '''This function returns the intersection of lists''' def intersectsLists(self, lists): if len(lists) == 0: return [] '''Sort the list on the basis of length such that smallest item appears first''' lists.sort(key=len) return list(reduce(lambda x, y: set(x) & set(y), lists)) def getPostings(self, terms): '''all terms in the list are guaranteed to be in the index''' return [self.index[term] for term in terms] def getDocsFromPostings(self, postings): '''no empty list in postings''' return [[x[0] for x in p] for p in postings] '''Free Text Query''' def freeTextQuery(self, q): q = self.getTerms(q) if len(q)==0: print('') return li = set() for term in q: try: p=self.index[term] p=[x[0] for x in p] li=li|set(p) except: #term not in index pass li = list(li) li.sort() print(' '.join(li)) '''Phrase Query''' def phraseQuery(self, q): originalQuery=q q = self.getTerms(q) if len(q) == 0: print('') return elif len(q) == 1: self.owq(originalQuery) return phraseDocs = self.phraseQueryDocs(q) print(' '.join(map(str, phraseDocs))) def phraseQueryDocs(self, termList): phraseDocs = [] length = len(termList) '''first find matching docs''' for term in termList: if term not in self.index: '''if a term doesn't appear in the index there can't be any document matching it''' return [] postings = self.getPostings(termList) docs = self.getDocsFromPostings(postings) '''docs are the documents that contain every term in the query''' docs = self.intersectLists(docs) '''postings are the postings list of the terms in the documents docs only'''
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): # ------------------------------------------------------------------- # TODO: Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # NOTE that you probably do *not* want to store a value for every # word-document pair, but rather just for those pairs where a # word actually occurs in the document. print "Calculating tf-idf..." self.tfidf = {} # initialized for word in self.vocab: for d in range(len(self.docs)): if word not in self.tfidf: self.tfidf[word] = {} self.tfidf[word][d] = 0.0 N = len(self.docs) for word in self.vocab: indices = self.inv_index[word] for i in indices: tf = 1 + math.log10(indices[i]) idf = math.log10(N*1.0 / len(self.get_posting(word))) self.tfidf[word][i] = tf * idf #print self.tfidf # ------------------------------------------------------------------ def get_tfidf(self, word, document): # ------------------------------------------------------------------ # TODO: Return the tf-idf weigthing for the given word (string) and # document index. tfidf = 0.0 if word in self.tfidf: tfidf = self.tfidf[word][document] # ------------------------------------------------------------------ return tfidf def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." # ------------------------------------------------------------------ # TODO: Create an inverted index. # Granted this may not be a linked list as in a proper # implementation. # Some helpful instance variables: # * self.docs = List of documents # * self.titles = List of titles # Example: inv_index['separ'] = {54: 3} in doc id 54, occurs 3 times! inv_index = {} for word in self.vocab: inv_index[word] = {} numdocs = len(self.docs) for d in xrange(0, numdocs): doc = self.docs[d] for word in doc: #if word == "zulu": # print "zulu", inv_index[word] if d in inv_index[word]: inv_index[word][d] = inv_index[word][d]+1 else: inv_index[word][d] = 1 #print inv_index['separ'] #print "zulu inverted index", inv_index['zulu'] #print inv_index self.inv_index = inv_index # ------------------------------------------------------------------ def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ # ------------------------------------------------------------------ # TODO: return the list of postings for a word. posting = [] for i in self.inv_index[word]: posting.append(i) posting.sort() #if word == "zulu": # print "posting for word", word , posting return posting # ------------------------------------------------------------------ def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ # ------------------------------------------------------------------ # TODO: Implement Boolean retrieval. You will want to use your # inverted index that you created in index(). # Right now this just returns all the possible documents! qsets = {} for qword in query: qsets[qword] = set() if qword in self.inv_index: for i in self.inv_index[qword]: qsets[qword].add(i) #for qword in qsets: # print "word", qword, "set", qsets[qword] # initial set final = qsets[query[0]] for x in range(1, len(query)): final = final.intersection(qsets[query[x]]) #print "final set ", final docs = list(final) # ------------------------------------------------------------------ return sorted(docs) # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for xx in range(len(self.docs))] # ------------------------------------------------------------------ # TODO: Implement cosine similarity between a document and a list of # query words. # Right now, this code simply gets the score by taking the Jaccard # similarity between the query and every document. tf = {} words_in_query = set() for word in query: words_in_query.add(word) if word not in tf: tf[word] = 1 else: tf[word] = tf[word]+1 #print query, tf for d, doc in enumerate(self.docs): words_in_doc = set(doc) #scores[d] = len(words_in_query.intersection(words_in_doc)) \ # / float(len(words_in_query.union(words_in_doc))) union = words_in_query.union(words_in_doc) #inter = words_in_query.intersection(words_in_doc) # ltclnn = {} # # for w in union: # ltclnn[w] = {} # ltclnn[w]["dn"] = 0 # ltclnn[w]["qn"] = 0 # if w in tf: # ltclnn[w]["qwt"] = 1+ math.log10(tf[w]) # ltclnn[w]["qn"] = ltclnn[w]["qn"] + ltclnn[w]["qwt"]**2 # else: # ltclnn[w]["qwt"] = 0 # ltclnn[w]["qn"] = 0 # # ltclnn[w]["dwt"] = self.get_tfidf(w, d) # ltclnn[w]["dn"] = ltclnn[w]["dn"] + ltclnn[w]["dwt"]**2 # # for w in ltclnn: # ltclnn[w]["qwtn"] = ltclnn[w]["qwt"] / math.sqrt(ltclnn[w]["qn"]) # ltclnn[w]["dwtn"] = ltclnn[w]["dwt"] / math.sqrt(ltclnn[w]["dn"]) # # prod = 0 # for w in ltclnn: # prod = prod + ltclnn[w]["qwtn"] * ltclnn[w]["dwtn"] # # scores[d] = prod ltc_sum = 0 #lnn_sum = 0 ltc_lnn = 0 for term in union: ltc = self.get_tfidf(term, d) ltc_sum = ltc_sum + ltc*ltc if term in tf: lnn = 1 + math.log10(tf[term]) else: lnn = 0 #lnn_sum = lnn_sum + lnn*lnn ltc_lnn = ltc_lnn + ltc*lnn scores[d] = ltc_lnn / math.sqrt(ltc_sum) #print scores # ------------------------------------------------------------------ ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): print "Calculating tf-idf..." self.tfidf = {} n = float(len(self.docs)) for word in self.vocab: df = len(self.inv_index[word]) idf = math.log(n/df, 10) for d in range(len(self.docs)): if d not in self.tfidf: self.tfidf[d] = {} tf = len(self.inv_index[word].get(d, [])) if tf == 0: self.tfidf[d][word] = 0.0 else: self.tfidf[d][word] = (1 + math.log(tf, 10)) * idf def get_tfidf(self, word, document): return self.tfidf[document][word] def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." inv_index = {} for i in range(len(self.docs)): for j in range(len(self.docs[i])): word = self.docs[i][j] if word not in inv_index: inv_index[word] = {} if i not in inv_index[word]: inv_index[word][i] = [] inv_index[word][i].append(j) self.inv_index = inv_index def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ posting = self.inv_index.get(word).keys() return sorted(posting) # ------------------------------------------------------------------ def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ docs = range(len(self.docs)) for word in query: docs = list(set(self.get_posting(word)) & set(docs)) return sorted(docs) # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ scores = [0.0 for xx in range(len(self.docs))] query_vector = {} for word in self.vocab: tf = query.count(word) if tf == 0: query_vector[word] = 0.0 else: query_vector[word] = 1 + math.log(tf, 10) for d in range(len(self.docs)): doc_vector = self.tfidf[d] m1 = 0.0 m2 = 0.0 dp = 0.0 for word in query_vector: m1 += math.pow(query_vector[word], 2) m2 += math.pow(doc_vector[word], 2) dp += query_vector[word] * doc_vector[word] scores[d] = dp / math.sqrt(m2) ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
def stemAndRemoveFrequence(text): p = PorterStemmer() words = [] for word in text: words.append( p.stem(word, 0, len(word)-1) ) return set(words)
if line['kind'] == 'meeting': course['instructors'] = for course in courses: doc_vect = {} terms = set() doc_vector.append(doc_vect) titles_vector.append(course['code'] + " " + course['title']) title_word_vect = course['title'].split(" ") descp_word_vect = str(course['description']).split(' ') prev = "" for title_word in title_word_vect: title_word = title_word.lower() title_word = p.stem(title_word, 0, len(title_word)-1) if title_word not in doc_vect: doc_vect[title_word] = TITLE else: doc_vect[title_word] += TITLE if title_word not in corp_freq_hash: corp_freq_hash[title_word] = 1 else: corp_freq_hash[title_word] += 1 terms.add(title_word) if prev: bigram = prev+" "+title_word if bigram not in doc_vect: doc_vect[bigram] = TITLE
def stemWords(l): ps = PorterStemmer() return [ps.stem(x, 0, len(x) - 1) for x in l]
def stem_words(l): ps = PorterStemmer() return [ps.stem(x, 0, len(x) - 1) for x in l]
def stemming(self, listOfWords): p = PorterStemmer() stemList = [] for word in listOfWords: stemList.append(p.stem(word, 0, len(word) - 1)) return stemList
def stem(word): p = PorterStemmer() return p.stem(word,0,len(word)-1).encode('utf8')
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i + 1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [ idx for idx, title in sorted(enumerate(titles), key=lambda xx: xx[1]) ] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] def compute_tfidf(self): # ------------------------------------------------------------------- # TODO: Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # NOTE that you probably do *not* want to store a value for every # word-document pair, but rather just for those pairs where a # word actually occurs in the document. print "Calculating tf-idf..." self.tfidf = {} self.tf = collections.defaultdict( lambda: collections.defaultdict(lambda: 0)) for docid, doc in enumerate(self.docs): for word in doc: self.tf[docid][word] += 1 for word in self.vocab: if word not in self.tfidf: self.tfidf[word] = {} for d in range(len(self.docs)): if self.tf[d][word] > 0: tf = 1 + math.log10(self.tf[d][word]) df = len(self.inv_index[word]) idf = math.log10(len(self.docs) * 1.0 / df) self.tfidf[word][d] = tf * idf # ------------------------------------------------------------------ def get_tfidf(self, word, document): # ------------------------------------------------------------------ # TODO: Return the tf-idf weigthing for the given word (string) and # document index. #tfidf = 0.0 # ------------------------------------------------------------------ if self.tfidf[word].get(document, 0) == 0: tfidf = 0.0 else: tfidf = self.tfidf[word][document] return tfidf def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." # ------------------------------------------------------------------ # TODO: Create an inverted index. # Granted this may not be a linked list as in a proper # implementation. # Some helpful instance variables: # * self.docs = List of documents # * self.titles = List of titles self.inv_index = collections.defaultdict(set) for docid, doc in enumerate(self.docs): for word in doc: self.inv_index[word].add(docid) # ------------------------------------------------------------------ def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ # ------------------------------------------------------------------ # TODO: return the list of postings for a word. posting = self.inv_index[word] return posting # ------------------------------------------------------------------ def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ # ------------------------------------------------------------------ # TODO: Implement Boolean retrieval. You will want to use your # inverted index that you created in index(). # Right now this just returns all the possible documents! docSet = set() for term in query: if len(docSet) == 0: docSet = self.get_posting(term) else: docSet &= self.get_posting(term) # ------------------------------------------------------------------ return sorted(list(docSet)) # sorted doesn't actually matter def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ # ------------------------------------------------------------------ # TODO: Implement cosine similarity between a document and a list of # query words. # Right now, this code simply gets the score by taking the Jaccard # similarity between the query and every document. """ words_in_query = set() for word in query: words_in_query.add(word) for d, doc in enumerate(self.docs): words_in_doc = set(doc) scores[d] = len(words_in_query.intersection(words_in_doc)) \ / float(len(words_in_query.union(words_in_doc))) """ # ------------------------------------------------------------------ scores = collections.defaultdict(lambda: 0.0) for d in range(len(self.docs)): numerator = 0.0 denominator = 0.0 for term in query: count = 0.0 for q in query: if term == q: count += 1 numerator += (1 + math.log10(count)) * (self.get_tfidf( term, d)) for tt in set(self.docs[d]): denominator += self.get_tfidf(tt, d)**2 denominator = denominator**(1.0 / 2) scores[d] = numerator / denominator temp = [] for idx in range(len(self.docs)): temp.append(scores[idx]) ranking = [ idx for idx, sim in sorted( enumerate(temp), key=lambda xx: xx[1], reverse=True) ] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
class IRSystem: def __init__(self): # For holding the data - initialized in read_data() self.titles = [] self.docs = [] self.vocab = [] # For the text pre-processing. self.alphanum = re.compile('[^a-zA-Z0-9]') self.p = PorterStemmer() def get_uniq_words(self): uniq = set() for doc in self.docs: for word in doc: uniq.add(word) return uniq def __read_raw_data(self, dirname): print "Stemming Documents..." titles = [] docs = [] os.mkdir('%s/stemmed' % dirname) title_pattern = re.compile('(.*) \d+\.txt') # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/raw' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) for i, filename in enumerate(filenames): title = title_pattern.search(filename).group(1) print " Doc %d of %d: %s" % (i+1, len(filenames), title) titles.append(title) contents = [] f = open('%s/raw/%s' % (dirname, filename), 'r') of = open('%s/stemmed/%s.txt' % (dirname, title), 'w') for line in f: # make sure everything is lower case line = line.lower() # split on whitespace line = [xx.strip() for xx in line.split()] # remove non alphanumeric characters line = [self.alphanum.sub('', xx) for xx in line] # remove any words that are now empty line = [xx for xx in line if xx != ''] # stem words line = [self.p.stem(xx) for xx in line] # add to the document's conents contents.extend(line) if len(line) > 0: of.write(" ".join(line)) of.write('\n') f.close() of.close() docs.append(contents) return titles, docs def __read_stemmed_data(self, dirname): print "Already stemmed!" titles = [] docs = [] # make sure we're only getting the files we actually want filenames = [] for filename in os.listdir('%s/stemmed' % dirname): if filename.endswith(".txt") and not filename.startswith("."): filenames.append(filename) if len(filenames) != 60: msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n" msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run." raise Exception(msg) for i, filename in enumerate(filenames): title = filename.split('.')[0] titles.append(title) contents = [] f = open('%s/stemmed/%s' % (dirname, filename), 'r') for line in f: # split on whitespace line = [xx.strip() for xx in line.split()] # add to the document's conents contents.extend(line) f.close() docs.append(contents) return titles, docs def read_data(self, dirname): """ Given the location of the 'data' directory, reads in the documents to be indexed. """ # NOTE: We cache stemmed documents for speed # (i.e. write to files in new 'stemmed/' dir). print "Reading in documents..." # dict mapping file names to list of "words" (tokens) filenames = os.listdir(dirname) subdirs = os.listdir(dirname) if 'stemmed' in subdirs: titles, docs = self.__read_stemmed_data(dirname) else: titles, docs = self.__read_raw_data(dirname) # Sort document alphabetically by title to ensure we have the proper # document indices when referring to them. ordering = [idx for idx, title in sorted(enumerate(titles), key = lambda xx : xx[1])] self.titles = [] self.docs = [] numdocs = len(docs) for d in range(numdocs): self.titles.append(titles[ordering[d]]) self.docs.append(docs[ordering[d]]) # Get the vocabulary. self.vocab = [xx for xx in self.get_uniq_words()] # My additions - sorted documents self.sorted_docs = [sorted(doc) for doc in self.docs] self.reverse_sorted_docs = [doc[:] for doc in self.sorted_docs] map(list.reverse, self.reverse_sorted_docs) def compute_idf(self, word): """ Computes the idf of the given word. Needs the inv_index set up - do this by calling self.index """ N = len(self.docs) df = len(self.inv_index[word]) return math.log(N*1.0/df,10) def compute_tfidf(self): # ------------------------------------------------------------------- # TODO: Compute and store TF-IDF values for words and documents. # Recall that you can make use of: # * self.vocab: a list of all distinct (stemmed) words # * self.docs: a list of lists, where the i-th document is # self.docs[i] => ['word1', 'word2', ..., 'wordN'] # NOTE that you probably do *not* want to store a value for every # word-document pair, but rather just for those pairs where a # word actually occurs in the document. print "Calculating tf-idf..." self.tfidf = {} for doc_id, doc in enumerate(self.docs): word_counter = Counter(self.docs[doc_id]) for word in word_counter.keys(): if word not in self.tfidf: self.tfidf[word] = {} self.tfidf[word][doc_id] = 1 + math.log(word_counter[word], 10) for word in self.tfidf: idf = self.compute_idf(word) for doc_id in self.tfidf[word]: self.tfidf[word][doc_id] *= idf def get_tfidf(self, word, document): return self.tfidf[word][document] def get_tfidf_unstemmed(self, word, document): """ This function gets the TF-IDF of an *unstemmed* word in a document. Stems the word and then calls get_tfidf. You should *not* need to change this interface, but it is necessary for submission. """ word = self.p.stem(word) return self.get_tfidf(word, document) def index(self): """ Build an index of the documents. """ print "Indexing..." # ------------------------------------------------------------------ # TODO: Create an inverted index. # Granted this may not be a linked list as in a proper # implementation. # Some helpful instance variables: # * self.docs = List of documents # * self.titles = List of titles inv_index = {} for word in self.vocab: inv_index[word] = [] for doc_id, doc in enumerate(self.docs): for word in sorted(list(set(doc))): inv_index[word].append(doc_id) self.inv_index = inv_index # ------------------------------------------------------------------ def get_posting(self, word): """ Given a word, this returns the list of document indices (sorted) in which the word occurs. """ return self.inv_index[word] def get_posting_unstemmed(self, word): """ Given a word, this *stems* the word and then calls get_posting on the stemmed word to get its postings list. You should *not* need to change this function. It is needed for submission. """ word = self.p.stem(word) return self.get_posting(word) def boolean_retrieve(self, query): """ Given a query in the form of a list of *stemmed* words, this returns the list of documents in which *all* of those words occur (ie an AND query). Return an empty list if the query does not return any documents. """ postings = map(set,map(self.get_posting, query)) matching_doc_ids = set(range(len(self.docs))).intersection(*postings) return sorted(matching_doc_ids) def calculate_score_by_jaccard(self, query): scores = [0.0 for xx in range(len(self.docs))] words_in_query = set(query) for d, doc in enumerate(self.docs): words_in_doc = set(doc) scores[d] = len(words_in_query.intersection(words_in_doc)) / float(len(words_in_query.union(words_in_doc))) return scores def normalized_length(self, doc_id): """ Normalizes length of the given document: sums the squared TFIDF score of the SET (meaning unique) of all words in the document and returns the square root of the sum. """ word_wtf_in_doc = [self.tfidf[word][doc_id] for word in set(self.docs[doc_id])] return sum(map(lambda x: x**2, word_wtf_in_doc)) ** 0.5 def calculate_cosine_scores(self, query): scores = [0.0 for xx in range(len(self.docs))] lengths = [self.normalized_length(doc_id) for doc_id in range(len(self.docs))] for term in set(query): query_weighted_term_frequency = 1 + math.log(query.count(term), 10) for doc_id in self.get_posting(term): scores[doc_id] += self.tfidf[term][doc_id] * query_weighted_term_frequency scores = [score/length for score,length in zip(scores,lengths)] return scores def rank_retrieve(self, query): """ Given a query (a list of words), return a rank-ordered list of documents (by ID) and score for the query. """ #scores = self.calculate_score_by_jaccard(query) scores = self.calculate_cosine_scores(query) ranking = [idx for idx, sim in sorted(enumerate(scores), key = lambda xx : xx[1], reverse = True)] results = [] for i in range(10): results.append((ranking[i], scores[ranking[i]])) return results def process_query(self, query_str): """ Given a query string, process it and return the list of lowercase, alphanumeric, stemmed words in the string. """ # make sure everything is lower case query = query_str.lower() # split on whitespace query = query.split() # remove non alphanumeric characters query = [self.alphanum.sub('', xx) for xx in query] # stem words query = [self.p.stem(xx) for xx in query] return query def query_retrieve(self, query_str): """ Given a string, process and then return the list of matching documents found by boolean_retrieve(). """ query = self.process_query(query_str) return self.boolean_retrieve(query) def query_rank(self, query_str): """ Given a string, process and then return the list of the top matching documents, rank-ordered. """ query = self.process_query(query_str) return self.rank_retrieve(query)
class Chatbot: """Simple class to implement the chatbot for PA 6.""" def __init__(self, creative=False): # The chatbot's default name is `moviebot`. Give your chatbot a new name. self.name = 'moviebot' self.creative = creative # This matrix has the following shape: num_movies x num_users # The values stored in each row i and column j is the rating for # movie i by user j self.titles, ratings = movielens.ratings() self.sentiment = movielens.sentiment() self.new_sentiment = {} self.p = PorterStemmer() # create a new sentiment dict with stemmed keys for key in self.sentiment: new_key = self.p.stem(key) self.new_sentiment[new_key] = self.sentiment[key] self.bin_ratings = self.binarize(ratings) # a tuple with the sentiment of the movie being discussed self.current_sentiment = None # the movie title entered by the user self.current_title = None # a list of current movie candidates self.current_idxs = [] self.prev_movie = None self.prev_sentiment = None # a dict where dict[i] = j is the user's sentiment j for movie index i # for movies that the user has described and the chatbot has processed self.user_movies = {} # a set of movie indexes that the user has already described self.user_movie_set = set() self.prefix_match_found = False self.disambig = False # if chatbot is in recommend mode, only respond to yes or no self.recommend_mode = False # a list of recommendations for the user self.recommendations = [] self.recommend_idx = 0 # preprocess movie list by extracting possible titles and year self.movies = [] for entry in self.titles: self.movies.append(extract_titles_and_year(entry[0])) ############################################################################# # TODO: Binarize the movie ratings matrix. # ############################################################################# # Binarize the movie ratings before storing the binarized matrix. self.ratings = ratings ############################################################################# # END OF YOUR CODE # ############################################################################# ############################################################################# # 1. WARM UP REPL # ############################################################################# def greeting(self): """Return a message that the chatbot uses to greet the user.""" ############################################################################# # TODO: Write a short greeting message # ############################################################################# greeting_message = "Hi there! I'm Movie Chatbot. How can I help you?" ############################################################################# # END OF YOUR CODE # ############################################################################# return greeting_message def goodbye(self): """Return a message that the chatbot uses to bid farewell to the user.""" ############################################################################# # TODO: Write a short farewell message # ############################################################################# goodbye_message = "Have a nice day! It was fun talking to you!" ############################################################################# # END OF YOUR CODE # ############################################################################# return goodbye_message ############################################################################### # 2. Modules 2 and 3: extraction and transformation # ############################################################################### def process(self, line): """Process a line of input from the REPL and generate a response. This is the method that is called by the REPL loop directly with user input. You should delegate most of the work of processing the user's input to the helper functions you write later in this class. Takes the input string from the REPL and call delegated functions that 1) extract the relevant information, and 2) transform the information into a response to the user. Example: resp = chatbot.process('I loved "The Notebok" so much!!') print(resp) // prints 'So you loved "The Notebook", huh?' :param line: a user-supplied line of text :returns: a string containing the chatbot's response to the user input """ ############################################################################# # TODO: Implement the extraction and transformation in this method, # # possibly calling other functions. Although modular code is not graded, # # it is highly recommended. # ############################################################################# response = '' swear_response = self.checkSwearWords(line) if swear_response: return swear_response caps_lock_response = self.checkAnger(line) if caps_lock_response: return caps_lock_response if self.recommend_mode: if re.match('yes', line.strip(), re.I): return self.give_recommendation() elif re.match('no', line.strip(), re.I): return "Okay, I guess I've given you enough recommendations!" else: return "Let's talk about that later. Do you want another recommendation?" clarification = False if self.creative: # deal with "Can you...?", "What is...?", etc. questions response_to_question = self.matches_question(line) if response_to_question: return response_to_question elif self.disambig: self.current_idxs = self.disambiguate(line, self.current_idxs) #print(self.current_idxs) if len(self.current_idxs) == 1: self.current_title = self.titles[self.current_idxs[0]][0] self.disambig = False clarification = True else: response = "Sorry, can you be a little more specific? I still found the following movies:\n" for i in self.current_idxs: response += "{}\n".format(self.titles[i][0]) return response # extract titles and matches extracted_title_from_current_line = False if not self.current_title: matches = self.get_possible_matching_titles(line) extracted_title_from_current_line = True #print('Extracted title') else: matches = [(self.current_title, self.current_idxs)] #print('Current title:{}'.format(self.current_title)) # extract sentiment extracted_sentiment_from_current_line = False if not self.current_sentiment: # remove title from line for sentiment extraction if matches: line = line.replace(matches[0][0], '') sentiment = self.extract_sentiment(line) extracted_sentiment_from_current_line = True self.current_sentiment = sentiment #print('Extracted sentiment') else: sentiment = self.current_sentiment #print('Current sentiment:{}'.format(self.current_sentiment)) if self.creative: if not extracted_title_from_current_line and \ extracted_sentiment_from_current_line: if not clarification and not contains_anaphoric_expression( line): #print('no anaphoric expression') return self.generate_response_to_irrelevant_input() if self.creative: if len(matches) == 0 and not self.current_title: return self.generate_response_to_irrelevant_input() elif len(matches) > 1: return 'Please tell me about one movie at a time.' elif len(matches) == 1: title, idxs = matches[0] self.current_idxs = idxs self.current_title = title if len(idxs) == 0: self.clear_current_movie() return "Hmm, I couldn't find a match for \"{}\". Please tell me about some other movies you have watched!".format( title) elif len(idxs) == 1: if idxs[0] in self.user_movie_set: response = "(I think you already told me about that movie, but I'll update what you tell me!)\n" if sentiment == 0: return response + "I'm a little confused. What did you think about \"{}\"?".format( self.titles[idxs[0]][0]) if sentiment == 1: response += "Great, so you liked \"{}\".".format( self.titles[idxs[0]][0]) elif sentiment == 2: response += "Wow, you really loved \"{}\"!".format( self.titles[idxs[0]][0]) elif sentiment == -1: response += "Okay, you didn't like \"{}\".".format( self.titles[idxs[0]][0]) elif sentiment == -2: response += "It seems like you hated \"{}\" with a passion! That's too bad.".format( self.titles[idxs[0]][0]) self.process_movie(idxs[0], sentiment) else: response = "I found multiple movies. Which one are you talking about?\n" for i in idxs: response += '{}\n'.format(self.titles[i][0]) self.disambig = True return response else: if len(matches) == 0: return self.generate_response_to_irrelevant_input() elif len(matches) > 1: return 'Please tell me about one movie at a time.' else: title, idxs = matches[0] sentiment = self.extract_sentiment(line.replace(title, '')) if sentiment == 0: return "So did you like \"{}\" or hate it? Please tell me.".format( self.titles[idxs[0]][0]) else: if len(idxs) > 1: return "I found multiple matches for \"{}\". Can you be more specific? Maybe try telling me the year as well.".format( title) elif len(idxs) == 0: return "Hmm, I couldn't find a match for \"{}\". Please tell me about some other movies you have watched!".format( title) else: if sentiment > 0: if idxs[0] in self.user_movie_set: response = "(I think you already told me about that movie, but I'll update what you tell me!)\n" else: response = "Great! So you liked \"{}\". ".format( self.titles[idxs[0]][0]) self.process_movie(idxs[0], sentiment) elif sentiment < 0: if idxs[0] in self.user_movie_set: response = "I think you already told me about that movie." else: response = "Okay, so you didn't like \"{}\". ".format( self.titles[idxs[0]][0]) self.process_movie(idxs[0], sentiment) else: return "I'm not sure if you liked or didn't the movie. Can you tell me a movie and what you thought about it?" # recommend once we have 5 movies if len(self.user_movies) >= 5: self.recommend_mode = True user_ratings = np.zeros(len(self.titles)) for m in self.user_movies: user_ratings[m] = self.user_movies[m] self.recommendations = self.recommend(user_ratings, self.bin_ratings, k=10, creative=self.creative) self.recommend_idx = 0 return self.give_recommendation() else: response += " " + self.generate_request_for_more_movies() ############################################################################# # END OF YOUR CODE # ############################################################################# return response def give_recommendation(self): recommend_sentences = [ "Why don't you check out \"{}\"? ", "I think you might enjoy \"{}\"! ", "\"{}\" might suit your tastes! " ] if self.recommend_idx < len(self.recommendations): response = '' if self.recommend_idx == 0: response += "Okay, based on what you told me, I think you would like \"{}\"! ".format( self.titles[self.recommendations[self.recommend_idx]][0]) else: response += random.choice(recommend_sentences).format( self.titles[self.recommendations[self.recommend_idx]][0]) response += 'Would you like another recommendation?' self.recommend_idx += 1 else: response = "Sorry, I don't have any more recommendations!" return response def matches_question(self, text): ''' Returns response to question ''' question_responses = [ "I don't know. Ask Google.", "I'd like to know as well.", "Let me think about that. I'll get back to you in a billion years." ] match = re.findall('(.*)\?', text, re.I) if match: return self.flip_question(text) + ' ' + random.choice( question_responses) else: return None def flip_question(self, text): ''' Flips the perspective of the question ''' table = { 'I': 'you', 'me': 'you', 'my': 'your', 'your': 'my', 'myself': 'yourself', 'yourself': 'myself' } # some common prepositions prep_set = { 'of', 'with', 'at', 'from', 'including', 'until', 'against', 'among', 'towards', 'upon', 'to' } words = re.split('\s|\?', text) words.pop() # remove empty string at end last_word = None for i in range(len(words)): if words[i] in table: words[i] = table[words[i]] elif words[i] == 'you' or words[i] == 'You': if last_word in prep_set: words[i] = 'me' else: words[i] = 'I' last_word = words[i] return ' '.join(words) + '?' def generate_response_to_irrelevant_input(self): responses = [ "I'm sorry, but I want to hear about a movie you liked.", "That's really cool and all, but can we go back to talking about movies? I want to know more about movies you enjoyed!", "Maybe we can talk about that later. Let's get back to talking about movies. Why don't you tell me what you thought about a movie you watched recently?" ] return random.choice(responses) def generate_request_for_more_movies(self): responses = [ "Please tell me about more movies you've watched!", "Tell me another one of your favorite movies. This is so much fun!", "What is another movie you liked?" ] return random.choice(responses) def get_possible_matching_titles(self, line): possible_titles = self.extract_titles(line) matches = [] if self.creative: self.prefix_match_found = False for title in possible_titles: movie_idxs = self.find_movies_by_title(title) #print(movie_idxs) if not self.prefix_match_found: movie_idxs.extend( self.find_movies_closest_to_title(title, max_distance=3)) #print(movie_idxs) movie_idxs = sorted(list(set(movie_idxs))) matches.append((title, movie_idxs)) else: for title in possible_titles: matches.append((title, self.find_movies_by_title(title))) return matches def process_movie(self, movie_index, sentiment): self.user_movies[movie_index] = sentiment self.user_movie_set.add(movie_index) self.prev_idx = movie_index self.prev_sentiment = self.current_sentiment self.clear_current_movie() def clear_current_movie(self): self.current_sentiment = None self.current_title = None self.current_idxs = None def extract_titles(self, text): """Extract potential movie titles from a line of text. Given an input text, this method should return a list of movie titles that are potentially in the text. - If there are no movie titles in the text, return an empty list. - If there is exactly one movie title in the text, return a list containing just that one movie title. - If there are multiple movie titles in the text, return a list of all movie titles you've extracted from the text. Example: potential_titles = chatbot.extract_titles('I liked "The Notebook" a lot.') print(potential_titles) // prints ["The Notebook"] :param text: a user-supplied line of text that may contain movie titles :returns: list of movie titles that are potentially in the text """ potential_titles = [] if self.creative: pat1 = '"(.*?)"' stop_words = 'at|as|of|on|to|with|and|the|in|from|&|\+|by|or|de|vs\.' pat2 = '((?:[A-HJ-Z0-9]\S*(?:\s+(?:[A-Z0-9\.\-\(]\S*|' + stop_words + ')?|$)|I [A-Z0-9])(?:.*[A-HJ-Z0-9]\S*|.*[A-Z]\S+)?\s*(?:\(\d{4}\))?)' potential_titles = re.findall(pat1, text) potential_titles.extend(re.findall(pat2, text)) potential_titles = list(set(potential_titles)) else: potential_titles = re.findall('"(.*?)"', text) return potential_titles def find_movies_by_title(self, title): """ Given a movie title, return a list of indices of matching movies. - If no movies are found that match the given title, return an empty list. - If multiple movies are found that match the given title, return a list containing all of the indices of these matching movies. - If exactly one movie is found that matches the given title, return a list that contains the index of that matching movie. Example: ids = chatbot.find_movies_by_title('Titanic') print(ids) // prints [1359, 1953] :param title: a string containing a movie title :returns: a list of indices of matching movies """ candidates = [] if self.creative: movie = extract_titles_and_year(title) for i in range(len(self.movies)): match_found = False for dbt in self.movies[i].titles: for qt in movie.titles: # if database title starts with query title if bool(re.match(qt + '($|\W)', dbt, re.I)): match_found = True break if match_found: break if match_found: # if no year included in query, add all movies that match if not movie.year: candidates.append(i) self.prefix_match_found = True # if year included in query, add only movies that match both # title AND year if movie.year and movie.year == self.movies[i].year: candidates.append(i) self.prefix_match_found = True else: movie = extract_titles_and_year(title) for i in range(len(self.movies)): if set(movie.titles).intersection(set(self.movies[i].titles)): if not movie.year: candidates.append(i) elif movie.year and movie.year == self.movies[i].year: candidates.append(i) return candidates return candidates class Example: """Represents a document with a label. klass is 'pos' or 'neg' by convention. words is a list of strings. """ def __init__(self): self.klass = '' self.words = [] def extract_sentiment(self, text): """Extract a sentiment rating from a line of text. You should return -1 if the sentiment of the text is negative, 0 if the sentiment of the text is neutral (no sentiment detected), or +1 if the sentiment of the text is positive. As an optional creative extension, return -2 if the sentiment of the text is super negative and +2 if the sentiment of the text is super positive. Example: sentiment = chatbot.extract_sentiment('I liked "The Titanic"') print(sentiment) // prints 1 :param text: a user-supplied line of text :returns: a numerical value for the sentiment of the text """ #process train data negationSet = {"n't", "never", "not", "no"} strongerSet = { "really", "very", "love", "hate", "terrible", "truly", "despise", "great", "fantastic", "amazing", "extremely", "horrible", "disgusting", "stunning", "adore" } punct = "\W+" newSet = set() for word in negationSet: newSet.add(self.p.stem(word)) negationSet = newSet newSet = set() for word in strongerSet: newSet.add(self.p.stem(word)) strongerSet = newSet textWords = nltk.word_tokenize(text) opp = False pos_num = 0 neg_num = 0 strength_val = 1 num_sentiment_words = 0 for word in textWords: word = self.p.stem(word) if word in negationSet: opp = True continue if re.match(punct, word): opp = False strength_val = 1 continue if word in strongerSet: strength_val = 2 if word in self.new_sentiment: if self.new_sentiment[word] == 'pos' and not opp: pos_num += strength_val elif self.new_sentiment[word] == 'pos' and opp: neg_num += strength_val elif self.new_sentiment[word] == 'neg' and not opp: neg_num += strength_val else: pos_num += strength_val num_sentiment_words += 1 thresh = 0.25 if num_sentiment_words == 0: sentiment = 0 else: avg = (pos_num - neg_num) / float(num_sentiment_words) if avg > 1: sentiment = 2 elif thresh < avg <= 1: sentiment = 1 elif -thresh <= avg <= thresh: sentiment = 0 elif -1 <= avg < -thresh: sentiment = -1 else: sentiment = -2 if not self.creative: if sentiment > 1: sentiment = 1 elif sentiment < -1: sentiment = -1 return sentiment def extract_sentiment_for_movies(self, text): """Creative Feature: Extracts the sentiments from a line of text that may contain multiple movies. Note that the sentiments toward the movies may be different. You should use the same sentiment values as extract_sentiment, described above. Hint: feel free to call previously defined functions to implement this. Example: sentiments = chatbot.extract_sentiment_for_text('I liked both "Titanic (1997)" and "Ex Machina".') print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)] :param text: a user-supplied line of text :returns: a list of tuples, where the first item in the tuple is a movie title, and the second is the sentiment in the text toward that movie """ pass def find_movies_closest_to_title(self, title, max_distance=3): """Creative Feature: Given a potentially misspelled movie title, return a list of the movies in the dataset whose titles have the least edit distance from the provided title, and with edit distance at most max_distance. - If no movies have titles within max_distance of the provided title, return an empty list. - Otherwise, if there's a movie closer in edit distance to the given title than all other movies, return a 1-element list containing its index. - If there is a tie for closest movie, return a list with the indices of all movies tying for minimum edit distance to the given movie. Example: chatbot.find_movies_closest_to_title("Sleeping Beaty") # should return [1656] :param title: a potentially misspelled title :param max_distance: the maximum edit distance to search for :returns: a list of movie indices with titles closest to the given title and within edit distance max_distance """ candidates = [] movie = extract_titles_and_year(title) for i in range(len(self.movies)): match_found = False for dbt in self.movies[i].titles: for qt in movie.titles: dist = edit_distance(qt, dbt) if dist <= max_distance: match_found = True # if distance is smaller than all previous, discard previous if dist < max_distance: candidates = [] max_distance = dist break if match_found: break if match_found: if not movie.year: candidates.append(i) if movie.year and movie.year == self.movies[i].year: candidates.append(i) return candidates return candidates def disambiguate(self, clarification, candidates): """Creative Feature: Given a list of movies that the user could be talking about (represented as indices), and a string given by the user as clarification (eg. in response to your bot saying "Which movie did you mean: Titanic (1953) or Titanic (1997)?"), use the clarification to narrow down the list and return a smaller list of candidates (hopefully just 1!) - If the clarification uniquely identifies one of the movies, this should return a 1-element list with the index of that movie. - If it's unclear which movie the user means by the clarification, it should return a list with the indices it could be referring to (to continue the disambiguation dialogue). Example: chatbot.disambiguate("1997", [1359, 2716]) should return [1359] :param clarification: user input intended to disambiguate between the given movies :param candidates: a list of movie indices :returns: a list of indices corresponding to the movies identified by the clarification """ filtered_idxs = [] for idx in candidates: if bool( re.search('(\W|^)' + clarification + '(\W|$)', self.titles[idx][0], re.I)): filtered_idxs.append(idx) # try looking for phrases like 'first one' or '2nd movie' if not filtered_idxs: if bool(re.search('(\W|^)(first|1st)(\W|$)', clarification, re.I)) and \ len(candidates) >= 1: filtered_idxs = [candidates[0]] elif bool(re.search('(\W|^)(second|2nd)(\W|$)', clarification, re.I)) and \ len(candidates) >= 2: filtered_idxs = [candidates[1]] elif bool(re.search('(\W|^)(third|3rd)(\W|$)', clarification, re.I)) and \ len(candidates) >= 3: filtered_idxs = [candidates[2]] elif bool(re.search('(\W|^)(fourth|4th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 4: filtered_idxs = [candidates[3]] elif bool(re.search('(\W|^)(fifth|5th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 5: filtered_idxs = [candidates[4]] elif bool(re.search('(\W|^)(sixth|6th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 6: filtered_idxs = [candidates[5]] elif bool(re.search('(\W|^)(seventh|7th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 7: filtered_idxs = [candidates[6]] elif bool(re.search('(\W|^)(eighth|8th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 8: filtered_idxs = [candidates[7]] elif bool(re.search('(\W|^)(ninth|9th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 9: filtered_idxs = [candidates[8]] elif bool(re.search('(\W|^)(tenth|10th)(\W|$)', clarification, re.I)) and \ len(candidates) >= 10: filtered_idxs = [candidates[9]] if not filtered_idxs: return candidates else: return filtered_idxs ############################################################################# # 3. Movie Recommendation helper functions # ############################################################################# def binarize(self, ratings, threshold=2.5, creative=False): """Return a binarized version of the given matrix. To binarize a matrix, replace all entries above the threshold with 1. and replace all entries at or below the threshold with a -1. Entries whose values are 0 represent null values and should remain at 0. :param x: a (num_movies x num_users) matrix of user ratings, from 0.5 to 5.0 :param threshold: Numerical rating above which ratings are considered positive :returns: a binarized version of the movie-rating matrix """ ############################################################################# # TODO: Binarize the supplied ratings matrix. # ############################################################################# # The starter code returns a new matrix shaped like ratings but full of zeros. if creative: high_thresh = 4 low_thresh = 5 - high_thresh binarized_ratings = np.where( ratings >= high_thresh, 2.0, 0.0) + np.where( (ratings > threshold) & (ratings < high_thresh), 1.0, 0.0) + np.where( (ratings <= threshold) & (ratings > low_thresh), -1.0, 0.0) + np.where( (ratings != 0.0) & (ratings <= low_thresh), -2.0, 0.0) else: binarized_ratings = np.where( ratings > threshold, 1.0, 0.0) + np.where( (ratings != 0.0) & (ratings <= threshold), -1.0, 0.0) ############################################################################# # END OF YOUR CODE # ############################################################################# return binarized_ratings def similarity(self, u, v): """Calculate the cosine similarity between two vectors. You may assume that the two arguments have the same shape. :param u: one vector, as a 1D numpy array :param v: another vector, as a 1D numpy array :returns: the cosine similarity between the two vectors """ ############################################################################# # TODO: Compute cosine similarity between the two vectors. ############################################################################# u_norm = np.linalg.norm(u) v_norm = np.linalg.norm(v) dot_prod = np.dot(u, v) similarity = dot_prod if u_norm == 0.0 or v_norm == 0.0: return 0.0 else: similarity = float(dot_prod) / (u_norm * v_norm) ############################################################################# # END OF YOUR CODE # ############################################################################# return similarity def recommend(self, user_ratings, ratings_matrix, k=10, creative=False): """Generate a list of indices of movies to recommend using collaborative filtering. You should return a collection of `k` indices of movies recommendations. As a precondition, user_ratings and ratings_matrix are both binarized. Remember to exclude movies the user has already rated! :param user_ratings: a binarized 1D numpy array of the user's movie ratings :param ratings_matrix: a binarized 2D numpy matrix of all ratings, where `ratings_matrix[i, j]` is the rating for movie i by user j :param k: the number of recommendations to generate :param creative: whether the chatbot is in creative mode :returns: a list of k movie indices corresponding to movies in ratings_matrix, in descending order of recommendation """ ####################################################################################### # TODO: Implement a recommendation function that takes a vector user_ratings # # and matrix ratings_matrix and outputs a list of movies recommended by the chatbot. # # # # For starter mode, you should use item-item collaborative filtering # # with cosine similarity, no mean-centering, and no normalization of scores. # ####################################################################################### # Populate this list with k movie indices to recommend to the user. unseen_movies = np.where(user_ratings == 0)[0] seen_movies = np.where(user_ratings != 0)[0] ratings_unseen = [] for i in unseen_movies: unseen_ratings = ratings_matrix[i, :] weights = [] ratings = [] for j in seen_movies: seen_ratings = ratings_matrix[j, :] weight = self.similarity(unseen_ratings, seen_ratings) weights.append(weight) ratings.append(user_ratings[j]) estimated_rating = float(np.dot(weights, ratings)) ratings_unseen.append([i, estimated_rating]) ratings_unseen.sort(key=lambda x: x[1], reverse=True) recommendations = [] for i in range(k): recommendations.append(ratings_unseen[i][0]) ############################################################################# # END OF YOUR CODE # ############################################################################# return recommendations def checkAnger(self, string): response = '' words = string.split() upperCase = all([(word.isupper()) for word in words]) upperCaseResponses = [ "Any reason you are yelling at me?!", "Is your caps lock key stuck or something?", "It looks like you were busy capslocking >_>." ] if upperCase: return random.choice(upperCaseResponses) else: return '' def checkSwearWords(self, string): swearSet = { "f**k", "f*****g", "shit", "damn", "bitch", "crap", "piss", "dick", "c**k", "pussy", "asshole", "f*g", "bastard", "s**t", "douche", "bollocks", "arsehole", "bloody" } words = set(string.lower().split()) if words & swearSet: return 'Wash your mouth with soap!' else: return '' ############################################################################# # 4. Debug info # ############################################################################# def debug(self, line): """Return debug information as a string for the line string from the REPL""" # Pass the debug information that you may think is important for your # evaluators debug_info = 'debug info' return debug_info ############################################################################# # 5. Write a description for your chatbot here! # ############################################################################# def intro(self): """Return a string to use as your chatbot's description for the user. Consider adding to this description any information about what your chatbot can do and how the user can interact with it. """ return """
class Chatbot: """Simple class to implement the chatbot for PA 6.""" def __init__(self, is_turbo=False): self.name = 'moviebot' self.is_turbo = is_turbo self.p = PorterStemmer() self.read_data() # self.titles, self.ratings = ratings() self.binarize() self.RecommendationStrings = [ "I think you should check out %s! ", "This movie will blow your mind: %s. ", "Watch %s. It will ruin all other movies for you. " ] self.ratedMovieList = {} self.userRatingVector = np.zeros(len(self.titles)) self.recommendedMovies = [] self.inTheMiddleOfSentimentAnalysis = False self.currentMovieForMoreInformation = "" self.TwoMoviesBoolean = False self.currentConjunction = "" self.sentimentOfPreviousMovie = 0 self.check = {} self.distanceThreshold = 10 self.confirm = False self.previousInput = "" def greeting(self): """chatbot greeting message""" HelloStrings = [ "How can I help you?", "Hey there! It's so nice to meet you. I'd love to hear what you thought of a few movies!", "What's up? Tell me about some movies you've seen!" ] GoodbyeStrings = [ "Have a nice day!", "I'm going to miss you.", "Am gonna be in my room crying until I see you again." ] greeting_message = random.choice(HelloStrings) return greeting_message def goodbye(self): """chatbot goodbye message""" GoodbyeStrings = [ "Have a nice day!", "I'm going to miss you.", "Am gonna be in my room crying until I see you again." ] goodbye_message = random.choice(GoodbyeStrings) return goodbye_message def process(self, input): self.TwoMoviesBoolean = False self.sentimentOfPreviousMovie = 0 """Takes the input string from the REPL and call delegated functions that 1) extract the relevant information and 2) transform the information into a response to the user """ WrongFormatStrings = [ "I'm sorry, is that the right format? Please make sure to include the name of the movie in quotation marks.", "Whoaaa, can you please make sure you use quotation marks?", "Quotation marks around the movie, buddy. Please and thank you." ] UnknownMovieStrings = [ "I'm sorry, I've never heard about that movie! Please tell me about another one.", "Is that some random indie film? Never heard of it!", "Man, I really need to get back to the cinema. Never heard of that movie..." ] SameMovieStrings = [ "Hey! You already told me about that movie. Tell me about a different one now.", "Come on man, pick a NEW movie!", "Have you only watched 1 movie in your entire life? Pick a new one, please" ] ConfirmationStrings = [ "I think you were talking about %s. Am I right?", "It probably wouldn't hurt for you to brush up on your spelling a bit. Did you mean %s?", "C'mon now, you can spell better than that! Were you talking about %s?" ] ConfusedStrings = [ "Hmmmm. Didn't quite get that one. Let's try again. Tell me about another movie!", "Well this is going nowhere fast. From the top, lets try a new one!", "Trying to keep me on my toes I see. How about we get back to some recommendations. Tell me about a movie you've seen!" ] WhatIsStrings = [ "To be honest, I'm not sure I'd like to talk about %s. How about we get back to movies?", "As much as I'd love to talk about %s, I'm really here for the movies. Give me another one!", "Would you rather chat about %s or get some movie recommendations? That what I thought. Hit me with a movie!" ] CanYouStrings = [ "I'm not big on talking about me. Lets focus on the movies.", "Can you?!? Back to the movies please.", "I really appreciate how you've taken an interest in learning about me but all I really want to talk about is what you think about movies. How about one more?" ] ArbitraryStrings = [ "Ok, got it.", "Interesting. But not as interesting as movies. Let's get back to movie recommendations!", "Wow you have such a broad range of interesting topics for discussion. I'd really like to stick to movies though.", "Hmmmm very interesting. How about you let me know what you thought of another movie?" ] if len(input) == 0: return "It seems you meant to say something but forgot" if len(self.recommendedMovies) > 0: movieRec = self.recommend(self.userRatingVector).title() response = random.choice( self.RecommendationStrings ) % movieRec + "Enter any key to hear another recommendation. (Or enter :quit if you're done.)" return response if self.inTheMiddleOfSentimentAnalysis: self.inTheMiddleOfSentimentAnalysis = False response = self.addRating(self.currentMovieForMoreInformation, input) return response if self.confirm: self.confirm = False match = re.match("yep|yea|yes|y *$|Yep|Yea|Yes|Y *$", input) if match is None: return random.choice(ConfusedStrings) else: if self.currentMovieForMoreInformation in self.ratedMovieList: return random.choice(SameMovieStrings) return self.addRating(self.currentMovieForMoreInformation, self.previousInput) #Explaining this regex - checks if there are articles, checks for the year, repeats it all twice matchDouble = re.match( '(.*)\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\"(.*) (and|or|but|yet|neither|either|so)\,* (.*)\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\"(.*)', input) if matchDouble is not None: if matchDouble.group(2): movie1Name = matchDouble.group(3) + ", " + matchDouble.group(2) else: movie1Name = matchDouble.group(3) if matchDouble.group(8): movie2Name = matchDouble.group(9) + ", " + matchDouble.group(8) else: movie2Name = matchDouble.group(9) movie1Name = movie1Name.lower() movie2Name = movie2Name.lower() if (movie1Name not in self.ratedMovieList) and ( movie2Name not in self.ratedMovieList): if (movie1Name in self.titlesOnly) and (movie2Name in self.titlesOnly): self.currentConjunction = matchDouble.group(6) self.TwoMoviesBoolean = True input1 = matchDouble.group(1) + " " + matchDouble.group(5) input2 = matchDouble.group(7) + " " + matchDouble.group(11) response1 = self.addRating(movie1Name, input1) response2 = self.addRating(movie2Name, input2) return (response1 + "\n" + response2) else: response = random.choice(UnknownMovieStrings) return response else: response = random.choice(SameMovieStrings) return response match = re.match('.*\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\".*', input) if match is None: match = re.match('(?:I )?[^A-Z]*([A-Z].*)', input) if match is not None: matchSubstr = match.group(1).lower() splitSubStr = matchSubstr.split() movieName = "" for i in range(0, len(splitSubStr)): movieName = movieName + " " + splitSubStr[i] movieName = movieName.strip() if movieName in self.titlesOnly: input = self.removeTitle(movieName, input) return self.addRating(movieName, input) if self.is_turbo: can_you = re.match("[Cc]an you (.*)", input) what_is = re.match("[Ww]hat is (.*)[\?.!]?", input) if can_you is not None: return random.choice(CanYouStrings) if what_is is not None: return random.choice(WhatIsStrings) % what_is.group(1) else: return random.choice(ArbitraryStrings) return random.choice(WrongFormatStrings) if match is not None: if match.group(1): movieName = match.group(2) + ", " + match.group(1) else: movieName = match.group(2) movieName = movieName.lower() if movieName not in self.ratedMovieList: if movieName in self.titlesOnly: input = self.removeTitle(movieName, input) return self.addRating(movieName, input) else: movieName = self.findPotentialMovie(movieName) if movieName is None: return random.choice(UnknownMovieStrings) else: self.currentMovieForMoreInformation = movieName self.confirm = True self.previousInput = input return random.choice( ConfirmationStrings) % movieName.title() else: response = random.choice(SameMovieStrings) else: response = random.choice(WrongFormatStrings) return response def addRating(self, movieName, string): rating = 0 MoreMoviesStrings = [ "Thank you! Please tell me about another movie.", "Whooo making progress. Give me another one.", "Just a few more movies and I will blow your mind with a recommendation. Give me one more." ] NegationWords = [ "didn't", "never", "not", "don't", "none", "not", "nobody" ] strongPositive = [ "love", "adore", "favorite", "amazing", "incredible", "fantastic" ] strongNegative = ["awful", "terrible", "hate"] strongIntensifiers = ["really", "very", "extremely"] confirmingConjunctionList = ["and", "or", "neither", "either", "so"] opposingConjunctionList = ["but", "yet"] strongPositiveBoolean = False strongNegativeBoolean = False strongIntensifierBoolean = False ReverseBoolean = 1 for word in string.split(): if word in NegationWords: ReverseBoolean = -1 if word in strongPositive: strongPositiveBoolean = True if word in strongNegative: strongNegativeBoolean = True if word in strongIntensifiers: strongIntensifierBoolean = True if self.p.stem(word) in self.sentiment: if self.sentiment[self.p.stem(word)] == "pos": rating += (1 * ReverseBoolean) if strongIntensifierBoolean: strongPositiveBoolean = True strongIntensifierBoolean = False else: rating -= (1 * ReverseBoolean) if strongIntensifiers: strongNegativeBoolean = True strongIntensifierBoolean = False ReverseBoolean = 1 if rating >= 1: rating = 1 strongNegativeBoolean = False elif rating < 0: rating = -1 strongPositiveBoolean = False if self.TwoMoviesBoolean and self.sentimentOfPreviousMovie == 0: self.sentimentOfPreviousMovie = rating if rating == 0: if self.TwoMoviesBoolean: if self.currentConjunction in confirmingConjunctionList: rating = self.sentimentOfPreviousMovie elif self.currentConjunction in opposingConjunctionList: rating = -1 * self.sentimentOfPreviousMovie else: self.inTheMiddleOfSentimentAnalysis = True self.currentMovieForMoreInformation = movieName response = movieName.title( ) + "! I didn't understand if you liked it or not. Tell me more." return response self.ratedMovieList[movieName] = rating self.userRatingVector[self.titlesOnly.index(movieName)] = rating if len(self.ratedMovieList) >= 5: movieRec = self.recommend(self.userRatingVector).title() response = random.choice( self.RecommendationStrings ) % movieRec + " Tap any key to hear another recommendation. (Or enter :quit if you're done.)" else: if strongPositiveBoolean == True and strongNegativeBoolean == False: response = "Whoa, you really liked that one, huh? Give me another one. " elif strongNegativeBoolean == True and strongPositiveBoolean == False: response = "Wow, that bad, huh? Give me another one. " else: response = random.choice(MoreMoviesStrings) return response def removeTitle(self, movieName, input): movieSplit = movieName.split() inputSplit = input.lower().split() for word in movieSplit: #remove the movie title from the words if word in inputSplit: inputSplit.remove(word) input = " ".join(inputSplit) return input def minimumEditDistance(self, string, userInput, knownMovie): userLen = len(userInput) movieLen = len(knownMovie) concat = "%s %s" % (userInput, knownMovie) if userInput == knownMovie: return 0 if userLen == 0: return movieLen elif movieLen == 0: return userLen if concat in self.check: return self.check[concat] else: x = min( self.minimumEditDistance("first", userInput[:-1], knownMovie) + 1, self.minimumEditDistance("second", knownMovie[:movieLen - 1], userInput) + 1, self.minimumEditDistance("third", userInput[:userLen - 1], knownMovie[:movieLen - 1]) + self.substitution(knownMovie[movieLen - 1], userInput[userLen - 1])) self.check[concat] = x return x def substitution(self, letterOne, letterTwo): if letterOne == letterTwo: return 0 else: return 2 def findPotentialMovie(self, string): start = time.time() minDist = None potentialMovie = None for i, title in enumerate(self.titlesOnly): if math.fabs(len(string) - len(title)) < 4: strSet = set(string) titleSet = set(title) if len(strSet - titleSet) > 3 or len(titleSet - strSet) > 2: continue self.check = {} dist = self.minimumEditDistance("zero", string, title) if (minDist is None or dist < minDist) and dist < self.distanceThreshold: minDist = dist potentialMovie = title return potentialMovie ############################################################################# # 3. Movie Recommendation helper functions # ############################################################################# def read_data(self): """Reads the ratings matrix from file""" # This matrix has the following shape: num_movies x num_users # The values stored in each row i and column j is the rating for # movie i by user j self.titles, self.ratings = ratings() reader = csv.reader(open('data/sentiment.txt', 'rb')) self.sentiment = dict(reader) self.titlesOnly = [] for entry in self.titles: titleOnly = entry[0].split(' (')[0] self.titlesOnly.append(titleOnly.lower()) self.sentiment.update( {self.p.stem(k): v for k, v in self.sentiment.items()}) def binarize(self): """Modifies the ratings matrix to make all of the ratings binary""" total = 0 count = 0 avg_rating = 0 for movie_id, movie in enumerate(self.ratings): for user_id, rating in enumerate(movie): if rating != 0: self.ratings[movie_id, user_id] = 1 if rating > 2.5 else -1 def distance(self, u, v): """Calculates a given distance function between vectors u and v""" numerator = np.dot(u, v) denominator = np.linalg.norm(u) * np.linalg.norm(v) similarity = numerator / (denominator + 1e-7) return similarity def recommend(self, u): """Generates a list of movies based on the input vector u using collaborative filtering""" sims = {} #similarities recommendation = "" topScore = None start = time.time() for movie_id, rating in enumerate(u): if rating != 0: sims[movie_id] = {} for r_id, movie in enumerate(self.ratings): sims[movie_id][r_id] = self.distance( movie, self.ratings[movie_id]) # print time.time() - start, "distance time" start = time.time() for i, movieRating in enumerate(self.ratings): iPrediction = 0 for movieName in self.ratedMovieList: j = self.titlesOnly.index(movieName) iPrediction += sims[j][i] * 1.0 * self.userRatingVector[j] if topScore is None or iPrediction > topScore: movie = self.titlesOnly[i] if movie not in self.ratedMovieList and movie not in self.recommendedMovies: # print("prediction score for %s is %.5f" % (movie, iPrediction)) topScore = iPrediction recommendation = movie # print time.time() - start, "recommendation time" self.recommendedMovies.append(recommendation) articlePattern = re.match('(.*), (the|a|an|el|la)', recommendation) if articlePattern is not None: recommendation = articlePattern.group( 2) + " " + articlePattern.group(1) return recommendation def debug(self, input): """Returns debug information as a string for the input string from the REPL""" # Pass the debug information that you may think is important for your # evaluators debug_info = 'debug info' return debug_info ############################################################################# # 5. Write a description for your chatbot here! # ############################################################################# def intro(self): return """ Welcome to our MovieBot! A couple of things to help you out in the processs of using it: - The only difference between Starter and Creative Modes is Creative mode supports Arbitrary Input responses. All other features are supported in Start mode! - We implemented the following Creative Mode features: - Identifying movies without quotation marks or perfect capitalization - Fine-grained sentiment extraction - Spell-checking movie titles - Extracting sentiment with multiple-movie input - Responding to arbitrary input Enjoy! """ ############################################################################# # Auxiliary methods for the chatbot. # # # # DO NOT CHANGE THE CODE BELOW! # # # ############################################################################# def bot_name(self): return self.name