def get_scores(self, terms): """ Creates a list of scores for each file in corpus The score = weighted frequency / the total word count in file. Args: terms (list): a list of str Returns: list: a list of tuples, each containing the file_path_name and its relevancy score """ scores = HashTableLinear() for term in terms: word_hash_table = self.term_freqs.get(term) for file in self.file_list: if word_hash_table.contains(file): if scores.contains(file): scores[file] += self.get_wf(word_hash_table[file]) else: scores[file] = self.get_wf(word_hash_table[file]) score_list = [] for file in self.file_list: if scores.contains(file) and scores[file] > 0: norm_score = scores[file] / self.doc_length[file] score_list.append((file, norm_score)) return score_list
def test_linear2(self): ht = HashTableLinear() for i in range(22): ht.put(chr(i), i) self.assertEqual(ht.size(), 22) self.assertTrue(ht.load_factor() <= 0.75) self.assertTrue(ht.contains(chr(0))) self.assertTrue(ht.contains(chr(1))) self.assertTrue(ht.contains(chr(19))) self.assertFalse(ht.contains(chr(22)))
def test_linear1(self): ht = HashTableLinear() for i in range(11): ht.put(str(i), i) self.assertEqual(ht.size(), 11) self.assertTrue(ht.load_factor() <= 0.75) self.assertTrue(ht.contains('0')) self.assertTrue(ht.contains('1')) self.assertTrue(ht.contains('10')) self.assertFalse(ht.contains('11'))
def get_scores(self, terms): """creates a list of scores for each file in corpus The score = weighted frequency / the total word count in the file. Compute this score for each term in a query and sum all the scores. Args: terms (list) : a list of str Returns: list : a list of tuples, each containing the filename and its relevancy score """ scores = HashTableLinear() for term in terms: if self.term_freqs.contains(term): hashtable = self.term_freqs[term] for file1 in hashtable.slots: if file1 != None and hashtable.contains(file1[0]): if scores.contains(file1[0]): key, val = scores.remove(file1[0]) scores.put(file1[0], val + self.get_wf(file1[1])) else: scores.put(file1[0], self.get_wf(file1[1])) for file1 in scores.slots: if file1 is not None: key, val = scores.remove(file1[0]) val /= self.doc_length.get(file1[0]) scores.put(file1[0], val) return scores
def test_whole_functionality(self): """ Tests the Separate Chain Hash Table Functionality""" filename = 'stop_words.txt' hash_table = HashTableLinear() hash_table = import_stopwords(filename, hash_table) self.assertRaises(KeyError, hash_table.get, 'BubbleGum') self.assertTrue('to' in hash_table) second_hash = HashTableLinear() second_hash.put('three', 'three') third_hash = HashTableLinear() third_hash.put('three', 'three') self.assertEqual(second_hash, third_hash) self.assertNotEqual(hash_table, second_hash) self.assertNotEqual(hash_table, 5) expected = "Hash_val = 0: None\n" \ "Hash_val = 1: None\n" \ "Hash_val = 2: None\n" \ "Hash_val = 3: None\n" \ "Hash_val = 4: ('three', 'three')\n" \ "Hash_val = 5: None\n" \ "Hash_val = 6: None\n" \ "Hash_val = 7: None\n" \ "Hash_val = 8: None\n" \ "Hash_val = 9: None\n" \ "Hash_val = 10: None\n" self.assertEqual(expected, repr(second_hash)) second_hash['four'] = 'four' self.assertEqual(second_hash['four'], 'four') second_hash['five'] = 'five' self.assertEqual(0, hash_table.get('from')) self.assertFalse(second_hash.contains('p')) self.assertTrue(second_hash.contains('five')) second_hash.remove('five') self.assertFalse(second_hash.contains('five')) self.assertRaises(KeyError, second_hash.remove, 'p') self.assertEqual(1, third_hash.size()) self.assertEqual(0, third_hash.collisions())
def test_linear4(self): ht = HashTableLinear() for i in range(22): ht.put(chr(i), i) self.assertEqual(ht.size(), 22) self.assertTrue(ht.load_factor() <= 0.75) self.assertEqual(ht[chr(0)], 0) self.assertEqual(ht[chr(1)], 1) self.assertEqual(ht[chr(19)], 19) self.assertRaises(KeyError, ht.get, 'a') for i in range(22): ht.remove(chr(i)) self.assertFalse(ht.contains(chr(0))) self.assertFalse(ht.contains(chr(1))) self.assertFalse(ht.contains(chr(19))) self.assertRaises(KeyError, ht.remove, 'a')
def test_basic(self): """ Tests basic functionality""" hash_table = HashTableLinear() hash_table.put('unless', 'unless') self.assertTrue(hash_table.contains('unless')) hash_table.put('every', 'every') hash_table.put('being', 'being') hash_table.put('elsewhere', 'elsewhere') hash_table.put('nothing', 'nothing') hash_table.put('hereby', 'hereby') hash_table.put('latter', 'latter') hash_table.put('and', 'and') hash_table.put('afterwards', 'afterwards') hash_table.put('say', 'say') hash_table.put('very', 'very') hash_table.put('few', 'few') hash_table.put('well', 'well') hash_table.put('various', 'various') hash_table.put('make', 'make') hash_table.put('regarding', 'regarding') hash_table.put('take', 'take') hash_table.put('give', 'give') hash_table.put('whole', 'whole') hash_table.put('i', 'i') hash_table.put('against', 'against') hash_table.put('can', 'can') hash_table.get('every') hash_table.get('being') hash_table.get('elsewhere') hash_table.get('nothing') hash_table.get('hereby') hash_table.get('latter') hash_table.get('and') hash_table.get('afterwards') hash_table.get('say') hash_table.get('very') hash_table.get('few') hash_table.get('well') hash_table.get('various') hash_table.get('make') hash_table.get('regarding') hash_table.get('take') hash_table.get('give') hash_table.get('whole') hash_table.get('i') hash_table.get('against') hash_table.get('can')
def test_HashTableLinear(self): t = HashTableLinear() self.assertEqual(t.size(), 0) self.assertFalse(t.contains('us')) self.assertRaises(KeyError, t.get, 'us') t.put('us', 'us') self.assertEqual(t.get('us'), 'us') self.assertEqual(t['us'], 'us') self.assertTrue(t.contains('us')) self.assertFalse(t.contains('say')) self.assertEqual(t.size(), 1) self.assertEqual(t.collisions(), 0) t.put('say', 'say') self.assertEqual(t.get('say'), 'say') self.assertTrue(t.contains('say')) self.assertEqual(t.size(), 2) self.assertEqual(t.collisions(), 1) t.remove('say') self.assertFalse(t.contains('say')) self.assertTrue(t.contains('us')) t.remove('us') self.assertEqual(t.size(), 0) # print(hash_string('the', 11)) # 'the' = 5 t.put('us', 'us') t.put('say', 'say') # self.assertEqual(t.load_factor(), 0.18181818181818182) t.put('the', 'the') # t.put(chr(0), chr(0)) # t.put('0', '0') # print('chr 0', chr(0)) # print('just 0', '0') # print(type(chr(0))) # print(type('0')) # print(hash_string('us', 23)) # 'the' = 5 # print(hash_string('say', 23)) # 'the' = 5 # print(hash_string('the', 23)) # 'the' = 5 # print('from tests', t) self.assertTrue(t.contains('us')) self.assertTrue(t.contains('the'))
def get_scores(self, terms): """ Creates list of scores for each file in corpus. The score = (weighted frequency / total word count in file) Compute the score for each term in a query and sum all the scores. Args: terms (list): a list of strings, raw input string from user query Returns: list: a list of tuples, each containing the filename and its relevancy score """ # scores = HashMap() score_table = HashTable( ) # contains tuples of (filename, weighted_frequency) for query_term in terms: # fetch a hash table of "term" from self.term_freqs query_term_table = self.term_freqs[query_term][1] # for each file in the hash table, add weighted frequency to scores[file] qt_table_keys = query_term_table.keys() for key in qt_table_keys: # key is a file name weighted_frequency = self.get_wf(query_term_table[key][1]) if weighted_frequency != 0: # if this is the second query_term if score_table.contains(key): # new frequency + old frequency old_freq = score_table[key][1] updated_freq = weighted_frequency + old_freq score_table.put(key, updated_freq) # if score_table[key] is empty, use put (if first query_term) else: score_table.put(key, weighted_frequency) # for each file in scores, do scores[file] /= self.doc_length[file] score_table_keys = score_table.keys() score_list = [] for key in score_table_keys: # key is a filename normalized_score = score_table[key][1] / self.doc_length[key][1] score_table[key] = normalized_score score_list.append(score_table[key]) # return scores, which is a list of tuples neglecting terms with frequencies of 0 return score_list
def search(self, query): """ Search for the query terms in files Args: query (str): query input: e.g. "computer science" Returns: list: a list of tuples: (files_path_name, score) sorted in descending order or relevancy excluding files whose relevancy score is 0. """ terms = self.parse_words([query]) cleaned_terms = [] hash_terms = HashTableLinear() for term in terms: if not hash_terms.contains(term): cleaned_terms.append(term) hash_terms.put(term, term) scores = self.get_scores(cleaned_terms) scores = self.rank(scores) return scores
class SearchEngine: """ Builds and maintains an inverted index of documents stored in a specified directory and provides a functionality to search documents with query terms Attributes: directory (str): a directory name stopwords (HashTable): contains stopwords doc_length (HashTable): contains number of words in each document doc_freqs (HashTable): contains number of documents containing the term for each term term_freqs (HashTable): hash table of hash tables for each term, each hash table contains the frequency of the term in documents (document names are the keys and the frequencies are the values) """ def __init__(self, directory, stopwords): self.doc_length = HashTable() self.doc_freqs = HashTable() #this will not be used in this assignment self.term_freqs = HashTable() self.stopwords = stopwords self.index_files(directory) # PREPROCESSING ================================================================================ def read_file(self, infile): """ A helper function to read a file Args: infile (str): the path to a file Returns: list: a list of strings read from a file """ # open file with open(infile, "r") as filepointer: lines = filepointer.readlines( ) # looks like ["line 1 here", "line 2 here"] return lines def parse_words(self, lines): """ Split strings into words, convert words to lower cases and remove newline characters, exclude stopwords Args: lines (list): a list of lists of strings Returns: list: a list of words """ raw_words = [] for line in lines: split_line = line.split( " ") # split line looks like ["line", "1", "here"] raw_words.extend(split_line) # create new list with all words that aren't stop words filtered_words = [ word.rstrip().lower() for word in raw_words if word not in self.stopwords ] return filtered_words def count_words(self, filename, words): """ Count words in a file and store the frequency of each word in the term_freqs hash table. Words should not contain stopwords. Also store the total count of words contained in the file in the doc_length hash table. Args: filename (str): the file name words (list): a list of words """ # store total count of words in the doc_length hash table self.doc_length.put(filename, len(words)) # iterate through each word for word in words: # calculate frequency of this word in this document word_frequency = words.count( word) # returns number of occurences of this word in words # if word is already in term_freqs if self.term_freqs.contains(word): # add new ("doc1", freq) pair to term_freqs[word] (which is the lower hashtable) self.term_freqs[word][1].put(filename, word_frequency) # if word is not already in term_freqs else: # create new frequency hashtable for each term ("doc1", frequency) freq_hashtable = HashTable() freq_hashtable.put(filename, word_frequency) # put this newly created hash table into term_freqs hash table self.term_freqs.put(word, freq_hashtable) def index_files(self, directory): """ Index all text files in a given directory Args: directory (str) : the path of a directory """ # get a list of files in the directory file_list = os.listdir(directory) # for each item in file_list, item is a filename for item in file_list: # construct full path of each file path = os.path.join(directory, item) # if item is not a file, skip it if not os.path.isfile(path) or item == "stop_words.txt": continue # split path into file extension and the rest parts = os.path.splitext( item) # maybe change item stuff here to path # only process text files if parts[1] == ".txt": # process it item_lines = self.read_file(path) item_words = self.parse_words(item_lines) self.count_words(path, item_words) # SEARCHING ==================================================================================== def get_wf(self, term_frequency): """ Computes the weighted frequency Args: term_frequency (float): term frequency Returns: float: the weighted frequency """ if term_frequency > 0: weighted_freq = 1 + math.log(term_frequency) else: weighted_freq = 0 return weighted_freq def get_scores(self, terms): """ Creates list of scores for each file in corpus. The score = (weighted frequency / total word count in file) Compute the score for each term in a query and sum all the scores. Args: terms (list): a list of strings, raw input string from user query Returns: list: a list of tuples, each containing the filename and its relevancy score """ # scores = HashMap() score_table = HashTable( ) # contains tuples of (filename, weighted_frequency) for query_term in terms: # fetch a hash table of "term" from self.term_freqs query_term_table = self.term_freqs[query_term][1] # for each file in the hash table, add weighted frequency to scores[file] qt_table_keys = query_term_table.keys() for key in qt_table_keys: # key is a file name weighted_frequency = self.get_wf(query_term_table[key][1]) if weighted_frequency != 0: # if this is the second query_term if score_table.contains(key): # new frequency + old frequency old_freq = score_table[key][1] updated_freq = weighted_frequency + old_freq score_table.put(key, updated_freq) # if score_table[key] is empty, use put (if first query_term) else: score_table.put(key, weighted_frequency) # for each file in scores, do scores[file] /= self.doc_length[file] score_table_keys = score_table.keys() score_list = [] for key in score_table_keys: # key is a filename normalized_score = score_table[key][1] / self.doc_length[key][1] score_table[key] = normalized_score score_list.append(score_table[key]) # return scores, which is a list of tuples neglecting terms with frequencies of 0 return score_list def rank(self, scores): """ Ranks files in the descending order of relevancy Args: scores (list): list of tuples of (filename, score) Returns: list: a list of filenames sorted in descending order of relevancy """ return sorted(scores, key=lambda x: x[1], reverse=True) def search(self, query): """ Search for the query terms in files Args: query (str): query input, "user input goes here" Returns: list: a list of files in descending order of relevancy """ # parse words filtered_query = self.parse_words([query]) # remove duplicate words using a hash table word_table = HashTable() for word in filtered_query: word_table.put(word, word) word_table_keys = word_table.keys() parsed_query_terms = [] # changes from string to a list # add all words from hash table to list using keys() for key in word_table_keys: parsed_query_terms.append(word_table[key][0]) # pass query terms to get_scores() tuples = self.get_scores(parsed_query_terms) # pass resulting list of tuples to rank() results = self.rank(tuples) # rank's result will be displayed in descending order on screen for a_tuple in results: print(a_tuple[0])
class SearchEngine: """ doc_length: the number of words contained in document contained in a hashtable term_freqs: a hashtable containing each word in text file and an inner hashtable containing the frequency count stopwords: a hashtable containing stop words """ def __init__(self, directory, stopwords): self.doc_length = HashTableLinear() self.term_freqs = HashTableLinear() self.stopwords = stopwords self.index_files(directory) def read_file(self, infile): """A helper function to read a file Args: infile (str) : the path to a file Returns: list : a list of str read from a file """ with open(infile, 'r') as fi: strings = fi.read() strings = strings.split() return strings def parse_words(self, lines): """split strings into words Convert words to lower cases and remove new line chars. Exclude stopwords. Args: lines (list) : a list of strings Returns: list : a list of words """ list1 = [] for line in lines: words = line.lower() words = words.split() for word in words: list1.append(word) list1 = self.exclude_stopwords(list1) return list1 def exclude_stopwords(self, terms): """exclude stopwords from the list of terms Args: terms (list) : Returns: list : a list of str with stopwords removed """ list1 = [] for i in terms: if not self.stopwords.contains(i): list1.append(i) return list1 def count_words(self, filename, words): """ Args: filename (str) : the file name words (list) : a list of words """ for word in words: if word not in self.term_freqs: self.term_freqs[word] = HashTableLinear() self.term_freqs[word][filename] = 1 else: if filename not in self.term_freqs[word]: self.term_freqs[word][filename] = 1 else: self.term_freqs[word][filename] += 1 self.doc_length.put(filename, len(words)) def index_files(self, directory): """index all text files in a given directory Args: directory (str) : the path of a directory """ file_list = os.listdir(directory) for item in file_list: val = os.path.join(directory, item) if os.path.isfile(val): parts = os.path.splitext(val) if parts[1] == '.txt': words1 = self.read_file(val) words2 = self.parse_words(words1) self.count_words(val, words2) def get_wf(self, tf): """comptes the weighted frequency Args: tf (float) : term frequency Returns: float : the weighted frequency """ if tf > 0: wf = 1 + math.log(tf) else: wf = 0 return wf def get_scores(self, terms): """creates a list of scores for each file in corpus The score = weighted frequency / the total word count in the file. Compute this score for each term in a query and sum all the scores. Args: terms (list) : a list of str Returns: list : a list of tuples, each containing the filename and its relevancy score """ scores = HashTableLinear() for term in terms: if self.term_freqs.contains(term): hashtable = self.term_freqs[term] for file1 in hashtable.slots: if file1 != None and hashtable.contains(file1[0]): if scores.contains(file1[0]): key, val = scores.remove(file1[0]) scores.put(file1[0], val + self.get_wf(file1[1])) else: scores.put(file1[0], self.get_wf(file1[1])) for file1 in scores.slots: if file1 is not None: key, val = scores.remove(file1[0]) val /= self.doc_length.get(file1[0]) scores.put(file1[0], val) return scores def rank(self, scores): """ranks files in the descending order of relevancy Args: scores(list) : a list of tuples: (filename, score) Returns: list : a list of tuples: (filename, score) sorted in descending order of relevancy """ no_none = [] for val in scores.slots: if val != None: no_none.append(val) no_none = sorted(no_none, key=lambda x: x[1], reverse=True) new_line_out = '' for i in no_none: new_line_out += i[0] + '\n' return new_line_out def search(self, query): """ search for the query terms in files Args: query (str) : query input Returns: list : list of files in descending order or relevancy """ list1 = self.parse_words([query]) scores = self.get_scores(list1) print(scores) return self.rank(scores)
class SearchEngine: """class for SearchEngine Attributes: directory (str) : a directory name stopwords (HashMap) : a hash table containing stopwords doc_length (HashMap) : a hash table containing the total number of words in each document term_freqs (HashMap) : a hash table of hash tables for each term. Each hash table contains the frequency of the term in documents (document names are the keys and the frequencies are the values) file_list : a list of files """ def __init__(self, directory, stopwords): self.doc_length = HashTableLinear( ) # Replace HashMap() with your hash table. self.term_freqs = HashTableLinear() self.stopwords = stopwords self.index_files(directory) def __repr__(self): return "doc_length: %s, term_freqs: %s, stopwords: %s"\ % (self.doc_length, self.term_freqs, self.stopwords) def __eq__(self, other): return isinstance(other, SearchEngine) and self.doc_length == other.doc_length\ and self.term_freqs == other.term_freqs def read_file(self, infile): """A helper function to read a file Args: infile (str) : the path to a file Returns: list : a list of str read from a file """ file_list = [] with open(infile, 'r') as i: for line in i.readlines(): file_list.append(line) return file_list def parse_words(self, lines): """split strings into words Convert words to lower cases and remove new line chars. Exclude stopwords. Args: lines (list) : a list of strings Returns: list : a list of words """ word_list = [] for item in lines: split_arr = item.split(' ') for word in split_arr: if word != '\n': word_list.append(word.lower()) final_arr = [] for item in word_list: item = item.replace("(", '') item = item.replace(")", '') item = item.replace(".", '') item = item.replace(",", '') final_arr.append(item.rstrip()) final_arr = self.exclude_stopwords(final_arr) return final_arr def exclude_stopwords(self, terms): """exclude stopwords from the list of terms Args: terms (list) : Returns: list : a list of str with stopwords removed """ new_terms = [] for item in terms: if item in self.stopwords: continue else: new_terms.append(item) return new_terms def count_words(self, filename, words): """count words in a file and store the frequency of each word in the term_freqs hash table. The keys of the term_freqs hash table shall be words. The values of the term_freqs hash table shall be hash tables (term_freqs is a hash table of hash tables). The keys of the hash tables (inner hash table) stored in the term_freqs shall be file names. The values of the inner hash tables shall be the frequencies of words. For example, self.term_freqs[word][filename] += 1; Words should not contain stopwords. Also store the total count of words contained in the file in the doc_length hash table. Args: filename (str) : the file name words (list) : a list of words """ for item in words: if self.term_freqs.contains(item): if self.term_freqs[item].contains(filename): self.term_freqs[item][filename] += 1 else: self.term_freqs[item].put(filename, 1) else: ht = HashTableLinear() ht.put(filename, 1) self.term_freqs.put(item, ht) if self.doc_length.contains(filename): self.doc_length[filename] += 1 else: self.doc_length.put(filename, 1) def index_files(self, directory): """index all text files in a given directory Args: directory (str) : the path of a directory """ dir_list = os.listdir(directory) for item in dir_list: if os.path.isfile(os.path.join(directory, item)) is True: parts = os.path.splitext(item) if parts[1] == '.txt': line_arr = self.read_file(os.path.join(directory, item)) words = self.parse_words(line_arr) self.count_words(os.path.join(directory, item), words) def get_wf(self, tf): """computes the weighted frequency Args: tf (float) : term frequency Returns: float : the weighted frequency """ if tf > 0: wf = 1 + math.log(tf) else: wf = 0 return wf def get_scores(self, terms): """creates a list of scores for each file in corpus The score = weighted frequency / the total word count in the file. Compute this score for each term in a query and sum all the scores. Args: terms (list) : a list of str Returns: list : a list of tuples, each containing the filename and its relevancy score """ scores = HashTableLinear() for item in terms: if item in self.term_freqs: temp = self.term_freqs[item] for thing in temp.hash_table: if thing is not None: if thing[0] in scores: scores[thing[0]] += self.get_wf(thing[1]) else: scores.put(thing[0], self.get_wf(thing[1])) for item in scores.hash_table: if item is None: continue else: scores[item[0]] /= self.doc_length[item[0]] return scores def rank(self, scores): """ranks files in the descending order of relevancy Args: scores(list) : a list of tuples: (filename, score) Returns: list : a list of tuples: (filename, score) sorted in descending order of relevancy """ new_scores = list(filter(None, scores)) sorted_scores = sorted(new_scores, key=lambda x: x[1], reverse=True) return sorted_scores def search(self, query): """ search for the query terms in files Args: query (str) : query input Returns: list : list of files in descending order or relevancy """ scores = self.get_scores(query).hash_table new_scores = self.rank(scores) score_arr = [] for item in new_scores: if item is not None: score_arr.append(item) print(item) return score_arr