def test_HashTableLinear(self): t = HashTableLinear() self.assertEqual(t.size(), 0) self.assertFalse(t.contains('us')) self.assertRaises(KeyError, t.get, 'us') t.put('us', 'us') self.assertEqual(t.get('us'), 'us') self.assertEqual(t['us'], 'us') self.assertTrue(t.contains('us')) self.assertFalse(t.contains('say')) self.assertEqual(t.size(), 1) self.assertEqual(t.collisions(), 0) t.put('say', 'say') self.assertEqual(t.get('say'), 'say') self.assertTrue(t.contains('say')) self.assertEqual(t.size(), 2) self.assertEqual(t.collisions(), 1) t.remove('say') self.assertFalse(t.contains('say')) self.assertTrue(t.contains('us')) t.remove('us') self.assertEqual(t.size(), 0) # print(hash_string('the', 11)) # 'the' = 5 t.put('us', 'us') t.put('say', 'say') # self.assertEqual(t.load_factor(), 0.18181818181818182) t.put('the', 'the') # t.put(chr(0), chr(0)) # t.put('0', '0') # print('chr 0', chr(0)) # print('just 0', '0') # print(type(chr(0))) # print(type('0')) # print(hash_string('us', 23)) # 'the' = 5 # print(hash_string('say', 23)) # 'the' = 5 # print(hash_string('the', 23)) # 'the' = 5 # print('from tests', t) self.assertTrue(t.contains('us')) self.assertTrue(t.contains('the'))
def test_whole_functionality(self): """ Tests the Separate Chain Hash Table Functionality""" filename = 'stop_words.txt' hash_table = HashTableLinear() hash_table = import_stopwords(filename, hash_table) self.assertRaises(KeyError, hash_table.get, 'BubbleGum') self.assertTrue('to' in hash_table) second_hash = HashTableLinear() second_hash.put('three', 'three') third_hash = HashTableLinear() third_hash.put('three', 'three') self.assertEqual(second_hash, third_hash) self.assertNotEqual(hash_table, second_hash) self.assertNotEqual(hash_table, 5) expected = "Hash_val = 0: None\n" \ "Hash_val = 1: None\n" \ "Hash_val = 2: None\n" \ "Hash_val = 3: None\n" \ "Hash_val = 4: ('three', 'three')\n" \ "Hash_val = 5: None\n" \ "Hash_val = 6: None\n" \ "Hash_val = 7: None\n" \ "Hash_val = 8: None\n" \ "Hash_val = 9: None\n" \ "Hash_val = 10: None\n" self.assertEqual(expected, repr(second_hash)) second_hash['four'] = 'four' self.assertEqual(second_hash['four'], 'four') second_hash['five'] = 'five' self.assertEqual(0, hash_table.get('from')) self.assertFalse(second_hash.contains('p')) self.assertTrue(second_hash.contains('five')) second_hash.remove('five') self.assertFalse(second_hash.contains('five')) self.assertRaises(KeyError, second_hash.remove, 'p') self.assertEqual(1, third_hash.size()) self.assertEqual(0, third_hash.collisions())
class SearchEngine: """ doc_length: the number of words contained in document contained in a hashtable term_freqs: a hashtable containing each word in text file and an inner hashtable containing the frequency count stopwords: a hashtable containing stop words """ def __init__(self, directory, stopwords): self.doc_length = HashTableLinear() self.term_freqs = HashTableLinear() self.stopwords = stopwords self.index_files(directory) def read_file(self, infile): """A helper function to read a file Args: infile (str) : the path to a file Returns: list : a list of str read from a file """ with open(infile, 'r') as fi: strings = fi.read() strings = strings.split() return strings def parse_words(self, lines): """split strings into words Convert words to lower cases and remove new line chars. Exclude stopwords. Args: lines (list) : a list of strings Returns: list : a list of words """ list1 = [] for line in lines: words = line.lower() words = words.split() for word in words: list1.append(word) list1 = self.exclude_stopwords(list1) return list1 def exclude_stopwords(self, terms): """exclude stopwords from the list of terms Args: terms (list) : Returns: list : a list of str with stopwords removed """ list1 = [] for i in terms: if not self.stopwords.contains(i): list1.append(i) return list1 def count_words(self, filename, words): """ Args: filename (str) : the file name words (list) : a list of words """ for word in words: if word not in self.term_freqs: self.term_freqs[word] = HashTableLinear() self.term_freqs[word][filename] = 1 else: if filename not in self.term_freqs[word]: self.term_freqs[word][filename] = 1 else: self.term_freqs[word][filename] += 1 self.doc_length.put(filename, len(words)) def index_files(self, directory): """index all text files in a given directory Args: directory (str) : the path of a directory """ file_list = os.listdir(directory) for item in file_list: val = os.path.join(directory, item) if os.path.isfile(val): parts = os.path.splitext(val) if parts[1] == '.txt': words1 = self.read_file(val) words2 = self.parse_words(words1) self.count_words(val, words2) def get_wf(self, tf): """comptes the weighted frequency Args: tf (float) : term frequency Returns: float : the weighted frequency """ if tf > 0: wf = 1 + math.log(tf) else: wf = 0 return wf def get_scores(self, terms): """creates a list of scores for each file in corpus The score = weighted frequency / the total word count in the file. Compute this score for each term in a query and sum all the scores. Args: terms (list) : a list of str Returns: list : a list of tuples, each containing the filename and its relevancy score """ scores = HashTableLinear() for term in terms: if self.term_freqs.contains(term): hashtable = self.term_freqs[term] for file1 in hashtable.slots: if file1 != None and hashtable.contains(file1[0]): if scores.contains(file1[0]): key, val = scores.remove(file1[0]) scores.put(file1[0], val + self.get_wf(file1[1])) else: scores.put(file1[0], self.get_wf(file1[1])) for file1 in scores.slots: if file1 is not None: key, val = scores.remove(file1[0]) val /= self.doc_length.get(file1[0]) scores.put(file1[0], val) return scores def rank(self, scores): """ranks files in the descending order of relevancy Args: scores(list) : a list of tuples: (filename, score) Returns: list : a list of tuples: (filename, score) sorted in descending order of relevancy """ no_none = [] for val in scores.slots: if val != None: no_none.append(val) no_none = sorted(no_none, key=lambda x: x[1], reverse=True) new_line_out = '' for i in no_none: new_line_out += i[0] + '\n' return new_line_out def search(self, query): """ search for the query terms in files Args: query (str) : query input Returns: list : list of files in descending order or relevancy """ list1 = self.parse_words([query]) scores = self.get_scores(list1) print(scores) return self.rank(scores)
def test_basic(self): """ Tests basic functionality""" hash_table = HashTableLinear() hash_table.put('unless', 'unless') self.assertTrue(hash_table.contains('unless')) hash_table.put('every', 'every') hash_table.put('being', 'being') hash_table.put('elsewhere', 'elsewhere') hash_table.put('nothing', 'nothing') hash_table.put('hereby', 'hereby') hash_table.put('latter', 'latter') hash_table.put('and', 'and') hash_table.put('afterwards', 'afterwards') hash_table.put('say', 'say') hash_table.put('very', 'very') hash_table.put('few', 'few') hash_table.put('well', 'well') hash_table.put('various', 'various') hash_table.put('make', 'make') hash_table.put('regarding', 'regarding') hash_table.put('take', 'take') hash_table.put('give', 'give') hash_table.put('whole', 'whole') hash_table.put('i', 'i') hash_table.put('against', 'against') hash_table.put('can', 'can') hash_table.get('every') hash_table.get('being') hash_table.get('elsewhere') hash_table.get('nothing') hash_table.get('hereby') hash_table.get('latter') hash_table.get('and') hash_table.get('afterwards') hash_table.get('say') hash_table.get('very') hash_table.get('few') hash_table.get('well') hash_table.get('various') hash_table.get('make') hash_table.get('regarding') hash_table.get('take') hash_table.get('give') hash_table.get('whole') hash_table.get('i') hash_table.get('against') hash_table.get('can')
class SearchEngine: """ Search engine class to build an inverted index of documents stored in a specified directory and provides a functionality to search documents with query terms. Attributes: directory (str): a directory name stopwords (HashMap): a hash table containing stop words doc_length (HashMap): a hash table containing the total number of words in each document term_freqs (HashMap): a hash table of hash tables for each term. Each hash table contains the frequency of the term in documents (document names are the keys and the frequencies of the values) """ def __init__(self, directory, stopwords): """ Initialize the data structure by taking a directory name and a hash table containing stopwords. Args: directory (str): a directory name stopwords (HashMap): a hash table containing stopwords """ self.doc_length = HashTableLinear() self.term_freqs = HashTableLinear() self.stopwords = stopwords self.file_list = [] self.index_files(directory) def __eq__(self, other): """ Compares the data structure to other""" return isinstance(other, SearchEngine) and \ self.doc_length == other.doc_length and \ self.term_freqs == other.term_freqs and \ self.stopwords == other.stopwords def __repr__(self): """ How the data structure is represented""" return "SearchEngine Instance:\n" + str(self.term_freqs) def read_file(self, infile): """ Reads all words contained in the file except for stop words Args: infile (str): the path to a file Returns: list: a list of str read from a file """ with open(infile, 'r') as file: str_list = file.readlines() for index, line in enumerate(str_list): str_list[index] = line.rstrip('\n') return str_list def parse_words(self, lines): """ Splits strings into words by spaces, converts words to lower cases, and removes newline chars, parentheses, brackets, and punctuations Args: lines (list): a list of strings Returns: list: a list of words """ words = [] for each in lines: line_list = each.split() for index, word in enumerate(line_list): line_list[index] = re.sub('[\W_]+', '', word) words += line_list for index, word in enumerate(words): words[index] = word.lower() words = self.exclude_stopwords(words) return words def exclude_stopwords(self, terms): """ Exclude stopwords from the list of terms Args: terms (list): list of terms to be cleaned of stop words Returns: list: a list of str with stopwords removed """ out_list = [] for each in terms: if each not in self.stopwords: out_list.append(each) return out_list def count_words(self, file_path_name, words): """ Count words in a file and store the frequency of each word in the term_freqs hash table. The keys of the term_freqs hash table shall be words. The values of the term_freqs hash table shall be hash tables (term_freqs is a hash table of hash tables). The keys of the hash tables (inner hash table) stored in the term_freqs shall be file names. Values of inner hash tables shall be the frequencies of words. Args: file_path_name (str): the file name words (list): a list of words """ self.doc_length.put(file_path_name, len(words)) while len(words) > 0: current_word = words[0] word_freq = 0 word_freq = words.count(current_word) try: while True: words.remove(current_word) except ValueError: pass # If the word already in term_freqs, retrieve the doc freq table # otherwise, create a new hash table if current_word in self.term_freqs: freq_hash = self.term_freqs.get(current_word) else: freq_hash = HashTableLinear() freq_hash.put(file_path_name, word_freq) self.term_freqs.put(current_word, freq_hash) def index_files(self, directory): """ Processes a directory and makes an index of all the files Args: directory (str): the directory being indexed """ dir_list = os.listdir(directory) full_dir_list = [] for index, item in enumerate(dir_list): full_dir_list.append(os.path.join(directory, item)) file_list = [] for index, item in enumerate(full_dir_list): if os.path.isfile(item): parts = os.path.splitext(item) if parts[1] == '.txt': file_list.append(full_dir_list[index]) # The list of txt files in directory is now in file_list self.file_list = file_list for file in file_list: str_list = self.read_file(file) words = self.parse_words(str_list) self.count_words(file, words) def get_wf(self, tf): """ computes the weighted frequency Args: tf (float): term frequency Returns: float: the weighted frequency """ if tf > 0: wf = 1 + math.log(tf) else: wf = 0 return wf def get_scores(self, terms): """ Creates a list of scores for each file in corpus The score = weighted frequency / the total word count in file. Args: terms (list): a list of str Returns: list: a list of tuples, each containing the file_path_name and its relevancy score """ scores = HashTableLinear() for term in terms: word_hash_table = self.term_freqs.get(term) for file in self.file_list: if word_hash_table.contains(file): if scores.contains(file): scores[file] += self.get_wf(word_hash_table[file]) else: scores[file] = self.get_wf(word_hash_table[file]) score_list = [] for file in self.file_list: if scores.contains(file) and scores[file] > 0: norm_score = scores[file] / self.doc_length[file] score_list.append((file, norm_score)) return score_list def rank(self, scores): """ Ranks files in the descending order of relevancy Args: scores (list): a list of tuples: (file_path_name, score) Returns: list: a list of tuples (file_path_name, score) sorted in descending order of relevancy """ for pos in range(1, len(scores)): cur_pos = pos while cur_pos > 0 and scores[cur_pos - 1][1] < scores[cur_pos][1]: temp = scores[cur_pos - 1] scores[cur_pos - 1] = scores[cur_pos] scores[cur_pos] = temp # Reverse the order so its descending # scores.reverse() return scores def search(self, query): """ Search for the query terms in files Args: query (str): query input: e.g. "computer science" Returns: list: a list of tuples: (files_path_name, score) sorted in descending order or relevancy excluding files whose relevancy score is 0. """ terms = self.parse_words([query]) cleaned_terms = [] hash_terms = HashTableLinear() for term in terms: if not hash_terms.contains(term): cleaned_terms.append(term) hash_terms.put(term, term) scores = self.get_scores(cleaned_terms) scores = self.rank(scores) return scores def print_nice_results(self, scores): """ Takes the output of scores method and makes the results more presentable to look at. Args: scores (list): list of tuples that is output from self.scores """ for each in scores: print(each[0] + str(each[1]))