def get_scores(self, terms): """Creates a list of scores for each file in corpus. The score = weighted frequency / the total word count in the file The score is computed for each term and all scores are summed. Arguments: terms (list): A list of str Returns: list: a list of tuples, each containing the file_path_name and its relevancy score """ scores = HashTable() results = [] for term in terms: if term in self.term_freqs: for file in self.term_freqs[term].keys(): term_f = self.term_freqs[term][file] weighted_f = self.get_wf(term_f) if file not in scores: scores[file] = weighted_f else: scores[file] += weighted_f for file in scores.keys(): scores[file] /= self.doc_length[file] results.append((file, scores[file])) return results
def test_sepchain2(self): ht = HashTableSepchain() for i in range(20): print(ht.table_size) ht.put(chr(i), i) self.assertEqual(ht.size(), 20) self.assertTrue(ht.load_factor() <= 1.0) self.assertTrue(ht.contains(chr(0))) self.assertTrue(ht.contains(chr(1))) self.assertTrue(ht.contains(chr(19))) self.assertFalse(ht.contains(chr(20)))
def test_sepchain1(self): ht = HashTableSepchain() for i in range(11): ht.put(str(i), i) self.assertEqual(ht.size(), 11) self.assertEqual(ht.load_factor(), 1.0) self.assertTrue(ht.contains('0')) self.assertTrue(ht.contains('1')) print(ht.hash_table) self.assertTrue(ht.contains('10')) self.assertFalse(ht.contains('11'))
def search(self, query): """Search for the query items in files. Arguments: query (str): query input: e.g. "Computer Science" Returns: list: a list of tuples: (file_path_name, score) sorted in descending order or relevancy, excluding files whose relevancy score is 0 """ query_terms = query.lower().strip().split(' ') duplicate_check = HashTable() for term in query_terms: if term not in duplicate_check: duplicate_check[term] = 1 else: duplicate_check[term] += 1 return self.rank(self.get_scores(duplicate_check.keys()))
def test_sepchain6(self): ht = HashTableSepchain() for i in range(22): ht.put(chr(i), i) self.assertEqual(ht.size(), 22) print(ht.hash_table) self.assertEqual(ht[chr(0)], 0) self.assertEqual(ht[chr(1)], 1) self.assertEqual(ht[chr(19)], 19) for i in range(22): ht.remove(chr(i)) self.assertFalse(ht.contains(chr(0))) self.assertFalse(ht.contains(chr(1))) self.assertFalse(ht.contains(chr(19)))
def test_sepchain5(self): ht = HashTableSepchain() stop_words = import_stopwords(FILE, ht) self.assertEqual(stop_words.size(), 305) self.assertTrue(stop_words.load_factor() <= 1.5) self.assertFalse("collision" in stop_words) self.assertTrue("very" in stop_words) self.assertFalse("linear" in stop_words) self.assertTrue("a" in stop_words)
def test_import_stopwords(self): hashtable = import_stopwords("stop_words.txt", HashTableSepchain()) self.assertEqual(hashtable["unless"], "unless") self.assertRaises(KeyError, hashtable.get, "Parth") hashtable = import_stopwords("stop_words.txt", HashTableLinear()) self.assertEqual(hashtable["unless"], "unless") self.assertRaises(KeyError, hashtable.get, "Parth") hashtable = import_stopwords("stop_words.txt", HashTableQuadratic()) self.assertEqual(hashtable["unless"], "unless") self.assertRaises(KeyError, hashtable.get, "Parth")
def test_hash_sepchain(self): table = HashTableSepchain() self.assertEqual(table.table_size, 11) table["3"] = "3" table["2"] = "2" table["4"] = "4" table["5"] = "5" self.assertEqual("5" in table, True) self.assertEqual("6" in table, False) self.assertRaises(KeyError, table.get, "6") table["3"] = "6" self.assertEqual(table["3"], "6") table[chr(40)] = "20" self.assertEqual(table["3"], "6") self.assertEqual(table.num_collisions, 1) table.remove("3") table.remove("4") self.assertRaises(KeyError, table.get, "4") self.assertRaises(KeyError, table.remove, "4")
def test_sepchain4(self): ht = HashTableSepchain() num_items = int(47 * 1.5) - 1 for i in range(num_items): ht.put(chr(i), i) self.assertEqual(ht.size(), num_items) self.assertTrue(1.0 < ht.load_factor() <= 1.5)
def test_sepchain3(self): ht = HashTableSepchain() for i in range(34): ht.put(chr(i), i) self.assertEqual(ht.size(), 34) self.assertTrue(1.0 < ht.load_factor() <= 1.5)
def test_sepchain6(self): ht = HashTableSepchain() for i in range(22): ht.put(chr(i), i) self.assertEqual(ht.size(), 22) self.assertEqual(ht[chr(0)], 0) self.assertEqual(ht[chr(1)], 1) self.assertEqual(ht[chr(19)], 19) self.assertRaises(KeyError, ht.get, 'a') for i in range(22): ht.remove(chr(i)) self.assertFalse(ht.contains(chr(0))) self.assertFalse(ht.contains(chr(1))) self.assertFalse(ht.contains(chr(19))) self.assertRaises(KeyError, ht.remove, 'a')
def test_HashTableSepChain(self): t = HashTableSepchain() self.assertEqual(t.size(), 0) self.assertFalse(t.contains('us')) self.assertRaises(KeyError, t.get, 'us') t.put('us', 'us') self.assertEqual(t.get('us'), 'us') self.assertEqual(t['us'], 'us') self.assertTrue(t.contains('us')) self.assertFalse(t.contains('say')) self.assertEqual(t.size(), 1) self.assertEqual(t.collisions(), 0) t.put('say', 'say') self.assertEqual(t.get('say'), 'say') self.assertTrue(t.contains('say')) self.assertEqual(t.size(), 2) self.assertEqual(t.collisions(), 1) t.remove('say') self.assertFalse(t.contains('say')) self.assertTrue(t.contains('us')) t.remove('us') self.assertEqual(t.size(), 0) # print(hash_string('the', 11)) # 'the' = 5 t.put('us', 'us') t.put('say', 'say') # self.assertEqual(t.load_factor(), 0.18181818181818182) t.put('the', 'the') t.put(chr(0), chr(0)) # t.put('0', '0') # print('chr 0', chr(0)) # print('just 0', '0') # print(type(chr(0))) # print(type('0')) # print(hash_string('us', 23)) # 'the' = 5 # print(hash_string('say', 23)) # 'the' = 5 # print(hash_string('the', 23)) # 'the' = 5 # print('from tests', t) self.assertTrue(t.contains('us')) self.assertTrue(t.contains('the')) # self.assertTrue(t.contains('0')) self.assertTrue(t.contains(chr(0)))