Ejemplo n.º 1
0
 def get_scores(self, terms):
     """Creates a list of scores for each file in corpus.
     The score = weighted frequency / the total word count in the file
     The score is computed for each term and all scores are summed.
     Arguments:
         terms (list): A list of str
     Returns:
         list: a list of tuples, each containing the file_path_name and
               its relevancy score
     """
     scores = HashTable()
     results = []
     for term in terms:
         if term in self.term_freqs:
             for file in self.term_freqs[term].keys():
                 term_f = self.term_freqs[term][file]
                 weighted_f = self.get_wf(term_f)
                 if file not in scores:
                     scores[file] = weighted_f
                 else:
                     scores[file] += weighted_f
     for file in scores.keys():
         scores[file] /= self.doc_length[file]
         results.append((file, scores[file]))
     return results
Ejemplo n.º 2
0
 def test_sepchain2(self):
     ht = HashTableSepchain()
     for i in range(20):
         print(ht.table_size)
         ht.put(chr(i), i)
     self.assertEqual(ht.size(), 20)
     self.assertTrue(ht.load_factor() <= 1.0)
     self.assertTrue(ht.contains(chr(0)))
     self.assertTrue(ht.contains(chr(1)))
     self.assertTrue(ht.contains(chr(19)))
     self.assertFalse(ht.contains(chr(20)))
Ejemplo n.º 3
0
 def test_sepchain1(self):
     ht = HashTableSepchain()
     for i in range(11):
         ht.put(str(i), i)
     self.assertEqual(ht.size(), 11)
     self.assertEqual(ht.load_factor(), 1.0)
     self.assertTrue(ht.contains('0'))
     self.assertTrue(ht.contains('1'))
     print(ht.hash_table)
     self.assertTrue(ht.contains('10'))
     self.assertFalse(ht.contains('11'))
Ejemplo n.º 4
0
 def search(self, query):
     """Search for the query items in files.
     Arguments:
         query (str): query input: e.g. "Computer Science"
     Returns:
          list: a list of tuples: (file_path_name, score) sorted
                in descending order or relevancy, excluding files whose
                relevancy score is 0
     """
     query_terms = query.lower().strip().split(' ')
     duplicate_check = HashTable()
     for term in query_terms:
         if term not in duplicate_check:
             duplicate_check[term] = 1
         else:
             duplicate_check[term] += 1
     return self.rank(self.get_scores(duplicate_check.keys()))
Ejemplo n.º 5
0
 def test_sepchain6(self):
     ht = HashTableSepchain()
     for i in range(22):
         ht.put(chr(i), i)
     self.assertEqual(ht.size(), 22)
     print(ht.hash_table)
     self.assertEqual(ht[chr(0)], 0)
     self.assertEqual(ht[chr(1)], 1)
     self.assertEqual(ht[chr(19)], 19)
     for i in range(22):
         ht.remove(chr(i))
     self.assertFalse(ht.contains(chr(0)))
     self.assertFalse(ht.contains(chr(1)))
     self.assertFalse(ht.contains(chr(19)))
Ejemplo n.º 6
0
 def test_sepchain5(self):
     ht = HashTableSepchain()
     stop_words = import_stopwords(FILE, ht)
     self.assertEqual(stop_words.size(), 305)
     self.assertTrue(stop_words.load_factor() <= 1.5)
     self.assertFalse("collision" in stop_words)
     self.assertTrue("very" in stop_words)
     self.assertFalse("linear" in stop_words)
     self.assertTrue("a" in stop_words)
Ejemplo n.º 7
0
 def test_import_stopwords(self):
     hashtable = import_stopwords("stop_words.txt", HashTableSepchain())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")
     hashtable = import_stopwords("stop_words.txt", HashTableLinear())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")
     hashtable = import_stopwords("stop_words.txt", HashTableQuadratic())
     self.assertEqual(hashtable["unless"], "unless")
     self.assertRaises(KeyError, hashtable.get, "Parth")
Ejemplo n.º 8
0
 def test_hash_sepchain(self):
     table = HashTableSepchain()
     self.assertEqual(table.table_size, 11)
     table["3"] = "3"
     table["2"] = "2"
     table["4"] = "4"
     table["5"] = "5"
     self.assertEqual("5" in table, True)
     self.assertEqual("6" in table, False)
     self.assertRaises(KeyError, table.get, "6")
     table["3"] = "6"
     self.assertEqual(table["3"], "6")
     table[chr(40)] = "20"
     self.assertEqual(table["3"], "6")
     self.assertEqual(table.num_collisions, 1)
     table.remove("3")
     table.remove("4")
     self.assertRaises(KeyError, table.get, "4")
     self.assertRaises(KeyError, table.remove, "4")
Ejemplo n.º 9
0
 def test_sepchain4(self):
     ht = HashTableSepchain()
     num_items = int(47 * 1.5) - 1
     for i in range(num_items):
         ht.put(chr(i), i)
     self.assertEqual(ht.size(), num_items)
     self.assertTrue(1.0 < ht.load_factor() <= 1.5)
Ejemplo n.º 10
0
    def test_sepchain3(self):
        ht = HashTableSepchain()
        for i in range(34):
            ht.put(chr(i), i)

        self.assertEqual(ht.size(), 34)
        self.assertTrue(1.0 < ht.load_factor() <= 1.5)
Ejemplo n.º 11
0
    def test_sepchain6(self):
        ht = HashTableSepchain()
        for i in range(22):
            ht.put(chr(i), i)
        self.assertEqual(ht.size(), 22)
        self.assertEqual(ht[chr(0)], 0)
        self.assertEqual(ht[chr(1)], 1)
        self.assertEqual(ht[chr(19)], 19)

        self.assertRaises(KeyError, ht.get, 'a')

        for i in range(22):
            ht.remove(chr(i))
        self.assertFalse(ht.contains(chr(0)))
        self.assertFalse(ht.contains(chr(1)))
        self.assertFalse(ht.contains(chr(19)))

        self.assertRaises(KeyError, ht.remove, 'a')
Ejemplo n.º 12
0
    def test_HashTableSepChain(self):
        t = HashTableSepchain()

        self.assertEqual(t.size(), 0)
        self.assertFalse(t.contains('us'))
        self.assertRaises(KeyError, t.get, 'us')

        t.put('us', 'us')
        self.assertEqual(t.get('us'), 'us')
        self.assertEqual(t['us'], 'us')
        self.assertTrue(t.contains('us'))
        self.assertFalse(t.contains('say'))
        self.assertEqual(t.size(), 1)
        self.assertEqual(t.collisions(), 0)

        t.put('say', 'say')
        self.assertEqual(t.get('say'), 'say')
        self.assertTrue(t.contains('say'))
        self.assertEqual(t.size(), 2)
        self.assertEqual(t.collisions(), 1)

        t.remove('say')
        self.assertFalse(t.contains('say'))
        self.assertTrue(t.contains('us'))
        t.remove('us')
        self.assertEqual(t.size(), 0)

        # print(hash_string('the', 11)) # 'the' = 5
        t.put('us', 'us')
        t.put('say', 'say')
        # self.assertEqual(t.load_factor(), 0.18181818181818182)
        t.put('the', 'the')
        t.put(chr(0), chr(0))
        # t.put('0', '0')
        # print('chr 0', chr(0))
        # print('just 0', '0')
        # print(type(chr(0)))
        # print(type('0'))

        # print(hash_string('us', 23)) # 'the' = 5
        # print(hash_string('say', 23)) # 'the' = 5
        # print(hash_string('the', 23)) # 'the' = 5
        # print('from tests', t)

        self.assertTrue(t.contains('us'))
        self.assertTrue(t.contains('the'))
        # self.assertTrue(t.contains('0'))
        self.assertTrue(t.contains(chr(0)))