def main(): #print word_distance('gol', 'bola') #print word_distance(*sys.argv[1:]) words_limit = int(sys.argv[1]) mtree = MTree(distance_function=word_distance) loaded_words = 0 t = Timer() print 'Indexing...', ; sys.stdout.flush() with open(DICT_FILE) as f: for line in f: if line[0] != '%': word = unicode(line.strip(), 'utf-8') #print "Adding %r (%s)" % (word, word) mtree.add(word) loaded_words += 1 if loaded_words >= words_limit: break if loaded_words % 100 == 0: print '\r%d words indexed' % loaded_words, ; sys.stdout.flush() print '\r%d words indexed' % loaded_words times = t.getTimes() print 'TIMES: %0.2fuser %0.2ftotal' % times print while True: word = unicode(raw_input("Type a word: "), 'utf-8') t = Timer() for near in mtree.get_nearest(word, limit=10): print '\t%d %s' % (near.distance, near.data) times = t.getTimes() print 'TIMES: %0.2fuser %0.2ftotal' % times print
def createMTree(dataVectorsStandarized): """ Add 1 by 1 all the vectors in the M-Tree """ myTree=MTree(distance_function=distanceMtree,min_node_capacity=50) for index,vector in enumerate(dataVectorsStandarized): print(index) try: myTree.add(str(vector)) except: pass return myTree
def create_mtree(words, min_node_capacity): print >>sys.stderr, "Creating M-Tree with min_node_capacity=%r" % min_node_capacity mtree = MTree(min_node_capacity=min_node_capacity, distance_function = word_distance.word_distance) print >>sys.stderr, "Adding words...", b = timing() for n, word in enumerate(words, 1): mtree.add(word) if n % 100 == 0: print >>sys.stderr, "\r%r words added..." % n, e = timing() total_time = e - b print >>sys.stderr print "\t".join([ "CREATE-MTREE", "min_node_capacity=%r" % min_node_capacity, "total_time=%r" % total_time, "avg_time=%r" % (total_time / n), ]) print >>sys.stderr, "M-Tree created" return mtree
def create_mtree(words, min_node_capacity): print >> sys.stderr, "Creating M-Tree with min_node_capacity=%r" % min_node_capacity mtree = MTree(min_node_capacity=min_node_capacity, distance_function=word_distance.word_distance) print >> sys.stderr, "Adding words...", b = timing() for n, word in enumerate(words, 1): mtree.add(word) if n % 100 == 0: print >> sys.stderr, "\r%r words added..." % n, e = timing() total_time = e - b print >> sys.stderr print "\t".join([ "CREATE-MTREE", "min_node_capacity=%r" % min_node_capacity, "total_time=%r" % total_time, "avg_time=%r" % (total_time / n), ]) print >> sys.stderr, "M-Tree created" return mtree
class Test(unittest.TestCase): def setUp(self): # Removing randomness def not_random_promotion(data_objects, distance_function): data_objects = sorted(data_objects) return data_objects[0], data_objects[-1] self.mtree = MTree( min_node_capacity=2, max_node_capacity=3, split_function=f.make_split_function(not_random_promotion, f.balanced_partition) ) def checked(unchecked_method): def checked_method(*args, **kwargs): try: return unchecked_method(*args, **kwargs) finally: self.mtree._check() return checked_method self.mtree.add = checked(self.mtree.add) self.mtree.remove = checked(self.mtree.remove) self.all_data = set() def testEmpty(self): self._check_nearest_by_range((1, 2, 3), 4) self._check_nearest_by_limit((1, 2, 3), 4) def test01(self): self._test('f01') def test02(self): self._test('f02') def test03(self): self._test('f03') def test04(self): self._test('f04') def test05(self): self._test('f05') def test06(self): self._test('f06') def test07(self): self._test('f07') def test08(self): self._test('f08') def test09(self): self._test('f09') def test10(self): self._test('f10') def test11(self): self._test('f11') def test12(self): self._test('f12') def test13(self): self._test('f13') def test14(self): self._test('f14') def test15(self): self._test('f15') def test16(self): self._test('f16') def test17(self): self._test('f17') def test18(self): self._test('f18') def test19(self): self._test('f19') def test20(self): self._test('f20') def testLots(self): self._test('fLots') def testRemoveNonExisting(self): # Empty self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) # With some items self.mtree.add((4, 44)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((95, 43)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((76, 21)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((64, 53)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((47, 3)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((26, 11)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) def testGeneratedCase01(self): self._test('fG01') def testGeneratedCase02(self): self._test('fG02') def testRandom(self): fixtures_path, _ = os.path.split(fixtures.__file__) random_test_path = os.path.join(fixtures_path, 'fRandom.py') if os.path.isfile(random_test_path): print >>sys.stderr, "WARNING: Using previously generated random test (fRandom)." generated = False else: # Random test doesn't exist. Generate it options = generator.Options(actions=500, dimensions=3, remove_chance=0.2) fixture = generator.generate_test_data(options) f = file(random_test_path, 'w') stdout_bkp = sys.stdout sys.stdout = f try: print "# Test case generated by testRandom()." generator.print_test_data(fixture, options) finally: sys.stdout = stdout_bkp f.close() generated = True try: self._test('fRandom') except: print >>sys.stderr, "WARNING: The random test (fRandom) failed." print >>sys.stderr, "Investigate it, fix MTree and then convert" print >>sys.stderr, "the random test to a permanent test case." raise else: if generated: os.remove(random_test_path) for compiled_file in (random_test_path + 'c', random_test_path + 'o'): if os.path.isfile(compiled_file): os.remove(compiled_file) else: print >>sys.stderr, "ATTENTION: The previously existing random test" print >>sys.stderr, "has passed. Do want to delete it or convert to" print >>sys.stderr, "a permanent test case?" def _test(self, fixture_name): fixtures = __import__('fixtures.' + fixture_name) fixture = getattr(fixtures, fixture_name) self._test_fixture(fixture) def _test_fixture(self, fixture): def callback(action): if isinstance(action, generator.ADD): assert action.data not in self.all_data self.all_data.add(action.data) self.mtree.add(action.data) elif isinstance(action, generator.REMOVE): assert action.data in self.all_data self.all_data.remove(action.data) self.mtree.remove(action.data) else: assert False, action.__class__ self._check_nearest_by_range(action.query.data, action.query.radius) self._check_nearest_by_limit(action.query.data, action.query.limit) fixture.PERFORM(callback) def _check_nearest_by_range(self, query_data, radius): result = list(self.mtree.get_nearest(query_data, range=radius)) previous_distance = None for item in result: data, distance = item # Check if increasing distance if previous_distance is not None: self.assertTrue(distance is not None) self.assertLessEqual(previous_distance, distance) previous_distance = distance # Check if every item in the results came from the generated query_data self.assertIn(data, self.all_data) self.assertTrue(isinstance(item, MTree.ResultItem), item) # Check if every item in the results is within the range self.assertLessEqual(distance, radius) self.assertEqual(self.mtree.distance_function(data, query_data), distance) stripped_result = [item.data for item in result] for data in self.all_data: dist = self.mtree.distance_function(data, query_data) if dist <= radius: self.assertIn(data, stripped_result) else: self.assertNotIn(data, stripped_result) def _check_nearest_by_limit(self, query_data, limit): nearest_result = list(self.mtree.get_nearest(query_data, limit=limit)) if limit <= len(self.all_data): self.assertEquals(limit, len(nearest_result)) else: # limit > len(self.all_data) self.assertEquals(len(self.all_data), len(nearest_result)) farthest = 0.0 previous_distance = None for item in nearest_result: data, distance = item # Check if increasing distance if previous_distance is not None: self.assertTrue(distance is not None) self.assertLessEqual(previous_distance, distance) previous_distance = distance # Check if every item in the results came from the generated query_data self.assertIn(data, self.all_data) self.assertTrue(isinstance(item, MTree.ResultItem)) # Check if items are not repeated self.assertEqual(1, nearest_result.count(item)) d = self.mtree.distance_function(data, query_data) self.assertEqual(d, distance) farthest = max(farthest, d) stripped_nearest_result = [item.data for item in nearest_result] for data in self.all_data: d = self.mtree.distance_function(data, query_data) if d < farthest: self.assertIn(data, stripped_nearest_result) elif d > farthest: self.assertNotIn(data, stripped_nearest_result) else: # d == farthest: pass
class Test(unittest.TestCase): def setUp(self): # Removing randomness def not_random_promotion(data_objects, distance_function): data_objects = sorted(data_objects) return data_objects[0], data_objects[-1] self.mtree = MTree(min_node_capacity=2, max_node_capacity=3, split_function=f.make_split_function( not_random_promotion, f.balanced_partition)) def checked(unchecked_method): def checked_method(*args, **kwargs): try: return unchecked_method(*args, **kwargs) finally: self.mtree._check() return checked_method self.mtree.add = checked(self.mtree.add) self.mtree.remove = checked(self.mtree.remove) self.all_data = set() def testEmpty(self): self._check_nearest_by_range((1, 2, 3), 4) self._check_nearest_by_limit((1, 2, 3), 4) def test01(self): self._test('f01') def test02(self): self._test('f02') def test03(self): self._test('f03') def test04(self): self._test('f04') def test05(self): self._test('f05') def test06(self): self._test('f06') def test07(self): self._test('f07') def test08(self): self._test('f08') def test09(self): self._test('f09') def test10(self): self._test('f10') def test11(self): self._test('f11') def test12(self): self._test('f12') def test13(self): self._test('f13') def test14(self): self._test('f14') def test15(self): self._test('f15') def test16(self): self._test('f16') def test17(self): self._test('f17') def test18(self): self._test('f18') def test19(self): self._test('f19') def test20(self): self._test('f20') def testLots(self): self._test('fLots') def testRemoveNonExisting(self): # Empty self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) # With some items self.mtree.add((4, 44)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((95, 43)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((76, 21)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((64, 53)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((47, 3)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) self.mtree.add((26, 11)) self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77))) def testGeneratedCase01(self): self._test('fG01') def testGeneratedCase02(self): self._test('fG02') def testRandom(self): fixtures_path, _ = os.path.split(fixtures.__file__) random_test_path = os.path.join(fixtures_path, 'fRandom.py') if os.path.isfile(random_test_path): print >> sys.stderr, "WARNING: Using previously generated random test (fRandom)." generated = False else: # Random test doesn't exist. Generate it options = generator.Options(actions=500, dimensions=3, remove_chance=0.2) fixture = generator.generate_test_data(options) f = file(random_test_path, 'w') stdout_bkp = sys.stdout sys.stdout = f try: print "# Test case generated by testRandom()." generator.print_test_data(fixture, options) finally: sys.stdout = stdout_bkp f.close() generated = True try: self._test('fRandom') except: print >> sys.stderr, "WARNING: The random test (fRandom) failed." print >> sys.stderr, "Investigate it, fix MTree and then convert" print >> sys.stderr, "the random test to a permanent test case." raise else: if generated: os.remove(random_test_path) for compiled_file in (random_test_path + 'c', random_test_path + 'o'): if os.path.isfile(compiled_file): os.remove(compiled_file) else: print >> sys.stderr, "ATTENTION: The previously existing random test" print >> sys.stderr, "has passed. Do want to delete it or convert to" print >> sys.stderr, "a permanent test case?" def _test(self, fixture_name): fixtures = __import__('fixtures.' + fixture_name) fixture = getattr(fixtures, fixture_name) self._test_fixture(fixture) def _test_fixture(self, fixture): def callback(action): if isinstance(action, generator.ADD): assert action.data not in self.all_data self.all_data.add(action.data) self.mtree.add(action.data) elif isinstance(action, generator.REMOVE): assert action.data in self.all_data self.all_data.remove(action.data) self.mtree.remove(action.data) else: assert False, action.__class__ self._check_nearest_by_range(action.query.data, action.query.radius) self._check_nearest_by_limit(action.query.data, action.query.limit) fixture.PERFORM(callback) def _check_nearest_by_range(self, query_data, radius): result = list(self.mtree.get_nearest(query_data, range=radius)) previous_distance = None for item in result: data, distance = item # Check if increasing distance if previous_distance is not None: self.assertTrue(distance is not None) self.assertLessEqual(previous_distance, distance) previous_distance = distance # Check if every item in the results came from the generated query_data self.assertIn(data, self.all_data) self.assertTrue(isinstance(item, MTree.ResultItem), item) # Check if every item in the results is within the range self.assertLessEqual(distance, radius) self.assertEqual(self.mtree.distance_function(data, query_data), distance) stripped_result = [item.data for item in result] for data in self.all_data: dist = self.mtree.distance_function(data, query_data) if dist <= radius: self.assertIn(data, stripped_result) else: self.assertNotIn(data, stripped_result) def _check_nearest_by_limit(self, query_data, limit): nearest_result = list(self.mtree.get_nearest(query_data, limit=limit)) if limit <= len(self.all_data): self.assertEquals(limit, len(nearest_result)) else: # limit > len(self.all_data) self.assertEquals(len(self.all_data), len(nearest_result)) farthest = 0.0 previous_distance = None for item in nearest_result: data, distance = item # Check if increasing distance if previous_distance is not None: self.assertTrue(distance is not None) self.assertLessEqual(previous_distance, distance) previous_distance = distance # Check if every item in the results came from the generated query_data self.assertIn(data, self.all_data) self.assertTrue(isinstance(item, MTree.ResultItem)) # Check if items are not repeated self.assertEqual(1, nearest_result.count(item)) d = self.mtree.distance_function(data, query_data) self.assertEqual(d, distance) farthest = max(farthest, d) stripped_nearest_result = [item.data for item in nearest_result] for data in self.all_data: d = self.mtree.distance_function(data, query_data) if d < farthest: self.assertIn(data, stripped_nearest_result) elif d > farthest: self.assertNotIn(data, stripped_nearest_result) else: # d == farthest: pass