コード例 #1
0
ファイル: word_distance.py プロジェクト: wjcskqygj2015/M-Tree
def main():
	#print word_distance('gol', 'bola')
	#print word_distance(*sys.argv[1:])
	
	words_limit = int(sys.argv[1])
	
	mtree = MTree(distance_function=word_distance)
	
	loaded_words = 0
	t = Timer()
	print 'Indexing...', ; sys.stdout.flush()
	with open(DICT_FILE) as f:
		for line in f:
			if line[0] != '%':
				word = unicode(line.strip(), 'utf-8')
				#print "Adding %r (%s)" % (word, word)
				mtree.add(word)
				loaded_words += 1
				if loaded_words >= words_limit:
					break
				if loaded_words % 100 == 0:
					print '\r%d words indexed' % loaded_words, ; sys.stdout.flush()
	print '\r%d words indexed' % loaded_words
	times = t.getTimes()
	print 'TIMES: %0.2fuser %0.2ftotal' % times
	print 
	
	while True:
		word = unicode(raw_input("Type a word: "), 'utf-8')
		t = Timer()
		for near in mtree.get_nearest(word, limit=10):
			print '\t%d %s' % (near.distance, near.data)
		times = t.getTimes()
		print 'TIMES: %0.2fuser %0.2ftotal' % times
		print
コード例 #2
0
ファイル: test_mtree.py プロジェクト: wjcskqygj2015/M-Tree
    def setUp(self):

        # Removing randomness
        def not_random_promotion(data_objects, distance_function):
            data_objects = sorted(data_objects)
            return data_objects[0], data_objects[-1]

        self.mtree = MTree(min_node_capacity=2,
                           max_node_capacity=3,
                           split_function=f.make_split_function(
                               not_random_promotion, f.balanced_partition))

        def checked(unchecked_method):
            def checked_method(*args, **kwargs):
                try:
                    return unchecked_method(*args, **kwargs)
                finally:
                    self.mtree._check()

            return checked_method

        self.mtree.add = checked(self.mtree.add)
        self.mtree.remove = checked(self.mtree.remove)

        self.all_data = set()
コード例 #3
0
def createMTree(dataVectorsStandarized):
    """
        Add 1 by 1 all the vectors in the M-Tree
    """
    myTree=MTree(distance_function=distanceMtree,min_node_capacity=50)
    for index,vector in enumerate(dataVectorsStandarized):
        print(index)
        try:
            myTree.add(str(vector))
        except:
            pass
    return myTree
コード例 #4
0
ファイル: test_mtree.py プロジェクト: erdavila/M-Tree
	def setUp(self):
		
		# Removing randomness
		def not_random_promotion(data_objects, distance_function):
			data_objects = sorted(data_objects)
			return data_objects[0], data_objects[-1]
		
		
		self.mtree = MTree(
				min_node_capacity=2,
				max_node_capacity=3,
				split_function=f.make_split_function(not_random_promotion, f.balanced_partition)
			)
		
		def checked(unchecked_method):
			def checked_method(*args, **kwargs):
				try:
					return unchecked_method(*args, **kwargs)
				finally:
					self.mtree._check()
			return checked_method
		
		self.mtree.add = checked(self.mtree.add)
		self.mtree.remove = checked(self.mtree.remove)
		
		self.all_data = set()
コード例 #5
0
def getNeirestNeighbors(mtree:MTree, dataVectors:list,K):
    """
        Get the distance for the closest neighbors for every trace
    """
    queries=[]
    for index,dataVector in enumerate(dataVectors): 
        print(index)
        x=list(mtree.get_nearest(str(dataVector),limit=K))
        m=[i[1] for i in x]
        queries.append(m)
    return queries
コード例 #6
0
ファイル: stats.py プロジェクト: erdavila/M-Tree
def create_mtree(words, min_node_capacity):
	print >>sys.stderr, "Creating M-Tree with min_node_capacity=%r" % min_node_capacity
	mtree = MTree(min_node_capacity=min_node_capacity, distance_function = word_distance.word_distance)
	print >>sys.stderr, "Adding words...",
	b = timing()
	for n, word in enumerate(words, 1):
		mtree.add(word)
		if n % 100 == 0:
			print >>sys.stderr, "\r%r words added..." % n,
	e = timing()
	total_time = e - b
	print >>sys.stderr
	print "\t".join([
			"CREATE-MTREE",
			"min_node_capacity=%r" % min_node_capacity,
			"total_time=%r" % total_time,
			"avg_time=%r" % (total_time / n),
	])
	
	print >>sys.stderr, "M-Tree created"
	return mtree
コード例 #7
0
def calculateQueries(mtree:MTree, dataVectors:list,K,R):
    """
        If there is no previous data of calculated queries, or the value of 
        k and r are not combatable, this method will create the queries
        in the M-Tree based on the given values
    """
    queries=[]
    for index,dataVector in enumerate(dataVectors): 
        print(index)
        x=list(mtree.get_nearest(str(dataVector),range=R,limit=K))
        m=[i[1] for i in x]
        queries.append(m)
    return queries
コード例 #8
0
def create_mtree(words, min_node_capacity):
    print >> sys.stderr, "Creating M-Tree with min_node_capacity=%r" % min_node_capacity
    mtree = MTree(min_node_capacity=min_node_capacity,
                  distance_function=word_distance.word_distance)
    print >> sys.stderr, "Adding words...",
    b = timing()
    for n, word in enumerate(words, 1):
        mtree.add(word)
        if n % 100 == 0:
            print >> sys.stderr, "\r%r words added..." % n,
    e = timing()
    total_time = e - b
    print >> sys.stderr
    print "\t".join([
        "CREATE-MTREE",
        "min_node_capacity=%r" % min_node_capacity,
        "total_time=%r" % total_time,
        "avg_time=%r" % (total_time / n),
    ])

    print >> sys.stderr, "M-Tree created"
    return mtree
コード例 #9
0
ファイル: test_mtree.py プロジェクト: erdavila/M-Tree
class Test(unittest.TestCase):
	
	def setUp(self):
		
		# Removing randomness
		def not_random_promotion(data_objects, distance_function):
			data_objects = sorted(data_objects)
			return data_objects[0], data_objects[-1]
		
		
		self.mtree = MTree(
				min_node_capacity=2,
				max_node_capacity=3,
				split_function=f.make_split_function(not_random_promotion, f.balanced_partition)
			)
		
		def checked(unchecked_method):
			def checked_method(*args, **kwargs):
				try:
					return unchecked_method(*args, **kwargs)
				finally:
					self.mtree._check()
			return checked_method
		
		self.mtree.add = checked(self.mtree.add)
		self.mtree.remove = checked(self.mtree.remove)
		
		self.all_data = set()
	
	
	
	def testEmpty(self):
		self._check_nearest_by_range((1, 2, 3), 4)
		self._check_nearest_by_limit((1, 2, 3), 4)
	
	def test01(self):  self._test('f01')
	def test02(self):  self._test('f02')
	def test03(self):  self._test('f03')
	def test04(self):  self._test('f04')
	def test05(self):  self._test('f05')
	def test06(self):  self._test('f06')
	def test07(self):  self._test('f07')
	def test08(self):  self._test('f08')
	def test09(self):  self._test('f09')
	def test10(self):  self._test('f10')
	def test11(self):  self._test('f11')
	def test12(self):  self._test('f12')
	def test13(self):  self._test('f13')
	def test14(self):  self._test('f14')
	def test15(self):  self._test('f15')
	def test16(self):  self._test('f16')
	def test17(self):  self._test('f17')
	def test18(self):  self._test('f18')
	def test19(self):  self._test('f19')
	def test20(self):  self._test('f20')
	
	def testLots(self):  self._test('fLots')
	
	
	def testRemoveNonExisting(self):
		# Empty
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		# With some items
		self.mtree.add((4, 44))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		self.mtree.add((95, 43))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		self.mtree.add((76, 21))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		self.mtree.add((64, 53))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		self.mtree.add((47, 3))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
		
		self.mtree.add((26, 11))
		self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))
	
	
	def testGeneratedCase01(self): self._test('fG01')
	def testGeneratedCase02(self): self._test('fG02')
	
	
	def testRandom(self):
		fixtures_path, _ = os.path.split(fixtures.__file__)
		random_test_path = os.path.join(fixtures_path, 'fRandom.py')
		
		if os.path.isfile(random_test_path):
			print >>sys.stderr, "WARNING: Using previously generated random test (fRandom)."
			generated = False
		else:
			# Random test doesn't exist. Generate it
			options = generator.Options(actions=500, dimensions=3, remove_chance=0.2)
			fixture = generator.generate_test_data(options)
			f = file(random_test_path, 'w')
			stdout_bkp = sys.stdout
			sys.stdout = f
			try:
				print "# Test case generated by testRandom()."
				generator.print_test_data(fixture, options)
			finally:
				sys.stdout = stdout_bkp
			f.close()
			generated = True
		
		
		try:
			self._test('fRandom')
		except:
			print >>sys.stderr, "WARNING: The random test (fRandom) failed."
			print >>sys.stderr, "Investigate it, fix MTree and then convert"
			print >>sys.stderr, "the random test to a permanent test case."
			raise
		else:
			if generated:
				os.remove(random_test_path)
				for compiled_file in (random_test_path + 'c', random_test_path + 'o'):
					if os.path.isfile(compiled_file):
						os.remove(compiled_file)
			else:
				print >>sys.stderr, "ATTENTION: The previously existing random test"
				print >>sys.stderr, "has passed. Do want to delete it or convert to"
				print >>sys.stderr, "a permanent test case?"
	
	
	
	def _test(self, fixture_name):
		fixtures = __import__('fixtures.' + fixture_name)
		fixture = getattr(fixtures, fixture_name)
		self._test_fixture(fixture)
	

	def _test_fixture(self, fixture):
		def callback(action):
			if isinstance(action, generator.ADD):
				assert action.data not in self.all_data
				self.all_data.add(action.data)
				self.mtree.add(action.data)
			elif isinstance(action, generator.REMOVE):
				assert action.data in self.all_data
				self.all_data.remove(action.data)
				self.mtree.remove(action.data)
			else:
				assert False, action.__class__
			
			self._check_nearest_by_range(action.query.data, action.query.radius)
			self._check_nearest_by_limit(action.query.data, action.query.limit)
		
		fixture.PERFORM(callback)
	
	
	def _check_nearest_by_range(self, query_data, radius):
		result = list(self.mtree.get_nearest(query_data, range=radius))
		
		previous_distance = None
		for item in result:
			data, distance = item
			
			# Check if increasing distance
			if previous_distance is not None:
				self.assertTrue(distance is not None)
				self.assertLessEqual(previous_distance, distance)
			previous_distance = distance
			
			# Check if every item in the results came from the generated query_data
			self.assertIn(data, self.all_data)
			self.assertTrue(isinstance(item, MTree.ResultItem), item)
			
			# Check if every item in the results is within the range
			self.assertLessEqual(distance, radius)
			self.assertEqual(self.mtree.distance_function(data, query_data), distance)
		
		stripped_result = [item.data for item in result]
		for data in self.all_data:
			dist = self.mtree.distance_function(data, query_data)
			if dist <= radius:
				self.assertIn(data, stripped_result)
			else:
				self.assertNotIn(data, stripped_result)
	
	
	def _check_nearest_by_limit(self, query_data, limit):
		nearest_result = list(self.mtree.get_nearest(query_data, limit=limit))
		
		if limit <= len(self.all_data):
			self.assertEquals(limit, len(nearest_result))
		else: # limit > len(self.all_data)
			self.assertEquals(len(self.all_data), len(nearest_result))
		
		farthest = 0.0
		previous_distance = None
		for item in nearest_result:
			data, distance = item
			
			# Check if increasing distance
			if previous_distance is not None:
				self.assertTrue(distance is not None)
				self.assertLessEqual(previous_distance, distance)
			previous_distance = distance

			# Check if every item in the results came from the generated query_data
			self.assertIn(data, self.all_data)
			self.assertTrue(isinstance(item, MTree.ResultItem))
			
			# Check if items are not repeated
			self.assertEqual(1, nearest_result.count(item))
			
			d = self.mtree.distance_function(data, query_data)
			self.assertEqual(d, distance)
			farthest = max(farthest, d)
		
		stripped_nearest_result = [item.data for item in nearest_result]
		for data in self.all_data:
			d = self.mtree.distance_function(data, query_data)
			if d < farthest:
				self.assertIn(data, stripped_nearest_result)
			elif d > farthest:
				self.assertNotIn(data, stripped_nearest_result)
			else: # d == farthest:
				pass
コード例 #10
0
ファイル: test_mtree.py プロジェクト: wjcskqygj2015/M-Tree
class Test(unittest.TestCase):
    def setUp(self):

        # Removing randomness
        def not_random_promotion(data_objects, distance_function):
            data_objects = sorted(data_objects)
            return data_objects[0], data_objects[-1]

        self.mtree = MTree(min_node_capacity=2,
                           max_node_capacity=3,
                           split_function=f.make_split_function(
                               not_random_promotion, f.balanced_partition))

        def checked(unchecked_method):
            def checked_method(*args, **kwargs):
                try:
                    return unchecked_method(*args, **kwargs)
                finally:
                    self.mtree._check()

            return checked_method

        self.mtree.add = checked(self.mtree.add)
        self.mtree.remove = checked(self.mtree.remove)

        self.all_data = set()

    def testEmpty(self):
        self._check_nearest_by_range((1, 2, 3), 4)
        self._check_nearest_by_limit((1, 2, 3), 4)

    def test01(self):
        self._test('f01')

    def test02(self):
        self._test('f02')

    def test03(self):
        self._test('f03')

    def test04(self):
        self._test('f04')

    def test05(self):
        self._test('f05')

    def test06(self):
        self._test('f06')

    def test07(self):
        self._test('f07')

    def test08(self):
        self._test('f08')

    def test09(self):
        self._test('f09')

    def test10(self):
        self._test('f10')

    def test11(self):
        self._test('f11')

    def test12(self):
        self._test('f12')

    def test13(self):
        self._test('f13')

    def test14(self):
        self._test('f14')

    def test15(self):
        self._test('f15')

    def test16(self):
        self._test('f16')

    def test17(self):
        self._test('f17')

    def test18(self):
        self._test('f18')

    def test19(self):
        self._test('f19')

    def test20(self):
        self._test('f20')

    def testLots(self):
        self._test('fLots')

    def testRemoveNonExisting(self):
        # Empty
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        # With some items
        self.mtree.add((4, 44))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        self.mtree.add((95, 43))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        self.mtree.add((76, 21))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        self.mtree.add((64, 53))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        self.mtree.add((47, 3))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

        self.mtree.add((26, 11))
        self.assertRaises(KeyError, lambda: self.mtree.remove((99, 77)))

    def testGeneratedCase01(self):
        self._test('fG01')

    def testGeneratedCase02(self):
        self._test('fG02')

    def testRandom(self):
        fixtures_path, _ = os.path.split(fixtures.__file__)
        random_test_path = os.path.join(fixtures_path, 'fRandom.py')

        if os.path.isfile(random_test_path):
            print >> sys.stderr, "WARNING: Using previously generated random test (fRandom)."
            generated = False
        else:
            # Random test doesn't exist. Generate it
            options = generator.Options(actions=500,
                                        dimensions=3,
                                        remove_chance=0.2)
            fixture = generator.generate_test_data(options)
            f = file(random_test_path, 'w')
            stdout_bkp = sys.stdout
            sys.stdout = f
            try:
                print "# Test case generated by testRandom()."
                generator.print_test_data(fixture, options)
            finally:
                sys.stdout = stdout_bkp
            f.close()
            generated = True

        try:
            self._test('fRandom')
        except:
            print >> sys.stderr, "WARNING: The random test (fRandom) failed."
            print >> sys.stderr, "Investigate it, fix MTree and then convert"
            print >> sys.stderr, "the random test to a permanent test case."
            raise
        else:
            if generated:
                os.remove(random_test_path)
                for compiled_file in (random_test_path + 'c',
                                      random_test_path + 'o'):
                    if os.path.isfile(compiled_file):
                        os.remove(compiled_file)
            else:
                print >> sys.stderr, "ATTENTION: The previously existing random test"
                print >> sys.stderr, "has passed. Do want to delete it or convert to"
                print >> sys.stderr, "a permanent test case?"

    def _test(self, fixture_name):
        fixtures = __import__('fixtures.' + fixture_name)
        fixture = getattr(fixtures, fixture_name)
        self._test_fixture(fixture)

    def _test_fixture(self, fixture):
        def callback(action):
            if isinstance(action, generator.ADD):
                assert action.data not in self.all_data
                self.all_data.add(action.data)
                self.mtree.add(action.data)
            elif isinstance(action, generator.REMOVE):
                assert action.data in self.all_data
                self.all_data.remove(action.data)
                self.mtree.remove(action.data)
            else:
                assert False, action.__class__

            self._check_nearest_by_range(action.query.data,
                                         action.query.radius)
            self._check_nearest_by_limit(action.query.data, action.query.limit)

        fixture.PERFORM(callback)

    def _check_nearest_by_range(self, query_data, radius):
        result = list(self.mtree.get_nearest(query_data, range=radius))

        previous_distance = None
        for item in result:
            data, distance = item

            # Check if increasing distance
            if previous_distance is not None:
                self.assertTrue(distance is not None)
                self.assertLessEqual(previous_distance, distance)
            previous_distance = distance

            # Check if every item in the results came from the generated query_data
            self.assertIn(data, self.all_data)
            self.assertTrue(isinstance(item, MTree.ResultItem), item)

            # Check if every item in the results is within the range
            self.assertLessEqual(distance, radius)
            self.assertEqual(self.mtree.distance_function(data, query_data),
                             distance)

        stripped_result = [item.data for item in result]
        for data in self.all_data:
            dist = self.mtree.distance_function(data, query_data)
            if dist <= radius:
                self.assertIn(data, stripped_result)
            else:
                self.assertNotIn(data, stripped_result)

    def _check_nearest_by_limit(self, query_data, limit):
        nearest_result = list(self.mtree.get_nearest(query_data, limit=limit))

        if limit <= len(self.all_data):
            self.assertEquals(limit, len(nearest_result))
        else:  # limit > len(self.all_data)
            self.assertEquals(len(self.all_data), len(nearest_result))

        farthest = 0.0
        previous_distance = None
        for item in nearest_result:
            data, distance = item

            # Check if increasing distance
            if previous_distance is not None:
                self.assertTrue(distance is not None)
                self.assertLessEqual(previous_distance, distance)
            previous_distance = distance

            # Check if every item in the results came from the generated query_data
            self.assertIn(data, self.all_data)
            self.assertTrue(isinstance(item, MTree.ResultItem))

            # Check if items are not repeated
            self.assertEqual(1, nearest_result.count(item))

            d = self.mtree.distance_function(data, query_data)
            self.assertEqual(d, distance)
            farthest = max(farthest, d)

        stripped_nearest_result = [item.data for item in nearest_result]
        for data in self.all_data:
            d = self.mtree.distance_function(data, query_data)
            if d < farthest:
                self.assertIn(data, stripped_nearest_result)
            elif d > farthest:
                self.assertNotIn(data, stripped_nearest_result)
            else:  # d == farthest:
                pass