Ejemplo n.º 1
0
 def test_levenshtein(self):
     # Returns 0 (identical strings).
     v = metrics.levenshtein("gallahad", "gallahad")
     self.assertEqual(v, 0)
     # Returns 3 (1 insert, 1 delete, 1 replace).
     v = metrics.levenshtein("gallahad", "_g_llaha")
     self.assertEqual(v, 3)
Ejemplo n.º 2
0
 def test_levenshtein(self):
     # Assert 0 (identical strings).
     v = metrics.levenshtein("gallahad", "gallahad")
     self.assertEqual(v, 0)
     # Assert 3 (1 insert, 1 delete, 1 replace).
     v = metrics.levenshtein("gallahad", "_g_llaha")
     self.assertEqual(v, 3)
     print("pattern.metrics.levenshtein()")
Ejemplo n.º 3
0
 def test_levenshtein(self):
     # Assert 0 (identical strings).
     v = metrics.levenshtein("gallahad", "gallahad")
     self.assertEqual(v, 0)
     # Assert 3 (1 insert, 1 delete, 1 replace).
     v = metrics.levenshtein("gallahad", "_g_llaha")
     self.assertEqual(v, 3)
     print("pattern.metrics.levenshtein()")
Ejemplo n.º 4
0
def duplicates( options, parser ):
	# get corpus, else exit
	try:
		corpus = Corpus.objects.get( name=options.corpus )
	except:
		return error( message="corpus was not found! use sync.py script to load corpora", parser=parser )
	
	#  number of segments inside the corpus
	document_segments = Document_Segment.objects.filter(document__corpus = corpus)
	num_of_segments = document_segments.count()
	# print similarity("ciao", "caio", metric=DICE)
	
	c = 0
	for i in range(0, num_of_segments ):
		for j in range (i+1, num_of_segments ):
			#print i,j, document_segments[i].segment.stemmed, document_segments[j].segment.stemmed
			a = document_segments[i].segment.stemmed
			b = document_segments[j].segment.stemmed
			
			if a == b:
				# equal strings? not now, please
				c+=1
				continue
				
			dl = len(a) - len(b)
			ml = max( len(a), len(b) )
			
			if abs(dl) > ml/10.0:
				continue
			
			
			
			# test levensh
			ratio = 1-levenshtein(a,b)/float(ml)
			
			if ratio < .75:
				continue
			# print similarity(a, b, metric=DICE)
			print
			print ratio
			print " ", document_segments[i].segment.stemmed, document_segments[j].segment.stemmed
			print " ", document_segments[i].segment.content, document_segments[j].segment.content
			print
			
			c+=1
		# inner cycle
		
		#break
	print "found", c, "duplicates"