def jaccard(stra, strb, tokenizer=None): """ Return the jaccard distance between stra and strb, condering the tokens set of stra and strb. If no tokenizer is given, it use if alignement.normalize.tokenize's default one. J(A, B) = (A \cap B)/(A \cup B) d(A, B) = 1 - J(A, B) """ seta = set(tokenize(stra, tokenizer)) setb = set(tokenize(strb, tokenizer)) return generic_jaccard(seta, setb)
def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs): """ Compute the matrix of distances between all tokens of stra and strb (with function ``distance``). Extra args are given to the distance function The distance returned is defined as the max of the min of each rows of each distance matrix, see the example above : | Victor | Hugo Victor | Jean | Hugo Victor | 0 | 5 Victor | 0 | 6 | 5 Jean | 6 | 4 Hugo | 5 | 4 | 0 Hugo | 5 | 0 --> 4 --> 0 Return 4 """ if ' ' not in stra: stra += ' ' if ' ' not in strb: strb += ' ' toka = tokenize(stra, tokenizer) tokb = tokenize(strb, tokenizer) # If not same number of tokens, complete the smallest list with empty strings if len(toka) != len(tokb): mint = toka if len(toka)<len(tokb) else tokb maxt = toka if len(toka)>len(tokb) else tokb mint.extend(['' for i in range(len(maxt)-len(mint))]) listmatrix = [] for i in xrange(len(toka)): listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))]) m = matrix(listmatrix) minlist = [m[i,:].min() for i in xrange(m.shape[0])] minlist.extend([m[:,i].min() for i in xrange(m.shape[1])]) return max(minlist)
def test_tokenize(self): self.assertEqual(tokenize(u"J'aime les frites !"), [u"J'", u'aime', u'les', u'frites', u'!',])