コード例 #1
0
def jaccard(stra, strb, tokenizer=None):
    """ Return the jaccard distance between stra and strb, condering the tokens
        set of stra and strb. If no tokenizer is given, it use if
        alignement.normalize.tokenize's default one.

        J(A, B) = (A \cap B)/(A \cup B)
        d(A, B) = 1 - J(A, B)
    """
    seta = set(tokenize(stra, tokenizer))
    setb = set(tokenize(strb, tokenizer))
    return generic_jaccard(seta, setb)
コード例 #2
0
def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
    """ Compute the matrix of distances between all tokens of stra and strb
        (with function ``distance``). Extra args are given to the distance
        function

        The distance returned is defined as the max of the min of each rows of
        each distance matrix, see the example above :

                 |  Victor |  Hugo                  Victor | Jean | Hugo
         Victor  |     0   |    5           Victor |  0    |  6   |  5
          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
          Hugo   |     5   |    0

                 --> 4                                --> 0

        Return 4
    """

    if ' ' not in stra:
        stra += ' '
    if ' ' not in strb:
        strb += ' '

    toka = tokenize(stra, tokenizer)
    tokb = tokenize(strb, tokenizer)
    # If not same number of tokens, complete the smallest list with empty strings
    if len(toka) != len(tokb):
        mint = toka if len(toka)<len(tokb) else tokb
        maxt = toka if len(toka)>len(tokb) else tokb
        mint.extend(['' for i in range(len(maxt)-len(mint))])

    listmatrix = []
    for i in xrange(len(toka)):
        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
    m = matrix(listmatrix)
    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
    return max(minlist)
コード例 #3
0
 def test_tokenize(self):
     self.assertEqual(tokenize(u"J'aime les frites !"),
                      [u"J'", u'aime', u'les', u'frites', u'!',])