Ejemplo n.º 1
0
    def test_iterable_input(self):
        """Function should function properly when called with an iterable"""
        self.assertEqual(fuzzycomp.levenshtein_distance(["H", "e", "l", "l", "o"], ["H", "e", "l", "l", "o"]), 0)

        self.assertEqual(
            fuzzycomp.levenshtein_distance(["S", "a", "t", "u", "r", "d", "a", "y"], ["S", "u", "n", "d", "a", "y"]), 3
        )

        self.assertEqual(fuzzycomp.levenshtein_distance(("H", "e", "l", "l", "o"), ("H", "e", "l", "l", "o")), 0)

        self.assertEqual(
            fuzzycomp.levenshtein_distance(("S", "a", "t", "u", "r", "d", "a", "y"), ("S", "u", "n", "d", "a", "y")), 3
        )
Ejemplo n.º 2
0
def extractVenueFeatures(A,B):
    
     # remove pontuation
    A = re.sub(r'[^\w\s]','',A)
     # remove pontuation
    B = re.sub(r'[^\w\s]','',B)
    
    # remove multiple spaces
    A = re.sub(' +',' ',A)
    B = re.sub(' +',' ',B)
    if A == '':
        A = '-';
    if B == '':
        B = '-';
		
    charactersA = numpy.zeros(26)
    for char in A:
        if char >= 'a' and char <= 'z':
            charactersA[ord(char)-97]+=1
    
    charactersB = numpy.zeros(26)
    for char in B:
        if char >= 'a' and char <= 'z':
            charactersB[ord(char)-97]+=1
    
    subtraction = numpy.absolute(charactersA-charactersB)
    distance = numpy.sum(subtraction)
    
    max_chars =  max(numpy.sum(charactersA),numpy.sum(charactersB))
    if (max_chars ==0):
        measure =0
    else:
        measure = distance/float(max_chars)
    
    return [measure, fuzzycomp.levenshtein_distance(A,B), fuzzycomp.jaccard_distance(A,B), fuzzycomp.jaro_distance(A,B)]
Ejemplo n.º 3
0
    def test_iterable_input(self):
        """Function should function properly when called with an iterable"""
        self.assertEqual(
            fuzzycomp.levenshtein_distance(["H", "e", "l", "l", "o"],
                                           ["H", "e", "l", "l", "o"]), 0)

        self.assertEqual(
            fuzzycomp.levenshtein_distance(
                ["S", "a", "t", "u", "r", "d", "a", "y"],
                ["S", "u", "n", "d", "a", "y"]), 3)

        self.assertEqual(
            fuzzycomp.levenshtein_distance(("H", "e", "l", "l", "o"),
                                           ("H", "e", "l", "l", "o")), 0)

        self.assertEqual(
            fuzzycomp.levenshtein_distance(
                ("S", "a", "t", "u", "r", "d", "a", "y"),
                ("S", "u", "n", "d", "a", "y")), 3)
Ejemplo n.º 4
0
def extractCathegoryFeatures(A,B):
    matches=re.findall(r"\'(.+?)\'",A)
    a = " ".join(matches)
    
    matches=re.findall(r"\'(.+?)\'",B)
    b = " ".join(matches)

    a = a.replace("&amp", "")
    a = re.sub(r'[^\w\s]','',a)
    a = re.sub(' +',' ',a)
 
    b = b.replace("&amp", "")
    b = re.sub(r'[^\w\s]','',b)
    b = re.sub(' +',' ',b)

    a = a.split()
    b = b.split()
    
    if not a:
        a.append("-")
    if not b:
        b.append("-")
        
    
    min_leven = 99999;
    value_leven =0;
    
    for elem1 in a:
        for elem2 in b:
            if elem1 == "":
                elem1 = "-"
            if elem2 == "":
                elem2 = "-"
            
            lev = fuzzycomp.levenshtein_distance(elem1,elem2)
            if  lev < min_leven:
                value_leven = lev
                min_leven = value_leven
            
    return value_leven
Ejemplo n.º 5
0
def extractVenueFeatures(A, B):

    # remove pontuation
    A = re.sub(r'[^\w\s]', '', A)
    # remove pontuation
    B = re.sub(r'[^\w\s]', '', B)

    # remove multiple spaces
    A = re.sub(' +', ' ', A)
    B = re.sub(' +', ' ', B)
    if A == '':
        A = '-'
    if B == '':
        B = '-'

    charactersA = numpy.zeros(26)
    for char in A:
        if char >= 'a' and char <= 'z':
            charactersA[ord(char) - 97] += 1

    charactersB = numpy.zeros(26)
    for char in B:
        if char >= 'a' and char <= 'z':
            charactersB[ord(char) - 97] += 1

    subtraction = numpy.absolute(charactersA - charactersB)
    distance = numpy.sum(subtraction)

    max_chars = max(numpy.sum(charactersA), numpy.sum(charactersB))
    if (max_chars == 0):
        measure = 0
    else:
        measure = distance / float(max_chars)

    return [
        measure,
        fuzzycomp.levenshtein_distance(A, B),
        fuzzycomp.jaccard_distance(A, B),
        fuzzycomp.jaro_distance(A, B)
    ]
Ejemplo n.º 6
0
def extractCathegoryFeatures(A, B):
    matches = re.findall(r"\'(.+?)\'", A)
    a = " ".join(matches)

    matches = re.findall(r"\'(.+?)\'", B)
    b = " ".join(matches)

    a = a.replace("&amp", "")
    a = re.sub(r'[^\w\s]', '', a)
    a = re.sub(' +', ' ', a)

    b = b.replace("&amp", "")
    b = re.sub(r'[^\w\s]', '', b)
    b = re.sub(' +', ' ', b)

    a = a.split()
    b = b.split()

    if not a:
        a.append("-")
    if not b:
        b.append("-")

    min_leven = 99999
    value_leven = 0

    for elem1 in a:
        for elem2 in b:
            if elem1 == "":
                elem1 = "-"
            if elem2 == "":
                elem2 = "-"

            lev = fuzzycomp.levenshtein_distance(elem1, elem2)
            if lev < min_leven:
                value_leven = lev
                min_leven = value_leven

    return value_leven
Ejemplo n.º 7
0
 def test_case_difference(self):
     """Algorithm should be case sensitive"""
     self.assertNotEqual(fuzzycomp.levenshtein_distance("HELLO", "hello"), 0)
Ejemplo n.º 8
0
 def test_valid_input(self):
     """Algorithm should return correct values under valid input"""
     self.assertEqual(fuzzycomp.levenshtein_distance("Hello", "Hello"), 0)
     self.assertEqual(fuzzycomp.levenshtein_distance("Saturday", "Sunday"), 3)
Ejemplo n.º 9
0
 def test_case_difference(self):
     """Algorithm should be case sensitive"""
     self.assertNotEqual(fuzzycomp.levenshtein_distance("HELLO", "hello"),
                         0)
Ejemplo n.º 10
0
 def test_valid_input(self):
     """Algorithm should return correct values under valid input"""
     self.assertEqual(fuzzycomp.levenshtein_distance("Hello", "Hello"), 0)
     self.assertEqual(fuzzycomp.levenshtein_distance("Saturday", "Sunday"),
                      3)