Esempio n. 1
0
    def search(self, phrase, subtitle=None, tolerance=None, func=None):
        """
        obj.search(phrase, subtitle=None, tolerance=None) -> result generator

            Returns a generator of results matching the given search
                phrase.  A secondary phrase can be given through the 
                'subtitle' parameter, and an optional levenshtein
                tolerance value can be given for filtering results.
        """
        if not func:
            if subtitle is not None:
                func = lambda p,s,r: levenshtein(r.subtitle, s)
            else:
                func = lambda p,s,r: levenshtein('%s : %s' % \
                                            (r.title,r.subtitle), p) \
                                        if r.subtitle is not None else \
                                            levenshtein(r.title, p)

        if tolerance is None:
            tolerance = int(self.db.settings.NULL.\
                                        get('MetadataLookupTolerance', 5))
        if subtitle is not None:
            res = self.command('-N', '"%s" "%s"' % (phrase, subtitle))
        else:
            res = self.command('-M', '"%s"' % phrase)

        for r in res:
            r.levenshtein = func(phrase, subtitle, r)
            if r.levenshtein > tolerance:
                continue
            yield r
Esempio n. 2
0
    def search(self, phrase, subtitle=None, tolerance=None, func=None):
        """
        obj.search(phrase, subtitle=None, tolerance=None) -> result generator

            Returns a generator of results matching the given search
                phrase.  A secondary phrase can be given through the 
                'subtitle' parameter, and an optional levenshtein
                tolerance value can be given for filtering results.
        """
        if not func:
            if subtitle is not None:
                func = lambda p, s, r: levenshtein(r.subtitle, s)
            else:
                func = lambda p,s,r: levenshtein('%s : %s' % \
                                            (r.title,r.subtitle), p) \
                                        if r.subtitle is not None else \
                                            levenshtein(r.title, p)

        if tolerance is None:
            tolerance = int(self.db.settings.NULL.\
                                        get('MetadataLookupTolerance', 5))
        if subtitle is not None:
            res = self.command('-N', '"%s" "%s"' % (phrase, subtitle))
        else:
            res = self.command('-M', '"%s"' % phrase)

        for r in res:
            r.levenshtein = func(phrase, subtitle, r)
            if r.levenshtein > tolerance:
                continue
            yield r
def calculateZipConfidence(zip1, zip2):
    if zip1 == "" or zip2 == "":
        return None
    else:
        distance = utility.levenshtein(zip1, zip2)
        confidence = 1 / (pow(distance + 1, distance))
        return confidence
Esempio n. 4
0
def calculateMiddleIConfidence(middle1, middle1AB, middle1DM, middle2, middle2AB, middle2DM):
    if middle1 == "" or middle2 == "":
        return None
    total = 0

    if middle1 == middle2:
        return 1

    if middle1AB == middle2 or middle2AB == middle1:
        total += 0.15
    
    # if utility.compareWordsWithoutSpecialChars(middle1, middle2):
    #     return 1

    # if utility.compareNameByNickname(middle1, middle2):
    #     total += 0.35
    
    if utility.compareByContains(middle1, middle2):
        total += 0.15
    
    if utility.compareDoubleMetaphones(middle1DM, middle2DM):
        total += 0.35

    # if utility.compareByVisuallySimilarChars(middle1, middle2):
    #     return 1
    
    #change
    # manhattandistance = utility.compareWordsByKeyboardDistance(middle1, middle2)

    levDistance = utility.levenshtein(middle1, middle2)
    levConfidence = 1/(pow(levDistance+1,0.2*levDistance)) * .35
    total += levConfidence
    return total
def calculateCityConfidence(city1, city2):
    if city1 == "" or city2 == "":
        return None
    #calculate two fully spelled out cities
    #levenshtein
    else:
        distance = utility.levenshtein(city1, city2)

        dmetaScore = 0
        #double metaphone
        if utility.compareByDoubleMetaphone(city1, city2):
            dmetaScore = 0.5

    #calculate abbreviations
        abbreviationScore = 0
        shortenedScore = 0
        if utility.compareByAbbrevSentence(city1, city2):
            abbreviationScore = (min(len(city1), len(city2))) / 5
    #calculate shortened versions (if abbreviated skip)
        elif utility.compareByContains(city1, city2):
            shortenedScore = (min(len(city1), len(city2))) / max(
                len(city1), len(city2))

        confidence = min(
            1 / (pow(distance + 1, distance + 1)) + dmetaScore +
            abbreviationScore + shortenedScore, 1)
        return confidence
Esempio n. 6
0
def get_query_prediction_value(query_prediction, title):
    dict = json.loads(s=query_prediction)
    diff = 0.0
    for k, v in dict.items():
        diff = diff + float(v) * utility.levenshtein(k, title)
    #print(title + ":" + str(diff))
    return diff
Esempio n. 7
0
def calculateCityConfidence(city1, city1AB, city1DM, city2, city2AB, city2DM):
    if city1 == "" or city2 == "":
        return None
    #calculate two fully spelled out cities
    #levenshtein
    else:
        total = 0
        if city1 == city2:
            return 1
        distance = utility.levenshtein(city1, city2)
        levConfidence = 1/(pow(distance+1, distance+1)) * 0.3

    #double metaphone
        if utility.compareSentDoubleMetaphones(city1DM, city2DM):
            total += 0.5

    #calculate abbreviations
        # abbreviationScore = 0
        # shortenedScore = 0
        # if utility.compareByAbbrevSentence(city1, city2):
        #     abbreviationScore = (min(len(city1),len(city2)))/5
    #calculate shortened versions (if abbreviated skip)
        if city1 == city2AB or city1AB == city2:
            total += 0.1
        if utility.compareByContains(city1,city2):
            total += 0.1
        return total
Esempio n. 8
0
def calculateNameConfidence(name1, name1DM, name2, name2DM):
    total = 0

    # if utility.compareByAbbrevWord(name1, name2):
    #     total += 0.1
    
    # if utility.compareWordsWithoutSpecialChars(name1, name2):
    #     return 1

    # if utility.compareNameByNickname(name1, name2):
    #     total += 0.35

    if name1 == name2:
        return 1
    
    if utility.compareByContains(name1, name2):
        total += 0.2
    
    if utility.compareDoubleMetaphones(name1DM, name2DM):
        total += 0.4

    # if utility.compareByVisuallySimilarChars(name1, name2):
    #     return 1
    
    #CHANGE
    # manhattandistance = utility.compareWordsByKeyboardDistance(name1, name2)

    levDistance = utility.levenshtein(name1, name2)
    levConfidence = 1/(pow(levDistance+1,0.9*levDistance)) * 0.4
    total += levConfidence
    return total
Esempio n. 9
0
def calculateStreetConfidence(street1, street1DM, street2, street2DM):
    if street1 == "" or street2 == "":
        return None
    # street1 = street1.split(' ')
    # street2 = street2.split(' ')

    #convert street abbreviations to fully spelled out
    # try:
    #     street1[-1] = dictionaries.streets[street1[-1]]
    # except KeyError:
    #     pass
    # try: 
    #      street2[-1] = dictionaries.streets[street2[-1]]
    # except KeyError:
    #     pass
    
    if street1 == street2:
        return 1
    total = 0
    if utility.compareSentDoubleMetaphones(street1DM, street2DM):
            total += 0.5
    #double metaphone for each word
   
    # street1 = ' '.join(str(elem) for elem in street1)
    # street2 = ' '.join(str(elem) for elem in street2)

    #levenshtein
    distance = utility.levenshtein(street1, street2)
    total += 1/(pow(distance+1,0.2*distance)) * 0.5

    return total
def calculateStreetConfidence(street1, street2):
    if street1 == "" or street2 == "":
        return None
    street1 = street1.split(' ')
    street2 = street2.split(' ')

    #convert street abbreviations to fully spelled out
    try:
        street1[-1] = dictionaries.streets[street1[-1]]
    except KeyError:
        pass
    try:
        street2[-1] = dictionaries.streets[street2[-1]]
    except KeyError:
        pass

    if street1 == street2:
        return 1

    #double metaphone for each word
    for elem1, elem2 in zip(street1, street2):
        metaphoneConfidence = 0
        if elem1 == None or elem2 == None:
            break
        if utility.compareByDoubleMetaphone(elem1, elem2):
            metaphoneConfidence = 1 / (max(len(street1), len(street2)))

    street1 = ' '.join(str(elem) for elem in street1)
    street2 = ' '.join(str(elem) for elem in street2)

    #levenshtein
    distance = utility.levenshtein(street1, street2)
    levenshteinConfidence = 1 / (pow(distance + 1, 0.2 * distance))

    return metaphoneConfidence * 0.5 + levenshteinConfidence * 0.5
Esempio n. 11
0
def calculateDOBConfidence(dob1, dob2):
    if dob1 == "" or dob2 == "":
        return None
    if dob1 == dob2:
        return 1
    distance = utility.levenshtein(dob1, dob2)
    confidence = 1/pow(distance+1,0.5*distance)
    return confidence
Esempio n. 12
0
def calculateZipConfidence(zip1, zip2):
    if zip1 == "" or zip2 == "":
        return None
    else:
        if zip1 == zip2:
            return 1
        distance = utility.levenshtein(zip1, zip2) 
    # distance -> confidence
    #0 -> 1
    #1 -> 0.5
    #2 -> 0.11
    #3 -> .0156
        confidence = 1/(pow(distance+1, distance))
        return confidence
def calculateSexConfidence(sex1, sex2):
    if sex1 == "" or sex2 == "":
        return None
    try:
        sex1 = dictionaries.sex[sex1]
    except KeyError:
        pass
    try:
        sex2 = dictionaries.sex[sex2]
    except KeyError:
        pass
    distance = utility.levenshtein(sex1, sex2)
    confidence = 1 / (pow(distance + 1, distance))
    return confidence
def calculateNameConfidence(name1, name2):
    total = 0

    if utility.compareByAbbrevWord(name1, name2):
        total += 0.1

    if utility.compareWordsWithoutSpecialChars(name1, name2):
        return 1
    if utility.compareByContains(name1, name2):
        total += 0.1
    if utility.compareByDoubleMetaphone(name1, name2):
        total += 0.4
    levDistance = utility.levenshtein(name1, name2)
    levConfidence = 1 / (pow(levDistance + 1, 0.9 * levDistance)) * 0.4
    total += levConfidence
    return total
def calculateStateConfidence(state1, state2):
    #convert abbreviations to full states
    if state1 == "" or state2 == "":
        return None
    else:
        try:
            state1 = dictionaries.states[state1]
        except KeyError:
            pass
        try:
            state2 = dictionaries.states[state2]
        except KeyError:
            pass
        distance = utility.levenshtein(state1, state2)
        confidence = 1 / (distance + 1)
        return confidence
Esempio n. 16
0
def calculateSexConfidence(sex1, sex2):
    if sex1 == "" or sex2 == "":
        return None
    # try:
    #     sex1 = dictionaries.sex[sex1]
    # except KeyError:
    #     pass
    # try:
    #     sex2 = dictionaries.sex[sex2]
    # except KeyError:
    #     pass
    if sex1 == sex2:
        return 1
    if len(sex1) != len(sex2):
        return 0
    distance = utility.levenshtein(sex1, sex2)
    confidence = 1/(pow(distance+1, distance))
    return confidence
def calculateMiddleIConfidence(middle1, middle2):
    if middle1 == "" or middle2 == "":
        return None
    total = 0

    if utility.compareByAbbrevWord(middle1, middle2):
        total += 0.1

    if utility.compareWordsWithoutSpecialChars(middle1, middle2):
        return 1
    if utility.compareByContains(middle1, middle2):
        total += 0.1

    if utility.compareByDoubleMetaphone(middle1, middle2):
        total += 0.4
    levDistance = utility.levenshtein(middle1, middle2)
    levConfidence = 1 / (pow(levDistance + 1, 0.2 * levDistance)) * .4
    total += levConfidence
    return total
Esempio n. 18
0
def calculatePatientAcctNumConfidence(patientAcctNum1, patientAcctNum2):
    if patientAcctNum1 == "" or patientAcctNum2 == "":
        return 0
    distance = utility.levenshtein(patientAcctNum1, patientAcctNum2)
    confidence = 1/pow(distance+1,0.15*distance)
    return confidence