Example #1
0
    def compare_two_phrases(self, f_phrase, s_phrase, compare_from_end=False):

        f_concat_phrase = "".join(f_phrase)
        s_concat_phrase = "".join(s_phrase)
        f_compare = ""
        s_compare = ""

        if len(f_concat_phrase) < self.small_phrase_border:
            # comprare words
            if not compare_from_end:
                f_compare = f_concat_phrase[:len(f_concat_phrase)]
                s_compare = s_concat_phrase[:len(f_concat_phrase)]
            else:
                f_compare = f_concat_phrase[-len(f_concat_phrase):]
                s_compare = s_concat_phrase[-len(f_concat_phrase):]
            self.last_result = jellyfish.jaro_winkler_similarity(
                f_compare, s_compare)
        else:
            # compare concated chunks
            if not compare_from_end:
                f_compare = f_concat_phrase[:self.small_phrase_border]
                s_compare = s_concat_phrase[:self.small_phrase_border]
            else:
                f_compare = f_concat_phrase[-self.small_phrase_border:]
                s_compare = s_concat_phrase[-self.small_phrase_border:]
            self.last_result = jellyfish.jaro_winkler_similarity(
                f_compare, s_compare)

        out_res = False
        out_res = True if self.last_result > self.mistakes_border else False

        # debug line
        # print("f_compare: " + f_compare + " s_compare: " + s_compare + " res " + str(self.last_result))
        return out_res
Example #2
0
def name_matcher(original_matriz, matriz_to_merge, column_with_nan_spaces, n):
    # Combino los dataframes por nombre del municipio
    final_with_errors = pd.merge(original_matriz,
                                 matriz_to_merge,
                                 on='Municipality',
                                 how='outer')
    # Tomo los municipios que no obtuvieron coincidencia por nombre
    matriz_with_wrong_names = final_with_errors.iloc[n:, :]
    print("==================================================")
    print("Matriz con nombres equivocados")
    print("==================================================")
    print(matriz_with_wrong_names)
    matriz_with_blanks = final_with_errors[np.isnan(
        final_with_errors[column_with_nan_spaces])]
    print("==================================================")
    print("Matriz con espacios vacíos")
    print("==================================================")
    print(matriz_with_blanks)
    for i in matriz_with_wrong_names['Municipality']:
        score = 0
        winner = ''
        for j in matriz_with_blanks['Municipality']:
            if jf.jaro_winkler_similarity(i, j) >= score:
                score = jf.jaro_winkler_similarity(i, j)
                winner = j
        print(f'{i} was replaced for {winner}')
        matriz_to_merge.loc[matriz_to_merge['Municipality'] == i,
                            'Municipality'] = winner
Example #3
0
    def __query(self, index: Index, text: str,
                domains: Set[str]) -> pd.DataFrame:
        q = self.name_parser.parse(text)

        with index.searcher() as s:
            results = []
            for hit in s.search(q, limit=6):
                ds = set((hit.get('domains') or '').split(','))
                results.append({
                    'raw_score':
                    hit.score,
                    'id':
                    hit['id'],
                    'name':
                    hit['name'],
                    'domains_boost':
                    self.matching_domains_boost if len(ds & domains) > 0 else 1
                })

        if len(results) == 0:
            return pd.DataFrame()

        df = pd.DataFrame.from_records(results, index='id')

        # Compute accurate score based on string similarity (lowercased)
        df['score'] = df['name'].apply(
            # "Sharpen" the similarity to make it more intuitive
            lambda name: jellyfish.jaro_winkler_similarity(
                name.lower(), text.lower())**1.5)

        df['score'] = df['score'] * df[
            'domains_boost'] / self.matching_domains_boost
        df = df.sort_values(by='score', ascending=False)
        return df.reset_index(drop=True)
Example #4
0
 def jaro_winkler_sim(self):
     self.cluster = []
     for i in range(0,len(self.group)):
         for j in range(i+1, len(self.group)):
             if self.threshold <= jf.jaro_winkler_similarity(str(self.group[i]),str(self.group[j])):
                 self.cluster.append([str(self.group[i]),str(self.group[j])])
     return self.cluster
Example #5
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
Example #6
0
	def fuzzy_contact_name_match(self, search_name, monica_contact_list, my_name, benchmark=0.85):
		all_score = []
		if search_name!=my_name:
			for monica_contact in monica_contact_list:
				score = jellyfish.jaro_winkler_similarity(search_name, monica_contact)			
				all_score.append(score)
		
			name_matched = self.find_max_score_name(monica_contact_list, all_score, benchmark)
			return name_matched
Example #7
0
    def jaro_winkler_apply(x):

        try:
            return jaro_winkler_similarity(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
Example #8
0
def alternative_search(element, script_list, best_match, best_match_script, i):
    ''' This function using Jaro Winkler similarity
    will be used if NLTK doesn't find a sufficiently good match.
    Overall this improves accuracy. '''
    bm = best_match
    for a in range(15):
        if i - a >= 0:
            if jellyfish.jaro_winkler_similarity(element,
                                                 script_list[i - a]) > bm:
                bm = jellyfish.jaro_winkler_similarity(element,
                                                       script_list[i - a])
                best_match_script = script_list[i - a]
        if i + a < len(script_list):
            if jellyfish.jaro_winkler_similarity(element,
                                                 script_list[i + a]) > bm:
                bm = jellyfish.jaro_winkler_similarity(element,
                                                       script_list[i + a])
                best_match_script = script_list[i + a]
    return bm, best_match_script
Example #9
0
def lemmatize(tokens: List[str]):
    """
    Accepts a list of tokens and returns a list containg
    lemmas of those tokens.

    Input(s):
    1) tokens - A list containg tokens to be lemmatized.

    Output(s):
    1) lemma_list - A list containing lemmas of tokens for which
                    lemmas could be found and the tokens themselves
                    for which no lemmas were found.
    """

    lemma_list = []
    for token in tokens:
        bigrams = get_character_ngrams(generate_stem_words(token), 2)
        options = db.search(where('letter') == token[0])
        options = options[0] if options else options
        similarity_score = len(token)

        if options:
            if token in options["words"]:
                lemma_list.append(token)
            else:
                candidates = []
                for lemma in options["words"]:
                    temp = get_distance(token, lemma)
                    if (temp != -1) and (temp <= similarity_score):
                        similarity_score = temp
                        candidates.append(lemma)
                    else:
                        pass
                similarity_score = 0.0
                jw_similarity_score = 0.0
                add = ""
                for i in candidates:
                    cand_big = options["words"][i]
                    temp = similarity(bigrams, cand_big)
                    temp_jw = jaro_winkler_similarity(token, i)
                    if (temp > similarity_score) and (temp_jw >
                                                      jw_similarity_score):
                        similarity_score = temp
                        jw_similarity_score = temp_jw
                        add = i
                if round(similarity_score) == 1:
                    lemma_list.append(add)
                else:
                    lemma_list.append(token)
        else:
            lemma_list.append(token)

    return list(zip(tokens, lemma_list))
def mra_1_to_all(word, all_words, threshold):
    similar_list = []
    for j, w2 in enumerate(all_words):
        if word == w2:  # skip -- same word
            continue

        # Must similar according to Match Rating Comparison (similarity on MRA hashes)
        if jellyfish.match_rating_comparison(word, w2):
            # And also score must be higher than threshold
            if jellyfish.jaro_winkler_similarity(word, w2) >= threshold:
                similar_list.append(w2)

    return similar_list
def measure_distance(word1, word2, distance_type):
    if distance_type == 'lv':
        distance = Levenshtein.eval(word1, word2)
    if distance_type == 'dlv':
        distance = jellyfish.damerau_levenshtein_distance(word1, word2)
    if distance_type == 'jw':
        # Jaro–Winkler indicates the similiraty, we take the inverse
        distance = -jellyfish.jaro_winkler_similarity(word1, word2)
    if distance_type == 'j':
        distance = -jellyfish.jaro_similarity(word1, word2)
    if distance_type == 'hm':
        distance = jellyfish.hamming_distance(word1, word2)
    return distance
Example #12
0
def fuzzy_match(s1, s2, max_dist=.8):
    ''' 
    Fuzzy match the given two strings with the given maximum distance
    Args:
        s1: 
            string: First string 
        s2: 
            string: Second string 
        max_dist: 
            float: The distance - default: 0.8 
    Returns:
        float: 
            jellyfish jaro_winkler_similarity based on https://en.wikipedia.org/wiki/Jaro-Winkler_distance
    '''
    return jellyfish.jaro_winkler_similarity(s1, s2) >= max_dist
Example #13
0
def get_nearest_string_from_list(string, string_list, threshold=0.75):
    matching_item = None
    closest_dist = threshold
    for list_item in string_list:
        dist = jellyfish.jaro_winkler_similarity(string.lower(),
                                                 list_item.lower())
        if dist > closest_dist:
            matching_item = list_item
            closest_dist = dist
        print(f"    {string} {list_item} {dist}")

    if matching_item is None:
        print("No Match Found.")
        raise ValueError()
    print(f"Match: {string} {matching_item} {closest_dist}")
    return matching_item, closest_dist
Example #14
0
    def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None):
        """
        Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \
        un tipo de distancia o similitud determinado.

        :param texto1: Primer texto de interés a comparar.
        :type texto1: str
        :param texto2: Segundo texto de interés a comparar.
        :type texto2: str
        :param tipo: Criterio de comparación a utilizar entre los textos. \
            Valor por defecto `'levenshtein'`.
        :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \
            'jaro_winkler', 'jaro'}, opcional
        :param norm: Permite normalizar los resultados en función de la \
            longitud de los textos. Si `norm = 1` se normaliza en función al \
            texto más corto, si `norm = 2` se normaliza en función al texto \
            de mayor extensión.
        :type norm: {1,2}, opcional
        :return: (float) Valor resultado de la comparación entre `texto1` y \
            `texto2`.
        """
        tipo = tipo.lower()
        if "damerau" in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif "levenshtein" in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif "hamming" in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif "winkler" in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif "jaro" in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                (
                    "Por favor seleccione un criterio válido "
                    "para comparar los strings."
                )
            )
            return None
        if norm in [1, 2] and "jaro" not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
Example #15
0
    def get_noise_results(self, orig_word, list_of_similar):
        similar = []
        added_words = set({})
        for w2 in list_of_similar:
            if orig_word == w2:
                continue

            score = jellyfish.jaro_winkler_similarity(orig_word, w2)
            if score >= self.threshold and w2 not in added_words:  # and score must be higher than threshold
                similar.append((w2, score))  # word, pronunciation, score
                added_words.add(w2)

        # few or zero results ..
        if len(similar) < self.cnt_error_samples:
            return similar

        # else return top scoring
        # similar = sorted(similar, key=lambda entry: entry[1], reverse=True)  # by score from greatest
        return similar[:self.cnt_error_samples]
Example #16
0
def string_similarity(string1, string2):
    """

    Args:
        string1 (str): Primeira string que vai ser comparada
        string2 (str): Segunda string que vai ser comparada

    Returns:
        float: O quão similar são as strings, podendo ir de 0.0 a 1.0

    Examples:
        >>> string_similarity('string 1', 'string 1')
        1.0
        >>> string_similarity('string 1', 'string 2')
        0.95
        >>> string_similarity('abc', 'bcd')
        0.0
        >>> string_similarity('apple', 'appel')
        0.9533333333333333

    """
    return jaro_winkler_similarity(str(string1), str(string2))
Example #17
0
def _similarity_compare(words: list, compare_to):
    if type(compare_to) != list:
        compare_to = [compare_to]

    points = 0
    running_total = 0

    for index, item in enumerate(compare_to):
        if len(item.split(" ")) > 1:
            compare_to.pop(index)
            [compare_to.append(c) for c in item.split(" ")]

    for c in compare_to:
        for w in words:
            score = jellyfish.jaro_winkler_similarity(w.lower(), c.lower())
            if score != 0.0:
                running_total += score
                points += 1

    if points != 0:
        running_total /= points

    return running_total
    def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None):
        """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \
            distancia o similitud determinado.

        :param texto1: (str) Primer texto de interés a comparar.
        :param texto2: (str) Segundo texto de interés a comparar.
        :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \
            'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos.
        :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ 
            los resultados en función de la longitud de los textos. \ 
            Si norm=1 se normaliza en función al texto más corto, \ 
            si norm=2 se normaliza en función al texto de mayor extensión.
        :return: (float o int) Valor resultado de la comparación.
        """
        tipo = tipo.lower()
        if 'damerau' in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif 'levenshtein' in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif 'hamming' in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif 'winkler' in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif 'jaro' in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                'Por favor seleccione un criterio válido para comparar los strings.'
            )
            return None
        if norm in [1, 2] and 'jaro' not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
Example #19
0
def jaro_winkler_distance(A, B):
    return 1 - jaro_winkler_similarity(A, B)
def jaroDistance(x, y):
    d = 1 - jellyfish.jaro_winkler_similarity(x, y)
    return d
Example #21
0
                print(f"Node {node1} is not found in network `{network}`.")
            try:
                node2_neighbors.extend(
                    [x for x in networks_dict[network].neighbors(node2)])
            except nx.exception.NetworkXError:
                print(f"Node {node2} is not found in network `{network}`.")
        return (sorted(list(set(node1_neighbors))),
                sorted(list(set(node2_neighbors))))

    similar_names = []
    for name in performer_names:
        for cmp in [
                x for x in performer_names
                if not x == name and not "unnamed" in x.lower()
        ]:
            fsh = jellyfish.jaro_winkler_similarity(name, cmp)
            if fsh > THRESHOLD:
                if (not (name, cmp, fsh) in similar_names
                        and not (cmp, name, fsh) in similar_names):
                    neighbors1, neighbors2 = compare_neighbors(
                        name, cmp, networks)
                    similar_names.append(
                        (name, cmp, fsh, neighbors1, neighbors2))

    file_name = f"{PREFIX}-report-similar-names.json"

    if similar_names:
        with open("network-app/data/" + file_name, "w+") as fp:
            json.dump(obj=similar_names, fp=fp)

        print(
Example #22
0
 def getTermScore(node):
     s1 = (' ').join(node.cleanSearchedTerm)
     s2 = (' ').join(node.cleanTermTokens)
     return jaro_winkler_similarity(s1, s2)
Example #23
0
 def jar(self):
     return jellyfish.jaro_winkler_similarity(
         self.translatable_word.english_word, self.readable_word)
Example #24
0
def jaro_winkler(a, b):
    return jf.jaro_winkler_similarity(a, b)
Example #25
0
        return float(numerator) / denominator


def textToVector(text):
    words = WORD.findall(text)
    return Counter(words)

second = open("reinterpreted_file_1.txt", "r")
first = open("reinterpreted_file_2.txt", "r")

text1 = second.read()
text2 = first.read()


vector1 = textToVector(text1)
vector2 = textToVector(text2)

cosine = calculateCosineSimilarity(vector1, vector2)

data = {
    'cosine': cosine,
    'jaro_similarity': jellyfish.jaro_similarity(text1, text2),
    'jaro_winkler_similarity': jellyfish.jaro_winkler_similarity(text1, text2),
    'levenshtein_distance': jellyfish.levenshtein_distance(text1, text2),
    'damerau_levenshtein_distance': jellyfish.damerau_levenshtein_distance(text1, text2),
    'hamming_distance': jellyfish.hamming_distance(text1, text2)

}

with open('results.txt', 'w') as outfile:
    json.dump(data, outfile)
Example #26
0
def submit_answers(request, langy_session_id):
    if request.method == 'POST':
        # Get, update and save LangySession
        langy_session = get_object_or_404(LangySession, pk=langy_session_id)
        langy_session.end_time = timezone.now()
        langy_session.save()

        # Get data from the request
        json_data = json.loads(request.body)
        answers = json_data['answers']
        if (len(answers)==0):
            return HttpResponseBadRequest('No answers received in request')
        
        # Prepare to create a response with results and create new LearningTraces
        response_results = []
        for answer in answers:
            translation = get_object_or_404(Translation, pk=answer['translation_id'])

            # Get user answer, ignore capitalisation
            user_english = answer['user_english'].lower()

            # Get correct answer(s)
            true_english = translation.translatable_word.english_word.lower()
            synonyms = [syn.english_word for syn in translation.translatable_word.synonyms.all()]
            if len(synonyms) != 0:
                # Find the closest word to the user's input
                max_sim = jellyfish.jaro_winkler_similarity(user_english, true_english)
                for syn in synonyms:
                    sim = jellyfish.jaro_winkler_similarity(user_english, syn)
                    if sim > max_sim:
                        max_sim = sim
                        true_english = syn

            # Evaluate user answer
            correct = user_english == true_english
            typo = False

            # Typos: plurals
            # Allow missing or additional 's'
            # Some foreign words e.g. Swedish "djur" (animal/animals) are the same for singular/plural
            if (user_english == true_english+'s' or user_english+'s' == true_english):
                correct = True
                typo = True
            
            # Typos: typing error tolerance
            # Allow one accidental character insertion, deletion, substitution or transposition
            if jellyfish.damerau_levenshtein_distance(user_english, true_english) == 1:
                correct = True
                typo = True

            # Add result to list for response
            response_results.append({
                'translation_id': answer['translation_id'],
                'true_english': true_english,
                'correct': correct,
                'typo': typo,
            })

            # Prepare to create a new LearningTrace
            # Find previous LearningTrace object for this Translation
            prev = (request.user.traces
                .filter(translation=translation)
                .filter(translation__foreign_language = request.user.active_language.foreign_language)
                .last())
            if prev is None:
                continue  # next answer

            LearningTrace.objects.create(
                session = langy_session,
                user = request.user,
                # Tracing
                translation = translation,
                prev = prev,
                # Statistics
                seen = prev.seen + 1,
                interacted = prev.interacted,
                tested = prev.tested + 1,
                correct = prev.correct + 1 if correct else prev.correct,
            )

        return JsonResponse({
            'results': response_results
        })

    else:
        return HttpResponseBadRequest('Invalid request method')
Example #27
0
inFile = open('dump-sorted-uniq.txt', mode = 'r')
lines = inFile.read()
print(lines)
inFile.close()
myAuthors = []
count = 0

# split on non-espcaped '
currLineCommaSep = lines.split(", ")
print("Quote separated: "+str(currLineCommaSep))
# parse input -- everything between non-escaped quotes is a new author to add to list
for newAuthor in currLineCommaSep:
    if (newAuthor is not "" and newAuthor is not "\n" and not("," in newAuthor)):
        print("Found author #"+str(count)+" : "+str(newAuthor))
        # add each author to list
        myAuthors.append(newAuthor.strip())
        count = count + 1

numAuthors = len(myAuthors)
# now that we have all authors, run deduplication -- print out likely matches
for currAuthor in range(numAuthors):
    restOfAuthors = currAuthor + 1
    # compare author with all subsequent authors, searching for close matches
    while (restOfAuthors < numAuthors):
        #print("Comparing "+str(currAuthor)+" to "+str(restOfAuthors))
        similarity = jellyfish.jaro_winkler_similarity(myAuthors[currAuthor], myAuthors[restOfAuthors])
        # arbitrary threshold at the moment -- anectdotally, anything less than this leads to a large number of false positives
        if (similarity > 0.94):
            print("Similarity ("+str(similarity)+"): "+myAuthors[currAuthor]+", "+myAuthors[restOfAuthors])
        restOfAuthors += 1
Example #28
0
def jaro_winkler_similarity(s1, s2):
    return None if s1 == None or s2 == None else J.jaro_winkler_similarity(
        s1, s2)