예제 #1
0
def find_searchedperson(first_name, last_name, national_id):
    '''
    Try super hard to match this person up. Rather return bad
    match than no match - we can manually search bad matches.
    '''
    if national_id in persons_by_id:
        yield persons_by_id[national_id]

    last_name_norm = normalize_string(last_name)
    matches = []
    if last_name_norm in persons_by_last_name:
        matches = persons_by_last_name[last_name_norm]
    else:
        for key in persons_by_last_name.keys():
            if jaro(last_name_norm, key) > 0.9:
                matches.extend(persons_by_last_name[key])

    first_name_norm = normalize_string(first_name)
    for match in matches:
        key = normalize_string(match.first_name)
        '''
        A match is valid if:
        1. We don't have a first name because only last name
           was used in a failed search.
        2. The entire first name string is similar.
        3. One of the first names are similar.
        '''
        if (not key and isinstance(match, SearchedPersonNotFound)) \
                or jaro(first_name_norm, key) > 0.9 \
                or has_matching_word(first_name_norm, key):
            yield match
예제 #2
0
    def compare_two_names(name1, name2, max_splits=7):
        def normalize_name(s):
            return re.sub(r"\s+", " ", s.lower().strip().replace("-", " "))

        def slugify_name(s):
            return (
                s.replace(" ", "")
                .replace(".", "")
                .replace('"', "")
                .replace("'", "")
                .replace("’", "")
            )

        name1 = normalize_name(name1)
        name2 = normalize_name(name2)

        if slugify_name(name1) == slugify_name(name2):
            return 1

        if jaro(name1, name2) > 0.95:
            return 1

        splits = name2.split(" ")
        limit = reduce(mul, range(1, max_splits + 1))

        if len(splits) > max_splits:
            print("Too much permutations for {}".format(name2))

        return max(
            jaro(name1, " ".join(opt)) for opt in islice(permutations(splits), limit)
        )
예제 #3
0
def full_compare(name1, name2):
    name1 = _normalize_name(name1)
    name2 = _normalize_name(name2)
    slugified_name1 = _slugify_name(name1)
    slugified_name2 = _slugify_name(name2)

    if slugified_name1 == slugified_name2:
        return True

    if slugified_name1.startswith(slugified_name2) and len(slugified_name2) >= 10:
        return True

    if slugified_name2.startswith(slugified_name1) and len(slugified_name1) >= 10:
        return True

    if slugified_name1.endswith(slugified_name2) and len(slugified_name2) >= 10:
        return True

    if slugified_name2.endswith(slugified_name1) and len(slugified_name1) >= 10:
        return True

    if jaro(slugified_name1, slugified_name2) > 0.95:
        return True

    if jaro(slugified_name2, slugified_name1) > 0.95:
        return True

    if _compare_two_names(name1, name2):
        return True

    if _compare_two_names(name2, name1):
        return True

    return _thorough_compare(name1, name2) or _thorough_compare(name2, name1)
예제 #4
0
def find_searchedperson(first_name, last_name, national_id):
    '''
    Try super hard to match this person up. Rather return bad
    match than no match - we can manually search bad matches.
    '''
    if national_id in persons_by_id:
        yield persons_by_id[national_id]

    last_name_norm = normalize_string(last_name)
    matches = []
    if last_name_norm in persons_by_last_name:
        matches = persons_by_last_name[last_name_norm]
    else:
        for key in persons_by_last_name.keys():
            if jaro(last_name_norm, key) > 0.9:
                matches.extend(persons_by_last_name[key])

    first_name_norm = normalize_string(first_name)
    for match in matches:
        key = normalize_string(match.first_name)
        '''
        A match is valid if:
        1. We don't have a first name because only last name
           was used in a failed search.
        2. The entire first name string is similar.
        3. One of the first names are similar.
        '''
        if (not key and isinstance(match, SearchedPersonNotFound)) \
                or jaro(first_name_norm, key) > 0.9 \
                or has_matching_word(first_name_norm, key):
            yield match
예제 #5
0
def get_jaro_to_list(first4jaro, list4jaro, factor=0.9):
    result = [[0 for x in range(len(list4jaro))]
              for y in range(len(first4jaro))]
    loc_data = 0.0
    #If loc_data =0, we take the first one
    loc_i = 0
    loc_j = 0
    for i, item in enumerate(first4jaro):
        for j, data in enumerate(list4jaro):
            if (item[1] == "") or (data[1] == ""):
                result[i][j] = jaro(item[0], data[0])
            else:
                result[i][j] = jaro(item[0], data[0]) * jaro(item[1], data[1])
            if result[i][j] > loc_data:
                loc_data = result[i][j]
                loc_i = i
                loc_j = j
    first2return = first4jaro[:loc_i] + first4jaro[loc_i + 1:]
    list4return = list4jaro[:loc_j] + list4jaro[loc_j + 1:]
    if (len(first2return) == 0) or (len(list4return) == 0):
        dif = abs(len(first2return) - len(list4return))
        return loc_data * loc_data * math.pow(factor, dif)
    else:
        return loc_data * loc_data * get_jaro_to_list(first2return,
                                                      list4return)
예제 #6
0
def _get_name(name, first_name, gender):
    for prefix in ('eng', 'dr', 'hr', 'phd'):
        if first_name.startswith(prefix + ' '):
            first_name = first_name[len(prefix) + 1:]
    first_name = first_name.replace(' ', '')
    if first_name in ('abd', 'abdel', 'عبد') or first_name == '':
        ns = name.split(' ')
        segs = []
        for s in ns:
            segs.append(s)
            if s not in ('abd', 'el', 'عبد'):
                break
        first_name = ' '.join(segs).replace(' ', '')

    if is_arabic(first_name):
        return first_name

    if 'female' in gender:
        gender = 'female'
    elif 'male' in gender:
        gender = 'male'
    if gender == 'unknown':
        if first_name in males_en or first_name in males:
            gender = 'male'
        elif first_name in females_en or first_name in females:
            gender = 'female'

    if gender == 'unknown':
        nearest = -1
        nearest_gender = 'unknown'
        for en_name in males_en:
            similarity = jaro(first_name, en_name)
            if similarity > nearest:
                nearest_gender = 'male'
                nearest = similarity
            if similarity == nearest and nearest_gender == 'female':
                nearest_gender = 'unknown'
        for en_name in females_en:
            similarity = jaro(first_name, en_name)
            if similarity > nearest:
                nearest_gender = 'female'
                nearest = similarity
            if similarity == nearest and nearest_gender == 'male':
                nearest_gender = 'unknown'
        gender = nearest_gender

    d = males if gender == 'male' else females if gender == 'female' else unknowns
    if first_name in d:
        return d[first_name]

    res = translator.translate(first_name, dest='ar', src='en').text
    d[first_name] = res
    return res
예제 #7
0
def get_compared_data_file(data, language="en", data_kind="surname"):
    '''
    This function will compare the given name with the current data input
    '''
    if language in LANGUAGES_FILES.keys():
        if data_kind in LANGUAGES_FILES[language].keys():
            data_in_met = adapted_doublemetaphone(data, language=language)
            total_data = []
            for word, met_value in LANGUAGES_DATA[language][data_kind].items():
                if met_value == data_in_met:
                    total_data.append(word)
            #If the value is already available, we just return it
            if data in LANGUAGES_DATA[language][data_kind].keys():
                return data, 1.0
            else:
                data_temp = data.lower()
                norm = LANGUAGES_FILES[language]["normalize"]
                for notnorm in norm.keys():
                    data_temp = data_temp.replace(notnorm, norm[notnorm])
                results = {}
                for candidate in total_data:
                    candidate_temp = candidate.lower()
                    for notnorm in norm.keys():
                        candidate_temp = candidate_temp.replace(
                            notnorm, norm[notnorm])
                    results[candidate] = jaro(candidate_temp, data_temp)
                if (any(results)):
                    return max(results, key=results.get), max(results.values())
                else:
                    return data, -1.0
    return data, -1.0
예제 #8
0
def score_reconciliation(txn, payment):
    words = list(filter(None, re.split(r"\W+", txn.payee)))

    bankref_parts = [payment.bankref[:4], payment.bankref[4:]]
    bankref_distances = [ratio(w, p) for w in words for p in bankref_parts]
    # Get the two best matches, for the two parts of the bankref
    # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857
    bankref_score = sum(sorted(bankref_distances)[-2:])
    name_score = jaro(txn.payee, payment.user.name)

    other_score = 0.0

    if txn.amount == payment.amount:
        other_score += 0.4

    if txn.account.currency == payment.currency:
        other_score += 0.6

    # check posted against expiry?

    app.logger.debug(
        "Scores for txn %s payment %s: %s %s %s",
        txn.id,
        payment.id,
        bankref_score,
        name_score,
        other_score,
    )
    return bankref_score + name_score + other_score
예제 #9
0
파일: similarity.py 프로젝트: zkrhm/fdn-ner
    def update(self, idx, idy, get_word):
        # ox = self.store.get(idx)
        # oy = self.store.get(idy)

        #save distance:
        # self.vec[idx,idy] =

        stime = datetime.now()
        try:
            idx = int(idx)
            idy = int(idy)

            if idx == idy:
                return

            ox = self.store.get(str(idx))
            oy = self.store.get(str(idy))

            if ox is None or oy is None:
                return
            w1 = get_word(ox)
            w2 = get_word(oy)
            simi = jaro(w1, w2)

            logger.debug("({},{}) vs ({},{}) : {}".format(
                idx, w1, idy, w2, simi))

            self.store.set_entry(idx, idy, simi)

        except Exception as e:
            logger.debug("(idx : {} type : {})".format(idx, type(idx)))
            raise e
        ntime = datetime.now()

        logger.debug("processing time : {}".format(ntime - stime))
예제 #10
0
파일: names.py 프로젝트: dchaplinsky/edrdr
def full_compare(name1, name2):
    def normalize_name(s):
        return re.sub(r"\s+", " ", s.strip().replace("-", " "))

    def slugify_name(s):
        s = (s.replace(" ", "").replace(".", "").replace('"', "").replace(
            "'",
            "").replace("’", "").replace("є", "е").replace("i", "и").replace(
                "ь", "").replace("'",
                                 "").replace('"', "").replace('`', "").replace(
                                     "’", "").replace("ʼ", ""))

        return re.sub(r"\d+", "", s)

    name1 = normalize_name(name1)
    name2 = normalize_name(name2)
    slugified_name1 = slugify_name(name1)
    slugified_name2 = slugify_name(name2)

    if slugified_name1 == slugified_name2:
        return True

    if slugified_name1.startswith(
            slugified_name2) and len(slugified_name2) >= 10:
        return True

    if slugified_name2.startswith(
            slugified_name1) and len(slugified_name1) >= 10:
        return True

    if jaro(slugified_name1, slugified_name2) < 0.6:
        return False

    if jaro(slugified_name1, slugified_name2) > 0.95:
        return True

    if jaro(slugified_name2, slugified_name1) > 0.95:
        return True

    if _compare_two_names(name1, name2):
        return True

    if _compare_two_names(name2, name1):
        return True

    return False
예제 #11
0
def fetch_answer(Q_input):
    score_list = list(map(lambda x: jaro(x, Q_input), Q_list))
    highest_score = max(score_list)
    highest_score_index = score_list.index(highest_score)
    selected_answer = A_list[highest_score_index]
    score_list.pop(highest_score_index)
    second_highest_score = max(score_list)
    return selected_answer, highest_score, second_highest_score
예제 #12
0
def predict_ner():
    '''
    This will return the similarity between two strings
    '''

    str1 = str(request.form['string1'])
    str2 = str(request.form['string2'])
    result = jaro(str1, str2)
    return render_template('ner.html', prediction_text='{}'.format(result))
예제 #13
0
 def cmp_auth(self, s1, s2):
     if len(s1) != len(s2): return False, False
     matching, sorted = True, True
     sim_mat = [[jaro(s1[y], s2[x]) for x in range(len(s2))]
                for y in range(len(s1))]
     for i in range(len(s2)):
         if max(sim_mat[i]) != sim_mat[i][i]: sorted = False
         if max(sim_mat[i]) < self.threshold['author']: matching = False
     return matching, sorted
예제 #14
0
def compare_fingerprints(left, right):
    result = 0
    left_list = ensure_list(left.get('fingerprints'))
    right_list = ensure_list(right.get('fingerprints'))
    for (left, right) in itertools.product(left_list, right_list):
        similarity = jaro(left, right)
        score = similarity * dampen(3, 20, min(left, right, key=len))
        result = max(result, score)
    return result
예제 #15
0
def compare_names(left, right):
    result = 0
    left_list = list(_normalize_names(left.names))
    right_list = list(_normalize_names(right.names))
    for (left, right) in itertools.product(left_list, right_list):
        similarity = jaro(left, right)
        score = similarity * dampen(2, 20, shortest(left, right))
        result = max(result, score)
    return result
예제 #16
0
def compare_names(left, right):
    result = 0
    left_list = [normalize(n, latinize=True) for n in left.names]
    right_list = [normalize(n, latinize=True) for n in right.names]
    for (left, right) in itertools.product(left_list, right_list):
        similarity = jaro(left, right)
        score = similarity * dampen(2, 20, shortest(left, right))
        result = max(result, score)
    return result
예제 #17
0
def get_classify(tuple_cont):
    '''判断当前站点所属类别,该函数为进程池目标函数
    '''
    try:
        if jaro(Config.ipc_r_kw.value[0], tuple_cont[0]) > Config.edit_dist_benchmark:
            Config.ipc_list_url.append(tuple_cont[1] + '\n')
    except:
        # print '#############################################'
        # print Config.ipc_r_kw.value[1]
        pass
예제 #18
0
def get_name_from_fullname(full_name,
                           list_father_surnames,
                           list_mother_surnames,
                           language="en"):
    '''
    Given a full name, including surname, this function will provide out the first name of
    the person removing the surname of the person
    '''
    merged_list = list_father_surnames + list_mother_surnames
    for surname in merged_list:
        temp_surname = surname.split(" ")
        if len(temp_surname) > 1:
            for i, _ in enumerate(temp_surname):
                if temp_surname[i] in LANGUAGES_ADDS[language]:
                    temp_surname[i] = ""
            new_surname = " ".join(temp_surname).rstrip().strip()
            if (not new_surname in merged_list):
                merged_list.append(new_surname)
    merged_metaphore = []
    for data in merged_list:
        if adapted_doublemetaphone(data, language) not in merged_metaphore:
            merged_metaphore.append(adapted_doublemetaphone(data, language))
    full_name_list = get_splitted_name_from_complete_name(full_name, language)
    for i, value in enumerate(full_name_list[0]):
        #We remove from the specific particle the particles from each language that are used inside surnames
        #to connect
        check_surname = value.split(" ")
        if len(check_surname) > 1:
            for j, value in enumerate(check_surname):
                if (check_surname[j].lower() in LANGUAGES_ADDS[language]):
                    check_surname[j] = ""
        adapted_surname = "".join(check_surname).rstrip()
        if (adapted_doublemetaphone(value, language)
                in merged_metaphore) or (adapted_doublemetaphone(
                    adapted_surname, language) in merged_metaphore):
            #The methapone algorithm is not perfect... so that we include here a crosschecking of very close phonetical, but far written data.
            similar = 0
            for compared in merged_list:
                if jaro(adapted_surname, compared) > similar:
                    similar = jaro(adapted_surname, compared)
            if similar > THRESHOLD_JARO:
                full_name_list[0][i] = ""
    return " ".join(full_name_list[0]).rstrip()
예제 #19
0
def fuzzyStr(perc, str_one, str_two):
    if all(i in str_two.split() for i in str_one.split()):
        return True
    try:
        perc = int(perc)
    except:
        return False
    if int(jaro(str_one, str_two) * 100) >= perc:
        return True
    else:
        return False
예제 #20
0
def score_of_given_name_and_meta(first4jaro,
                                 list4jaro,
                                 name1,
                                 name2,
                                 factor=0.9):
    '''
    This function will take the maximum score between the direct comparison of the name and the phonetic comparison
    '''
    score_compare = jaro(name1, name2)
    score_met = get_jaro_to_list(first4jaro, list4jaro, factor=factor)
    return max(score_met, score_compare * score_compare)
예제 #21
0
def compare(x, y):
    if x is None:
        return [0, x, y]

    x = normalize(x)
    x_tokens = [word.lower() for word in tokenize(x) if word.isalnum()]

    y = normalize(y)
    y_tokens = [word.lower() for word in tokenize(y) if word.isalnum()]

    ppx = untokenize(x_tokens)
    ppy = untokenize(y_tokens)

    return [jaro(ppx, ppy), ppx, ppy]
예제 #22
0
파일: names.py 프로젝트: dchaplinsky/edrdr
def _compare_two_names(name1,
                       name2,
                       max_splits=7,
                       straight_limit=0.93,
                       smart_limit=0.95):
    splits = name2.split(" ")

    straight_similarity = jaro(name1, name2)
    if straight_similarity > smart_limit:
        return True

    if straight_similarity > straight_limit:
        min_pair_distance = 1
        for a, b in zip_longest(name1.split(" "), splits):
            if a is not None and b is not None:
                min_pair_distance = min(jaro(a, b), min_pair_distance)

        if min_pair_distance > 0.8:
            if len(splits) > 1 and DEBUG:
                tqdm.write("Hmmm, looks like a match {}\t{}".format(
                    name1, name2))
            return True
        else:
            if len(splits) > 1 and DEBUG:
                tqdm.write("Check if it's match: {}\t{}".format(name1, name2))

    limit = reduce(mul, range(1, max_splits + 1))

    if len(splits) > max_splits and DEBUG:
        tqdm.write("Too much permutations for {}".format(name2))

    max_similarity = max(
        jaro(name1, " ".join(opt))
        for opt in islice(permutations(splits), limit))

    return max_similarity > smart_limit
예제 #23
0
def score_of_given_name_and_meta(first4jaro,
                                 list4jaro,
                                 name1,
                                 name2,
                                 factor=0.9):
    '''
    This function will take the maximum score between the direct comparison of the name and the phonetic comparison
    '''
    #Jaro is creating odd situations with names which are very different in length, with this modification, we penalize lenght differences a lot
    len_factor = (abs((len(name1) - len(name2))) / max(len(name1), len(name2)))
    score_compare = jaro(name1, name2)
    score_met = get_jaro_to_list(first4jaro, list4jaro, factor=factor)
    if (len_factor < 0.33) or (1 - len_factor) * (1 - len_factor) > max(
            score_met, score_compare * score_compare):
        return max(score_met, score_compare * score_compare)
    #We undo only in case this new scoring is more negative
    else:
        return (1 - len_factor) * (1 - len_factor)
예제 #24
0
파일: test_rst.py 프로젝트: AcrDijon/henet
    def test_parse(self):
        for file in os.listdir(SAMPLE_DIR):
            if not file.endswith(".rst"):
                continue
            filename = os.path.join(SAMPLE_DIR, file)
            article = parse_article(filename)
            rendered = article.render().strip()

            with open(filename) as f:
                source = f.read().strip()
                source = source.expandtabs(4).decode("utf8")

            if source != rendered:
                lev_ = distance(source, rendered)
                jaro_ = jaro(source, rendered)

                if lev_ > 10 and jaro_ < 0.8 and file not in MUTATED_FILES:
                    print("%d %f %s" % (lev_, jaro_, filename))
                    raise AssertionError(filename)
예제 #25
0
def _compare_two_names(
    name1, name2, max_splits=7, straight_limit=0.70, smart_limit=0.96
):

    straight_similarity = jaro(name1, name2)
    if straight_similarity > smart_limit:
        return True

    if straight_similarity > straight_limit:
        min_pair_distance = 1
        for a, b in zip_longest(name1.split(" "), name2.split(" ")):
            if a is not None and b is not None:
                chunk_distance = _smart_jaro(a, b, func=jaro_winkler)
                min_pair_distance = min(chunk_distance, min_pair_distance)

        if min_pair_distance > 0.88:
            return True

    return False
예제 #26
0
    def match_location(self, location):
        """
        We will mutate the score a bit to add +0.1 to the jaro distance
        for starting with the same letter. Alexa's speech processing system
        really sucks at this.

        location -- the location to match against
        """
        if location:
            matches = []
            for switch_id, switch_func in self.server.switches.items():
                similarity = jaro(location, switch_id)
                if location[0].lower() == switch_id[0].lower():
                    similarity += 0.1
                matches += [(similarity, switch_id, switch_func)]
            matches.sort(key=lambda x: x[0], reverse=True)
            if matches[0][0] >= _MATCH_THRESHOLD:
                return matches[0][1:]
        raise ActionParseError("I didn't understand the location. "
                               "Could you please repeat?")
예제 #27
0
    def annotate(self, training_set):

        #Levenshtein distance - minimum number of single character edits
        distance_udf = udf(lambda x, y: distance(x, y), IntegerType())
        #Levenshtein ratio - similarity of two strings
        ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType())
        #Jaro - similarity score
        jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType())
        #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning
        jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType())
        #fuzz partial ratio - gives a score based on how well parts of a string match another
        fuzz_partial_ratio_udf = udf(
            lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType())

        training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \
            .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \
            .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2"))

        return training_set
예제 #28
0
def score_reconciliation(txn, payment):
    words = txn.payee.replace('-', ' ').split(' ')

    bankref_distances = [ratio(w, payment.bankref) for w in words]
    # Get the two best matches, for the two parts of the bankref
    bankref_score = sum(sorted(bankref_distances)[-2:])
    name_score = jaro(txn.payee, payment.user.name)

    other_score = 0.0

    if txn.amount == payment.amount:
        other_score += 0.4

    if txn.account.currency == payment.currency:
        other_score += 0.6

    # check posted against expiry?

    app.logger.debug('Scores for txn %s payment %s: %s %s %s',
                     txn.id, payment.id, bankref_score, name_score, other_score)
    return bankref_score + name_score + other_score
예제 #29
0
def match_locality(string, localities):
    ''' Try to figure out which locality 'string' is by
    finding the known localities that have the maximum 
    (jaro) similarity score '''
    if ',' in string:
        parts = string.split(',')
        string = parts[1].strip() + ' ' + parts[0]
    best = 0
    matches = []
    jaro_hits = []
    substring_hits = []
    for loc in localities:
        ulower_string = strip_accents(string.lower().decode("utf-8"))
        ulower_loc_name = strip_accents(loc['loc_name'].lower().decode("utf-8"))
        
        smaller, bigger = ulower_string, ulower_loc_name
        if len(bigger) < len(smaller):
            smaller, bigger = bigger, smaller
        if len(smaller) < len(bigger): # they might actually be the same size
            if smaller in bigger:
                hit = dict(loc)
                hit['score'] = 'sub' 
                substring_hits.append(hit)

        similarity = jaro(ulower_string, ulower_loc_name)
        if similarity > best:
            matches = []
            best = similarity
        if similarity == best: 
            hit = dict(loc)
            hit['score'] = similarity
            matches.append(hit)
    
    jaro_hits = [(m['loc_name'],m['muni_name']) for m in matches]
    for s in substring_hits:
        if (s['loc_name'],s['muni_name']) not in jaro_hits:
            matches.append(s)
    return matches
예제 #30
0
def score_reconciliation(txn, payment):
    words = list(filter(None, re.split('\W+', txn.payee)))

    bankref_parts = [payment.bankref[:4], payment.bankref[4:]]
    bankref_distances = [ratio(w, p) for w in words for p in bankref_parts]
    # Get the two best matches, for the two parts of the bankref
    # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857
    bankref_score = sum(sorted(bankref_distances)[-2:])
    name_score = jaro(txn.payee, payment.user.name)

    other_score = 0.0

    if txn.amount == payment.amount:
        other_score += 0.4

    if txn.account.currency == payment.currency:
        other_score += 0.6

    # check posted against expiry?

    app.logger.debug('Scores for txn %s payment %s: %s %s %s',
                     txn.id, payment.id, bankref_score, name_score, other_score)
    return bankref_score + name_score + other_score
예제 #31
0
 async def special_matches(self, monster):
     max_score = 0
     for class_attrs in self.monster_class_attributes:
         val: MonsterModel = monster
         for class_attr in class_attrs:
             val: str = getattr(val, class_attr, None)
         if val is None:
             continue
         val: str = val.lower()
         if self.match == "=" and val == self.string:  # Exact match
             return True, MatchData(self)
         elif self.match == "r" and bool(re.search(self.string,
                                                   val)):  # Regex match
             return True, MatchData(self)
         elif self.match == "g" and fnmatch(
                 val, '*' + self.string + '*'):  # Glob match
             return True, MatchData(self)
         elif self.string in val:
             return True, MatchData(self)
         max_score = max(max_score, jaro(self.string, val))
     if max_score >= TOKEN_JW_DISTANCE:
         return max_score, MatchData(self)
     return False, MatchData(self)
예제 #32
0
def get_common_audios(login: str, password: str, *ids):
    #пока только для 2 акков
    #добавить lru_cache
    vk_session = vk_api.VkApi(login, password, app_id=app_id, scope=scope)
    vk_session.auth()
    #vk=vk_session.get_api()
    vkaudio = audio.VkAudio(vk_session)
    users_track_list = []
    for id in ids:
        user_track_list = set([
            track['artist'] + '-' + track['title']
            for track in vkaudio.get_iter(id)
        ])
        print("I'm not dead")
        users_track_list.append(user_track_list)
    shares = [len(i) for i in users_track_list]
    common_audios_lst = set()
    while (len(users_track_list) > 1):
        common_audios_lst = set()
        users_track_list.sort(key=lambda x: len(x))
        for i in users_track_list[0]:
            for j in users_track_list[1]:
                if jaro(i, j) > 0.75:
                    common_audios_lst.add(i)
        users_track_list.pop(0)
        print("I'm not dead")
        users_track_list[0] = common_audios_lst
    print("_______________________")
    print(common_audios_lst)
    shares = [len(common_audios_lst) / i for i in shares.copy()]
    returnable_text = ''
    for i in common_audios_lst:
        returnable_text += (i + '\n')
    returnable_text += "Процент общих песен\n"
    for i in shares:
        returnable_text += (str(round(100 * i, 2)) + r'% ')
    return returnable_text[:-1]
예제 #33
0
ddir = '/home/ngaude/workspace/data/cdiscount/'

test = pd.read_csv(ddir+'test.csv',sep=';').fillna('');
test['lib'] = map(normalize_guess,test.Libelle.values)
test = test.sort('lib').reset_index(drop=True)

resultat = pd.read_csv(ddir+'test.csv',sep=';').fillna('');

a = test.lib.values
b = [0]
for i in range(0,len(a)-1):
    if len(a[i])<8 or len(a[i+1])<8:
        b.append(0)
    else:
        b.append(jaro(a[i],a[i+1]))

"""
plt.hist(b,bins=300,cumulative=True)
plt.show()
"""

cut_threshold = np.percentile(b,50)
same_categorie_than_previous_item = [i>cut_threshold for i in b]

group_categorie = [0]*len(same_categorie_than_previous_item)

for i in range(1,len(same_categorie_than_previous_item)):
    if same_categorie_than_previous_item[i] == True:
        group_categorie[i] = group_categorie[i-1]
    else:
예제 #34
0
word_phonemes = dict()
matched = dict()
# Map words to their phonemes
for i in range(len(words)):
  word_phonemes[words[i]] = phonemes[i]

print ("word1,word2,semantic_similarity,phonetic_similarity,word_similarity,sem_x_phon_similarity")
for triade in semsim:
  a = triade[0]
  b = triade[1]
  # ignore duplicated (b,a)
  dup = matched.get(b + "_" + a, None)
  if dup:
    continue
  matched[a + "_" + b] = True
  # ignore pairs with the same stem
  if (stem(a) == stem(b)):
    continue
  # Get their phonemes
  ph1 = word_phonemes.get(a, None)
  ph2 = word_phonemes.get(b, None)
  if ph1 is None or ph2 is None:
    continue
  # Semantic similarity
  ss = float(triade[2]) 
  # Phonetic similarity
  ps = jaro(ph1, ph2)
  # Word similarity
  ld = jaro(a,b) 
  print ("%s,%s,%.4f,%.4f,%.4f,%.4f" % (a,b, ss, ps, ld, ss*ps))
				if office_holder_dict not in columbus_file:
					columbus_file.append(office_holder_dict)
			#puts detroit at large ***office holder*** dicts in a separate dictList
			elif office_holder_dict['OCDID'] == 'ocd-division/country:us/state:mi/place:detroit' and office_holder_dict['Office Name'] != 'Mayor':
				if office_holder_dict not in detroit_file:
					detroit_file.append(office_holder_dict)
			#puts boston at large ***office holder*** dicts in a separate dictList
			elif office_holder_dict['OCDID'] == 'ocd-division/country:us/state:ma/place:boston' and office_holder_dict['Office Name'] != 'Mayor':
				if office_holder_dict not in boston_file:
					boston_file.append(office_holder_dict)
			#if the dicts have UIDs, and are not charlotte at large, detroit at large, boston at large, or columbus council members, then start string comparison
			else:
				if scraped_dict['UID'] == office_holder_dict['UID']:
					if scraped_dict['official.name'] == office_holder_dict['Official Name']:
						print office_holder_dict['UID'],"scraped name: ", scraped_dict['official.name'], "file name: ",office_holder_dict['Official Name'], '\n\t>>>all good, exact match'
					elif jaro(scraped_dict['official.name'].lower().replace(' ', '').replace('"','').replace('.','').replace(',',''),office_holder_dict['Official Name'].lower().replace(' ', '').replace('"','').replace('.','').replace(',','')) > .65:
						print office_holder_dict['UID'], scraped_dict['official.name'],office_holder_dict['Official Name'], '\n\t>>>not exact match, but high lev score'
					else:
						print "\n\t>>>found a difference!"
						print office_holder_dict['UID'],"scraped name: ", scraped_dict['official.name'], "file name: ",office_holder_dict['Official Name']
						print jaro(scraped_dict['official.name'],office_holder_dict['Official Name'])
						#answer = raw_input("\n\t>>>is this a meaningful difference? Y/N")
						#if answer == "Y" or answer == "y":
						checkList.append(office_holder_dict['UID'])

###output from initial scrape compare string comparisons
txt_file.append("\nCheck List: "+ ",".join(checkList))
txt_file.append("\nNo UID for:"+ ",".join(GPmissingList))


예제 #36
0
    def handle(self, *args, **options):
        activate(settings.LANGUAGE_CODE)
        all_companies = []

        keys = ["pk", "code", "name", "name_en", "short_name", "short_name_en"]

        for p in Company.objects.all():
            all_companies.append(dict(zip(keys, [
                p.pk,
                p.edrpou,
                p.name_uk,
                p.name_en,
                p.short_name_uk,
                p.short_name_en,
            ])))

        grouped_by_code = defaultdict(list)
        grouped_by_name = defaultdict(list)

        # First pass: exact matches by code, full name or short name
        for l in all_companies:
            code = self.cleanup(l["code"])
            if len(code) > 2:
                grouped_by_code[code].append(l["pk"])

            for k in ["name", "name_en", "short_name", "short_name_en"]:
                name = self.cleanup(l[k])

                if len(name) > 3:
                    grouped_by_name[name].append(l["pk"])

        spoiled_ids = set()
        chunks_to_review = list()

        for k, v in grouped_by_code.items():
            if len(set(v)) > 1:
                spoiled_ids |= set(v)
                chunks_to_review.append(v)

        for k, v in grouped_by_name.items():
            if len(set(v)) > 1:
                spoiled_ids |= set(v)
                chunks_to_review.append(v)

        for chunk in chunks_to_review:
            try:
                CompanyDeduplication(
                    company1_id=chunk[0],
                    company2_id=chunk[1],
                    company1_json=Company.objects.get(pk=chunk[0]).to_dict(),
                    company2_json=Company.objects.get(pk=chunk[1]).to_dict(),
                ).save()
            except IntegrityError:
                pass

        candidates_for_fuzzy = [
            l for l in all_companies
            if l["pk"] not in spoiled_ids
        ]

        for a, b in combinations(candidates_for_fuzzy, 2):
            for field_a, field_b in product(["name", "short_name"], repeat=2):
                val_a = self.cleanup(a[field_a])
                val_b = self.cleanup(b[field_b])
                if len(val_a) < 4 or len(val_b) < 4:
                    continue

                if self.cleanup_digits(a[field_a]) == self.cleanup_digits(b[field_b]):
                    continue

                score = jaro(val_a, val_b)
                if score > 0.97:
                    try:
                        CompanyDeduplication(
                            company1_id=a["pk"],
                            company2_id=b["pk"],
                            company1_json=Company.objects.get(pk=a["pk"]).to_dict(),
                            company2_json=Company.objects.get(pk=b["pk"]).to_dict(),
                            fuzzy=True,
                        ).save()
                        break
                    except IntegrityError:
                        pass

            for field_a, field_b in product(["name_en", "short_name_en"], repeat=2):
                val_a = self.cleanup(a[field_a])
                val_b = self.cleanup(b[field_b])
                if len(val_a) < 4 or len(val_b) < 4:
                    continue

                if self.cleanup_digits(a[field_a]) == self.cleanup_digits(b[field_b]):
                    continue

                score = jaro(val_a, val_b)

                if score > 0.97:
                    try:
                        CompanyDeduplication(
                            company1_id=a["pk"],
                            company2_id=b["pk"],
                            company1_json=Company.objects.get(pk=a["pk"]).to_dict(),
                            company2_json=Company.objects.get(pk=b["pk"]).to_dict(),
                            fuzzy=True,
                        ).save()
                        break
                    except IntegrityError:
                        pass
예제 #37
0
def has_matching_word(phrase1, phrase2):
    for word1 in phrase1.split():
        for word2 in phrase2.split():
            if jaro(word1, word2) > 0.9:
                return True
예제 #38
0
    def _jaro(a,b):
        """Jaro

The Jaro string similarity metric is intended for short strings like personal last names."""
        return jaro(a,b)
예제 #39
0
def has_matching_word(phrase1, phrase2):
    for word1 in phrase1.split():
        for word2 in phrase2.split():
            if jaro(word1, word2) > 0.9:
                return True
예제 #40
0
    def run(self, entry):
        self.logger.create(entry['ID'])
        article = search(entry)
        crit, missing, support = self.check_tags(entry)
        if not support:
            self.logger.log(
                'No support for ' + entry['ENTRYTYPE'] +
                '. Currently, only the following entry types are supported: ' +
                ', '.join(set(self.tags.keys()).symmetric_difference({'all'})),
                2)
            self.logger.print()
            return entry
        if not crit:
            self.logger.log('missing critical tag(s): ' + ' '.join(missing), 3)
            self.logger.print()
            return entry
        if article is None:
            self.logger.log('article could not be found on PubMed', 2)
            self.logger.print()
            return entry
        if len(missing) > 0:
            self.logger.log(
                'the following tags are missing: ' + ' '.join(missing), 1)
            self.logger.unindent()

        # compare authors list
        matching, sorted = self.cmp_auth(self.parse_auth(entry['author']),
                                         article.authors)
        if not matching:
            self.logger.log('authors mismatch:', 3)
            self.logger.log('PM:  ' + ' & '.join(article.authors), 3)
            self.logger.unindent()
            self.logger.log(
                'bib: ' + ' & '.join(self.parse_auth(entry['author'])), 3)
            self.logger.unindent()
            self.logger.unindent()
            entry['author'] = self.pack_auth(article.authors)
        elif not sorted:
            self.logger.log('authors list misordered:', 3)
            self.logger.log('PM:  ' + ' & '.join(article.authors), 3)
            self.logger.unindent()
            self.logger.log(
                'bib: ' + ' & '.join(self.parse_auth(entry['author'])), 3)
            self.logger.unindent()
            self.logger.unindent()
            entry['author'] = self.pack_auth(article.authors)

        # critical comparisons
        for tag in set(self.tags[entry['ENTRYTYPE']]).symmetric_difference(
            {'author'}):
            idx = [entry[tag]]
            if tag in self.aliases.keys():
                for i in self.aliases[tag].keys():
                    if jaro(idx[0].strip(stripsym).lower(),
                            i.strip(stripsym).lower()) > self.threshold[tag]:
                        idx.append(self.aliases[tag][i])
            if not any([
                    self.threshold[tag] <= jaro(
                        article.__dict__[dictionary[tag]].strip(
                            stripsym).lower(),
                        i.strip(stripsym).lower()) for i in idx
            ]):
                self.logger.log(tag + ' mismatch:', 3)
                self.logger.log(
                    'PM:  ' +
                    article.__dict__[dictionary[tag]].strip(stripsym), 3)
                self.logger.unindent()
                self.logger.log('bib: ' + entry[tag].strip(stripsym), 3)
                self.logger.unindent()
                self.logger.unindent()
                entry[tag] = article.__dict__[dictionary[tag]]

        self.logger.print()
        return entry
예제 #41
0
def discogs_ordered_search(query, item_type, limit=100):

    name_pattern = ' \([0-9]+\)'
    q_stripped = query.strip("'\"")

    # special case when searching directly by id
    if q_stripped.isdigit():

        url = 'http://{host}/{item_type}s/{query}'.format(
            host=DISCOGS_HOST,
            query=urllib.quote_plus(query.lower()),
            item_type=item_type
        )

        log.debug('search by id: {0}'.format(url))
        r = requests.get(url)

        if not r.status_code == 200:
            return []


        data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST))

        # TODO: not very nice - remap some fields
        if item_type == 'release':

            if 'title' in data:
                data['title'] = re.sub(name_pattern, '', data['title'])

            if 'formats' in data:
                formats = []
                for format in [f['name'] for f in data['formats'] if 'name' in f]:
                    formats.append(format)
                data['format'] = formats

            if 'labels' in data:
                try:
                    data['catno'] = data['labels'][0]['catno']
                except KeyError:
                    pass

        if item_type == 'artist':

            if 'name' in data:
                data['title'] = re.sub(name_pattern, '', data['name'])

            if 'aliases' in data:
                aliases = []
                for alias in [a['name'] for a in data['aliases'] if 'name' in a]:
                    aliases.append(re.sub(name_pattern, '', alias))
                data['aliases'] = aliases

            if 'members' in data:
                members = []
                for member in [m['name'] for m in data['members'] if 'name' in m]:
                    members.append(re.sub(name_pattern, '', member))
                data['members'] = members

            if 'images' in data:

                for image in [i['uri150'] for i in data['images'] if 'type' in i and i['type'] == 'primary']:
                    data['thumb'] = image
                    break

        return [data,]




    url = 'http://{host}/database/search?q={query}&type={item_type}&per_page=100'.format(
        host=DISCOGS_HOST,
        query=urllib.quote_plus(query.encode('utf8').lower()),
        item_type=item_type
    )

    results = []
    results_unsorted = []
    results_exact = []
    results_start = []
    results_other = []

    x = 0
    while url and x < API_MAX_REQUESTS:

        log.debug(url)
        r = requests.get(url)

        if not r.status_code == 200:
            return []

        data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST))

        url = reduce(dict.get, ['pagination', 'urls', 'next'], data)

        for r in data['results']:
            if 'title' in r:
                title = r['title']
                formatted_title = re.sub(name_pattern, '', title)
                r['index'] = get_index(title)
                r['formatted_title'] = formatted_title
                r['uri'] = 'https://www.discogs.com%s' % r['uri']

                r['dist'] = distance(formatted_title.lower(), q_stripped.lower())
                r['dist1'] = jaro(formatted_title.lower(), q_stripped.lower())
                r['dist2'] = jaro_winkler(formatted_title.lower(), q_stripped.lower())
                r['dist3'] = ratio(formatted_title.lower(), q_stripped.lower())

                # print r['dist'],
                # print r['dist1'],
                # print r['dist2'],
                # print r['dist3'],
                # print formatted_title.lower(),
                # print '::: {0} <> {1}'.format(formatted_title.lower(), q_stripped.lower())

                results_unsorted.append(r)

                if formatted_title.lower() == q_stripped.lower():
                    #print 'exact', formatted_title.lower()
                    results_exact.append(r)
                elif formatted_title.lower().startswith(q_stripped.lower()[0:10]):
                    #print 'start', formatted_title.lower()
                    results_start.append(r)
                else:
                    #print 'other', formatted_title.lower()
                    results_other.append(r)

        x += 1

    #results = sort_results(results_exact) + sort_results(results_start)+ sort_results(results_other)
    results = sort_results_by_distance(results_unsorted)

    if item_type == 'artist':
        results = populate_results(results)

    return results[0:limit]