Exemple #1
0
def suggest_identifier(id, names):
    sorted_names = sorted(names,
                          key=lambda other: jaro_winkler(id, other),
                          reverse=True)
    if len(sorted_names) > 0:
        if jaro_winkler(id, sorted_names[0]) > 0.0 and similarity(
                id, sorted_names[0]) > 0.5:
            return sorted_names[0]
Exemple #2
0
    def _most_similar_ident(self, id):
        names = set()
        names.update(self.host_environment.keys())
        for typing_env in reversed(self.env_stack):
            names.update(typing_env.keys())

        sorted_names = sorted(names, key=lambda other: jaro_winkler(id, other), reverse=True)
        if len(sorted_names) > 0:
            if jaro_winkler(id, sorted_names[0]) > 0.0:
                return sorted_names[0]
Exemple #3
0
def entity_similarity(left, right):
    left_name = left.get('name')
    right_name = right.get('name')
    score = 0
    if left_name is not None and right_name is not None:
        name_sim = jaro_winkler(chomp(left_name), chomp(right_name))
        score += (name_sim * 0.6)

    left_fp = fingerprints.generate(left_name)
    right_fp = fingerprints.generate(right_name)
    if left_fp is not None and right_fp is not None:
        fp_sim = jaro_winkler(left_fp, right_fp)
        score += (fp_sim * 0.4)

    return min(1.0, score)
Exemple #4
0
    def start(self):

        threads_ids = []

        threads = self.client.fetchThreadList()

        full_msgs = []
        for i in threads:
            if i.name in self.DANGEROUS_PEOPLE:
                a = self.client.fetchThreadMessages(i.uid)
                full_msgs.append(a)

        msgs_to_delete = []
        for i in range(len(full_msgs)):
            flag = False
            for j in range(len(full_msgs[i])):
                if full_msgs[i][j].text:
                    words = full_msgs[i][j].text.split(' ')
                    for l in words:
                        for m in self.DANGEROUS_WORDS:
                            if jaro_winkler(l, m) >= 0.75:
                                msgs_to_delete.append(full_msgs[i][j])
                                flag = True
                            if flag:
                                break
                        if flag:
                            break

        for i in msgs_to_delete:
            self.client.deleteMessages(i.uid)
    def detect(self, company_name, company_website=None):
        request = 'site:linkedin.com/company "%s"' % company_name
        result = self._fetch(request, company_name)

        if result is None and company_website is not None:
            company_domain = urlparse(company_website).netloc
            if company_domain != "":
                request = 'site:linkedin.com/company "%s"' % company_domain
                result = self._fetch(request, company_name)

        if result is None:
            return result

        if not LINKEDIN_URL.match(result.url):
            #sys.stderr.write("Not a linkedin url: " + result.url + "\n")
            return None

        company_identifier = LINKEDIN_URL.search(
            result.url).groupdict()["company"]

        #If the identifier is the universal name and not the id, we test for similarity
        try:
            int(company_identifier)
        except ValueError:
            score = jaro_winkler(normalize(company_name),
                                 normalize(company_identifier))
            if score < 0.7:
                #sys.stderr.write("%s too distant from %s (%.2f)\n" % (normalize(company_name),
                #                                                      normalize(company_identifier),
                #                                                      score))
                return None

        return result
Exemple #6
0
def jarowinkler_distance(words: Iterator[str], vocabulary: str):
    """Corrects the words based on JaroWinkler distances

    Args:
        words (Iterator[str]): Iterator over the misspelled words
        vocabulary (str) : dictionary holding words and their frequency
    """

    for word in words:
        distances = []
        suggestions = []
        vocab_list = list(vocabulary)
        for (i,vocab) in enumerate(vocab_list):
            distances.append(jaro_winkler(word, vocab))
        idx = np.array(distances).argsort()[::-1][:5]
        
        for i in range(5):
            for j in range(i+1,5):
                if distances[idx[i]] == distances[idx[j]]:
                    if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]):
                        temp = idx[i] 
                        idx[i] = idx[j]
                        idx[j] = temp   

        for i in idx:
            suggestions.append(vocab_list[i])

        output("{misspelled}\t{corrections}".format(
            misspelled=word,
            corrections="\t".join(suggestions)
        ))  # may cause IO bottleneck
Exemple #7
0
def gen_text_similarity_feature(sa,
                                sb,
                                prefix='',
                                ngrams_word_jaccard=[],
                                use_char_ngram_jaccard=False,
                                ngrams_char_jaccard=[3, 4, 5]):
    if not isinstance(sa, str) or not isinstance(sb, str):
        return {}
    feats = {}

    wa0 = tokenize0(sa)
    wb0 = tokenize0(sb)
    wa1 = tokenize1(sa)
    wb1 = tokenize1(sb)

    feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0)
    feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1)

    for n in ngrams_word_jaccard:
        feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa0, wb0, n)
        feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa1, wb1, n)

    if use_char_ngram_jaccard:
        for n in ngrams_char_jaccard:
            feats[prefix +
                  'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram(
                      sa, sb, n)

    feats[prefix + 'jw'] = jaro_winkler(sa, sb)
    feats[prefix +
          'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb))

    return feats
def calc_ratio_name(token: Union[QueryToken, str], full_word: str,
                    prefix_weight: float, index: MonsterIndex) -> float:
    """Calculate the name distance between two tokens"""
    string = token.value if isinstance(token, QueryToken) else token

    mw = index.mwt_to_len[full_word] != 1
    jw = jaro_winkler(string, full_word, prefix_weight)

    if string != full_word:
        if isinstance(token, QueryToken) and token.exact:
            return 0.0
        if string.isdigit() and full_word.isdigit():
            return 0.0

    if full_word == string:
        score = 1.0
    elif len(string) >= 3 and full_word.startswith(string):
        score = .995
        if mw and jw < score:
            return score
    else:
        score = jw

    if mw:
        score = score**10 * index.mwt_to_len[full_word]

    return score
Exemple #9
0
    def find_assignee(self, bz_patchers, hg_patchers, bz_commenters, bz_info):
        """Find a potential assignee.
        If an email is common between patchers (people who made patches on bugzilla)
        and hg patchers then return this email.
        If "Foo Bar [:foobar]" made a patch and his hg name is "Bar Foo" return the
        corresponding Bugzilla email.
        """

        if not bz_patchers:
            # we've no patch in the bug
            # so try to find an assignee in the commenters
            bz_patchers = set(bz_commenters.keys())

        potential = set()
        hg_patchers_mail = set(mail for _, mail in hg_patchers)
        common = bz_patchers & hg_patchers_mail
        if len(common) == 1:
            # there is a common email between Bz patchers & Hg email
            return list(common)[0]

        # here we try to find at least 2 common elements
        # in the creator real name and in the hg author name
        hg_patchers_name = [self.clean_name(name) for name, _ in hg_patchers]
        for bz_patcher in bz_patchers:
            if bz_patcher not in bz_info:
                continue
            real_name = self.clean_name(bz_info[bz_patcher])
            for name in hg_patchers_name:
                if len(name & real_name) >= 2:
                    potential.add(bz_patcher)

        # try to find similarities between email and name
        for name in hg_patchers_name:
            possible_mail_parts = self.mk_possible_mails(name)
            for bz_patcher in bz_patchers:
                _bz_patcher = self.clean_mail(bz_patcher)
                for part in possible_mail_parts:
                    if len(part) >= 5 and part in _bz_patcher:
                        potential.add(bz_patcher)

        # try to find similarities between email in using Jaro-Winkler metric
        for b in bz_patchers:
            _b = self.clean_mail(b)
            for h in hg_patchers_mail:
                _h = self.clean_mail(h)
                d = 1 - jaro_winkler(_b, _h)
                if d <= 0.2:
                    potential.add(b)

        if potential:
            potential = list(potential)
            if len(potential) == 1:
                return potential[0]
            return max(
                ((p, bz_commenters.get(p, 0)) for p in potential), key=lambda x: x[1]
            )[0]

        return None
    def find_assignee(self, bz_patchers, hg_patchers, bz_commenters, bz_info):
        """Find a potential assignee.
           If an email is common between patchers (people who made patches on bugzilla)
           and hg patchers then return this email.
           If "Foo Bar [:foobar]" made a patch and his hg name is "Bar Foo" return the
           corresponding Bugzilla email.
        """

        if not bz_patchers:
            # we've no patch in the bug
            # so try to find an assignee in the commenters
            bz_patchers = set(bz_commenters.keys())

        potential = set()
        hg_patchers_mail = set(mail for _, mail in hg_patchers)
        common = bz_patchers & hg_patchers_mail
        if len(common) == 1:
            # there is a common email between Bz patchers & Hg email
            return list(common)[0]

        # here we try to find at least 2 common elements
        # in the creator real name and in the hg author name
        hg_patchers_name = [self.clean_name(name) for name, _ in hg_patchers]
        for bz_patcher in bz_patchers:
            if bz_patcher not in bz_info:
                continue
            real_name = self.clean_name(bz_info[bz_patcher])
            for name in hg_patchers_name:
                if len(name & real_name) >= 2:
                    potential.add(bz_patcher)

        # try to find similarities between email and name
        for name in hg_patchers_name:
            possible_mail_parts = self.mk_possible_mails(name)
            for bz_patcher in bz_patchers:
                _bz_patcher = self.clean_mail(bz_patcher)
                for part in possible_mail_parts:
                    if len(part) >= 5 and part in _bz_patcher:
                        potential.add(bz_patcher)

        # try to find similarities between email in using Jaro-Winkler metric
        for b in bz_patchers:
            _b = self.clean_mail(b)
            for h in hg_patchers_mail:
                _h = self.clean_mail(h)
                d = 1 - jaro_winkler(_b, _h)
                if d <= 0.2:
                    potential.add(b)

        if potential:
            potential = list(potential)
            if len(potential) == 1:
                return potential[0]
            return max(
                ((p, bz_commenters.get(p, 0)) for p in potential), key=lambda x: x[1]
            )[0]

        return None
Exemple #11
0
def leven_music(agg_musics):
    """
    jaro_winklerによって曲を表記ゆれを吸収していく
    観覧車の表記ゆれを吸収できなかったので、music_listsに
    観覧車 ~あの日と、昨日と今日と明日と~
    観覧車
    をそれぞれいれておく。
    """
    """
    music_lists = [
        'Ever Spiral','ラムネ','Blue Planet','Square of the moon','散歩日和',
        '雨のちキミと晴れ模様','アマオト','七色の空','星に想いを夜に願いを','ひとひら',
        'ありがとう','向日葵','Aozora','happiness','あした天気になあれ','アイの庭','夢遥か',
        '夜空','My song','12 Stories','Melody','蕾','青×春☆','甘い罠','Platonic syndrome',
        'Love letter','フタリ','No.51','カラフル','二人色','Revolution!','ISI','光の溢れるときには',
        '夢の通り道','コイノハナ','恋をしよーよ','手紙','アルビナ','Dear','たからもの','Snow wish',
        'With you','こころの種','Cafe','アリガト','Love Clover','クローバー','カラフルDiary',
        'Temptation (Duca Ver)','久遠の夢','たいせつなきみのために、ぼくにできるいちばんのこと',
        '僕らの日々','Only you','ADABANA -仇華-','終わりのはじまり','ニブルヘイム','祈りの虹',
        'いろんなカタチ','ツナグミライ','キミガスキ','キミの大きな手',"絶対Darli'n",'大好きだよ。',
        'シアワセ定義','愛しいキズナ','Brand-New World','Lie','風の唄','恋せよ!乙女',
        'Save the Tale',"Welcome☆Berry's",'また好きになる','赤い薔薇、銀色の月','COLD BUTTERFLY',
        'スターライン','ひとひら ゆらゆらり','幸せのオトシモノ','キミとなら','ボク恋','桜色の想い','Story',
        '君がいてくれたから','太陽とキミと',"Eden's healing",'タイムカプセル','ことば旅行','marry me?',
        'Wishing you','inertia world','My First Love','シアワセのハジマリ','Make a Wish',
        'しあわせの場所','ロケット☆ライド','シアワセsummer','Dreamer','叶えたい未来','Aria','キミとメロディ',
        'Moon Beams','Rainbow Color','恋の記憶','snow crystal','恋をしようよ Let it snow',
        'Jewel Days','メグルmerry-go-round','未来トラベリング','My Darling','想いのハーモニー','恋のAria',
        '0 ~zero~',"I'm in the side",'Nothing','恋想葬','約束','想い出のパズル','Passion',
        'アイオライト','恋するまでの時間','かさねた気持ち','eternal','雪の街 キミと','Confession Eve','雫',
        '記憶×ハジマリ','Chaser×Chaser','1/5 (ゴブンノイチ)','シアワセの理由','笑顔のレシピ','イロドリ',
        '灼熱 Heart Beat','be confidence','恋するletter','かさなるココロ','コイイロセカイ',
        'beloved story','優しい雨','キミと...','Hello,Future!','ナツコイ','あいのうた','Say to you',
        'Fate Line','キミのオト','eyes to eyes','キミへ贈るメロディー','End of the Line','Still',
        '君がいない明日','achromia','ネコイチ','Growing','内緒のホント','Blooming',
        '観覧車 ~あの日と、昨日と今日と明日と~','観覧車',candy♥girl','Dribing story',
        'ラムネ (12 Stories ver)','ラムネ -strings arrange-','星に想いを夜に願いを Ending arrange ver',
        '夢遥か Piano Arrange Ver','カラフル (ロックバージョン)','二人色 (Jump out mix)',
        'フルスロットルHeart (cobalt green Remix)','光の溢れるときには (arrange ver)',
        'いろんなカタチ Piano Arrange Ver','ロケット☆ライド (AUG Remix)',
        'カラフルDiary Piano Arrange Ver','snow crystal ~Acoustic Arrange~',
        'また好きになる Piano Arrange Ver'
    ]
    """

    music_lists = ['ロケット☆ライド', '恋をしよーよ', "Welcome☆Berry's", '観覧車']
    result = {}
    for music_name, count in agg_musics.items():
        for music_list_name in music_lists:
            if not music_list_name in result:
                result[music_list_name] = 0
            jalo = jaro_winkler(music_list_name, music_name)
            if jalo > 0.84:
                result[music_list_name] += int(count)
            else:
                continue

    return result
Exemple #12
0
def match_names(name1_df, name2_df, reduce=True):
    name_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
    name_vectorizer.fit(list(name1_df['name']))

    name1_X = name_vectorizer.transform(name1_df['name'])
    name2_X = name_vectorizer.transform(name2_df['name'])

    similarity_matrix = cosine_similarity(name1_X, name2_X)

    matches = []
    for i, j in zip(*similarity_matrix.nonzero()):
        # Similarity between names
        name_sim_cs = similarity_matrix[i, j]

        if name_sim_cs < 0.5: continue

        match1 = name1_df.iloc[i]
        match2 = name2_df.iloc[j]

        name1 = match1['name']
        name2 = match2['name']

        name_sim_jw = jaro_winkler(name1, name2, 0.1)

        similarity = (name_sim_cs + name_sim_jw) / 2

        # Similarity between titles

        title1 = match1['title']
        title2 = match2['title']

        titles = title1.split()
        title_sim = jaccard(titles, title2.split())
        if title1 != '':
            title_weight = min(len(titles), 2) * 0.1

            similarity = (title_weight * title_sim +
                          (1 - title_weight) * similarity)

        matches.append([
            match1['id'], match1['seq'], match1['full_name'], name1, title1,
            match2['id'], match2['seq'], match2['full_name'], name2, title2,
            name_sim_cs, name_sim_jw, title_sim, similarity
        ])

    headers = [
        'id', 'seq', 'full_name', 'name', 'title', 'match_id', 'match_seq',
        'match_full_name', 'match_name', 'match_title', 'name_similarity_cs',
        'name_similarity_jw', 'title_similarity', 'similarity'
    ]

    match_df = pd.DataFrame(data=matches, columns=headers)

    if reduce:
        return match_df.groupby(['id']).apply(choose_match)
    else:
        return match_df
Exemple #13
0
 def _is_close(self, text, thresh=0.94):
     words = self.__preprocess(text)
     self.df["Scores"] = self.df["Profane"].apply(
         lambda x: max(jaro_winkler(w, x) for w in words))
     bword, score = self.df.sort_values("Scores",
                                        ascending=False).iloc[0].tolist()
     if score > thresh:
         return True
     return False
Exemple #14
0
def lev_menu_for (word):
	with_ratio = [ (jaro_winkler(word, key), key) for key in keys_without_accents.keys() ]
	with_ratio.sort(reverse = True)

	options = filter(lambda (ratio, key): ratio > 0.9, with_ratio[0:4])
	if options:
		print "Quizás quiso decir:"
		for ratio, key in options:
			print
			search(key)
def calc_ratio_modifier(s1: Union[QueryToken, str],
                        s2: str,
                        prefix_weight: float = .05) -> float:
    """Calculate the modifier distance between two tokens"""
    if isinstance(s1, QueryToken):
        if s1.exact:
            return 1.0 if s1.value == s2 else 0.0
        s1 = s1.value

    return jaro_winkler(s1, s2, prefix_weight)
Exemple #16
0
def text_score(match, candidates):
    if isinstance(candidates, basestring):
        candidates = [candidates]
    match_n = normalize(match)
    best_score = 0
    for candidate in candidates:
        cand_n = normalize(candidate)
        score = jaro_winkler(match_n, cand_n, 0.02) * 100
        best_score = max(int(score), best_score)
    return best_score
Exemple #17
0
def participant_autocomplete(request):
    ctx = request.context
    keyword = request.GET.get('q')
    if not keyword:
        raise HTTPBadRequest("please specify search terms (q)")
    limit = request.GET.get('limit', 20)
    try:
        limit = int(limit)
    except:
        raise HTTPBadRequest("limit must be an integer")
    if limit > 100:
        raise HTTPBadRequest("be reasonable")
    query = AgentProfile.default_db.query(
            AgentProfile.id, AgentProfile.name, User.username
        ).outerjoin(User).filter((User.verified == True) | (User.id == None))
    discussion = ctx.get_instance_of_class(Discussion)
    if discussion:
        query = query.filter(AgentProfile.id.in_(
            discussion.get_participants_query(True, True).subquery()))

    if len(keyword) < 6:
        query = query.add_column(literal(0))
        matchstr = '%'.join(keyword)
        matchstr = '%'.join(('', matchstr, ''))
        agents = query.filter(AgentProfile.name.ilike(matchstr) |
                             User.username.ilike(matchstr)
            ).limit(limit * 5).all()
        agents.sort(key=lambda u: max(
            jaro_winkler(u[1], keyword),
            jaro_winkler(u[2], keyword) if u[2] else 0
            ), reverse=True)
        num = min(len(agents), limit)
        agents = agents[:num]
    else:
        matchstr = keyword
        query, rank = add_simple_text_search(
            query, [AgentProfile.name], keyword.split())
        agents = query.order_by(rank.desc()).limit(limit).all()
    return {'results': [{
        'id': AgentProfile.uri_generic(id),
        'text': name} for (id, name, username, rank) in agents]}
Exemple #18
0
def find_similar_words(word_to_match: str):
    """Edit distance function."""
    topologic_response = requests.get(os.path.join(TOPOLOGIC["api"],
                                                   "get_all_field_values",
                                                   TOPOLOGIC["dbname"]),
                                      params={"field": "word"})
    words = topologic_response.json()["field_values"]
    similar_words: List[Tuple[str, float]] = []
    for word in words:
        similarity = jaro_winkler(word_to_match, word, 0.15)
        if similarity >= 0.85:
            similar_words.append((word, similarity))
    similar_words.sort(key=lambda x: x[1], reverse=True)
    return [word for word, _ in similar_words]
    def _find_module_match(self, pattern, exact=False):

        logging.debug('matching on {}'.format(pattern))

        matches = []

        if isinstance(pattern, unicode):
            pattern = pattern.encode('ascii', 'ignore')

        logging.debug('_find_module_match: {}'.format(pattern))

        noext = pattern.replace('.py', '').replace('.ps1', '')

        # exact is looking for a very precise name such as "vmware_guest"
        if exact:
            candidates = [pattern]
        else:
            candidates = [pattern, '_' + pattern, noext, '_' + noext]

        for k, v in self.MODULES.items():
            if v['name'] in candidates:
                logging.debug('match {} on name: {}'.format(k, v['name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in self.MODULES.items():
                if k == pattern:
                    logging.debug('match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        # spellcheck
        if not exact and not matches and '/' not in pattern:
            _pattern = pattern
            if not isinstance(_pattern, unicode):
                _pattern = _pattern.decode('utf-8')
            candidates = []
            for k, v in self.MODULES.items():
                vname = v['name']
                if not isinstance(vname, unicode):
                    vname = vname.decode('utf-8')
                jw = jaro_winkler(vname, _pattern)
                if jw > .9:
                    candidates.append((jw, k))
            for candidate in candidates:
                matches.append(self.MODULES[candidate[1]])

        return matches
Exemple #20
0
    def checkForMatches(newAgents, agents):
        merged = defaultdict(dict)
        for agent in agents:
            merger = list(filter(lambda x: jaro_winkler(x['name'].lower(), agent['name'].lower()) > 0.8, newAgents))
            if(len(merger) > 0):
                mergedAgent = merger[0]
                merged[mergedAgent['name']] = Agent.mergeFromDict(agent, mergedAgent)
            else:
                merged[agent.name] = agent

        for newAgent in newAgents:
            if newAgent.name not in merged:
                merged[newAgent.name] = newAgent

        return merged.values()
Exemple #21
0
def calc_ratio_name(token, full_word, index2, factor=.05):
    mw = index2.mwt_to_len[full_word] != 1
    jw = jaro_winkler(token, full_word, factor)

    if full_word == token:
        score = 1
    elif len(token) >= 3 and full_word.startswith(token):
        score = .995
        if mw and jw < score:
            return score
    else:
        score = jw

    if mw:
        score = score ** 10 * index2.mwt_to_len[full_word]

    return score
Exemple #22
0
    def resolve_country(country_text):
        """
        We are going to look for the best match between the country_text
        informed and the ISO 3166-1 alfa-3 code
        :param country_text: the text about the country
        :return: the ISO 3166-1 alfa-3 code
        """

        country_text = str(country_text.lower())

        # Check if the country_text is an ISO 3166-1 alfa-3 code
        try:
            if COUNTRIES_3LETTER_CODE.index(country_text):
                return country_text.upper()
        except ValueError:
            pass

        # Check if the country_text is an ISO 3166-1 alfa-2 code
        try:
            if COUNTRIES_2LETTER_CODE.index(country_text):
                return COUNTRIES_3LETTER_CODE[COUNTRIES_2LETTER_CODE.index(
                    country_text)].upper()
        except ValueError:
            pass

        # Check if the country_text is a recognized name
        try:
            if COUNTRIES_NAME.index(country_text):
                return COUNTRIES_3LETTER_CODE[COUNTRIES_NAME.index(
                    country_text)].upper()
        except ValueError:
            pass

        # Look for the closest name to the one informed using
        # the levenshtein distance
        similarity_ratios = []
        for index, valid_country_name in enumerate(COUNTRIES_NAME):
            similarity_ratios.append((COUNTRIES_3LETTER_CODE[index].upper(),
                                      jaro_winkler(valid_country_name,
                                                   str(country_text))))

        similarity_ratios = reversed(
            sorted(similarity_ratios, key=lambda x: x[1]))

        return similarity_ratios.next()[0]
Exemple #23
0
    def is_new(self, user_req=False, test=False):
        with open(self.log) as log, open(self.other_log) as other_log, open(self.decisions, 'a') as decisions:
            decisions.write('Compare case: '+self.url+'\n'+self.img_url+'\n')
            for url in other_log.readlines()+log.readlines()[::-1]:

                comp_article = cache.get(url.strip())
                if not comp_article:
                    print('set cache')
                    comp_article = NewArticle(url=url.strip(), compare=True)
                    array = comp_article.img_array
                    title = comp_article.url_title
                    lang = comp_article.language
                    val = CacheArticle(array, title, lang)
                    cache.set(url, val, 86400)

                img_index = mse(self.img_array, comp_article.img_array)
                text_index = jaro_winkler(
                    self.url_title,
                    comp_article.url_title,
                    0.25
                    )
                line = url+ comp_article.img_url+'\n'+ \
                'mse='+str(img_index)+', '+'text'+str(text_index)+'\n'+'\n'
                decisions.write(line)

                if img_index < 25: #denends from order
                    if comp_article.language != self.language:
                        decisions.write('found fersion'+'\n'+'\n')
                        self.similar_url = comp_article.url
                        return False

                    if text_index == 1:
                        self.update_url = comp_article.url
                        decisions.write('update'+'\n'+'\n')
                        return False

            decisions.write('write to bd'+'\n'+'\n')
        if user_req:
            with open(self.other_log, 'a') as other_log:
                other_log.write(url+'\n')

        print('new')
        return True
    def merge_positions(self, position1, position2):
        lposition1 = strip_punctuation(position1.lower())
        lposition2 = strip_punctuation(position2.lower())

        if lposition1 == lposition2:
            return position1

        if jaro_winkler(lposition1, lposition2) >= 0.7:
            return position1

        is_fr1 = self._is_french(lposition1)
        is_fr2 = self._is_french(lposition2)

        if is_fr1 and not is_fr2:
            return position1
        elif is_fr2 and not is_fr1:
            return position2

        return position1 if len(lposition1) > len(lposition2) else position2
Exemple #25
0
    def annotate(self, training_set):

        #Levenshtein distance - minimum number of single character edits
        distance_udf = udf(lambda x, y: distance(x, y), IntegerType())
        #Levenshtein ratio - similarity of two strings
        ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType())
        #Jaro - similarity score
        jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType())
        #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning
        jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType())
        #fuzz partial ratio - gives a score based on how well parts of a string match another
        fuzz_partial_ratio_udf = udf(
            lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType())

        training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \
            .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \
            .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \
            .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2"))

        return training_set
Exemple #26
0
    def agentParser(self, agents, fields):
        outAgents = {}

        for agent in agents:
            if agent is None: continue

            agentParts = agent.split('|')

            while len(agentParts) < len(fields):
                agentParts.insert(1, '')

            rec = dict(zip(fields, agentParts))
            recKey = re.sub(r'[.,:\(\)]+', '', rec['name'].lower())

            if rec['name'] == '' and rec['viaf'] == '' and rec['lcnaf'] == '':
                continue

            existingMatch = False
            for oaKey, oa in outAgents.items():
                for checkField in ['viaf', 'lcnaf']:
                    if rec[checkField] and rec[checkField] != '' and rec[
                            checkField] == oa[checkField]:
                        existingMatch = True
                        SFRRecordManager.mergeAgents(oa, rec)
                        break

                if existingMatch is False:
                    if jaro_winkler(oaKey, recKey) > 0.9:
                        SFRRecordManager.mergeAgents(oa, rec)
                        existingMatch = True
                        break

            if existingMatch is False:
                if 'role' in rec.keys():
                    rec['roles'] = list(set([rec['role']]))
                    del rec['role']
                outAgents[recKey] = rec

        return [a for _, a in outAgents.items()]
Exemple #27
0
    def search_schedule_matches(self, sch_tweet_id, tweet_text_tokens,
                                entities_matched, entities_token_matched,
                                matched_sch_ids):
        """
        Search for matches betweet schedule tweets annotated entities and
        ugc tweet. It returns the matches found.
        """
        # Iterate over the entities of the schedule
        for sch_entity in self.DictSched[sch_tweet_id]['entities']:
            sch_entity_strip = sch_entity[0].lower().split()
            sch_entity_type = sch_entity[1]
            # Sanity check over schedule entities annotated
            if not sch_entity_strip:
                continue

            # Get token matched between ugc and schedule tweet
            token_matches = [
                (t, sch_entity_type) for t in sch_entity_strip if [
                    s for s in tweet_text_tokens
                    if jaro_winkler(t.lower(), s.lower()) >= 0.95
                ] and t not in self.stopwords and t not in string.punctuation
            ]

            # Compute score for string similarity
            score = len(token_matches) / float(len(sch_entity_strip))

            # Check if string similarity conditions are valid
            if (sch_entity_type.endswith('Contributor') and
                    score >= self.contr_tsl) or \
               (sch_entity_type.endswith('Work') and
                    score >= self.work_tsl):
                # Discard matches againts already matched entities
                if sch_entity_strip not in entities_matched:
                    entities_matched.append(sch_entity_strip)
                    if token_matches not in entities_token_matched:
                        entities_token_matched += token_matches
                    matched_sch_ids.append(sch_tweet_id)

        return entities_matched, entities_token_matched, matched_sch_ids
Exemple #28
0
 def name_match(self, name_a, name_b):
     '''
     For example, we cannot totally assert Professor 'J. Tang' and Professor 'Jie Tang' are the same person.
     We use this function to estimate how much possibility that two professors' name belong to one person.
     '''
     name_a = name_a.lower().strip().replace('.',
                                             '').replace('-', '').replace(
                                                 u'\xa0', '')
     name_b = name_b.lower().strip().replace('.', '').replace('-', '')
     if name_a == name_b:
         return 1
     elif name_a[0] != name_b[0]:
         return 0
     lastname_a = name_a.split(' ')[-1]
     lastname_b = name_b.split(' ')[-1]
     if lastname_a != lastname_b:
         return 0
     firstname_a = name_a.split(' ')[0]
     firstname_b = name_b.split(' ')[0]
     if len(firstname_a) != 1 and len(firstname_b) != 1:
         return 0
     return jaro_winkler(name_a, name_b)
Exemple #29
0
    def select_best_company(query_params, candidates):
        """
        We are going to calculate the avg distance between the expected value
         for each parameter (name, domain) and the values of the candidates
         for them

        :param query_params: the values to check
        :param candidates: all the companies that matched with the
                previous params
        :return: the selected candidate that better match
        """

        logging.debug("Looking for the best match...")

        # All the distances for each param for each candidate
        ratios = []

        for index, candidate in enumerate(candidates):
            logging.debug("Candidate #%d: %s" % (index, candidate))
            candidate_ratios = []
            for param_name, param_value in query_params.iteritems():
                if param_name in candidate and candidate[param_name]:
                    candidate_ratios.append(
                        jaro_winkler(
                            str(param_value),
                            str(candidate[param_name].encode('utf-8'))))

            # Calculate the distance ratio as the AVG between all the computed
            # ratios for the candidate
            ratios.append(
                (float(sum(candidate_ratios)) / (len(query_params) - 1),
                 candidate))

        # The best match first
        ratios = reversed(sorted(ratios, key=lambda x: x[0]))

        # Return the select candidate being the best match
        return ratios.next()[1]
Exemple #30
0
    def randomize_choices(self, filename):
        qs = []
        for idq, question in enumerate(self.questions):
            if question['filename'] == filename:
                qs.append([idq, question])

        #all_choices = [x[1]['answer'] for x in qs]
        all_choices = []
        for x in qs:
            answer = x[1]['answer']
            if isinstance(answer, list):
                for y in answer:
                    all_choices.append(y)
            else:
                all_choices.append(answer)

        for question in qs:
            answers = question[1]['answer']
            if isinstance(answers, list):
                continue
                #import epdb; epdb.st()
            answer = question[1]['answer']
            choices = [
                answer,
                random.choice(all_choices),
                random.choice(all_choices)
            ]

            matches = []
            for ac in all_choices:
                if ac.lower() == answer.lower():
                    continue
                jw = jaro_winkler(answer, ac)
                matches.append([str(jw).lower(), str(ac).lower()])
            matches = sorted(matches, key=lambda x: x[0])
            choices.append(matches[-1][1])
            choices.append(matches[-2][1])
            self.questions[question[0]]['choices'] = choices[:]
Exemple #31
0
def create_jw_blocks(list_of_lawyers):
    """
    Receives list of blocks, where a block is a list of lawyers
    that all begin with the same letter. Within each block, does
    a pairwise jaro winkler comparison to block lawyers together
    """
    global blocks
    consumed = defaultdict(int)
    print 'Doing pairwise Jaro-Winkler...', len(list_of_lawyers)
    for i, primary in enumerate(list_of_lawyers):
        if consumed[primary]: continue
        consumed[primary] = 1
        blocks[primary].append(primary)
        for secondary in list_of_lawyers[i:]:
            if consumed[secondary]: continue
            if primary == secondary:
                blocks[primary].append(secondary)
                continue
            if jaro_winkler(primary, secondary, 0.0) >= THRESHOLD:
                consumed[secondary] = 1
                blocks[primary].append(secondary)
    pickle.dump(blocks, open('lawyer.pickle', 'wb'))
    print 'lawyer blocks created!'
def create_jw_blocks(list_of_lawyers):
    """
    Receives list of blocks, where a block is a list of lawyers
    that all begin with the same letter. Within each block, does
    a pairwise jaro winkler comparison to block lawyers together
    """
    global blocks
    consumed = defaultdict(int)
    print 'Doing pairwise Jaro-Winkler...', len(list_of_lawyers)
    for i, primary in enumerate(list_of_lawyers):
        if consumed[primary]: continue
        consumed[primary] = 1
        blocks[primary].append(primary)
        for secondary in list_of_lawyers[i:]:
            if consumed[secondary]: continue
            if primary == secondary:
                blocks[primary].append(secondary)
                continue
            if jaro_winkler(primary, secondary, 0.0) >= THRESHOLD:
                consumed[secondary] = 1
                blocks[primary].append(secondary)
    pickle.dump(blocks, open('lawyer.pickle', 'wb'))
    print 'lawyer blocks created!'
Exemple #33
0
    def is_new(self, user_req=False, test=False):
        '''Compare title with titles of alredy existed articles'''

        with open(self.log) as log, open(self.other_log) as other_log:
            for url in other_log.readlines()+log.readlines()[::-1]:
                comp_url_title = url.split('/')[-1]

                text_index = jaro_winkler(
                    self.url_title,
                    comp_url_title)

                if text_index == 1:
                    if user_req:
                        return url
                    print(url)
                    self.update_url = url.strip()
                    return False

        if user_req:
            with open(self.other_log, 'a') as other_log:
                other_log.write(self.url+'\n')
            return False

        return True
Exemple #34
0
    def update(self, idx, idy, get_word):
        '''
            idx, idy : index x and y from enumerated shit
            get_word : a function to fetch the saved structure
        '''
        # idx = int(idx.encode('utf-8'))
        # idy = int(idy.encode('utf-8'))
        # logger.debug("index : ({} TYPE={}, {} TYPE={})".format(idx, type(idx), idy, type(idy)))
        stime = datetime.now()
        try:
            idx = int(idx)
            idy = int(idy)

            if idx == idy:
                return

            ox = self.store.get(str(idx))
            oy = self.store.get(str(idy))

            if ox is None or oy is None:
                return
            w1 = get_word(ox)
            w2 = get_word(oy)
            simi = jaro_winkler(w1, w2)

            logger.debug("({},{}) vs ({},{}) : {}".format(
                idx, w1, idy, w2, simi))

            self.store.set_entry(idx, idy, simi)

        except Exception as e:
            logger.debug("(idx : {} type : {})".format(idx, type(idx)))
            raise e
        ntime = datetime.now()

        logger.debug("processing time : {}".format(ntime - stime))
Exemple #35
0
    async def roleinfo(self,
                       ctx: CustomContext,
                       *,
                       rolename: typing.Optional[str] = None):
        """
        Shows information about the given role.
        If used without any arguments, shows you a list of all roles supported in the bot.
        """
        if rolename is None:

            def accumulator(facroles, role):
                facroles[role().faction.category_name].append(role.name)
                return facroles

            fac_roles = reduce(accumulator, all_roles.values(),
                               defaultdict(list))
            embed = discord.Embed()
            embed.color = 0x000000
            embed.set_author(name='All supported roles',
                             icon_url=self.bot.user.avatar_url)
            embed.set_footer(
                text='For information on a specific role, use roleinfo command.'
            )
            embed.description = ''
            for faction, roles in fac_roles.items():
                roles.sort()
                for role in roles:
                    emote_name = role if faction == 'Neutral' else faction
                    emote = emotes.get(emote_name, '❓')
                    embed.description += '{} **{}**\n'.format(emote, role)
                embed.description += '\n'
            return await ctx.send(embed=embed)

        if rolename in role_categories:
            return await ctx.invoke(self.bot.get_command('categoryinfo'),
                                    category=rolename)

        for role in all_roles.values():
            role = role()  # initialize the class
            if role.name.lower() == rolename.lower():
                if role.__doc__ is None:
                    return await ctx.send(
                        'No documentation on {} available.'.format(rolename))

                annotations = []
                annotations.append(role.faction.category_name)
                if role.unique:
                    annotations.append('Unique')

                embed = discord.Embed()
                embed.color = 0x000000
                embed.set_author(
                    name=f'{role.name} ({"; ".join(annotations)})',
                    icon_url=self.bot.user.avatar_url)
                embed.description = '```diff\n'
                embed.description += inspect.getdoc(role)
                embed.description += '```'
                embed.set_footer(
                    text=f'Categories: {", ".join(sorted(role.categories))}')

                return await ctx.send(embed=embed)

        for role in all_roles.values():
            if jaro_winkler(role.name.lower(), rolename.lower()) > 0.85:
                await ctx.send(
                    'Couldn\'t find the role "{}". Did you mean {}?'.format(
                        rolename, role.name))

                def check(msg):
                    return msg.author == ctx.author and msg.content.lower(
                    ) in ['yes', 'y', 'yeah']

                try:
                    response = await self.bot.wait_for('message',
                                                       timeout=10.0,
                                                       check=check)
                    return await ctx.invoke(ctx.command, rolename=role.name)
                except asyncio.TimeoutError:
                    return
        await ctx.send("Couldn't find that role!")
def get_similarity(uuid1, uuid2):
    clean1 = uuid_to_cleanid[uuid1]
    clean2 = uuid_to_cleanid[uuid2]
    if clean1 == clean2:
        return 1.0
    return jaro_winkler(clean1, clean2, 0.0)
Exemple #37
0
from sys import argv
from PIL import Image
from pytesseract import image_to_string
from Levenshtein import jaro_winkler

import glob

states = open("states.txt").read().split("\n")[:-1]

for infile in sorted(glob.iglob(argv[1])):
    im = Image.open(infile)

    # rect = (444, 63, 661, 116)
    rect = (383, 58, 686, 110)

    im = im.crop(rect).point(lambda p: p > 111)
    im.convert("1")

#    print(image_to_string(im, config="--user-words states.txt --user-patterns states.txt -c load_system_dawg=0 -c load_freq_dawg=0"))
    text = image_to_string(im, config="-l states -c tessdata_char_blacklist='0123456789' bazaar").split("\n")[0]
    if text:
        match = text if text in states else max(states, key=lambda s: jaro_winkler(text, s))
        print(infile, match, text, jaro_winkler(text, match))
Exemple #38
0
def discogs_ordered_search(query, item_type, limit=100):

    name_pattern = ' \([0-9]+\)'
    q_stripped = query.strip("'\"")

    # special case when searching directly by id
    if q_stripped.isdigit():

        url = 'http://{host}/{item_type}s/{query}'.format(
            host=DISCOGS_HOST,
            query=urllib.quote_plus(query.lower()),
            item_type=item_type
        )

        log.debug('search by id: {0}'.format(url))
        r = requests.get(url)

        if not r.status_code == 200:
            return []


        data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST))

        # TODO: not very nice - remap some fields
        if item_type == 'release':

            if 'title' in data:
                data['title'] = re.sub(name_pattern, '', data['title'])

            if 'formats' in data:
                formats = []
                for format in [f['name'] for f in data['formats'] if 'name' in f]:
                    formats.append(format)
                data['format'] = formats

            if 'labels' in data:
                try:
                    data['catno'] = data['labels'][0]['catno']
                except KeyError:
                    pass

        if item_type == 'artist':

            if 'name' in data:
                data['title'] = re.sub(name_pattern, '', data['name'])

            if 'aliases' in data:
                aliases = []
                for alias in [a['name'] for a in data['aliases'] if 'name' in a]:
                    aliases.append(re.sub(name_pattern, '', alias))
                data['aliases'] = aliases

            if 'members' in data:
                members = []
                for member in [m['name'] for m in data['members'] if 'name' in m]:
                    members.append(re.sub(name_pattern, '', member))
                data['members'] = members

            if 'images' in data:

                for image in [i['uri150'] for i in data['images'] if 'type' in i and i['type'] == 'primary']:
                    data['thumb'] = image
                    break

        return [data,]




    url = 'http://{host}/database/search?q={query}&type={item_type}&per_page=100'.format(
        host=DISCOGS_HOST,
        query=urllib.quote_plus(query.encode('utf8').lower()),
        item_type=item_type
    )

    results = []
    results_unsorted = []
    results_exact = []
    results_start = []
    results_other = []

    x = 0
    while url and x < API_MAX_REQUESTS:

        log.debug(url)
        r = requests.get(url)

        if not r.status_code == 200:
            return []

        data = json.loads(r.text.replace('api.discogs.com', DISCOGS_HOST))

        url = reduce(dict.get, ['pagination', 'urls', 'next'], data)

        for r in data['results']:
            if 'title' in r:
                title = r['title']
                formatted_title = re.sub(name_pattern, '', title)
                r['index'] = get_index(title)
                r['formatted_title'] = formatted_title
                r['uri'] = 'https://www.discogs.com%s' % r['uri']

                r['dist'] = distance(formatted_title.lower(), q_stripped.lower())
                r['dist1'] = jaro(formatted_title.lower(), q_stripped.lower())
                r['dist2'] = jaro_winkler(formatted_title.lower(), q_stripped.lower())
                r['dist3'] = ratio(formatted_title.lower(), q_stripped.lower())

                # print r['dist'],
                # print r['dist1'],
                # print r['dist2'],
                # print r['dist3'],
                # print formatted_title.lower(),
                # print '::: {0} <> {1}'.format(formatted_title.lower(), q_stripped.lower())

                results_unsorted.append(r)

                if formatted_title.lower() == q_stripped.lower():
                    #print 'exact', formatted_title.lower()
                    results_exact.append(r)
                elif formatted_title.lower().startswith(q_stripped.lower()[0:10]):
                    #print 'start', formatted_title.lower()
                    results_start.append(r)
                else:
                    #print 'other', formatted_title.lower()
                    results_other.append(r)

        x += 1

    #results = sort_results(results_exact) + sort_results(results_start)+ sort_results(results_other)
    results = sort_results_by_distance(results_unsorted)

    if item_type == 'artist':
        results = populate_results(results)

    return results[0:limit]
Exemple #39
0
def suggest_identifier(id, names):
    sorted_names = sorted(names, key=lambda other: jaro_winkler(id, other), reverse=True)
    if len(sorted_names) > 0:
        if jaro_winkler(id, sorted_names[0]) > 0.0 and similarity(id, sorted_names[0]) > 0.5:
            return sorted_names[0]