Python is_confusable Examples, confusable_homoglyphs.confusables.is_confusable Python Examples

Example #1

0

Show file

File: test_confusables.py Project: v-a-kernel/confusable_homoglyphs

    def test_is_confusable(self):
        greek = confusables.is_confusable(looks_good, preferred_aliases=['latin'])
        self.assertEqual(greek[0]['character'], greek_a)
        self.assertIn({'c': 'A', 'n': 'LATIN CAPITAL LETTER A'}, greek[0]['homoglyphs'])
        latin = confusables.is_confusable(is_good, preferred_aliases=['latin'])
        self.assertFalse(latin)

        self.assertFalse(confusables.is_confusable(u'AlloΓ', preferred_aliases=['latin']))

        # stop at first confusable character
        self.assertEqual(len(confusables.is_confusable(u'Αlloρ', greedy=False)), 1)
        # find all confusable characters
        # Α (greek), l, o, and ρ can be confused with other unicode characters
        self.assertEqual(len(confusables.is_confusable(u'Αlloρ', greedy=True)), 4)
        # Only Α (greek) and ρ (greek) can be confused with a latin character
        self.assertEqual(
            len(confusables.is_confusable(u'Αlloρ', greedy=True, preferred_aliases=['latin'])), 2)

        # for 'Latin' readers, ρ is confusable!    ↓
        confusable = confusables.is_confusable(u'paρa', preferred_aliases=['latin'])[0]['character']
        self.assertEqual(confusable, u'ρ')
        # for 'Greek' readers, p is confusable!  ↓
        confusable = confusables.is_confusable(u'paρa', preferred_aliases=['greek'])[0]['character']
        self.assertEqual(confusable, 'p')

        try:
            confusables.is_confusable('', preferred_aliases=[u'latin'])
        except TypeError:
            self.fail('TypeError when preferred_aliases provided as unicode')

Example #2

0

Show file

def string_coerce(string):
    #do initial test for safety
    test = bool(confusables.is_confusable(string, preferred_aliases=['latin', 'common']))

    if not test:
        return string

    string_chars = []
    for char in string:
        aliases = confusables.is_confusable(char, greedy=False, preferred_aliases=[], allow_digit=False)
        string_chars += aliases

    coerced = []
    for char in string_chars:
        if char['alias'] == 'LATIN':
            coerced.append(char['character'])

        elif is_valid_int(char):
            coerced.append(char['character'])

        else:
            for homoglyph in char['homoglyphs']:
                if homoglyph['n'].startswith('LATIN') or homoglyph['n'].startswith('DIGIT'):
                    coerced.append(homoglyph['c'])
                    break

    return ''.join(coerced)

Example #3

0

Show file

File: normalization.py Project: drawpile/website

def normalize_username(name):
    """Generate a normalized version of a username for preventing
    deceptively similar names."""

    # First step: names should be case insensitive
    name = name.lower()

    # Replace confusable characters
    # In version 3.0.0, this can fail with a TypeError if the character is not found
    #homoglyphs = confusables.is_confusable(name, greedy=True, preferred_aliases=['latin'])

    # Workaround:
    homoglyphs = []
    for c in name:
        try:
            homoglyphs += confusables.is_confusable(
                c, preferred_aliases=['latin'])
        except TypeError:
            pass

    if homoglyphs:
        mapping = {x['character']: x['homoglyphs'][0]['c'] for x in homoglyphs}
        name = ''.join(mapping.get(c, c) for c in name)

    return name

Example #4

0

Show file

File: homoglyphs2ascii.py Project: wikipathways/pathway-figure-ocr

def homoglyphs2ascii(input_str, acceptable_characters):
    tree_paths = [""]

    for c in input_str:
        # Note that 'confusables.is_confusable()' only includes homoglyphs of
        # the input character but doesn't include the input character itself.
        #
        homoglyphs = set([c])

        if len(tree_paths) > 1000:
            tree_paths = get_tree_paths(tree_paths, homoglyphs)
            continue

        # TODO: if an input character is not acceptable, should we still pass
        # it along? What about if there's no acceptable homoglyph?
        #
        # homoglyphs = set()
        # if c in acceptable_characters:
        #    homoglyphs.add(c)

        results = confusables.is_confusable(c,
                                            preferred_aliases=[],
                                            greedy=True)

        if results:
            for result in results:
                for h in result["homoglyphs"]:
                    other_homoglyph = h["c"]
                    if other_homoglyph in acceptable_characters:
                        homoglyphs.add(other_homoglyph)

        tree_paths = get_tree_paths(tree_paths, homoglyphs)

    return tree_paths

Example #5

0

Show file

def rationalize_characterset(text: str) -> str:
    chars = confusables.is_confusable(text, preferred_aliases=['latin', 'common'], greedy=True)
    if chars:
        for issue in chars:
            bad = issue['character']
            replacement = [c for c in issue['homoglyphs'] if categories.aliases_categories(c['c'])[0] == 'LATIN'][0]
            # print(f"{bad} ({issue['alias']}) -> {replacement['c']} ({replacement['n']})")
            text = text.replace(bad, replacement['c'])
    return text

Example #6

0

Show file

File: common_domain_analyser.py Project: huydhn/certstream-analytics

    def run(self, record):
        """
        Using the confusable-homoglyphs, we are going to generate all alternatives ASCII
        names of a domain.  It's a bit of a brute force though.
        """
        decoded = []

        for domain in record['all_domains']:
            wildcard = False

            if re.match(r'^\*\.', domain):
                wildcard = True
                # Remove wild card to simplify the domain name a bit and we'll put it back later
                domain = re.sub(r'^\*\.', '', domain)

            hg_map = {
                hg['character']: hg
                for hg in confusables.is_confusable(domain, greedy=True)
            }
            decoded_domain_c = []

            for domain_c in domain:
                # Confusable homoglyphs could not find any homoglyphs for this character
                # so we decide to keep the original character as it is
                if domain_c not in hg_map:
                    decoded_domain_c.append([domain_c])
                    continue

                found = []
                hglyph = hg_map[domain_c]

                if hglyph['alias'] == 'LATIN':
                    # The character is Latin, we don't need to do anything here
                    found.append(hglyph['character'])

                for alt in hglyph['homoglyphs']:
                    if HomoglyphsDecoder.is_latin(alt['c']):
                        found.append(alt['c'].lower())

                # If nothing is found, we keep the original character
                if not found:
                    found.append(hglyph['character'])

                decoded_domain_c.append(found)

            for alt in self._generate_alternatives(decoded_domain_c):
                if wildcard:
                    alt = '*.{}'.format(alt)

                decoded.append(alt)

                if not self.greedy:
                    break

        record['all_domains'] = decoded
        return record

Example #7

0

Show file

File: ranchat.py Project: imnotkind/ranchat-macro

def gagachat(i):
    t = round(time.time() * 1000)
    r = requests.get("http://www.gagalive.com/randomchat/js/?c=" + str(t)).text
    #print(r)

    key = [
        "Y" + r[r.index("4e4|") + 4:r.index("|24e4")],
        "L손님_" + r[r.index("ub2d8_") + 6:r.index("|uc624")] + "|@@@randomchat"
    ]
    print(key)

    ws = create_connection("ws://rchat.gagalive.kr:8080/")
    print(str(i), "open")

    ws.send(key[0])
    ws.send(key[1])

    time.sleep(0.5)

    time.sleep(0.1)
    ws.send("#!)*")

    time.sleep(0.1)
    ws.send("#붸에에에에ㅔㄱ")

    while True:
        r = ws.recv()
        print("RECV", r)
        spam = "스팸 방지 문자: "
        if r.find(spam) != -1:
            s = r[r.find(spam) + len(spam):]
            print("CAPTCHA", s)
            ss = ""
            for c in s:
                a = confusables.is_confusable(c, greedy=True)
                print(a)
                if a == False:
                    ss += c
                    continue
                for p in a:
                    found = False
                    for q in p['homoglyphs']:
                        if q['n'].find(
                                "DIGIT") == 0 or q['n'].find(", DIGIT") != -1:
                            ss += q['c']
                            found = True
                            break
                    if not found:
                        ss += c
            ss = ss.replace(" ", "").replace(")", "").replace("(", "")
            print("CAPTCHA DECODE", ss)
            ws.send("#" + ss)

    print(str(i), "shut down")

Example #8

0

Show file

File: filter.py Project: Scrub000/futaba

    def __init__(self, text):
        logger.info("Creating filter regular expression from %r", text)
        groups = confusables.is_confusable(text, greedy=True)
        if groups:
            pattern = Filter.build_regex(text, groups)
        else:
            pattern = re.escape(text)

        logger.debug("Generated pattern: %r", pattern)

        self.text = text
        self.regex = re.compile(pattern, re.IGNORECASE)

Example #9

0

Show file

def clean_confusables(s):
    confusions = is_confusable(s, greedy=True, preferred_aliases=["latin"])
    result = s
    #print(confusions)
    if not confusions:
        return s
    for next in confusions:
        found = next['character']
        if len(next['homoglyphs']) > 1:
            print("In %s found %s" % (s, json.dumps(next)))
        generic = next['homoglyphs'][0]['c']
        result = result.replace(found, generic)
    return result

Example #10

0

Show file

    def run(self, record):
        '''
        Using the confusable-homoglyphs, we are going to generate all alternatives ASCII
        names of a domain.  It's a bit of a brute force though.
        '''
        decoded = []

        # For our specific case, we will only care about latin character
        lower_s = range(ord('a'), ord('z') + 1)
        upper_s = range(ord('A'), ord('Z') + 1)

        for domain in record['all_domains']:
            wildcard = False

            if re.match(r'^\*\.', domain):
                wildcard = True
                # Remove wildcard to simplify the domain name a bit and we'll put it back later
                domain = re.sub(r'^\*\.', '', domain)

            hg_map = {hg['character']: hg for hg in confusables.is_confusable(domain, greedy=True)}
            decoded_domain_c = []

            for domain_c in domain:
                # Confusable homoglyphs could not find any homoglyphs for this character
                # so we decice to keep the original character as it is
                if domain_c not in hg_map:
                    decoded_domain_c.append([domain_c])
                    continue

                found = []
                hglyph = hg_map[domain_c]

                if hglyph['alias'] == 'LATIN':
                    # The character is latin, we don't need to do anything here
                    found.append(hglyph['character'])

                for alt in hglyph['homoglyphs']:
                    is_latin = True
                    # We need to check the lengh of the homoglyph here cause
                    # confusable_homoglyphs library nicely returns multi-character
                    # match as well, for example, 'rn' has an alternative of 'm'
                    for alt_c in alt['c']:
                        if ord(alt_c) not in lower_s and ord(alt_c) not in upper_s:
                            is_latin = False
                            break

                    if is_latin:
                        found.append(alt['c'].lower())

                # If nothing is found, we keep the original character
                if not found:
                    found.append(hglyph['character'])

                decoded_domain_c.append(found)

            for alt in self._generate_alternatives(decoded_domain_c):
                if wildcard:
                    alt = '*.{}'.format(alt)

                decoded.append(alt)

                if not self.greedy:
                    break

        record['all_domains'] = decoded
        return record

Example #11

0

Show file

                    break

    return ''.join(coerced)

if __name__ == '__main__':
    test_strings = (
        '𐌚chan', #unsafe
        '8chan', #safe
        'уolo', #unsafe
        'Κiller Quеen', #unsafe
        'This is a safe sentence.', #safe
        "It'ѕ lit yo" #unsafe
        )

    for string in test_strings:
        #handle sentences by checking each word individually
        words = string.split(' ')
        result = ""

        for word in words:
            result += ' {}'.format(string_coerce(word))

        result = result.strip()

        test_original = bool(confusables.is_confusable(string, preferred_aliases=['latin', 'common']))

        print('Original is unsafe: {}'.format(str(test_original)))
        print('{} -> {}'.format(string, result))
        test_new = bool(confusables.is_confusable(result, preferred_aliases=['latin', 'common']))
        print('Coersion is unsafe: {}\n'.format(str(test_new)))