def test_is_confusable(self): greek = confusables.is_confusable(looks_good, preferred_aliases=['latin']) self.assertEqual(greek[0]['character'], greek_a) self.assertIn({'c': 'A', 'n': 'LATIN CAPITAL LETTER A'}, greek[0]['homoglyphs']) latin = confusables.is_confusable(is_good, preferred_aliases=['latin']) self.assertFalse(latin) self.assertFalse(confusables.is_confusable(u'AlloΓ', preferred_aliases=['latin'])) # stop at first confusable character self.assertEqual(len(confusables.is_confusable(u'Αlloρ', greedy=False)), 1) # find all confusable characters # Α (greek), l, o, and ρ can be confused with other unicode characters self.assertEqual(len(confusables.is_confusable(u'Αlloρ', greedy=True)), 4) # Only Α (greek) and ρ (greek) can be confused with a latin character self.assertEqual( len(confusables.is_confusable(u'Αlloρ', greedy=True, preferred_aliases=['latin'])), 2) # for 'Latin' readers, ρ is confusable! ↓ confusable = confusables.is_confusable(u'paρa', preferred_aliases=['latin'])[0]['character'] self.assertEqual(confusable, u'ρ') # for 'Greek' readers, p is confusable! ↓ confusable = confusables.is_confusable(u'paρa', preferred_aliases=['greek'])[0]['character'] self.assertEqual(confusable, 'p') try: confusables.is_confusable('', preferred_aliases=[u'latin']) except TypeError: self.fail('TypeError when preferred_aliases provided as unicode')
def string_coerce(string): #do initial test for safety test = bool(confusables.is_confusable(string, preferred_aliases=['latin', 'common'])) if not test: return string string_chars = [] for char in string: aliases = confusables.is_confusable(char, greedy=False, preferred_aliases=[], allow_digit=False) string_chars += aliases coerced = [] for char in string_chars: if char['alias'] == 'LATIN': coerced.append(char['character']) elif is_valid_int(char): coerced.append(char['character']) else: for homoglyph in char['homoglyphs']: if homoglyph['n'].startswith('LATIN') or homoglyph['n'].startswith('DIGIT'): coerced.append(homoglyph['c']) break return ''.join(coerced)
def normalize_username(name): """Generate a normalized version of a username for preventing deceptively similar names.""" # First step: names should be case insensitive name = name.lower() # Replace confusable characters # In version 3.0.0, this can fail with a TypeError if the character is not found #homoglyphs = confusables.is_confusable(name, greedy=True, preferred_aliases=['latin']) # Workaround: homoglyphs = [] for c in name: try: homoglyphs += confusables.is_confusable( c, preferred_aliases=['latin']) except TypeError: pass if homoglyphs: mapping = {x['character']: x['homoglyphs'][0]['c'] for x in homoglyphs} name = ''.join(mapping.get(c, c) for c in name) return name
def homoglyphs2ascii(input_str, acceptable_characters): tree_paths = [""] for c in input_str: # Note that 'confusables.is_confusable()' only includes homoglyphs of # the input character but doesn't include the input character itself. # homoglyphs = set([c]) if len(tree_paths) > 1000: tree_paths = get_tree_paths(tree_paths, homoglyphs) continue # TODO: if an input character is not acceptable, should we still pass # it along? What about if there's no acceptable homoglyph? # # homoglyphs = set() # if c in acceptable_characters: # homoglyphs.add(c) results = confusables.is_confusable(c, preferred_aliases=[], greedy=True) if results: for result in results: for h in result["homoglyphs"]: other_homoglyph = h["c"] if other_homoglyph in acceptable_characters: homoglyphs.add(other_homoglyph) tree_paths = get_tree_paths(tree_paths, homoglyphs) return tree_paths
def rationalize_characterset(text: str) -> str: chars = confusables.is_confusable(text, preferred_aliases=['latin', 'common'], greedy=True) if chars: for issue in chars: bad = issue['character'] replacement = [c for c in issue['homoglyphs'] if categories.aliases_categories(c['c'])[0] == 'LATIN'][0] # print(f"{bad} ({issue['alias']}) -> {replacement['c']} ({replacement['n']})") text = text.replace(bad, replacement['c']) return text
def run(self, record): """ Using the confusable-homoglyphs, we are going to generate all alternatives ASCII names of a domain. It's a bit of a brute force though. """ decoded = [] for domain in record['all_domains']: wildcard = False if re.match(r'^\*\.', domain): wildcard = True # Remove wild card to simplify the domain name a bit and we'll put it back later domain = re.sub(r'^\*\.', '', domain) hg_map = { hg['character']: hg for hg in confusables.is_confusable(domain, greedy=True) } decoded_domain_c = [] for domain_c in domain: # Confusable homoglyphs could not find any homoglyphs for this character # so we decide to keep the original character as it is if domain_c not in hg_map: decoded_domain_c.append([domain_c]) continue found = [] hglyph = hg_map[domain_c] if hglyph['alias'] == 'LATIN': # The character is Latin, we don't need to do anything here found.append(hglyph['character']) for alt in hglyph['homoglyphs']: if HomoglyphsDecoder.is_latin(alt['c']): found.append(alt['c'].lower()) # If nothing is found, we keep the original character if not found: found.append(hglyph['character']) decoded_domain_c.append(found) for alt in self._generate_alternatives(decoded_domain_c): if wildcard: alt = '*.{}'.format(alt) decoded.append(alt) if not self.greedy: break record['all_domains'] = decoded return record
def gagachat(i): t = round(time.time() * 1000) r = requests.get("http://www.gagalive.com/randomchat/js/?c=" + str(t)).text #print(r) key = [ "Y" + r[r.index("4e4|") + 4:r.index("|24e4")], "L손님_" + r[r.index("ub2d8_") + 6:r.index("|uc624")] + "|@@@randomchat" ] print(key) ws = create_connection("ws://rchat.gagalive.kr:8080/") print(str(i), "open") ws.send(key[0]) ws.send(key[1]) time.sleep(0.5) time.sleep(0.1) ws.send("#!)*") time.sleep(0.1) ws.send("#붸에에에에ㅔㄱ") while True: r = ws.recv() print("RECV", r) spam = "스팸 방지 문자: " if r.find(spam) != -1: s = r[r.find(spam) + len(spam):] print("CAPTCHA", s) ss = "" for c in s: a = confusables.is_confusable(c, greedy=True) print(a) if a == False: ss += c continue for p in a: found = False for q in p['homoglyphs']: if q['n'].find( "DIGIT") == 0 or q['n'].find(", DIGIT") != -1: ss += q['c'] found = True break if not found: ss += c ss = ss.replace(" ", "").replace(")", "").replace("(", "") print("CAPTCHA DECODE", ss) ws.send("#" + ss) print(str(i), "shut down")
def __init__(self, text): logger.info("Creating filter regular expression from %r", text) groups = confusables.is_confusable(text, greedy=True) if groups: pattern = Filter.build_regex(text, groups) else: pattern = re.escape(text) logger.debug("Generated pattern: %r", pattern) self.text = text self.regex = re.compile(pattern, re.IGNORECASE)
def clean_confusables(s): confusions = is_confusable(s, greedy=True, preferred_aliases=["latin"]) result = s #print(confusions) if not confusions: return s for next in confusions: found = next['character'] if len(next['homoglyphs']) > 1: print("In %s found %s" % (s, json.dumps(next))) generic = next['homoglyphs'][0]['c'] result = result.replace(found, generic) return result
def run(self, record): ''' Using the confusable-homoglyphs, we are going to generate all alternatives ASCII names of a domain. It's a bit of a brute force though. ''' decoded = [] # For our specific case, we will only care about latin character lower_s = range(ord('a'), ord('z') + 1) upper_s = range(ord('A'), ord('Z') + 1) for domain in record['all_domains']: wildcard = False if re.match(r'^\*\.', domain): wildcard = True # Remove wildcard to simplify the domain name a bit and we'll put it back later domain = re.sub(r'^\*\.', '', domain) hg_map = {hg['character']: hg for hg in confusables.is_confusable(domain, greedy=True)} decoded_domain_c = [] for domain_c in domain: # Confusable homoglyphs could not find any homoglyphs for this character # so we decice to keep the original character as it is if domain_c not in hg_map: decoded_domain_c.append([domain_c]) continue found = [] hglyph = hg_map[domain_c] if hglyph['alias'] == 'LATIN': # The character is latin, we don't need to do anything here found.append(hglyph['character']) for alt in hglyph['homoglyphs']: is_latin = True # We need to check the lengh of the homoglyph here cause # confusable_homoglyphs library nicely returns multi-character # match as well, for example, 'rn' has an alternative of 'm' for alt_c in alt['c']: if ord(alt_c) not in lower_s and ord(alt_c) not in upper_s: is_latin = False break if is_latin: found.append(alt['c'].lower()) # If nothing is found, we keep the original character if not found: found.append(hglyph['character']) decoded_domain_c.append(found) for alt in self._generate_alternatives(decoded_domain_c): if wildcard: alt = '*.{}'.format(alt) decoded.append(alt) if not self.greedy: break record['all_domains'] = decoded return record
break return ''.join(coerced) if __name__ == '__main__': test_strings = ( '𐌚chan', #unsafe '8chan', #safe 'уolo', #unsafe 'Κiller Quеen', #unsafe 'This is a safe sentence.', #safe "It'ѕ lit yo" #unsafe ) for string in test_strings: #handle sentences by checking each word individually words = string.split(' ') result = "" for word in words: result += ' {}'.format(string_coerce(word)) result = result.strip() test_original = bool(confusables.is_confusable(string, preferred_aliases=['latin', 'common'])) print('Original is unsafe: {}'.format(str(test_original))) print('{} -> {}'.format(string, result)) test_new = bool(confusables.is_confusable(result, preferred_aliases=['latin', 'common'])) print('Coersion is unsafe: {}\n'.format(str(test_new)))