def test_complete_gibberish(self): self.assertTrue( ProbeChaos("""ØĢØŠØģاØĶŲ ŲŲ ØĢŲ اŲŲاØģ ŲŲŲ Ų ا ØģŲŲŲØŠØģاØĶŲŲŲØ ØŊØđŲا ŲØģŲ Øđ ØđŲ (ŲØąŲØŊŲ) ŲاŲØŪا؊Ų """).gave_up, ) self.assertTrue( ProbeChaos("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""").gave_up, )
def test_part_gibberish(self): self.assertGreater( ProbeChaos("""[email protected] ุชุฑุฌู ููุฉ ููุดูููุงู ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู ูููู ูููุจููู""", giveup_threshold=0.5).ratio, 0.4) self.assertGreater(ProbeChaos("锌褉械锌芯写邪胁邪褌械谢褟屑懈 锌芯褝褌芯 ").ratio, 0.4)
def test_not_gibberish(self): self.assertLessEqual( ProbeChaos( '典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。' ).ratio, 0.) self.assertEqual( ProbeChaos('العقلية , التنويم المغناطيسي و / أو الاقتراح').ratio, 0.) self.assertEqual( ProbeChaos( "RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل"). ratio, 0.)
def chaos_secondary_pass(self): """ Check once again chaos in decoded text, except this time, with full content. :return: Same as chaos property expect it's about all content :rtype: float """ return ProbeChaos(str(self)).ratio
def test_subtle_gibberish(self): self.assertLessEqual( ProbeChaos("Cehennemin Sava■þ²s²'da kim?").ratio, 0.5) self.assertGreaterEqual( ProbeChaos("Cehennemin Sava■þ²s²'da kim?").ratio, 0.) self.assertGreater(ProbeChaos('´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v').ratio, 0.) self.assertLessEqual( ProbeChaos("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v").ratio, 0.5) self.assertGreater( ProbeChaos( "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli" ).ratio, 0.) self.assertLessEqual( ProbeChaos( "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli" ).ratio, 0.5) self.assertLessEqual( ProbeChaos( "<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>" ).ratio, 0.5)
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. Will test input like this (with steps=4 & chunk_size=4) --> [#### #### #### ####] :param bytes sequences: Actual sequence of bytes to analyse :param float threshold: Maximum amount of chaos allowed on first pass :param int chunk_size: Size to extract and analyse in each step :param int steps: Number of steps :return: List of potential matches :rtype: CharsetNormalizerMatches """ py_v = [int(el) for el in python_version_tuple()] py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6) supported = sorted(aliases.items()) if py_need_sort else aliases.items() tested = set() matches = list() maximum_length = len(sequences) if maximum_length <= chunk_size: chunk_size = maximum_length steps = 1 for support in supported: k, p = support if p in tested: continue tested.add(p) bom_available = False bom_len = None try: if p in BYTE_ORDER_MARK.keys(): if isinstance(BYTE_ORDER_MARK[p], bytes) and sequences.startswith(BYTE_ORDER_MARK[p]): bom_available = True bom_len = len(BYTE_ORDER_MARK[p]) elif isinstance(BYTE_ORDER_MARK[p], list): bom_c_list = [sequences.startswith(el) for el in BYTE_ORDER_MARK[p]] if any(bom_c_list) is True: bom_available = True bom_len = len(BYTE_ORDER_MARK[p][bom_c_list.index(True)]) str( sequences if bom_available is False else sequences[bom_len:], encoding=p ) except UnicodeDecodeError: continue except LookupError: continue r_ = range( 0 if bom_available is False else bom_len, maximum_length, int(maximum_length / steps) ) measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_] ratios = [el.ratio for el in measures] nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True) chaos_means = statistics.mean(ratios) chaos_median = statistics.median(ratios) # chaos_min = min(ratios) # chaos_max = max(ratios) if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold: # print(p, 'is too much chaos for decoded input !') continue encountered_unicode_range_occurrences = dict() for el in measures: for u_name, u_occ in el.encountered_unicode_range_occurrences.items(): if u_name not in encountered_unicode_range_occurrences.keys(): encountered_unicode_range_occurrences[u_name] = 0 encountered_unicode_range_occurrences[u_name] += u_occ # print(p, 'U RANGES', encountered_unicode_range_occurrences) cnm = CharsetNormalizerMatch( sequences if not bom_available else sequences[bom_len:], p, chaos_means, encountered_unicode_range_occurrences, bom_available ) fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in matches] if any(fingerprint_tests) is True: matches[fingerprint_tests.index(True)].submatch.append(cnm) else: matches.append( CharsetNormalizerMatch( sequences if not bom_available else sequences[bom_len:], p, chaos_means, encountered_unicode_range_occurrences, bom_available ) ) # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,) if (p == 'ascii' and chaos_median == 0.) or bom_available is True: return CharsetNormalizerMatches([matches[-1]]) return CharsetNormalizerMatches(matches)
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. :param bytearray sequences: Actual sequence of bytes to analyse :param float threshold: Maximum amount of chaos allowed on first pass :param int chunk_size: Size to extract and analyse in each step :param int steps: Number of steps :return: List of potential matches :rtype: CharsetNormalizerMatches """ py_v = [int(el) for el in python_version_tuple()] py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6) supported = sorted( aliases.items()) if py_need_sort else aliases.items() tested = set() working = dict() maximum_length = len(sequences) for support in supported: k, p = support if p in tested: continue tested.add(p) try: str(sequences, encoding=p) except UnicodeDecodeError: continue except LookupError: continue chaos_measures = list() ranges_encountered_t = dict() decoded_len_t = 0 for i in range(0, maximum_length, int(maximum_length / steps)): chunk = sequences[i:i + chunk_size] decoded = str(chunk, encoding=p, errors='ignore') probe_chaos = ProbeChaos(decoded) chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences for k, e in ranges_encountered.items(): if k not in ranges_encountered_t.keys(): ranges_encountered_t[k] = 0 ranges_encountered_t[k] += e if chaos_measure > threshold: if p in working.keys(): del working[p] break chaos_measures.append(chaos_measure) if p not in working.keys(): working[p] = dict() if p in working.keys(): working[p]['ratio'] = statistics.mean(chaos_measures) working[p]['ranges'] = ranges_encountered_t working[p]['chaos'] = sum(chaos_measures) working[p]['len'] = decoded_len_t if p == 'ascii' and working[p]['ratio'] == 0.: break return CharsetNormalizerMatches([ CharsetNormalizerMatch(sequences, enc, working[enc]['ratio'], working[enc]['ranges']) for enc in ( sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold ])
def chaos_secondary_pass(self): """ Check once again chaos in decoded text, except this time, with full content. :return: """ return ProbeChaos(str(self))