def _probe(self): for el in self._words: w_len = len(el) classification = UnicodeRangeIdentify.classification(el) c_ = 0 is_latin_based = all( ['Latin' in el for el in list(classification.keys())]) if len(classification.keys()) > 1: for u_name, u_occ in classification.items(): if UnicodeRangeIdentify.is_range_secondary(u_name) is True: c_ += u_occ if (not is_latin_based and c_ > int(w_len / 4)) \ or (is_latin_based and c_ > int(w_len / 2)) \ or (UnicodeRangeIdentify.part_punc(el) > 0.4 and len(classification.keys()) > 1) \ or (not is_latin_based and UnicodeRangeIdentify.part_accent(el) > 0.4) \ or (not is_latin_based and len(el) > 10 and UnicodeRangeIdentify.part_lonely_range(el) > 0.3): self._suspicious.append(el) else: pass
def test_list_by_range(self): self.assertEqual( {'Basic Latin': ['a', 'b', 'c', 'd', 'e', 'é', 'ù'], 'Hangul Syllables': ['역', '사'], 'Greek and Coptic': ['π', 'ο', 'υ']}, UnicodeRangeIdentify.list_by_range(['a', 'b', 'c', 'd', 'e', 'é', 'ù', '역', '사', 'π', 'ο', 'υ']) )
def test_should_be_accented(self): self.assertTrue( UnicodeRangeIdentify.is_accentuated('é') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('è') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('è') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('à') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('À') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('Ù') ) self.assertTrue( UnicodeRangeIdentify.is_accentuated('ç') )
def test_should_throw(self): with self.assertRaises(IOError): UnicodeRangeIdentify.is_accentuated('àé') with self.assertRaises(IOError): UnicodeRangeIdentify.is_accentuated('aé') with self.assertRaises(IOError): UnicodeRangeIdentify.is_accentuated('aa')
def alphabet_coverage(self): list_by_range = UnicodeRangeIdentify.list_by_range(self.letters) coverages = dict() for u_range, letters in list_by_range.items(): n_covered = 0 for l in letters: if l in self.covered_letters: n_covered += 1 coverages[u_range] = n_covered / len( letters) >= COHERENCE_ALPHABET_COVERED_IF return coverages
def ratio(self): """ Return a value between 0. and 1. Closest to 1. means that the initial string is considered as chaotic, Closest to 0. means that the initial string SEEMS NOT chaotic. :return: Ratio as floating number :rtype: float """ r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0 z_ = UnicodeRangeIdentify.unravel_suspicious_ranges( len(self._string), self.encountered_unicode_range_occurrences) return (r_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + self._unravel_cjc_suspicious()) / len( self._string) # + len(self.encountered_unicode_range)-1
def _probe(self): c__ = False upper_lower_m = False for c, i_ in zip(self._string, range(0, len(self._string))): if not c__: state_ = (i_ / len(self._string) >= 0.5) # If we already have measured 10 % or more of chaos after reading 50 %, give up. if not c__ and state_ and self.ratio >= self._threshold: self.gave_up = True break elif c__ is False and state_: c__ = True self.total_letter_encountered += 1 if not c.isprintable(): if c not in ['\n', '\t', '\r']: if not UnicodeRangeIdentify.is_cjk( c) and not UnicodeRangeIdentify.is_punc(c): self.unprintable += 2 self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 continue if c.isspace(): self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 self.previous_printable_letter = c continue if self.not_encountered_white_space_reset < 2: self.not_encountered_white_space += 1 if self.previous_printable_letter is None: self.previous_printable_letter = c continue is_accent = UnicodeRangeIdentify.is_accentuated(c) u_name = UnicodeRangeIdentify.find_letter_type(c) is_upper = c.isupper() is_lower = c.islower() if not is_upper else False is_alpha = c.isalpha() is_latin = UnicodeRangeIdentify.is_latin(c) if u_name is not None and u_name not in self.encountered_unicode_range: self.encountered_unicode_range_occurrences[u_name] = 0 self.encountered_unicode_range.add(u_name) if is_accent and UnicodeRangeIdentify.is_accentuated( self.previous_printable_letter): self.successive_accent += 2 if is_lower: self.total_lower_letter_encountered += 1 if is_upper and is_accent: self.total_upper_accent_encountered += 1 if self.previous_printable_letter.isalpha(): self.total_upper_accent_encountered_inner += 1 elif not is_accent and is_alpha: self.total_unaccented_letter_encountered += 1 if u_name is not None: self.encountered_unicode_range_occurrences[u_name] += 1 is_punc = UnicodeRangeIdentify.is_punc(c) if is_punc is True: self.encountered_punc_sign += 1 self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 continue if (is_lower and self.previous_printable_letter.isupper()) or ( is_upper and self.previous_printable_letter.islower()): if not upper_lower_m: upper_lower_m = True else: self.successive_upper_lower += 1 upper_lower_m = False else: upper_lower_m = False if is_latin: self.previous_encountered_unicode_range = u_name self.previous_printable_letter = c if self.previous_encountered_unicode_range is not None and UnicodeRangeIdentify.is_suspiciously_successive_range( u_name, self.previous_encountered_unicode_range) is True: if not UnicodeRangeIdentify.is_punc( self.previous_printable_letter): self.successive_different_unicode_range += 1 self.previous_encountered_unicode_range = u_name self.previous_printable_letter = c if len(self._string) < 50: self.not_encountered_white_space = 0 if self.successive_upper_lower < 3: self.successive_upper_lower = 0
def _probe(self): c__ = False for c, i_ in zip(self._string, range(0, len(self._string))): state_ = (i_ / len(self._string) >= 0.5) if not c__ and state_ > 0.2 and self.ratio >= 0.3: self.gave_up = True break elif c__ is False and state_ > 0.2: c__ = True self.total_letter_encountered += 1 if not c.isprintable(): if c not in ['\n', '\t', '\r']: u_name = UnicodeRangeIdentify.find_letter_type(c) if 'CJK' not in u_name and 'General Punctuation' not in u_name and ord( c) != 160: # CJC have there own white spaces self.unprintable += 2 self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 continue if c.isspace(): self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 self.previous_printable_letter = c continue if self.not_encountered_white_space_reset < 2: self.not_encountered_white_space += 1 if self.previous_printable_letter is None: self.previous_printable_letter = c continue is_accent = UnicodeRangeIdentify.is_accentuated(c) u_name = UnicodeRangeIdentify.find_letter_type(c) u_name_lower = u_name.lower() if u_name is not None else None is_upper = c.isupper() is_lower = c.islower() if not is_upper else False is_alpha = c.isalpha() if u_name is not None and u_name not in self.encountered_unicode_range: self.encountered_unicode_range_occurrences[u_name] = 0 self.encountered_unicode_range.add(u_name) if is_accent and UnicodeRangeIdentify.is_accentuated( self.previous_printable_letter): self.successive_accent += 2 if is_lower: self.total_lower_letter_encountered += 1 if is_upper and is_accent: self.total_upper_accent_encountered += 1 if self.previous_printable_letter.isalpha(): self.total_upper_accent_encountered_inner += 1 elif not is_accent and is_alpha: self.total_unaccented_letter_encountered += 1 if u_name is not None: self.encountered_unicode_range_occurrences[u_name] += 1 if 'symbols and punctuation' in u_name_lower or 'general punctuation' in u_name_lower or 'halfwidth and fullwidth forms' in u_name_lower: self.encountered_white_space += 1 self.not_encountered_white_space = 0 self.not_encountered_white_space_reset += 1 if 'latin' in u_name_lower or 'halfwidth and fullwidth forms' in u_name_lower or 'symbols and punctuation' in u_name_lower or 'general punctuation' in u_name_lower: self.previous_printable_letter = c continue elif (self.previous_printable_letter.isupper() and c.islower() ) or (self.previous_printable_letter.islower() and c.isupper()): self.successive_upper_lower += 1 if u_name != self.previous_encountered_unicode_range and self.previous_encountered_unicode_range is not None: k__ = self.previous_encountered_unicode_range if 'latin' not in k__ and \ 'halfwidth and fullwidth forms' not in k__ and \ 'symbols and punctuation' not in k__ and \ 'general punctuation' not in k__: self.successive_different_unicode_range += 1 self.previous_encountered_unicode_range = u_name self.previous_printable_letter = c