def test_obvious_coherence_gap(self): should_be_most_coherent = CharsetNormalizerMatches.from_path( './data/sample.1.ar.srt').best().first().coherence with open('./data/sample.1.ar.srt', 'r', encoding='mac_cyrillic') as fp: r_ = ProbeCoherence(HashableCounter(fp.read())).ratio with open('./data/sample.1.ar.srt', 'r', encoding='cp1251') as fp: t_ = ProbeCoherence(HashableCounter(fp.read())).ratio self.assertLess(should_be_most_coherent, r_) self.assertLess(should_be_most_coherent, t_)
def languages(self): """ Return a list of probable language in text :return: List of language :rtype: list[str] """ return ProbeCoherence(self.char_counter).most_likely
def language(self): """ Return the most probable language found in text :return: Most used/probable language in text :rtype: str """ languages = ProbeCoherence(self.char_counter).most_likely return languages[0] if len(languages) > 0 else 'Unknown'
def coherence(self): """ Return a value between 0. and 1. Closest to 0. means that the initial string is considered coherent, Closest to 1. means that the initial string SEEMS NOT coherent. :return: Ratio as floating number :rtype: float """ return ProbeCoherence(self.char_counter).ratio
def language(self): """ Return the most probable language found in text :return: Most used/probable language in text :rtype: str """ probe_coherence = ProbeCoherence(self.char_counter) languages = probe_coherence.most_likely if len(languages) == 0: return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown' return languages[0]
def coherence_non_latin(self): return ProbeCoherence(self.char_counter).non_latin_covered_any