def test_obvious_coherence_gap(self):

        should_be_most_coherent = CharsetNormalizerMatches.from_path(
            './data/sample.1.ar.srt').best().first().coherence

        with open('./data/sample.1.ar.srt', 'r',
                  encoding='mac_cyrillic') as fp:
            r_ = ProbeCoherence(HashableCounter(fp.read())).ratio

        with open('./data/sample.1.ar.srt', 'r', encoding='cp1251') as fp:
            t_ = ProbeCoherence(HashableCounter(fp.read())).ratio

        self.assertLess(should_be_most_coherent, r_)

        self.assertLess(should_be_most_coherent, t_)
예제 #2
0
 def languages(self):
     """
     Return a list of probable language in text
     :return: List of language
     :rtype: list[str]
     """
     return ProbeCoherence(self.char_counter).most_likely
예제 #3
0
 def language(self):
     """
     Return the most probable language found in text
     :return: Most used/probable language in text
     :rtype: str
     """
     languages = ProbeCoherence(self.char_counter).most_likely
     return languages[0] if len(languages) > 0 else 'Unknown'
예제 #4
0
 def coherence(self):
     """
     Return a value between 0. and 1.
     Closest to 0. means that the initial string is considered coherent,
     Closest to 1. means that the initial string SEEMS NOT coherent.
     :return: Ratio as floating number
     :rtype: float
     """
     return ProbeCoherence(self.char_counter).ratio
예제 #5
0
    def language(self):
        """
        Return the most probable language found in text
        :return: Most used/probable language in text
        :rtype: str
        """
        probe_coherence = ProbeCoherence(self.char_counter)
        languages = probe_coherence.most_likely

        if len(languages) == 0:
            return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown'

        return languages[0]
예제 #6
0
 def coherence_non_latin(self):
     return ProbeCoherence(self.char_counter).non_latin_covered_any