Ejemplo n.º 1
0
def force_unicode(s):
    """
    Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle,
    to prevent encoding issues when saving a subtitle to a non-ascii path.
    :param s: string
    :return: unicode string
    """
    if not isinstance(s, str):
        try:
            s = s.decode("utf-8")
        except UnicodeDecodeError:
            detector = Detector()
            t = detector.detect(s)
            try:
                s = s.decode(t)
            except UnicodeDecodeError:
                s = UnicodeDammit(s).unicode_markup
    return s
Ejemplo n.º 2
0
    def _clean_error_msg(self, msg):
        """converts a Powershell CLIXML message to a more human readable string
        """
        # TODO prepare unit test, beautify code
        # if the msg does not start with this, return it as is
        if msg.startswith(b"#< CLIXML\r\n"):
            # for proper xml, we need to remove the CLIXML part
            # (the first line)
            msg_xml = msg[11:]
            try:
                # remove the namespaces from the xml for easier processing
                msg_xml = self._strip_namespace(msg_xml)

                detector = Detector()
                found_encoding = detector.detect(msg_xml)
                msg_xml = msg_xml.decode(found_encoding)

                root = ET.fromstring(msg_xml)
                # the S node is the error message, find all S nodes
                nodes = root.findall("./S")
                new_msg = ""
                for s in nodes:
                    # append error msg string to result, also
                    # the hex chars represent CRLF so we replace with newline
                    new_msg += s.text.replace("_x000D__x000A_", "\n")
            except Exception as e:
                # if any of the above fails, the msg was not true xml
                # print a warning and return the orignal string
                # TODO do not print, raise user defined error instead
                print("Warning: there was a problem converting the Powershell"
                      " error message: %s" % e)

            return new_msg.strip().encode()

        # either failed to decode CLIXML or there was nothing to decode
        # just return the original message
        return msg.encode()
Ejemplo n.º 3
0
def guess_external_subtitles(dest_folder, subtitles):
    for subtitle, language in subtitles.items():
        if not language:
            subtitle_path = os.path.join(dest_folder, subtitle)
            if os.path.exists(subtitle_path) and os.path.splitext(
                    subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                logging.debug(
                    "BAZARR falling back to file content analysis to detect language."
                )
                detected_language = None

                # to improve performance, skip detection of files larger that 1M
                if os.path.getsize(subtitle_path) > 1 * 1024 * 1024:
                    logging.debug(
                        "BAZARR subtitles file is too large to be text based. Skipping this file: "
                        + subtitle_path)
                    continue

                with open(subtitle_path, 'rb') as f:
                    text = f.read()

                try:
                    text = text.decode('utf-8')
                    detected_language = guess_language(text)
                except UnicodeDecodeError:
                    detector = Detector()
                    try:
                        guess = detector.detect(text)
                    except:
                        logging.debug(
                            "BAZARR skipping this subtitles because we can't guess the encoding. "
                            "It's probably a binary file: " + subtitle_path)
                        continue
                    else:
                        logging.debug('BAZARR detected encoding %r', guess)
                        try:
                            text = text.decode(guess)
                        except:
                            logging.debug(
                                "BAZARR skipping this subtitles because we can't decode the file using the "
                                "guessed encoding. It's probably a binary file: "
                                + subtitle_path)
                            continue
                    detected_language = guess_language(text)
                except:
                    logging.debug(
                        'BAZARR was unable to detect encoding for this subtitles file: %r',
                        subtitle_path)
                finally:
                    if detected_language:
                        logging.debug(
                            "BAZARR external subtitles detected and guessed this language: "
                            + str(detected_language))
                        try:
                            subtitles[subtitle] = Language.rebuild(
                                Language.fromietf(detected_language),
                                forced=False,
                                hi=False)
                        except:
                            pass

        # Detect hearing-impaired external subtitles not identified in filename
        if not subtitles[subtitle].hi:
            subtitle_path = os.path.join(dest_folder, subtitle)

            # to improve performance, skip detection of files larger that 1M
            if os.path.getsize(subtitle_path) > 1 * 1024 * 1024:
                logging.debug(
                    "BAZARR subtitles file is too large to be text based. Skipping this file: "
                    + subtitle_path)
                continue

            with open(subtitle_path, 'rb') as f:
                text = f.read()

            try:
                text = text.decode('utf-8')
            except UnicodeDecodeError:
                detector = Detector()
                try:
                    guess = detector.detect(text)
                except:
                    logging.debug(
                        "BAZARR skipping this subtitles because we can't guess the encoding. "
                        "It's probably a binary file: " + subtitle_path)
                    continue
                else:
                    logging.debug('BAZARR detected encoding %r', guess)
                    try:
                        text = text.decode(guess)
                    except:
                        logging.debug(
                            "BAZARR skipping this subtitles because we can't decode the file using the "
                            "guessed encoding. It's probably a binary file: " +
                            subtitle_path)
                        continue

            if bool(re.search(hi_regex, text)):
                subtitles[subtitle] = Language.rebuild(subtitles[subtitle],
                                                       forced=False,
                                                       hi=True)
    return subtitles
Ejemplo n.º 4
0
def guess_external_subtitles(dest_folder, subtitles):
    for subtitle, language in subtitles.items():
        if not language:
            subtitle_path = os.path.join(dest_folder, subtitle)
            if os.path.exists(subtitle_path) and os.path.splitext(
                    subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                logging.debug(
                    "BAZARR falling back to file content analysis to detect language."
                )
                detected_language = None

                # to improve performance, skip detection of files larger that 1M
                if os.path.getsize(subtitle_path) > 1 * 1024 * 1024:
                    logging.debug(
                        "BAZARR subtitles file is too large to be text based. Skipping this file: "
                        + subtitle_path)
                    continue

                with open(subtitle_path, 'rb') as f:
                    text = f.read()

                try:
                    text = text.decode('utf-8')
                    detected_language = guess_language(text)
                    #add simplified and traditional chinese detection
                    if detected_language == 'zh':
                        traditional_chinese_fuzzy = [u"繁", u"雙語"]
                        traditional_chinese = [
                            ".cht", ".tc", ".zh-tw", ".zht", ".zh-hant",
                            ".zhhant", ".zh_hant", ".hant", ".big5",
                            ".traditional"
                        ]
                        if str(os.path.splitext(subtitle)[0]).lower().endswith(
                                tuple(traditional_chinese)) or (
                                    str(subtitle_path).lower()
                                )[:-5] in traditional_chinese_fuzzy:
                            detected_language == 'zt'
                except UnicodeDecodeError:
                    detector = Detector()
                    try:
                        guess = detector.detect(text)
                    except:
                        logging.debug(
                            "BAZARR skipping this subtitles because we can't guess the encoding. "
                            "It's probably a binary file: " + subtitle_path)
                        continue
                    else:
                        logging.debug('BAZARR detected encoding %r', guess)
                        try:
                            text = text.decode(guess)
                        except:
                            logging.debug(
                                "BAZARR skipping this subtitles because we can't decode the file using the "
                                "guessed encoding. It's probably a binary file: "
                                + subtitle_path)
                            continue
                    detected_language = guess_language(text)
                except:
                    logging.debug(
                        'BAZARR was unable to detect encoding for this subtitles file: %r',
                        subtitle_path)
                finally:
                    if detected_language:
                        logging.debug(
                            "BAZARR external subtitles detected and guessed this language: "
                            + str(detected_language))
                        try:
                            subtitles[subtitle] = Language.rebuild(
                                Language.fromietf(detected_language),
                                forced=False,
                                hi=False)
                        except:
                            pass

        # If language is still None (undetected), skip it
        if not language:
            pass

        # Skip HI detection if forced
        elif language.forced:
            pass

        # Detect hearing-impaired external subtitles not identified in filename
        elif not subtitles[subtitle].hi:
            subtitle_path = os.path.join(dest_folder, subtitle)

            # check if file exist:
            if os.path.exists(subtitle_path) and os.path.splitext(
                    subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                # to improve performance, skip detection of files larger that 1M
                if os.path.getsize(subtitle_path) > 1 * 1024 * 1024:
                    logging.debug(
                        "BAZARR subtitles file is too large to be text based. Skipping this file: "
                        + subtitle_path)
                    continue

                with open(subtitle_path, 'rb') as f:
                    text = f.read()

                try:
                    text = text.decode('utf-8')
                except UnicodeDecodeError:
                    detector = Detector()
                    try:
                        guess = detector.detect(text)
                    except:
                        logging.debug(
                            "BAZARR skipping this subtitles because we can't guess the encoding. "
                            "It's probably a binary file: " + subtitle_path)
                        continue
                    else:
                        logging.debug('BAZARR detected encoding %r', guess)
                        try:
                            text = text.decode(guess)
                        except:
                            logging.debug(
                                "BAZARR skipping this subtitles because we can't decode the file using the "
                                "guessed encoding. It's probably a binary file: "
                                + subtitle_path)
                            continue

                if bool(re.search(hi_regex, text)):
                    subtitles[subtitle] = Language.rebuild(subtitles[subtitle],
                                                           forced=False,
                                                           hi=True)
    return subtitles
Ejemplo n.º 5
0
def test_incorrect_min_confidence(min_confidence):
    with pytest.raises(ValueError, match='min_confidence must be in range'):
        Detector(min_confidence=min_confidence)
Ejemplo n.º 6
0
def test_no_encodings():
    with pytest.raises(ValueError, match='No encodings specified'):
        Detector(encodings=[])
Ejemplo n.º 7
0
def _get_detector():
    return Detector()