def force_unicode(s): """ Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle, to prevent encoding issues when saving a subtitle to a non-ascii path. :param s: string :return: unicode string """ if not isinstance(s, str): try: s = s.decode("utf-8") except UnicodeDecodeError: detector = Detector() t = detector.detect(s) try: s = s.decode(t) except UnicodeDecodeError: s = UnicodeDammit(s).unicode_markup return s
def _clean_error_msg(self, msg): """converts a Powershell CLIXML message to a more human readable string """ # TODO prepare unit test, beautify code # if the msg does not start with this, return it as is if msg.startswith(b"#< CLIXML\r\n"): # for proper xml, we need to remove the CLIXML part # (the first line) msg_xml = msg[11:] try: # remove the namespaces from the xml for easier processing msg_xml = self._strip_namespace(msg_xml) detector = Detector() found_encoding = detector.detect(msg_xml) msg_xml = msg_xml.decode(found_encoding) root = ET.fromstring(msg_xml) # the S node is the error message, find all S nodes nodes = root.findall("./S") new_msg = "" for s in nodes: # append error msg string to result, also # the hex chars represent CRLF so we replace with newline new_msg += s.text.replace("_x000D__x000A_", "\n") except Exception as e: # if any of the above fails, the msg was not true xml # print a warning and return the orignal string # TODO do not print, raise user defined error instead print("Warning: there was a problem converting the Powershell" " error message: %s" % e) return new_msg.strip().encode() # either failed to decode CLIXML or there was nothing to decode # just return the original message return msg.encode()
def guess_external_subtitles(dest_folder, subtitles): for subtitle, language in subtitles.items(): if not language: subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext( subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: logging.debug( "BAZARR falling back to file content analysis to detect language." ) detected_language = None # to improve performance, skip detection of files larger that 1M if os.path.getsize(subtitle_path) > 1 * 1024 * 1024: logging.debug( "BAZARR subtitles file is too large to be text based. Skipping this file: " + subtitle_path) continue with open(subtitle_path, 'rb') as f: text = f.read() try: text = text.decode('utf-8') detected_language = guess_language(text) except UnicodeDecodeError: detector = Detector() try: guess = detector.detect(text) except: logging.debug( "BAZARR skipping this subtitles because we can't guess the encoding. " "It's probably a binary file: " + subtitle_path) continue else: logging.debug('BAZARR detected encoding %r', guess) try: text = text.decode(guess) except: logging.debug( "BAZARR skipping this subtitles because we can't decode the file using the " "guessed encoding. It's probably a binary file: " + subtitle_path) continue detected_language = guess_language(text) except: logging.debug( 'BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path) finally: if detected_language: logging.debug( "BAZARR external subtitles detected and guessed this language: " + str(detected_language)) try: subtitles[subtitle] = Language.rebuild( Language.fromietf(detected_language), forced=False, hi=False) except: pass # Detect hearing-impaired external subtitles not identified in filename if not subtitles[subtitle].hi: subtitle_path = os.path.join(dest_folder, subtitle) # to improve performance, skip detection of files larger that 1M if os.path.getsize(subtitle_path) > 1 * 1024 * 1024: logging.debug( "BAZARR subtitles file is too large to be text based. Skipping this file: " + subtitle_path) continue with open(subtitle_path, 'rb') as f: text = f.read() try: text = text.decode('utf-8') except UnicodeDecodeError: detector = Detector() try: guess = detector.detect(text) except: logging.debug( "BAZARR skipping this subtitles because we can't guess the encoding. " "It's probably a binary file: " + subtitle_path) continue else: logging.debug('BAZARR detected encoding %r', guess) try: text = text.decode(guess) except: logging.debug( "BAZARR skipping this subtitles because we can't decode the file using the " "guessed encoding. It's probably a binary file: " + subtitle_path) continue if bool(re.search(hi_regex, text)): subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True) return subtitles
def guess_external_subtitles(dest_folder, subtitles): for subtitle, language in subtitles.items(): if not language: subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext( subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: logging.debug( "BAZARR falling back to file content analysis to detect language." ) detected_language = None # to improve performance, skip detection of files larger that 1M if os.path.getsize(subtitle_path) > 1 * 1024 * 1024: logging.debug( "BAZARR subtitles file is too large to be text based. Skipping this file: " + subtitle_path) continue with open(subtitle_path, 'rb') as f: text = f.read() try: text = text.decode('utf-8') detected_language = guess_language(text) #add simplified and traditional chinese detection if detected_language == 'zh': traditional_chinese_fuzzy = [u"繁", u"雙語"] traditional_chinese = [ ".cht", ".tc", ".zh-tw", ".zht", ".zh-hant", ".zhhant", ".zh_hant", ".hant", ".big5", ".traditional" ] if str(os.path.splitext(subtitle)[0]).lower().endswith( tuple(traditional_chinese)) or ( str(subtitle_path).lower() )[:-5] in traditional_chinese_fuzzy: detected_language == 'zt' except UnicodeDecodeError: detector = Detector() try: guess = detector.detect(text) except: logging.debug( "BAZARR skipping this subtitles because we can't guess the encoding. " "It's probably a binary file: " + subtitle_path) continue else: logging.debug('BAZARR detected encoding %r', guess) try: text = text.decode(guess) except: logging.debug( "BAZARR skipping this subtitles because we can't decode the file using the " "guessed encoding. It's probably a binary file: " + subtitle_path) continue detected_language = guess_language(text) except: logging.debug( 'BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path) finally: if detected_language: logging.debug( "BAZARR external subtitles detected and guessed this language: " + str(detected_language)) try: subtitles[subtitle] = Language.rebuild( Language.fromietf(detected_language), forced=False, hi=False) except: pass # If language is still None (undetected), skip it if not language: pass # Skip HI detection if forced elif language.forced: pass # Detect hearing-impaired external subtitles not identified in filename elif not subtitles[subtitle].hi: subtitle_path = os.path.join(dest_folder, subtitle) # check if file exist: if os.path.exists(subtitle_path) and os.path.splitext( subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: # to improve performance, skip detection of files larger that 1M if os.path.getsize(subtitle_path) > 1 * 1024 * 1024: logging.debug( "BAZARR subtitles file is too large to be text based. Skipping this file: " + subtitle_path) continue with open(subtitle_path, 'rb') as f: text = f.read() try: text = text.decode('utf-8') except UnicodeDecodeError: detector = Detector() try: guess = detector.detect(text) except: logging.debug( "BAZARR skipping this subtitles because we can't guess the encoding. " "It's probably a binary file: " + subtitle_path) continue else: logging.debug('BAZARR detected encoding %r', guess) try: text = text.decode(guess) except: logging.debug( "BAZARR skipping this subtitles because we can't decode the file using the " "guessed encoding. It's probably a binary file: " + subtitle_path) continue if bool(re.search(hi_regex, text)): subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True) return subtitles
def test_incorrect_min_confidence(min_confidence): with pytest.raises(ValueError, match='min_confidence must be in range'): Detector(min_confidence=min_confidence)
def test_no_encodings(): with pytest.raises(ValueError, match='No encodings specified'): Detector(encodings=[])
def _get_detector(): return Detector()