def force_utf8_and_filter_duplicates(subtitles): log_debug('Trying to filter duplicated subtitles...') for s in subtitles: _, s['lang'], s['content'] = guess_locale_and_convert(s['content']) dup_tag = [False] * len(subtitles) for i in range(len(subtitles)): if dup_tag[i]: continue for j in range(i + 1, len(subtitles)): sa = subtitles[i] sb = subtitles[j] if sa['extension'] != sb['extension'] or sa['lang'] != sb['lang']: continue import difflib similarity = difflib.SequenceMatcher( None, sa['content'], sb['content']).real_quick_ratio() log_debug('Similarity is {0}.'.format(similarity)) if similarity > 0.9: dup_tag[j] = True # TODO: reserve longer subtitles subtitles = [subtitles[i] for i in range(len(subtitles)) if not dup_tag[i]] log_debug('{0} subtitle(s) reserved after duplicates filtering.'.format( len(subtitles)))
def parse_local_subtitles(self): info = self.__info raw = self.__raw_info['mplayer'] info['subtitle'] = defaultdict(bool) if raw['ID_SUBTITLE_ID']: # TODO: extract subtitles and combine to a bi-lingual sub # ffmpeg -i Seinfeld.2x01.The_Ex-Girlfriend.xvid-TLF.mkv -vn -an -scodec srt sub.srt info['subtitle']['embed'] = [] for i in raw['ID_SUBTITLE_ID']: info['subtitle']['embed'] += raw['ID_SID_{0}_LANG'.format(i)] if raw['ID_FILE_SUB_ID']: info['subtitle']['external'] = raw['ID_FILE_SUB_FILENAME'] log_debug('Converting the external subtitles to UTF-8...') from charset import guess_locale_and_convert for subfile in raw['ID_FILE_SUB_FILENAME']: # open in binary mode because we don't know the encoding with open(subfile,'r+b') as f: s = f.read() enc,_,s = guess_locale_and_convert(s) if not enc in ['utf_8','ascii']: f.seek(0) f.write(s) self.add_arg('-subcp utf8') if raw['ID_VOBSUB_ID']: info['subtitle']['vobsub'] = True unrar = which('unrar') if unrar: self.add_arg('-unrarexec {0}'.format(unrar))
def parse_local_subtitles(self): info = self.__info raw = self.__raw_info['mplayer'] info['subtitle'] = defaultdict(bool) if raw['ID_SUBTITLE_ID']: # TODO: extract subtitles and combine to a bi-lingual sub # ffmpeg -i Seinfeld.2x01.The_Ex-Girlfriend.xvid-TLF.mkv -vn -an -scodec srt sub.srt info['subtitle']['embed'] = [] for i in raw['ID_SUBTITLE_ID']: info['subtitle']['embed'] += raw['ID_SID_{0}_LANG'.format(i)] if raw['ID_FILE_SUB_ID']: info['subtitle']['external'] = raw['ID_FILE_SUB_FILENAME'] log_debug('Converting the external subtitles to UTF-8...') from charset import guess_locale_and_convert for subfile in raw['ID_FILE_SUB_FILENAME']: # open in binary mode because we don't know the encoding with open(subfile, 'r+b') as f: s = f.read() enc, _, s = guess_locale_and_convert(s) if not enc in ['utf_8', 'ascii']: f.seek(0) f.write(s) self.add_arg('-subcp utf8') if raw['ID_VOBSUB_ID']: info['subtitle']['vobsub'] = True unrar = which('unrar') if unrar: self.add_arg('-unrarexec {0}'.format(unrar))
def force_utf8_and_filter_duplicates(subtitles): log_debug('Trying to filter duplicated subtitles...') for s in subtitles: _,s['lang'],s['content'] = guess_locale_and_convert(s['content']) dup_tag = [False]*len(subtitles) for i in range(len(subtitles)): if dup_tag[i]: continue for j in range(i+1, len(subtitles)): sa = subtitles[i] sb = subtitles[j] if sa['extension'] != sb['extension'] or sa['lang'] != sb['lang']: continue import difflib similarity = difflib.SequenceMatcher(None, sa['content'], sb['content']).real_quick_ratio() log_debug('Similarity is {0}.'.format(similarity)) if similarity > 0.9: dup_tag[j] = True # TODO: reserve longer subtitles subtitles = [subtitles[i] for i in range(len(subtitles)) if not dup_tag[i]] log_debug('{0} subtitle(s) reserved after duplicates filtering.'.format(len(subtitles)))