def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) guess = Guess(match.groupdict(), confidence=confidence, raw=string[span[0]:span[1]]) # decide whether we have only a single episode number or an # episode list if guess.get('episodeNumber'): eplist = number_list(guess['episodeNumber']) guess.set('episodeNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) if len(eplist) > 1: guess.set('episodeList', eplist, confidence=confidence, raw=string[span[0]:span[1]]) if guess.get('bonusNumber'): eplist = number_list(guess['bonusNumber']) guess.set('bonusNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) return guess, span return None, None
def guess_weak_episodes_rexps(string, node): if 'episodeNumber' in node.root.info: return None, None for rexp, span_adjust in weak_episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: metadata = match.groupdict() span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) epnum = int(metadata['episodeNumber']) if epnum > 100: season, epnum = epnum // 100, epnum % 100 # episodes which have a season > 25 are most likely errors # (Simpsons is at 23!) if season > 25: continue return Guess({ 'season': season, 'episodeNumber': epnum }, confidence=0.6, raw=string[span[0]:span[1]]), span else: return Guess(metadata, confidence=0.3, raw=string[span[0]:span[1]]), span return None, None
def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: guess = Guess(match.groupdict(), confidence=confidence) span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) # episodes which have a season > 25 are most likely errors # (Simpsons is at 24!) if int(guess.get('season', 0)) > 25: continue # decide whether we have only a single episode number or an # episode list if guess.get('episodeNumber'): eplist = number_list(guess['episodeNumber']) guess.set('episodeNumber', int(eplist[0]), confidence=confidence) if len(eplist) > 1: guess.set('episodeList', list(map(int, eplist)), confidence=confidence) if guess.get('bonusNumber'): eplist = number_list(guess['bonusNumber']) guess.set('bonusNumber', int(eplist[0]), confidence=confidence) return guess, span return None, None
def guess_language(string): language, span, confidence = search_language(string) if language: # is it a subtitle language? if 'sub' in clean_string(string[:span[0]]).lower().split(' '): return (Guess({'subtitleLanguage': language}, confidence=confidence), span) else: return (Guess({'language': language}, confidence=confidence), span) return None, None
def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: result = (Guess(match.groupdict(), confidence=confidence), (match.start() + span_adjust[0], match.end() + span_adjust[1])) # episodes which have a season > 25 are most likely errors # (Simpsons is at 23!) if int(result[0].get('season', 0)) > 25: continue # decide whether we have only a single episode number or an # episode list if result[0].get('episodeNumber'): eplist = number_list(result[0]['episodeNumber']) result[0].set('episodeNumber', int(eplist[0]), confidence=confidence) if len(eplist) > 1: result[0].set('episodeList', map(int, eplist), confidence=confidence) return result return None, None
def find_and_split_node(node, strategy, logger): string = ' %s ' % node.value # add sentinels for matcher, confidence in strategy: if getattr(matcher, 'use_node', False): result, span = matcher(string, node) else: result, span = matcher(string) if result: # readjust span to compensate for sentinels span = (span[0] - 1, span[1] - 1) if isinstance(result, Guess): if confidence is None: confidence = result.confidence(list(result.keys())[0]) else: if confidence is None: confidence = 1.0 guess = format_guess(Guess(result, confidence=confidence)) msg = 'Found with confidence %.2f: %s' % (confidence, guess) (logger or log).debug(msg) node.partition(span) absolute_span = (span[0] + node.offset, span[1] + node.offset) for child in node.children: if child.span == absolute_span: child.guess = guess else: find_and_split_node(child, strategy, logger) return
def process(mtree, filetype='autodetect'): filetype, other = guess_filetype(mtree, filetype) mtree.guess.set('type', filetype, confidence=1.0) log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess)) filetype_info = Guess(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(mtree.string, strict=False) if mime is not None: filetype_info.update({'mimetype': mime}, confidence=1.0) node_ext = mtree.node_at((-1, )) node_ext.guess = filetype_info log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess))
def process(mtree, filetype='autodetect'): filetype, other = guess_filetype(mtree, filetype) mtree.guess.set('type', filetype, confidence=1.0) log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess)) filetype_info = Guess(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(mtree.string, strict=False) if mime is not None: filetype_info.update({'mimetype': mime}, confidence=1.0) node_ext = mtree.node_at((-1,)) node_ext.guess = filetype_info log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess))
def guess_country(self, string, node=None, options=None): c = string.strip().lower() if not c in LNG_COMMON_WORDS: try: country, country_span = self._scan_country(c, True) if self.is_valid_country(country, options): guess = Guess(country=country, confidence=1.0, input=node.value, span=(country_span[0] + 1, country_span[1] + 1)) return guess except babelfish.Error: pass return None, None
def process(mtree): for node in mtree.unidentified_leaves(): # only keep explicit groups (enclosed in parentheses/brackets) if len(node.node_idx) == 2: try: country = Country(node.value[1:-1], strict=True) if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: continue node.guess = Guess(country=country, confidence=1.0) except ValueError: pass
def process(mtree, filetype='autodetect'): """guess the file type now (will be useful later) """ filetype, other = guess_filetype(mtree, filetype) mtree.guess.set('type', filetype, confidence=1.0) log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess)) filetype_info = Guess(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(mtree.string, strict=False) if mime is not None: filetype_info.update({'mimetype': mime}, confidence=1.0) node_ext = mtree.node_at((-1,)) node_ext.guess = filetype_info log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess)) if mtree.guess.get('type') in [None, 'unknown']: raise TransfoException(__name__, 'Unknown file type')
def process(self, mtree, options=None): GuessFinder(self.guess_country, None, self.log, options).process_nodes(mtree.unidentified_leaves()) for node in mtree.leaves_containing('language'): c = node.clean_value.lower() if c in self.replace_language: node.guess.set('language', None) try: country = Country.fromguessit(c) if self.is_valid_country(country, options): guess = Guess(country=country, confidence=0.9, input=node.value, span=node.span) found_guess(node, guess, logger=log) except babelfish.Error: pass
def guess_weak_episodes_rexps(string, node): if 'episodeNumber' in node.root.info: return None, None for rexp, span_adjust in weak_episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: metadata = match.groupdict() span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) epnum = int(metadata['episodeNumber']) if epnum > 100: return Guess( { 'season': epnum // 100, 'episodeNumber': epnum % 100 }, confidence=0.6), span else: return Guess(metadata, confidence=0.3), span return None, None
def guess_video_rexps(string): string = '-' + string + '-' for rexp, confidence, span_adjust in video_rexps: match = re.search(sep + rexp + sep, string, re.IGNORECASE) if match: metadata = match.groupdict() # is this the better place to put it? (maybe, as it is at least # the soonest that we can catch it) if metadata.get('cdNumberTotal', -1) is None: del metadata['cdNumberTotal'] return (Guess(metadata, confidence=confidence), (match.start() + span_adjust[0], match.end() + span_adjust[1] - 2)) return None, None
def process(self, mtree, options=None): for node in mtree.unidentified_leaves(): if len(node.node_idx) == 2: c = node.value[1:-1].lower() if c in self.country_common_words: continue # only keep explicit groups (enclosed in parentheses/brackets) if not node.is_explicit(): continue try: country = Country(c, strict=True) except ValueError: continue node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span)
def process(mtree): for node in mtree.unidentified_leaves(): if len(node.node_idx) == 2: c = node.value[1:-1].lower() if c in country_common_words: continue # only keep explicit groups (enclosed in parentheses/brackets) if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: continue try: country = Country(c, strict=True) except ValueError: continue node.guess = Guess(country=country, confidence=1.0, raw=c)
def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: guess = Guess(match.groupdict(), confidence=confidence) span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) # decide whether we have only a single episode number or an # episode list if guess.get('episodeNumber'): eplist = number_list(guess['episodeNumber']) guess.set('episodeNumber', eplist[0], confidence=confidence) if len(eplist) > 1: guess.set('episodeList', eplist, confidence=confidence) if guess.get('bonusNumber'): eplist = number_list(guess['bonusNumber']) guess.set('bonusNumber', eplist[0], confidence=confidence) return guess, span return None, None
def guess_language(string, node, skip=None): if skip: relative_skip = [] for entry in skip: node_idx = entry['node_idx'] span = entry['span'] if node_idx == node.node_idx[:len(node_idx)]: relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1) relative_skip.append(relative_span) skip = relative_skip language, span, confidence = search_language(string, skip=skip) if language: return (Guess({'language': language}, confidence=confidence, raw=string[span[0]:span[1]]), span) return None, None
def found_property(node, name, value, confidence): node.guess = Guess({ name: value }, confidence=confidence) log.debug('Found with confidence %.2f: %s' % (confidence, node.guess))
def guess_language(string): language, span, confidence = search_language(string) if language: return (Guess({'language': language}, confidence=confidence), span) return None, None
def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, string, re.IGNORECASE) if match: guess = Guess(match.groupdict(), confidence=confidence) span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) # episodes which have a season > 30 are most likely errors # (Simpsons is at 24!) if int(guess.get('season', 0)) > 30: continue # decide whether we have only a single episode number or an # episode list if guess.get('episodeNumber'): eplist = number_list(guess['episodeNumber']) guess.set('episodeNumber', eplist[0], confidence=confidence) if len(eplist) > 1: guess.set('episodeList', eplist, confidence=confidence) if guess.get('bonusNumber'): eplist = number_list(guess['bonusNumber']) guess.set('bonusNumber', eplist[0], confidence=confidence) return guess, span return None, None
def __init__(self, string='', span=None, parent=None): self.string = string self.span = span or (0, len(string)) self.parent = parent self.children = [] self.guess = Guess()