def process(self, mtree, options=None): """guess the file type now (will be useful later) """ filetype, other = self.guess_filetype(mtree, options) mtree.guess.set('type', filetype, confidence=1.0) log_found_guess(mtree.guess) filetype_info = Guess(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(mtree.string, strict=False) if mime is not None: filetype_info.update({'mimetype': mime}, confidence=1.0) # Retrieve the last node of category path (extension node) node_ext = list(filter(lambda x: x.category == 'path', mtree.nodes()))[-1] found_guess(node_ext, filetype_info) if mtree.guess.get('type') in [None, 'unknown']: if options.get('name_only'): mtree.guess.set('type', 'movie', confidence=0.6) else: raise TransformerException(__name__, 'Unknown file type')
def build_guess(node, name, value=None, confidence=1.0): guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) if value is None: guess.metadata().span = node.span guess.metadata().input = node.value if value is None else value return guess
def __init__(self, string='', span=None, parent=None, clean_function=None): self.string = string self.span = span or (0, len(string)) self.parent = parent self.children = [] self.guess = Guess() self._clean_value = None self._clean_function = clean_function or clean_default
def guess_release_group(self, string, node=None, options=None): found = self.container.find_properties(string, node, 'releaseGroup') guess = self.container.as_guess(found, string, self.validate_group_name, sep_replacement='-') validated_guess = None if guess: explicit_group_node = node.group_node() if explicit_group_node: for leaf in explicit_group_node.leaves_containing( self.previous_safe_properties): if self.is_leaf_previous(leaf, node): if leaf.root.value[leaf.span[1]] == '-': guess.metadata().confidence = 1 else: guess.metadata().confidence = 0.7 validated_guess = guess if not validated_guess: # If previous group last leaf is identified as a safe property, # consider the raw value as a releaseGroup previous_group_node = node.previous_group_node() if previous_group_node: for leaf in previous_group_node.leaves_containing( self.previous_safe_properties): if self.is_leaf_previous(leaf, node): guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value))) if self.validate_group_name(guess): node.guess = guess validated_guess = guess if validated_guess: # If following group nodes have only one unidentified leaf, it belongs to the release group next_group_node = node while True: next_group_node = next_group_node.next_group_node() if next_group_node: leaves = list(next_group_node.leaves()) if len(leaves) == 1 and not leaves[0].guess: validated_guess['releaseGroup'] = validated_guess[ 'releaseGroup'] + leaves[0].value leaves[0].guess = validated_guess else: break else: break if validated_guess: # Strip brackets validated_guess['releaseGroup'] = strip_brackets( validated_guess['releaseGroup']) return validated_guess
def build_guess(node, name, value=None, confidence=1.0): guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) guess.metadata().input = node.value if value is None else value if value is None: left_offset = 0 right_offset = 0 clean_value = node.clean_value for i in range(0, len(node.value)): if clean_value[0] == node.value[i]: break left_offset += 1 for i in reversed(range(0, len(node.value))): if clean_value[-1] == node.value[i]: break right_offset += 1 guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset) return guess
def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None): # automatically retrieve the log object from the caller frame if not logger: caller_frame = inspect.stack()[1][0] logger = caller_frame.f_locals['self'].log guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) return found_guess(node, guess, update_guess=update_guess, logger=logger)
def build_guess(node, name, value=None, confidence=1.0): guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) guess.metadata().input = node.value if value is None else value if value is None: left_offset = 0 right_offset = 0 clean_value = node.clean_value if clean_value: for i in range(0, len(node.value)): if clean_value[0] == node.value[i]: break left_offset += 1 for i in reversed(range(0, len(node.value))): if clean_value[-1] == node.value[i]: break right_offset += 1 guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset) return guess
def search_language(string, lang_filter=None): """Looks for language patterns, and if found return the language object, its group span and an associated confidence. you can specify a list of allowed languages using the lang_filter argument, as in lang_filter = [ 'fr', 'eng', 'spanish' ] >>> search_language('movie [en].avi')['language'] Language(English) >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es']) """ if lang_filter: lang_filter = set( babelfish.Language.fromguessit(lang) for lang in lang_filter) confidence = 1.0 # for all of them for prop, language, lang, word in find_possible_languages(string): pos = string.find(word) end = pos + len(word) if lang_filter and language not in lang_filter: continue # only allow those languages that have a 2-letter code, those that # don't are too esoteric and probably false matches #if language.lang not in lng3_to_lng2: # continue # confidence depends on alpha2, alpha3, english name, ... if len(lang) == 2: confidence = 0.8 elif len(lang) == 3: confidence = 0.9 elif prop == 'subtitleLanguage': confidence = 0.6 # Subtitle prefix found with language else: # Note: we could either be really confident that we found a # language or assume that full language names are too # common words and lower their confidence accordingly confidence = 0.3 # going with the low-confidence route here return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) return None
def process(self, mtree, options=None): """guess the file type now (will be useful later) """ filetype, other = self.guess_filetype(mtree, options) mtree.guess.set("type", filetype, confidence=1.0) log_found_guess(mtree.guess) filetype_info = Guess(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(mtree.string, strict=False) if mime is not None: filetype_info.update({"mimetype": mime}, confidence=1.0) node_ext = mtree.node_at((-1,)) found_guess(node_ext, filetype_info) if mtree.guess.get("type") in [None, "unknown"]: if options.get("name_only"): mtree.guess.set("type", "movie", confidence=0.6) else: raise TransformerException(__name__, "Unknown file type")
def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('test/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] if info is None: info = ['filename'] if isinstance(info, basestring): info = [info] for infotype in info: if infotype == 'filename': m = IterativeMatcher(filename, filetype=filetype) result.append(m.matched()) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append( Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception, e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append( Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception, e: log.warning('Could not compute ed2k hash because: %s' % e)
def process_node(self, node, iterative=True, partial_span=None): if partial_span: value = node.value[partial_span[0]:partial_span[1]] else: value = node.value string = ' %s ' % value # add sentinels matcher_result = self.guess_func(string, node, self.options) if not matcher_result: return if not isinstance(matcher_result, Guess): result, span = matcher_result else: result, span = matcher_result, matcher_result.metadata().span #log.error('span2 %s' % (span,)) if not result: return if span[1] == len(string): # somehow, the sentinel got included in the span. Remove it span = (span[0], span[1] - 1) # readjust span to compensate for sentinels span = (span[0] - 1, span[1] - 1) # readjust span to compensate for partial_span if partial_span: span = (span[0] + partial_span[0], span[1] + partial_span[0]) skip_nodes = self.options.get('skip_nodes') if skip_nodes: # if we guessed a node that we need to skip, recurse down the tree and ignore that node for skip_node in skip_nodes: skip_node_relative_span = (skip_node.span[0] - node.offset, skip_node.span[1] - node.offset) if skip_node_relative_span == span: partition_spans = [ s for s in node.get_partition_spans(span) if s != skip_node.span ] for partition_span in partition_spans: relative_span = (partition_span[0] - node.offset, partition_span[1] - node.offset) self.process_node(node, partial_span=relative_span) return # restore sentinels compensation if isinstance(result, Guess): guess = result else: guess = Guess(result, confidence=self.confidence, input=string, span=span) if not iterative: found_guess(node, guess, logger=self.logger) else: absolute_span = (span[0] + node.offset, span[1] + node.offset) node.partition(span) if node.is_leaf(): # FIXME: this seems like it is dead code... found_guess(node, guess, logger=self.logger) else: found_child = None for child in node.children: if child.span == absolute_span: # if we have a match on one of our children, mark it as such... found_guess(child, guess, logger=self.logger) found_child = child break # ...and only then recurse on the other children for child in node.children: if child is not found_child: self.process_node(child)
def guess_release_group(self, string, node=None, options=None): if options and options.get('expected_group'): expected_container = PropertiesContainer( enhance=True, canonical_from_pattern=False) for expected_group in options.get('expected_group'): if expected_group.startswith('re:'): expected_group = expected_group[3:] expected_group = expected_group.replace(' ', '-') expected_container.register_property('releaseGroup', expected_group, enhance=True) else: expected_group = re.escape(expected_group) expected_container.register_property('releaseGroup', expected_group, enhance=False) found = expected_container.find_properties(string, node, options, 'releaseGroup') guess = expected_container.as_guess(found, string, self.validate_group_name) if guess: return guess found = self.container.find_properties(string, node, options, 'releaseGroup') guess = self.container.as_guess(found, string, self.validate_group_name) validated_guess = None if guess: group_node = node.group_node() if group_node: for leaf in group_node.leaves_containing( self.previous_safe_properties): if self.validate_node(leaf, node, True): if leaf.root.value[leaf.span[1]] == '-': guess.metadata().confidence = 1 else: guess.metadata().confidence = 0.7 validated_guess = guess if not validated_guess: # If previous group last leaf is identified as a safe property, # consider the raw value as a releaseGroup previous_group_node = node.previous_group_node() if previous_group_node: for leaf in previous_group_node.leaves_containing( self.previous_safe_properties): if self.validate_node(leaf, node, False): guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value))) if self.validate_group_name(guess): node.guess = guess validated_guess = guess if validated_guess: # If following group nodes have only one unidentified leaf, it belongs to the release group next_group_node = node while True: next_group_node = next_group_node.next_group_node() if next_group_node: leaves = list(next_group_node.leaves()) if len(leaves) == 1 and not leaves[0].guess: validated_guess['releaseGroup'] = validated_guess[ 'releaseGroup'] + leaves[0].value leaves[0].guess = validated_guess else: break else: break if not validated_guess and node.is_explicit( ) and node.node_last_idx == 0: # first node from group validated_guess = build_guess( node, 'releaseGroup', value=node.value[1:len(node.value) - 1]) validated_guess.metadata().confidence = 0.4 validated_guess.metadata().span = 1, len(node.value) node.guess = validated_guess if validated_guess: # Strip brackets validated_guess['releaseGroup'] = strip_brackets( validated_guess['releaseGroup']) return validated_guess
def __init__(self, string='', span=None, parent=None): self.string = string self.span = span or (0, len(string)) self.parent = parent self.children = [] self.guess = Guess()
def process_node(self, node, iterative=True, partial_span=None): if partial_span: value = node.value[partial_span[0]:partial_span[1]] else: value = node.value string = ' %s ' % value # add sentinels if not self.options: matcher_result = self.guess_func(string, node) else: matcher_result = self.guess_func(string, node, self.options) if matcher_result: if not isinstance(matcher_result, Guess): result, span = matcher_result else: result, span = matcher_result, matcher_result.metadata().span if result: # readjust span to compensate for sentinels span = (span[0] - 1, span[1] - 1) # readjust span to compensate for partial_span if partial_span: span = (span[0] + partial_span[0], span[1] + partial_span[0]) partition_spans = None if self.options and 'skip_nodes' in self.options: skip_nodes = self.options.get('skip_nodes') for skip_node in skip_nodes: if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\ skip_node.span == span or\ skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset): if partition_spans is None: partition_spans = _get_split_spans(node, skip_node.span) else: new_partition_spans = [] for partition_span in partition_spans: tmp_node = MatchTree(value, span=partition_span, parent=node) tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span) new_partition_spans.extend(tmp_partitions_spans) partition_spans.extend(new_partition_spans) if not partition_spans: # restore sentinels compensation if isinstance(result, Guess): guess = result else: guess = Guess(result, confidence=self.confidence, input=string, span=span) if not iterative: found_guess(node, guess, logger=self.logger) else: absolute_span = (span[0] + node.offset, span[1] + node.offset) node.partition(span) if node.is_leaf(): found_guess(node, guess, logger=self.logger) else: found_child = None for child in node.children: if child.span == absolute_span: found_guess(child, guess, logger=self.logger) found_child = child break for child in node.children: if child is not found_child: self.process_node(child) else: for partition_span in partition_spans: self.process_node(node, partial_span=partition_span)
def process_node(self, node, iterative=True, partial_span=None, skip_nodes=True): if skip_nodes and not isinstance(skip_nodes, list): skip_nodes = self.options.get('skip_nodes') elif not isinstance(skip_nodes, list): skip_nodes = [] if partial_span: value = node.value[partial_span[0]:partial_span[1]] else: value = node.value string = ' %s ' % value # add sentinels matcher_result = self.guess_func(string, node, self.options) if not matcher_result: return if not isinstance(matcher_result, Guess): result, span = matcher_result else: result, span = matcher_result, matcher_result.metadata().span #log.error('span2 %s' % (span,)) if not result: return if span[1] == len(string): # somehow, the sentinel got included in the span. Remove it span = (span[0], span[1] - 1) # readjust span to compensate for sentinels span = (span[0] - 1, span[1] - 1) # readjust span to compensate for partial_span if partial_span: span = (span[0] + partial_span[0], span[1] + partial_span[0]) if skip_nodes: skip_nodes = [skip_node for skip_node in self.options.get('skip_nodes') if skip_node.parent.span[0] == node.span[0] or skip_node.parent.span[1] == node.span[1]] # if we guessed a node that we need to skip, recurse down the tree and ignore that node indices = set() skip_nodes_spans = [] next_skip_nodes = [] for skip_node in skip_nodes: skip_for_next = False skip_nodes_spans.append(skip_node.span) if node.offset <= skip_node.span[0] <= node.span[1]: indices.add(skip_node.span[0] - node.offset) skip_for_next = True if node.offset <= skip_node.span[1] <= node.span[1]: indices.add(skip_node.span[1] - node.offset) skip_for_next = True if not skip_for_next: next_skip_nodes.append(skip_node) if indices: partition_spans = [s for s in node.get_partition_spans(indices) if s not in skip_nodes_spans] for partition_span in partition_spans: relative_span = (partition_span[0] - node.offset, partition_span[1] - node.offset) self.process_node(node, partial_span=relative_span, skip_nodes=next_skip_nodes) return # restore sentinels compensation if isinstance(result, Guess): guess = result else: no_sentinel_string =string[1:-1] guess = Guess(result, confidence=self.confidence, input=no_sentinel_string, span=span) if not iterative: found_guess(node, guess, logger=self.logger) else: absolute_span = (span[0] + node.offset, span[1] + node.offset) node.partition(span) found_child = None for child in node.children: if child.span == absolute_span: # if we have a match on one of our children, mark it as such... found_guess(child, guess, logger=self.logger) found_child = child break # ...and only then recurse on the other children for child in node.children: if child is not found_child: self.process_node(child)
def guess_video_metadata(filename): """Gets the video metadata properties out of a given file. The file needs to exist on the filesystem to be able to be analyzed. An empty guess is returned otherwise. You need to have the Enzyme python package installed for this to work.""" result = Guess() def found(prop, value): result[prop] = value log.debug('Found with enzyme %s: %s' % (prop, value)) # first get the size of the file, in bytes try: size = os.stat(filename).st_size found('fileSize', size) except Exception as e: log.error('Cannot get video file size: %s' % e) # file probably does not exist, we might as well return now return result # then get additional metadata from the file using enzyme, if available try: import enzyme with open(filename) as f: mkv = enzyme.MKV(f) found('duration', mkv.info.duration.total_seconds()) if mkv.video_tracks: video_track = mkv.video_tracks[0] # resolution if video_track.height in (480, 720, 1080): if video_track.interlaced: found('screenSize', '%di' % video_track.height) else: found('screenSize', '%dp' % video_track.height) else: # TODO: do we want this? #found('screenSize', '%dx%d' % (video_track.width, video_track.height)) pass # video codec if video_track.codec_id == 'V_MPEG4/ISO/AVC': found('videoCodec', 'h264') elif video_track.codec_id == 'V_MPEG4/ISO/SP': found('videoCodec', 'DivX') elif video_track.codec_id == 'V_MPEG4/ISO/ASP': found('videoCodec', 'XviD') else: log.warning('MKV has no video track') if mkv.audio_tracks: audio_track = mkv.audio_tracks[0] # audio codec if audio_track.codec_id == 'A_AC3': found('audioCodec', 'AC3') elif audio_track.codec_id == 'A_DTS': found('audioCodec', 'DTS') elif audio_track.codec_id == 'A_AAC': found('audioCodec', 'AAC') else: log.warning('MKV has no audio track') if mkv.subtitle_tracks: embedded_subtitle_languages = set() for st in mkv.subtitle_tracks: try: if st.language: lang = babelfish.Language.fromalpha3b(st.language) elif st.name: lang = babelfish.Language.fromname(st.name) else: lang = babelfish.Language('und') except babelfish.Error: lang = babelfish.Language('und') embedded_subtitle_languages.add(lang) found('subtitleLanguage', embedded_subtitle_languages) else: log.debug('MKV has no subtitle track') return result except ImportError: log.error('Cannot get video file metadata, missing dependency: enzyme') log.error( 'Please install it from PyPI, by doing eg: pip install enzyme') return result except IOError as e: log.error('Could not open file: %s' % filename) log.error( 'Make sure it exists and is available for reading on the filesystem' ) log.error('Error: %s' % e) return result except enzyme.Error as e: log.error('Cannot guess video file metadata') log.error('enzyme.Error while reading file: %s' % filename) log.error('Error: %s' % e) return result
def process_node(self, node, iterative=True, partial_span=None): value = None if partial_span: value = node.value[partial_span[0]:partial_span[1]] else: value = node.value string = ' %s ' % value # add sentinels if not self.options: matcher_result = self.guess_func(string, node) else: matcher_result = self.guess_func(string, node, self.options) if matcher_result: if not isinstance(matcher_result, Guess): result, span = matcher_result else: result, span = matcher_result, matcher_result.metadata().span if result: # readjust span to compensate for sentinels span = (span[0] - 1, span[1] - 1) # readjust span to compensate for partial_span if partial_span: span = (span[0] + partial_span[0], span[1] + partial_span[0]) partition_spans = None if self.options and 'skip_nodes' in self.options: skip_nodes = self.options.get('skip_nodes') for skip_node in skip_nodes: if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\ skip_node.span == span or\ skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset): partition_spans = node.get_partition_spans( skip_node.span) for to_remove_span in partition_spans: if to_remove_span[0] == skip_node.span[ 0] and to_remove_span[1] in [ skip_node.span[1], skip_node.span[1] + 1 ]: partition_spans.remove(to_remove_span) break #break if not partition_spans: # restore sentinels compensation guess = None if isinstance(result, Guess): guess = result else: guess = Guess(result, confidence=self.confidence, input=string, span=span) if not iterative: node.guess.update(guess) else: absolute_span = (span[0] + node.offset, span[1] + node.offset) node.partition(span) found_child = None for child in node.children: if child.span == absolute_span: found_guess(child, guess, self.logger) found_child = child break for child in node.children: if not child is found_child: self.process_node(child) else: for partition_span in partition_spans: self.process_node(node, partial_span=partition_span)
def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) if info is None: info = ['filename'] if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': result.append(_guess_filename(filename, filetype)) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append(Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append(Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) elif infotype.startswith('hash_'): import hashlib hashname = infotype[5:] try: hasher = getattr(hashlib, hashname)() hashers.append((infotype, hasher)) except AttributeError: log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) else: log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: chunk = f.read(blocksize) while chunk: for hasher in hasherobjs: hasher.update(chunk) chunk = f.read(blocksize) for infotype, hasher in hashers: result.append(Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception as e: log.warning('Could not compute hash because: %s' % e) result = merge_all(result) # last minute adjustments # if country is in the guessed properties, make it part of the filename if 'series' in result and 'country' in result: result['series'] += ' (%s)' % result['country'].alpha2.upper() return result
def guess_filetype(self, mtree, options=None): options = options or {} # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword which we could use here in the upgrade_* functions # (python 3 does have it) filetype_container = [mtree.guess.get('type')] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'episodeinfo' elif (not filetype_container[0] or filetype_container[0] == 'video'): filetype_container[0] = 'episode' def upgrade_movie(): if filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'movieinfo' elif (not filetype_container[0] or filetype_container[0] == 'video'): filetype_container[0] = 'movie' def upgrade_subtitle(): if filetype_container[0] == 'movie': filetype_container[0] = 'moviesubtitle' elif filetype_container[0] == 'episode': filetype_container[0] = 'episodesubtitle' elif not filetype_container[0]: filetype_container[0] = 'subtitle' def upgrade_info(): if filetype_container[0] == 'movie': filetype_container[0] = 'movieinfo' elif filetype_container[0] == 'episode': filetype_container[0] = 'episodeinfo' elif not filetype_container[0]: filetype_container[0] = 'info' # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {'container': fileext} elif fileext in info_exts: upgrade_info() other = {'container': fileext} elif fileext in video_exts: other = {'container': fileext} else: if fileext and not options.get('name_only'): other = {'extension': fileext} list(mtree.unidentified_leaves())[-1].guess = Guess(other) # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [ (r'Movies?', upgrade_movie), (r'Films?', upgrade_movie), (r'Tv[ _-]?Shows?', upgrade_episode), (r'Series?', upgrade_episode), (r'Episodes?', upgrade_episode), ] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() return filetype_container[0], other # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = mtree.clean_string(filename).lower() for m in self.MOVIES: if m in fname: self.log.debug( 'Found in exception list of movies -> type = movie') upgrade_movie() return filetype_container[0], other for s in self.SERIES: if s in fname: self.log.debug( 'Found in exception list of series -> type = episode') upgrade_episode() return filetype_container[0], other # if we have an episode_rexp (eg: s02e13), it is an episode episode_transformer = get_transformer('guess_episodes_rexps') if episode_transformer: filename_parts = list(x.value for x in mtree.unidentified_leaves()) filename_parts.append(filename) for filename_part in filename_parts: guess = episode_transformer.guess_episodes_rexps(filename_part) if guess: self.log.debug( 'Found guess_episodes_rexps: %s -> type = episode', guess) upgrade_episode() return filetype_container[0], other properties_transformer = get_transformer('guess_properties') if properties_transformer: # if we have certain properties characteristic of episodes, it is an ep found = properties_transformer.container.find_properties( filename, mtree, options, 'episodeFormat') guess = properties_transformer.container.as_guess(found, filename) if guess: self.log.debug( 'Found characteristic property of episodes: %s"', guess) upgrade_episode() return filetype_container[0], other weak_episode_transformer = get_transformer( 'guess_weak_episodes_rexps') if weak_episode_transformer: found = properties_transformer.container.find_properties( filename, mtree, options, 'crc32') guess = properties_transformer.container.as_guess( found, filename) if guess: found = weak_episode_transformer.container.find_properties( filename, mtree, options) guess = weak_episode_transformer.container.as_guess( found, filename) if guess: self.log.debug( 'Found characteristic property of episodes: %s"', guess) upgrade_episode() return filetype_container[0], other found = properties_transformer.container.find_properties( filename, mtree, options, 'format') guess = properties_transformer.container.as_guess(found, filename) if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'): # Use weak episodes only if TV or WEB source weak_episode_transformer = get_transformer( 'guess_weak_episodes_rexps') if weak_episode_transformer: guess = weak_episode_transformer.guess_weak_episodes_rexps( filename) if guess: self.log.debug( 'Found guess_weak_episodes_rexps: %s -> type = episode', guess) upgrade_episode() return filetype_container[0], other website_transformer = get_transformer('guess_website') if website_transformer: found = website_transformer.container.find_properties( filename, mtree, options, 'website') guess = website_transformer.container.as_guess(found, filename) if guess: for namepart in ('tv', 'serie', 'episode'): if namepart in guess['website']: # origin-specific type self.log.debug( 'Found characteristic property of episodes: %s', guess) upgrade_episode() return filetype_container[0], other if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts): # if no episode info found, assume it's a movie self.log.debug( 'Nothing characteristic found, assuming type = movie') upgrade_movie() if not filetype_container[0]: self.log.debug( 'Nothing characteristic found, assuming type = unknown') filetype_container[0] = 'unknown' return filetype_container[0], other
def guess_file_info(filename, info=None, options=None, **kwargs): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') >>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1']) >>> g['hash_md5'], g['hash_sha1'] ('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c') """ info = info or 'filename' options = options or {} if isinstance(options, base_text_type): args = shlex.split(options) options = vars(get_opts().parse_args(args)) if default_options: if isinstance(default_options, base_text_type): default_args = shlex.split(default_options) merged_options = vars(get_opts().parse_args(default_args)) else: merged_options = deepcopy(default_options) merged_options.update(options) options = merged_options result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': result.append(_guess_filename(filename, options, **kwargs)) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append( Guess({infotype: hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append( Guess({infotype: hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) elif infotype.startswith('hash_'): import hashlib hashname = infotype[5:] try: hasher = getattr(hashlib, hashname)() hashers.append((infotype, hasher)) except AttributeError: log.warning( 'Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) elif infotype == 'video': g = guess_video_metadata(filename) if g: result.append(g) else: log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: chunk = f.read(blocksize) while chunk: for hasher in hasherobjs: hasher.update(chunk) chunk = f.read(blocksize) for infotype, hasher in hashers: result.append( Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception as e: log.warning('Could not compute hash because: %s' % e) result = smart_merge(result) return result
log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: for chunk in iter(lambda: f.read(blocksize), ''): for hasher in hasherobjs: hasher.update(chunk) for infotype, hasher in hashers: result.append( Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception, e: log.warning('Could not compute hash because: %s' % e) return merge_all(result) def guess_video_info(filename, info=None): return guess_file_info(filename, 'autodetect', info) def guess_movie_info(filename, info=None): return guess_file_info(filename, 'movie', info) def guess_episode_info(filename, info=None):