Beispiel #1
0
    def process(self, mtree, options=None):
        """guess the file type now (will be useful later)
        """
        filetype, other = self.guess_filetype(mtree, options)

        mtree.guess.set('type', filetype, confidence=1.0)
        log_found_guess(mtree.guess)

        filetype_info = Guess(other, confidence=1.0)
        # guess the mimetype of the filename
        # TODO: handle other mimetypes not found on the default type_maps
        # mimetypes.types_map['.srt']='text/subtitle'
        mime, _ = mimetypes.guess_type(mtree.string, strict=False)
        if mime is not None:
            filetype_info.update({'mimetype': mime}, confidence=1.0)

        # Retrieve the last node of category path (extension node)
        node_ext = list(filter(lambda x: x.category == 'path',
                               mtree.nodes()))[-1]
        found_guess(node_ext, filetype_info)

        if mtree.guess.get('type') in [None, 'unknown']:
            if options.get('name_only'):
                mtree.guess.set('type', 'movie', confidence=0.6)
            else:
                raise TransformerException(__name__, 'Unknown file type')
Beispiel #2
0
    def process(self, mtree, options=None):
        """guess the file type now (will be useful later)
        """
        filetype, other = self.guess_filetype(mtree, options)

        mtree.guess.set('type', filetype, confidence=1.0)
        log_found_guess(mtree.guess)

        filetype_info = Guess(other, confidence=1.0)
        # guess the mimetype of the filename
        # TODO: handle other mimetypes not found on the default type_maps
        # mimetypes.types_map['.srt']='text/subtitle'
        mime, _ = mimetypes.guess_type(mtree.string, strict=False)
        if mime is not None:
            filetype_info.update({'mimetype': mime}, confidence=1.0)

        # Retrieve the last node of category path (extension node)
        node_ext = list(filter(lambda x: x.category == 'path', mtree.nodes()))[-1]
        found_guess(node_ext, filetype_info)

        if mtree.guess.get('type') in [None, 'unknown']:
            if options.get('name_only'):
                mtree.guess.set('type', 'movie', confidence=0.6)
            else:
                raise TransformerException(__name__, 'Unknown file type')
Beispiel #3
0
def build_guess(node, name, value=None, confidence=1.0):
    guess = Guess({name: node.clean_value if value is None else value},
                  confidence=confidence)
    if value is None:
        guess.metadata().span = node.span
    guess.metadata().input = node.value if value is None else value
    return guess
Beispiel #4
0
 def __init__(self, string='', span=None, parent=None, clean_function=None):
     self.string = string
     self.span = span or (0, len(string))
     self.parent = parent
     self.children = []
     self.guess = Guess()
     self._clean_value = None
     self._clean_function = clean_function or clean_default
Beispiel #5
0
    def guess_release_group(self, string, node=None, options=None):
        found = self.container.find_properties(string, node, 'releaseGroup')
        guess = self.container.as_guess(found,
                                        string,
                                        self.validate_group_name,
                                        sep_replacement='-')
        validated_guess = None
        if guess:
            explicit_group_node = node.group_node()
            if explicit_group_node:
                for leaf in explicit_group_node.leaves_containing(
                        self.previous_safe_properties):
                    if self.is_leaf_previous(leaf, node):
                        if leaf.root.value[leaf.span[1]] == '-':
                            guess.metadata().confidence = 1
                        else:
                            guess.metadata().confidence = 0.7
                        validated_guess = guess

            if not validated_guess:
                # If previous group last leaf is identified as a safe property,
                # consider the raw value as a releaseGroup
                previous_group_node = node.previous_group_node()
                if previous_group_node:
                    for leaf in previous_group_node.leaves_containing(
                            self.previous_safe_properties):
                        if self.is_leaf_previous(leaf, node):
                            guess = Guess({'releaseGroup': node.value},
                                          confidence=1,
                                          input=node.value,
                                          span=(0, len(node.value)))
                            if self.validate_group_name(guess):
                                node.guess = guess
                                validated_guess = guess

            if validated_guess:
                # If following group nodes have only one unidentified leaf, it belongs to the release group
                next_group_node = node

                while True:
                    next_group_node = next_group_node.next_group_node()
                    if next_group_node:
                        leaves = list(next_group_node.leaves())
                        if len(leaves) == 1 and not leaves[0].guess:
                            validated_guess['releaseGroup'] = validated_guess[
                                'releaseGroup'] + leaves[0].value
                            leaves[0].guess = validated_guess
                        else:
                            break
                    else:
                        break

        if validated_guess:
            # Strip brackets
            validated_guess['releaseGroup'] = strip_brackets(
                validated_guess['releaseGroup'])

        return validated_guess
Beispiel #6
0
def build_guess(node, name, value=None, confidence=1.0):
    guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
    guess.metadata().input = node.value if value is None else value
    if value is None:
        left_offset = 0
        right_offset = 0

        clean_value = node.clean_value

        for i in range(0, len(node.value)):
            if clean_value[0] == node.value[i]:
                break
            left_offset += 1

        for i in reversed(range(0, len(node.value))):
            if clean_value[-1] == node.value[i]:
                break
            right_offset += 1

        guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset)
    return guess
Beispiel #7
0
def found_property(node,
                   name,
                   value=None,
                   confidence=1.0,
                   update_guess=True,
                   logger=None):
    # automatically retrieve the log object from the caller frame
    if not logger:
        caller_frame = inspect.stack()[1][0]
        logger = caller_frame.f_locals['self'].log
    guess = Guess({name: node.clean_value if value is None else value},
                  confidence=confidence)
    return found_guess(node, guess, update_guess=update_guess, logger=logger)
Beispiel #8
0
def build_guess(node, name, value=None, confidence=1.0):
    guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
    guess.metadata().input = node.value if value is None else value
    if value is None:
        left_offset = 0
        right_offset = 0

        clean_value = node.clean_value

        if clean_value:
            for i in range(0, len(node.value)):
                if clean_value[0] == node.value[i]:
                    break
                left_offset += 1

            for i in reversed(range(0, len(node.value))):
                if clean_value[-1] == node.value[i]:
                    break
                right_offset += 1

        guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset)
    return guess
def search_language(string, lang_filter=None):
    """Looks for language patterns, and if found return the language object,
    its group span and an associated confidence.

    you can specify a list of allowed languages using the lang_filter argument,
    as in lang_filter = [ 'fr', 'eng', 'spanish' ]

    >>> search_language('movie [en].avi')['language']
    Language(English)

    >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])

    """

    if lang_filter:
        lang_filter = set(
            babelfish.Language.fromguessit(lang) for lang in lang_filter)

    confidence = 1.0  # for all of them

    for prop, language, lang, word in find_possible_languages(string):
        pos = string.find(word)
        end = pos + len(word)

        if lang_filter and language not in lang_filter:
            continue

        # only allow those languages that have a 2-letter code, those that
        # don't are too esoteric and probably false matches
        #if language.lang not in lng3_to_lng2:
        #    continue

        # confidence depends on alpha2, alpha3, english name, ...
        if len(lang) == 2:
            confidence = 0.8
        elif len(lang) == 3:
            confidence = 0.9
        elif prop == 'subtitleLanguage':
            confidence = 0.6  # Subtitle prefix found with language
        else:
            # Note: we could either be really confident that we found a
            #       language or assume that full language names are too
            #       common words and lower their confidence accordingly
            confidence = 0.3  # going with the low-confidence route here

        return Guess({prop: language},
                     confidence=confidence,
                     input=string,
                     span=(pos, end))

    return None
Beispiel #10
0
    def process(self, mtree, options=None):
        """guess the file type now (will be useful later)
        """
        filetype, other = self.guess_filetype(mtree, options)

        mtree.guess.set("type", filetype, confidence=1.0)
        log_found_guess(mtree.guess)

        filetype_info = Guess(other, confidence=1.0)
        # guess the mimetype of the filename
        # TODO: handle other mimetypes not found on the default type_maps
        # mimetypes.types_map['.srt']='text/subtitle'
        mime, _ = mimetypes.guess_type(mtree.string, strict=False)
        if mime is not None:
            filetype_info.update({"mimetype": mime}, confidence=1.0)

        node_ext = mtree.node_at((-1,))
        found_guess(node_ext, filetype_info)

        if mtree.guess.get("type") in [None, "unknown"]:
            if options.get("name_only"):
                mtree.guess.set("type", "movie", confidence=0.6)
            else:
                raise TransformerException(__name__, "Unknown file type")
Beispiel #11
0
def guess_file_info(filename, filetype, info=None):
    """info can contain the names of the various plugins, such as 'filename' to
    detect filename info, or 'hash_md5' to get the md5 hash of the file.

    >>> guess_file_info('test/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])
    {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}
    """
    result = []
    hashers = []

    if info is None:
        info = ['filename']

    if isinstance(info, basestring):
        info = [info]

    for infotype in info:
        if infotype == 'filename':
            m = IterativeMatcher(filename, filetype=filetype)
            result.append(m.matched())

        elif infotype == 'hash_mpc':
            from guessit.hash_mpc import hash_file
            try:
                result.append(
                    Guess({'hash_mpc': hash_file(filename)}, confidence=1.0))
            except Exception, e:
                log.warning('Could not compute MPC-style hash because: %s' % e)

        elif infotype == 'hash_ed2k':
            from guessit.hash_ed2k import hash_file
            try:
                result.append(
                    Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0))
            except Exception, e:
                log.warning('Could not compute ed2k hash because: %s' % e)
Beispiel #12
0
    def process_node(self, node, iterative=True, partial_span=None):
        if partial_span:
            value = node.value[partial_span[0]:partial_span[1]]
        else:
            value = node.value
        string = ' %s ' % value  # add sentinels

        matcher_result = self.guess_func(string, node, self.options)
        if not matcher_result:
            return

        if not isinstance(matcher_result, Guess):
            result, span = matcher_result
        else:
            result, span = matcher_result, matcher_result.metadata().span
            #log.error('span2 %s' % (span,))

        if not result:
            return

        if span[1] == len(string):
            # somehow, the sentinel got included in the span. Remove it
            span = (span[0], span[1] - 1)

        # readjust span to compensate for sentinels
        span = (span[0] - 1, span[1] - 1)

        # readjust span to compensate for partial_span
        if partial_span:
            span = (span[0] + partial_span[0], span[1] + partial_span[0])

        skip_nodes = self.options.get('skip_nodes')
        if skip_nodes:
            # if we guessed a node that we need to skip, recurse down the tree and ignore that node
            for skip_node in skip_nodes:
                skip_node_relative_span = (skip_node.span[0] - node.offset,
                                           skip_node.span[1] - node.offset)
                if skip_node_relative_span == span:
                    partition_spans = [
                        s for s in node.get_partition_spans(span)
                        if s != skip_node.span
                    ]
                    for partition_span in partition_spans:
                        relative_span = (partition_span[0] - node.offset,
                                         partition_span[1] - node.offset)
                        self.process_node(node, partial_span=relative_span)
                    return

        # restore sentinels compensation
        if isinstance(result, Guess):
            guess = result
        else:
            guess = Guess(result,
                          confidence=self.confidence,
                          input=string,
                          span=span)

        if not iterative:
            found_guess(node, guess, logger=self.logger)
        else:
            absolute_span = (span[0] + node.offset, span[1] + node.offset)
            node.partition(span)

            if node.is_leaf():
                # FIXME: this seems like it is dead code...
                found_guess(node, guess, logger=self.logger)

            else:
                found_child = None

                for child in node.children:
                    if child.span == absolute_span:
                        # if we have a match on one of our children, mark it as such...
                        found_guess(child, guess, logger=self.logger)
                        found_child = child
                        break

                # ...and only then recurse on the other children
                for child in node.children:
                    if child is not found_child:
                        self.process_node(child)
    def guess_release_group(self, string, node=None, options=None):
        if options and options.get('expected_group'):
            expected_container = PropertiesContainer(
                enhance=True, canonical_from_pattern=False)
            for expected_group in options.get('expected_group'):
                if expected_group.startswith('re:'):
                    expected_group = expected_group[3:]
                    expected_group = expected_group.replace(' ', '-')
                    expected_container.register_property('releaseGroup',
                                                         expected_group,
                                                         enhance=True)
                else:
                    expected_group = re.escape(expected_group)
                    expected_container.register_property('releaseGroup',
                                                         expected_group,
                                                         enhance=False)

            found = expected_container.find_properties(string, node, options,
                                                       'releaseGroup')
            guess = expected_container.as_guess(found, string,
                                                self.validate_group_name)
            if guess:
                return guess

        found = self.container.find_properties(string, node, options,
                                               'releaseGroup')
        guess = self.container.as_guess(found, string,
                                        self.validate_group_name)
        validated_guess = None
        if guess:
            group_node = node.group_node()
            if group_node:
                for leaf in group_node.leaves_containing(
                        self.previous_safe_properties):
                    if self.validate_node(leaf, node, True):
                        if leaf.root.value[leaf.span[1]] == '-':
                            guess.metadata().confidence = 1
                        else:
                            guess.metadata().confidence = 0.7
                        validated_guess = guess

            if not validated_guess:
                # If previous group last leaf is identified as a safe property,
                # consider the raw value as a releaseGroup
                previous_group_node = node.previous_group_node()
                if previous_group_node:
                    for leaf in previous_group_node.leaves_containing(
                            self.previous_safe_properties):
                        if self.validate_node(leaf, node, False):
                            guess = Guess({'releaseGroup': node.value},
                                          confidence=1,
                                          input=node.value,
                                          span=(0, len(node.value)))
                            if self.validate_group_name(guess):
                                node.guess = guess
                                validated_guess = guess

            if validated_guess:
                # If following group nodes have only one unidentified leaf, it belongs to the release group
                next_group_node = node

                while True:
                    next_group_node = next_group_node.next_group_node()
                    if next_group_node:
                        leaves = list(next_group_node.leaves())
                        if len(leaves) == 1 and not leaves[0].guess:
                            validated_guess['releaseGroup'] = validated_guess[
                                'releaseGroup'] + leaves[0].value
                            leaves[0].guess = validated_guess
                        else:
                            break
                    else:
                        break

            if not validated_guess and node.is_explicit(
            ) and node.node_last_idx == 0:  # first node from group
                validated_guess = build_guess(
                    node,
                    'releaseGroup',
                    value=node.value[1:len(node.value) - 1])
                validated_guess.metadata().confidence = 0.4
                validated_guess.metadata().span = 1, len(node.value)
                node.guess = validated_guess

        if validated_guess:
            # Strip brackets
            validated_guess['releaseGroup'] = strip_brackets(
                validated_guess['releaseGroup'])

        return validated_guess
Beispiel #14
0
 def __init__(self, string='', span=None, parent=None):
     self.string = string
     self.span = span or (0, len(string))
     self.parent = parent
     self.children = []
     self.guess = Guess()
Beispiel #15
0
    def process_node(self, node, iterative=True, partial_span=None):
        if partial_span:
            value = node.value[partial_span[0]:partial_span[1]]
        else:
            value = node.value
        string = ' %s ' % value  # add sentinels

        if not self.options:
            matcher_result = self.guess_func(string, node)
        else:
            matcher_result = self.guess_func(string, node, self.options)

        if matcher_result:
            if not isinstance(matcher_result, Guess):
                result, span = matcher_result
            else:
                result, span = matcher_result, matcher_result.metadata().span

            if result:
                # readjust span to compensate for sentinels
                span = (span[0] - 1, span[1] - 1)

                # readjust span to compensate for partial_span
                if partial_span:
                    span = (span[0] + partial_span[0], span[1] + partial_span[0])

                partition_spans = None
                if self.options and 'skip_nodes' in self.options:
                    skip_nodes = self.options.get('skip_nodes')
                    for skip_node in skip_nodes:
                        if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\
                            skip_node.span == span or\
                                skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset):
                            if partition_spans is None:
                                partition_spans = _get_split_spans(node, skip_node.span)
                            else:
                                new_partition_spans = []
                                for partition_span in partition_spans:
                                    tmp_node = MatchTree(value, span=partition_span, parent=node)
                                    tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span)
                                    new_partition_spans.extend(tmp_partitions_spans)
                                partition_spans.extend(new_partition_spans)

                if not partition_spans:
                    # restore sentinels compensation

                    if isinstance(result, Guess):
                        guess = result
                    else:
                        guess = Guess(result, confidence=self.confidence, input=string, span=span)

                    if not iterative:
                        found_guess(node, guess, logger=self.logger)
                    else:
                        absolute_span = (span[0] + node.offset, span[1] + node.offset)
                        node.partition(span)
                        if node.is_leaf():
                            found_guess(node, guess, logger=self.logger)
                        else:
                            found_child = None
                            for child in node.children:
                                if child.span == absolute_span:
                                    found_guess(child, guess, logger=self.logger)
                                    found_child = child
                                    break
                            for child in node.children:
                                if child is not found_child:
                                    self.process_node(child)
                else:
                    for partition_span in partition_spans:
                        self.process_node(node, partial_span=partition_span)
Beispiel #16
0
    def process_node(self, node, iterative=True, partial_span=None, skip_nodes=True):
        if skip_nodes and not isinstance(skip_nodes, list):
            skip_nodes = self.options.get('skip_nodes')
        elif not isinstance(skip_nodes, list):
            skip_nodes = []

        if partial_span:
            value = node.value[partial_span[0]:partial_span[1]]
        else:
            value = node.value
        string = ' %s ' % value  # add sentinels

        matcher_result = self.guess_func(string, node, self.options)
        if not matcher_result:
            return

        if not isinstance(matcher_result, Guess):
            result, span = matcher_result
        else:
            result, span = matcher_result, matcher_result.metadata().span
            #log.error('span2 %s' % (span,))

        if not result:
            return

        if span[1] == len(string):
            # somehow, the sentinel got included in the span. Remove it
            span = (span[0], span[1] - 1)

        # readjust span to compensate for sentinels
        span = (span[0] - 1, span[1] - 1)

        # readjust span to compensate for partial_span
        if partial_span:
            span = (span[0] + partial_span[0], span[1] + partial_span[0])

        if skip_nodes:
            skip_nodes = [skip_node for skip_node in self.options.get('skip_nodes') if skip_node.parent.span[0] == node.span[0] or skip_node.parent.span[1] == node.span[1]]
            # if we guessed a node that we need to skip, recurse down the tree and ignore that node
            indices = set()
            skip_nodes_spans = []
            next_skip_nodes = []
            for skip_node in skip_nodes:
                skip_for_next = False
                skip_nodes_spans.append(skip_node.span)
                if node.offset <= skip_node.span[0] <= node.span[1]:
                    indices.add(skip_node.span[0] - node.offset)
                    skip_for_next = True
                if node.offset <= skip_node.span[1] <= node.span[1]:
                    indices.add(skip_node.span[1] - node.offset)
                    skip_for_next = True
                if not skip_for_next:
                    next_skip_nodes.append(skip_node)
            if indices:
                partition_spans = [s for s in node.get_partition_spans(indices) if s not in skip_nodes_spans]
                for partition_span in partition_spans:
                    relative_span = (partition_span[0] - node.offset, partition_span[1] - node.offset)
                    self.process_node(node, partial_span=relative_span, skip_nodes=next_skip_nodes)
                return

        # restore sentinels compensation
        if isinstance(result, Guess):
            guess = result
        else:
            no_sentinel_string =string[1:-1]
            guess = Guess(result, confidence=self.confidence, input=no_sentinel_string, span=span)

        if not iterative:
            found_guess(node, guess, logger=self.logger)
        else:
            absolute_span = (span[0] + node.offset, span[1] + node.offset)
            node.partition(span)
            found_child = None

            for child in node.children:
                if child.span == absolute_span:
                    # if we have a match on one of our children, mark it as such...
                    found_guess(child, guess, logger=self.logger)
                    found_child = child
                    break

            # ...and only then recurse on the other children
            for child in node.children:
                if child is not found_child:
                    self.process_node(child)
Beispiel #17
0
def guess_video_metadata(filename):
    """Gets the video metadata properties out of a given file. The file needs to
    exist on the filesystem to be able to be analyzed. An empty guess is
    returned otherwise.

    You need to have the Enzyme python package installed for this to work."""
    result = Guess()

    def found(prop, value):
        result[prop] = value
        log.debug('Found with enzyme %s: %s' % (prop, value))

    # first get the size of the file, in bytes
    try:
        size = os.stat(filename).st_size
        found('fileSize', size)

    except Exception as e:
        log.error('Cannot get video file size: %s' % e)
        # file probably does not exist, we might as well return now
        return result

    # then get additional metadata from the file using enzyme, if available
    try:
        import enzyme

        with open(filename) as f:
            mkv = enzyme.MKV(f)

            found('duration', mkv.info.duration.total_seconds())

            if mkv.video_tracks:
                video_track = mkv.video_tracks[0]

                # resolution
                if video_track.height in (480, 720, 1080):
                    if video_track.interlaced:
                        found('screenSize', '%di' % video_track.height)
                    else:
                        found('screenSize', '%dp' % video_track.height)
                else:
                    # TODO: do we want this?
                    #found('screenSize', '%dx%d' % (video_track.width, video_track.height))
                    pass

                # video codec
                if video_track.codec_id == 'V_MPEG4/ISO/AVC':
                    found('videoCodec', 'h264')
                elif video_track.codec_id == 'V_MPEG4/ISO/SP':
                    found('videoCodec', 'DivX')
                elif video_track.codec_id == 'V_MPEG4/ISO/ASP':
                    found('videoCodec', 'XviD')

            else:
                log.warning('MKV has no video track')

            if mkv.audio_tracks:
                audio_track = mkv.audio_tracks[0]
                # audio codec
                if audio_track.codec_id == 'A_AC3':
                    found('audioCodec', 'AC3')
                elif audio_track.codec_id == 'A_DTS':
                    found('audioCodec', 'DTS')
                elif audio_track.codec_id == 'A_AAC':
                    found('audioCodec', 'AAC')
            else:
                log.warning('MKV has no audio track')

            if mkv.subtitle_tracks:
                embedded_subtitle_languages = set()
                for st in mkv.subtitle_tracks:
                    try:
                        if st.language:
                            lang = babelfish.Language.fromalpha3b(st.language)
                        elif st.name:
                            lang = babelfish.Language.fromname(st.name)
                        else:
                            lang = babelfish.Language('und')

                    except babelfish.Error:
                        lang = babelfish.Language('und')

                    embedded_subtitle_languages.add(lang)

                found('subtitleLanguage', embedded_subtitle_languages)
            else:
                log.debug('MKV has no subtitle track')

        return result

    except ImportError:
        log.error('Cannot get video file metadata, missing dependency: enzyme')
        log.error(
            'Please install it from PyPI, by doing eg: pip install enzyme')
        return result

    except IOError as e:
        log.error('Could not open file: %s' % filename)
        log.error(
            'Make sure it exists and is available for reading on the filesystem'
        )
        log.error('Error: %s' % e)
        return result

    except enzyme.Error as e:
        log.error('Cannot guess video file metadata')
        log.error('enzyme.Error while reading file: %s' % filename)
        log.error('Error: %s' % e)
        return result
Beispiel #18
0
    def process_node(self, node, iterative=True, partial_span=None):
        value = None
        if partial_span:
            value = node.value[partial_span[0]:partial_span[1]]
        else:
            value = node.value
        string = ' %s ' % value  # add sentinels

        if not self.options:
            matcher_result = self.guess_func(string, node)
        else:
            matcher_result = self.guess_func(string, node, self.options)

        if matcher_result:
            if not isinstance(matcher_result, Guess):
                result, span = matcher_result
            else:
                result, span = matcher_result, matcher_result.metadata().span

            if result:
                # readjust span to compensate for sentinels
                span = (span[0] - 1, span[1] - 1)

                # readjust span to compensate for partial_span
                if partial_span:
                    span = (span[0] + partial_span[0],
                            span[1] + partial_span[0])

                partition_spans = None
                if self.options and 'skip_nodes' in self.options:
                    skip_nodes = self.options.get('skip_nodes')
                    for skip_node in skip_nodes:
                        if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\
                            skip_node.span == span or\
                            skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset):
                            partition_spans = node.get_partition_spans(
                                skip_node.span)
                            for to_remove_span in partition_spans:
                                if to_remove_span[0] == skip_node.span[
                                        0] and to_remove_span[1] in [
                                            skip_node.span[1],
                                            skip_node.span[1] + 1
                                        ]:
                                    partition_spans.remove(to_remove_span)
                                    break
                            #break

                if not partition_spans:
                    # restore sentinels compensation

                    guess = None
                    if isinstance(result, Guess):
                        guess = result
                    else:
                        guess = Guess(result,
                                      confidence=self.confidence,
                                      input=string,
                                      span=span)

                    if not iterative:
                        node.guess.update(guess)
                    else:
                        absolute_span = (span[0] + node.offset,
                                         span[1] + node.offset)
                        node.partition(span)
                        found_child = None
                        for child in node.children:
                            if child.span == absolute_span:
                                found_guess(child, guess, self.logger)
                                found_child = child
                                break
                        for child in node.children:
                            if not child is found_child:
                                self.process_node(child)
                else:
                    for partition_span in partition_spans:
                        self.process_node(node, partial_span=partition_span)
Beispiel #19
0
def build_guess(node, name, value=None, confidence=1.0):
    guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
    if value is None:
        guess.metadata().span = node.span
    guess.metadata().input = node.value if value is None else value
    return guess
Beispiel #20
0
def guess_file_info(filename, filetype, info=None):
    """info can contain the names of the various plugins, such as 'filename' to
    detect filename info, or 'hash_md5' to get the md5 hash of the file.

    >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])
    {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}
    """
    result = []
    hashers = []

    # Force unicode as soon as possible
    filename = u(filename)

    if info is None:
        info = ['filename']

    if isinstance(info, base_text_type):
        info = [info]

    for infotype in info:
        if infotype == 'filename':
            result.append(_guess_filename(filename, filetype))

        elif infotype == 'hash_mpc':
            from guessit.hash_mpc import hash_file

            try:
                result.append(Guess({'hash_mpc': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute MPC-style hash because: %s' % e)

        elif infotype == 'hash_ed2k':
            from guessit.hash_ed2k import hash_file

            try:
                result.append(Guess({'hash_ed2k': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute ed2k hash because: %s' % e)

        elif infotype.startswith('hash_'):
            import hashlib

            hashname = infotype[5:]
            try:
                hasher = getattr(hashlib, hashname)()
                hashers.append((infotype, hasher))
            except AttributeError:
                log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)

        else:
            log.warning('Invalid infotype: %s' % infotype)

    # do all the hashes now, but on a single pass
    if hashers:
        try:
            blocksize = 8192
            hasherobjs = dict(hashers).values()

            with open(filename, 'rb') as f:
                chunk = f.read(blocksize)
                while chunk:
                    for hasher in hasherobjs:
                        hasher.update(chunk)
                    chunk = f.read(blocksize)

            for infotype, hasher in hashers:
                result.append(Guess({infotype: hasher.hexdigest()},
                                    confidence=1.0))
        except Exception as e:
            log.warning('Could not compute hash because: %s' % e)

    result = merge_all(result)

    # last minute adjustments

    # if country is in the guessed properties, make it part of the filename
    if 'series' in result and 'country' in result:
        result['series'] += ' (%s)' % result['country'].alpha2.upper()

    return result
Beispiel #21
0
    def guess_filetype(self, mtree, options=None):
        options = options or {}

        # put the filetype inside a dummy container to be able to have the
        # following functions work correctly as closures
        # this is a workaround for python 2 which doesn't have the
        # 'nonlocal' keyword which we could use here in the upgrade_* functions
        # (python 3 does have it)
        filetype_container = [mtree.guess.get('type')]
        other = {}
        filename = mtree.string

        def upgrade_episode():
            if filetype_container[0] == 'subtitle':
                filetype_container[0] = 'episodesubtitle'
            elif filetype_container[0] == 'info':
                filetype_container[0] = 'episodeinfo'
            elif (not filetype_container[0]
                  or filetype_container[0] == 'video'):
                filetype_container[0] = 'episode'

        def upgrade_movie():
            if filetype_container[0] == 'subtitle':
                filetype_container[0] = 'moviesubtitle'
            elif filetype_container[0] == 'info':
                filetype_container[0] = 'movieinfo'
            elif (not filetype_container[0]
                  or filetype_container[0] == 'video'):
                filetype_container[0] = 'movie'

        def upgrade_subtitle():
            if filetype_container[0] == 'movie':
                filetype_container[0] = 'moviesubtitle'
            elif filetype_container[0] == 'episode':
                filetype_container[0] = 'episodesubtitle'
            elif not filetype_container[0]:
                filetype_container[0] = 'subtitle'

        def upgrade_info():
            if filetype_container[0] == 'movie':
                filetype_container[0] = 'movieinfo'
            elif filetype_container[0] == 'episode':
                filetype_container[0] = 'episodeinfo'
            elif not filetype_container[0]:
                filetype_container[0] = 'info'

        # look at the extension first
        fileext = os.path.splitext(filename)[1][1:].lower()
        if fileext in subtitle_exts:
            upgrade_subtitle()
            other = {'container': fileext}
        elif fileext in info_exts:
            upgrade_info()
            other = {'container': fileext}
        elif fileext in video_exts:
            other = {'container': fileext}
        else:
            if fileext and not options.get('name_only'):
                other = {'extension': fileext}
                list(mtree.unidentified_leaves())[-1].guess = Guess(other)

        # check whether we are in a 'Movies', 'Tv Shows', ... folder
        folder_rexps = [
            (r'Movies?', upgrade_movie),
            (r'Films?', upgrade_movie),
            (r'Tv[ _-]?Shows?', upgrade_episode),
            (r'Series?', upgrade_episode),
            (r'Episodes?', upgrade_episode),
        ]
        for frexp, upgrade_func in folder_rexps:
            frexp = re.compile(frexp, re.IGNORECASE)
            for pathgroup in mtree.children:
                if frexp.match(pathgroup.value):
                    upgrade_func()
                    return filetype_container[0], other

        # check for a few specific cases which will unintentionally make the
        # following heuristics confused (eg: OSS 117 will look like an episode,
        # season 1, epnum 17, when it is in fact a movie)
        fname = mtree.clean_string(filename).lower()
        for m in self.MOVIES:
            if m in fname:
                self.log.debug(
                    'Found in exception list of movies -> type = movie')
                upgrade_movie()
                return filetype_container[0], other
        for s in self.SERIES:
            if s in fname:
                self.log.debug(
                    'Found in exception list of series -> type = episode')
                upgrade_episode()
                return filetype_container[0], other

        # if we have an episode_rexp (eg: s02e13), it is an episode
        episode_transformer = get_transformer('guess_episodes_rexps')
        if episode_transformer:
            filename_parts = list(x.value for x in mtree.unidentified_leaves())
            filename_parts.append(filename)
            for filename_part in filename_parts:
                guess = episode_transformer.guess_episodes_rexps(filename_part)
                if guess:
                    self.log.debug(
                        'Found guess_episodes_rexps: %s -> type = episode',
                        guess)
                    upgrade_episode()
                    return filetype_container[0], other

        properties_transformer = get_transformer('guess_properties')
        if properties_transformer:
            # if we have certain properties characteristic of episodes, it is an ep
            found = properties_transformer.container.find_properties(
                filename, mtree, options, 'episodeFormat')
            guess = properties_transformer.container.as_guess(found, filename)
            if guess:
                self.log.debug(
                    'Found characteristic property of episodes: %s"', guess)
                upgrade_episode()
                return filetype_container[0], other

            weak_episode_transformer = get_transformer(
                'guess_weak_episodes_rexps')
            if weak_episode_transformer:
                found = properties_transformer.container.find_properties(
                    filename, mtree, options, 'crc32')
                guess = properties_transformer.container.as_guess(
                    found, filename)
                if guess:
                    found = weak_episode_transformer.container.find_properties(
                        filename, mtree, options)
                    guess = weak_episode_transformer.container.as_guess(
                        found, filename)
                    if guess:
                        self.log.debug(
                            'Found characteristic property of episodes: %s"',
                            guess)
                        upgrade_episode()
                        return filetype_container[0], other

            found = properties_transformer.container.find_properties(
                filename, mtree, options, 'format')
            guess = properties_transformer.container.as_guess(found, filename)
            if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL',
                                             'DVB'):
                # Use weak episodes only if TV or WEB source
                weak_episode_transformer = get_transformer(
                    'guess_weak_episodes_rexps')
                if weak_episode_transformer:
                    guess = weak_episode_transformer.guess_weak_episodes_rexps(
                        filename)
                    if guess:
                        self.log.debug(
                            'Found guess_weak_episodes_rexps: %s -> type = episode',
                            guess)
                        upgrade_episode()
                        return filetype_container[0], other

        website_transformer = get_transformer('guess_website')
        if website_transformer:
            found = website_transformer.container.find_properties(
                filename, mtree, options, 'website')
            guess = website_transformer.container.as_guess(found, filename)
            if guess:
                for namepart in ('tv', 'serie', 'episode'):
                    if namepart in guess['website']:
                        # origin-specific type
                        self.log.debug(
                            'Found characteristic property of episodes: %s',
                            guess)
                        upgrade_episode()
                        return filetype_container[0], other

        if filetype_container[0] in ('subtitle',
                                     'info') or (not filetype_container[0]
                                                 and fileext in video_exts):
            # if no episode info found, assume it's a movie
            self.log.debug(
                'Nothing characteristic found, assuming type = movie')
            upgrade_movie()

        if not filetype_container[0]:
            self.log.debug(
                'Nothing characteristic found, assuming type = unknown')
            filetype_container[0] = 'unknown'

        return filetype_container[0], other
Beispiel #22
0
def guess_file_info(filename, info=None, options=None, **kwargs):
    """info can contain the names of the various plugins, such as 'filename' to
    detect filename info, or 'hash_md5' to get the md5 hash of the file.

    >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt')
    >>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1'])
    >>> g['hash_md5'], g['hash_sha1']
    ('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c')
    """
    info = info or 'filename'
    options = options or {}

    if isinstance(options, base_text_type):
        args = shlex.split(options)
        options = vars(get_opts().parse_args(args))
    if default_options:
        if isinstance(default_options, base_text_type):
            default_args = shlex.split(default_options)
            merged_options = vars(get_opts().parse_args(default_args))
        else:
            merged_options = deepcopy(default_options)
        merged_options.update(options)
        options = merged_options

    result = []
    hashers = []

    # Force unicode as soon as possible
    filename = u(filename)

    if isinstance(info, base_text_type):
        info = [info]

    for infotype in info:
        if infotype == 'filename':
            result.append(_guess_filename(filename, options, **kwargs))

        elif infotype == 'hash_mpc':
            from guessit.hash_mpc import hash_file
            try:
                result.append(
                    Guess({infotype: hash_file(filename)}, confidence=1.0))
            except Exception as e:
                log.warning('Could not compute MPC-style hash because: %s' % e)

        elif infotype == 'hash_ed2k':
            from guessit.hash_ed2k import hash_file
            try:
                result.append(
                    Guess({infotype: hash_file(filename)}, confidence=1.0))
            except Exception as e:
                log.warning('Could not compute ed2k hash because: %s' % e)

        elif infotype.startswith('hash_'):
            import hashlib
            hashname = infotype[5:]
            try:
                hasher = getattr(hashlib, hashname)()
                hashers.append((infotype, hasher))
            except AttributeError:
                log.warning(
                    'Could not compute %s hash because it is not available from python\'s hashlib module'
                    % hashname)

        elif infotype == 'video':
            g = guess_video_metadata(filename)
            if g:
                result.append(g)

        else:
            log.warning('Invalid infotype: %s' % infotype)

    # do all the hashes now, but on a single pass
    if hashers:
        try:
            blocksize = 8192
            hasherobjs = dict(hashers).values()

            with open(filename, 'rb') as f:
                chunk = f.read(blocksize)
                while chunk:
                    for hasher in hasherobjs:
                        hasher.update(chunk)
                    chunk = f.read(blocksize)

            for infotype, hasher in hashers:
                result.append(
                    Guess({infotype: hasher.hexdigest()}, confidence=1.0))
        except Exception as e:
            log.warning('Could not compute hash because: %s' % e)

    result = smart_merge(result)

    return result
Beispiel #23
0
            log.warning('Invalid infotype: %s' % infotype)

    # do all the hashes now, but on a single pass
    if hashers:
        try:
            blocksize = 8192
            hasherobjs = dict(hashers).values()

            with open(filename, 'rb') as f:
                for chunk in iter(lambda: f.read(blocksize), ''):
                    for hasher in hasherobjs:
                        hasher.update(chunk)

            for infotype, hasher in hashers:
                result.append(
                    Guess({infotype: hasher.hexdigest()}, confidence=1.0))
        except Exception, e:
            log.warning('Could not compute hash because: %s' % e)

    return merge_all(result)


def guess_video_info(filename, info=None):
    return guess_file_info(filename, 'autodetect', info)


def guess_movie_info(filename, info=None):
    return guess_file_info(filename, 'movie', info)


def guess_episode_info(filename, info=None):