Example #1
0
    def post_process(self, mtree, options=None):
        # 1- try to promote language to subtitle language where it makes sense
        prefixes = []

        for node in mtree.nodes():
            if 'language' not in node.guess:
                continue

            # - if we matched a language in a file with a sub extension and that
            #   the group is the last group of the filename, it is probably the
            #   language of the subtitle
            #   (eg: 'xxx.english.srt')
            ext_node = list(filter(lambda x: x.category == 'path', mtree.nodes()))[-1]
            if (ext_node.value.lower() in subtitle_exts and
                    node == list(mtree.leaves())[-2]):
                self.promote_subtitle(node)

            # - if we find in the same explicit group
            # a subtitle prefix before the language,
            # or a subtitle suffix after the language,
            # then upgrade the language
            explicit_group = mtree.node_at(node.node_idx[:2])
            group_str = explicit_group.value.lower()

            for sub_prefix in subtitle_prefixes:
                if (sub_prefix in find_words(group_str) and
                        0 <= group_str.find(sub_prefix) < (node.span[0] - explicit_group.span[0])):
                    prefixes.append((explicit_group, sub_prefix))
                    self.promote_subtitle(node)

            # - if a language is in an explicit group just preceded by "st",
            #   it is a subtitle language (eg: '...st[fr-eng]...')
            try:
                idx = node.node_idx
                previous = list(mtree.node_at((idx[0], idx[1] - 1)).leaves())[-1]
                if previous.value.lower()[-2:] == 'st':
                    self.promote_subtitle(node)
            except IndexError:
                pass

        for node in mtree.nodes():
            if 'language' not in node.guess:
                continue

            explicit_group = mtree.node_at(node.node_idx[:2])
            group_str = explicit_group.value.lower()

            for sub_suffix in subtitle_suffixes:
                if (sub_suffix in find_words(group_str) and
                            (node.span[0] - explicit_group.span[0]) < group_str.find(sub_suffix)):
                    is_a_prefix = False
                    for prefix in prefixes:
                        if prefix[0] == explicit_group and group_str.find(prefix[1]) == group_str.find(sub_suffix):
                            is_a_prefix = True
                            break
                    if not is_a_prefix:
                        self.promote_subtitle(node)
Example #2
0
    def process(self, mtree):
        """perform some post-processing steps
        """

        # 1- try to promote language to subtitle language where it makes sense
        for node in mtree.nodes():
            if "language" not in node.guess:
                continue

            # - if we matched a language in a file with a sub extension and that
            #   the group is the last group of the filename, it is probably the
            #   language of the subtitle
            #   (eg: 'xxx.english.srt')
            if mtree.node_at((-1,)).value.lower() in subtitle_exts and node == mtree.leaves()[-2]:
                self.promote_subtitle(node)

            # - if we find in the same explicit group
            # a subtitle prefix before the language,
            # or a subtitle suffix after the language,
            # then upgrade the language
            explicit_group = mtree.node_at(node.node_idx[:2])
            group_str = explicit_group.value.lower()

            for sub_prefix in subtitle_prefixes:
                if sub_prefix in find_words(group_str) and 0 <= group_str.find(sub_prefix) < (
                    node.span[0] - explicit_group.span[0]
                ):
                    self.promote_subtitle(node)

            for sub_suffix in subtitle_suffixes:
                if sub_suffix in find_words(group_str) and (node.span[0] - explicit_group.span[0]) < group_str.find(
                    sub_suffix
                ):
                    self.promote_subtitle(node)

            # - if a language is in an explicit group just preceded by "st",
            #   it is a subtitle language (eg: '...st[fr-eng]...')
            try:
                idx = node.node_idx
                previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1]
                if previous.value.lower()[-2:] == "st":
                    self.promote_subtitle(node)
            except IndexError:
                pass

        # 2- ", the" at the end of a series title should be prepended to it
        for node in mtree.nodes():
            if "series" not in node.guess:
                continue

            node.guess["series"] = reorder_title(node.guess["series"])
Example #3
0
def find_possible_languages(string):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """
    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if not lang_word in LNG_COMMON_WORDS:
            try:
                lang = Language(lang_word)
                # Keep language with alpha2 equilavent. Others are probably an uncommon language.
                if lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
def find_possible_languages(string):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """
    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if not lang_word in LNG_COMMON_WORDS:
            try:
                lang = Language(lang_word)
                # Keep language with alpha2 equilavent. Others are probably an uncommon language.
                if lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
    def post_process(self, mtree, options=None):
        # 1- try to promote language to subtitle language where it makes sense
        for node in mtree.nodes():
            if 'language' not in node.guess:
                continue

            # - if we matched a language in a file with a sub extension and that
            #   the group is the last group of the filename, it is probably the
            #   language of the subtitle
            #   (eg: 'xxx.english.srt')
            if (mtree.node_at((-1, )).value.lower() in subtitle_exts
                    and node == list(mtree.leaves())[-2]):
                self.promote_subtitle(node)

            # - if we find in the same explicit group
            # a subtitle prefix before the language,
            # or a subtitle suffix after the language,
            # then upgrade the language
            explicit_group = mtree.node_at(node.node_idx[:2])
            group_str = explicit_group.value.lower()

            for sub_prefix in subtitle_prefixes:
                if (sub_prefix in find_words(group_str)
                        and 0 <= group_str.find(sub_prefix) <
                    (node.span[0] - explicit_group.span[0])):
                    self.promote_subtitle(node)

            for sub_suffix in subtitle_suffixes:
                if (sub_suffix in find_words(group_str)
                        and (node.span[0] - explicit_group.span[0]) <
                        group_str.find(sub_suffix)):
                    self.promote_subtitle(node)

            # - if a language is in an explicit group just preceded by "st",
            #   it is a subtitle language (eg: '...st[fr-eng]...')
            try:
                idx = node.node_idx
                previous = list(mtree.node_at(
                    (idx[0], idx[1] - 1)).leaves())[-1]
                if previous.value.lower()[-2:] == 'st':
                    self.promote_subtitle(node)
            except IndexError:
                pass
Example #6
0
def find_possible_languages(string):
    """Find possible languages in the string

    :return: list of tuple (property, language, word)
    """
    found_words = set(find_words(string))

    valid_words = []
    for word in found_words:
        lword = word.lower()
        result = _possible_languages_hashed.get(lword)
        if result:
            valid_words.append((result[0], result[1], word))

    return valid_words
Example #7
0
def process(mtree):
    # 1- try to promote language to subtitle language where it makes sense
    for node in mtree.nodes():
        if 'language' not in node.guess:
            continue

        def promote_subtitle():
            # pylint: disable=W0631
            node.guess.set('subtitleLanguage',
                           node.guess['language'],
                           confidence=node.guess.confidence('language'))
            del node.guess['language']

        # - if we matched a language in a file with a sub extension and that
        #   the group is the last group of the filename, it is probably the
        #   language of the subtitle
        #   (eg: 'xxx.english.srt')
        if (mtree.node_at((-1, )).value.lower() in subtitle_exts
                and node == mtree.leaves()[-2]):
            promote_subtitle()

        # - if we find the word 'sub' before the language, and in the same explicit
        #   group, then upgrade the language
        explicit_group = mtree.node_at(node.node_idx[:2])
        group_str = explicit_group.value.lower()

        if ('sub' in find_words(group_str) and 0 <= group_str.find('sub') <
            (node.span[0] - explicit_group.span[0])):
            promote_subtitle()

        # - if a language is in an explicit group just preceded by "st",
        #   it is a subtitle language (eg: '...st[fr-eng]...')
        try:
            idx = node.node_idx
            previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1]
            if previous.value.lower()[-2:] == 'st':
                promote_subtitle()
        except IndexError:
            pass

    # 2- ", the" at the end of a series title should be prepended to it
    for node in mtree.nodes():
        if 'series' not in node.guess:
            continue

        node.guess['series'] = reorder_title(node.guess['series'])
def process(mtree):
    # 1- try to promote language to subtitle language where it makes sense
    for node in mtree.nodes():
        if 'language' not in node.guess:
            continue

        def promote_subtitle():
            # pylint: disable=W0631
            node.guess.set('subtitleLanguage', node.guess['language'],
                           confidence=node.guess.confidence('language'))
            del node.guess['language']

        # - if we matched a language in a file with a sub extension and that
        #   the group is the last group of the filename, it is probably the
        #   language of the subtitle
        #   (eg: 'xxx.english.srt')
        if (mtree.node_at((-1,)).value.lower() in subtitle_exts and
            node == mtree.leaves()[-2]):
            promote_subtitle()

        # - if we find the word 'sub' before the language, and in the same explicit
        #   group, then upgrade the language
        explicit_group = mtree.node_at(node.node_idx[:2])
        group_str = explicit_group.value.lower()

        if ('sub' in find_words(group_str) and
            0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])):
            promote_subtitle()

        # - if a language is in an explicit group just preceded by "st",
        #   it is a subtitle language (eg: '...st[fr-eng]...')
        try:
            idx = node.node_idx
            previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1]
            if previous.value.lower()[-2:] == 'st':
                promote_subtitle()
        except IndexError:
            pass

    # 2- ", the" at the end of a series title should be prepended to it
    for node in mtree.nodes():
        if 'series' not in node.guess:
            continue

        node.guess['series'] = reorder_title(node.guess['series'])
Example #9
0
def find_possible_languages(string, allowed_languages=None):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """

    common_words = None
    if allowed_languages:
        common_words = LNG_COMMON_WORDS_STRICT
    else:
        common_words = LNG_COMMON_WORDS

    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if lang_word not in common_words and word.lower() not in common_words:
            try:
                lang = Language.fromguessit(lang_word)
                if allowed_languages:
                    if lang.name.lower(
                    ) in allowed_languages or lang.alpha2.lower(
                    ) in allowed_languages or lang.alpha3.lower(
                    ) in allowed_languages:
                        valid_words.append((key, lang, lang_word, word))
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                elif lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
Example #10
0
def find_possible_languages(string, allowed_languages=None):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """

    common_words = None
    if allowed_languages:
        common_words = LNG_COMMON_WORDS_STRICT
    else:
        common_words = LNG_COMMON_WORDS

    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if lang_word not in common_words:
            try:
                lang = Language.fromguessit(lang_word)
                if allowed_languages:
                    if lang.name.lower() in allowed_languages or lang.alpha2.lower() in allowed_languages or lang.alpha3.lower() in allowed_languages:
                        valid_words.append((key, lang, lang_word, word))
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                elif lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
def search_language(string, lang_filter=None):
    """Looks for language patterns, and if found return the language object,
    its group span and an associated confidence.

    you can specify a list of allowed languages using the lang_filter argument,
    as in lang_filter = [ 'fr', 'eng', 'spanish' ]

    >>> search_language('movie [en].avi')
    (Language(English), (7, 9), 0.8)

    >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])
    (None, None, None)
    """

    # list of common words which could be interpreted as languages, but which
    # are far too common to be able to say they represent a language in the
    # middle of a string (where they most likely carry their commmon meaning)
    lng_common_words = frozenset([
        # english words
        'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
        'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
        'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as',
        'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi',
        # french words
        'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que',
        'mal', 'est', 'vol', 'or', 'mon', 'se',
        # spanish words
        'la', 'el', 'del', 'por', 'mar',
        # other
        'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
        'vi', 'ben', 'da', 'lt'
        ])
    sep = r'[](){} \._-+'

    if lang_filter:
        lang_filter = lang_set(lang_filter)

    slow = ' %s ' % string.lower()
    confidence = 1.0 # for all of them

    for lang in set(find_words(slow)) & lng_all_names:

        if lang in lng_common_words:
            continue

        pos = slow.find(lang)

        if pos != -1:
            end = pos + len(lang)
            # make sure our word is always surrounded by separators
            if slow[pos - 1] not in sep or slow[end] not in sep:
                continue

            language = Language(slow[pos:end])
            if lang_filter and language not in lang_filter:
                continue

            # only allow those languages that have a 2-letter code, those that
            # don't are too esoteric and probably false matches
            if language.lang not in lng3_to_lng2:
                continue

            # confidence depends on lng2, lng3, english name, ...
            if len(lang) == 2:
                confidence = 0.8
            elif len(lang) == 3:
                confidence = 0.9
            else:
                # Note: we could either be really confident that we found a
                #       language or assume that full language names are too
                #       common words and lower their confidence accordingly
                confidence = 0.3 # going with the low-confidence route here

            return language, (pos - 1, end - 1), confidence

    return None, None, None