Python Tokens.get_list Examples, anitopy.token.Tokens.get_list Python Examples

Example #1

0

Show file

File: parser_number.py Project: kayabe/anitopy

def search_for_last_number(tokens):
    for token in tokens:
        token_index = Tokens.get_index(token)

        # Assuming that episode number always comes after the title, first
        # token cannot be what we're looking for
        if token_index == 0:
            continue

        # An enclosed token is unlikely to be the episode number at this point
        if token.enclosed:
            continue

        # Ignore if it's the first non-enclosed, non-delimiter token
        if all([
                t.enclosed or t.category == TokenCategory.DELIMITER
                for t in Tokens.get_list()[:token_index]
        ]):
            continue

        # Ignore if the previous token is "Movie" or "Part"
        previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER)
        if previous_token.category == TokenCategory.UNKNOWN:
            if previous_token.content.lower() == 'movie' or \
                    previous_token.content.lower() == 'part':
                continue

        # We'll use this number after all
        if set_episode_number(token.content, token, validate=True):
            return True

    return False

Example #2

0

Show file

File: parser.py Project: kayabe/anitopy

    def search_for_isolated_numbers(self):
        for token in Tokens.get_list(TokenFlags.UNKNOWN):
            if not token.content.isdigit() or \
                    not parser_helper.is_token_isolated(token):
                continue

            number = int(token.content)

            # Anime year
            if number >= parser_number.ANIME_YEAR_MIN and \
                    number <= parser_number.ANIME_YEAR_MAX:
                if not Elements.contains(ElementCategory.ANIME_YEAR):
                    Elements.insert(ElementCategory.ANIME_YEAR, token.content)
                    token.category = TokenCategory.IDENTIFIER
                    continue

            # Video resolution
            if number == 480 or number == 720 or number == 1080:
                # If these numbers are isolated, it's more likely for them to
                # be the video resolution rather than the episode number. Some
                # fansub groups use these without the "p" suffix.
                if not Elements.contains(ElementCategory.VIDEO_RESOLUTION):
                    Elements.insert(
                        ElementCategory.VIDEO_RESOLUTION, token.content)
                    token.category = TokenCategory.IDENTIFIER
                    continue

Example #3

0

Show file

File: parser_helper.py Project: tylergibbs2/anitopy

def build_element(category, token_begin=None, token_end=None,
                  keep_delimiters=False):
    element = ''

    for token in Tokens.get_list(begin=token_begin, end=token_end):
        if token.category == TokenCategory.UNKNOWN:
            element += token.content
            token.category = TokenCategory.IDENTIFIER
        elif token.category == TokenCategory.BRACKET:
            element += token.content
        elif token.category == TokenCategory.DELIMITER:
            delimiter = token.content
            if keep_delimiters:
                element += delimiter
            elif token != token_begin and token != token_end:
                if delimiter == ',' or delimiter == '&':
                    element += delimiter
                else:
                    element += ' '

    if not keep_delimiters:
        element = element.strip(' ' + DASHES)

    if element:
        Elements.insert(category, element)

Example #4

0

Show file

File: parser.py Project: kayabe/anitopy

    def search_for_keywords(self):
        for token in Tokens.get_list(TokenFlags.UNKNOWN):
            word = token.content
            word = word.strip(' -')

            if not word:
                continue
            # Don't bother if the word is a number that cannot be CRC
            if len(word) != 8 and word.isdigit():
                continue

            category = ElementCategory.UNKNOWN
            keyword = keyword_manager.find(keyword_manager.normalize(word))
            if keyword:
                category = keyword.category
                if not self.options['parse_release_group'] and \
                        category == ElementCategory.RELEASE_GROUP:
                    continue
                if not ElementCategory.is_searchable(category) or \
                        not keyword.options.searchable:
                    continue
                if ElementCategory.is_singular(category) and \
                        Elements.contains(category):
                    continue

                if category == ElementCategory.ANIME_SEASON_PREFIX:
                    parser_helper.check_anime_season_keyword(token)
                    continue
                elif category == ElementCategory.EPISODE_PREFIX:
                    if keyword.options.valid:
                        parser_number.check_extent_keyword(
                            ElementCategory.EPISODE_NUMBER, token)
                    continue
                elif category == ElementCategory.RELEASE_VERSION:
                    word = word[1:]  # number without "v"
                elif category == ElementCategory.VOLUME_PREFIX:
                    parser_number.check_extent_keyword(
                        ElementCategory.VOLUME_NUMBER, token)
                    continue
            else:
                if not Elements.contains(ElementCategory.FILE_CHECKSUM) and \
                        parser_helper.is_crc32(word):
                    category = ElementCategory.FILE_CHECKSUM
                elif not Elements.contains(ElementCategory.VIDEO_RESOLUTION) \
                        and parser_helper.is_resolution(word):
                    category = ElementCategory.VIDEO_RESOLUTION

            if category != ElementCategory.UNKNOWN:
                Elements.insert(category, word)
                if keyword is None or keyword.options.identifiable:
                    token.category = TokenCategory.IDENTIFIER

Example #5

0

Show file

File: parser.py Project: kayabe/anitopy

    def search_for_episode_number(self):
        # List all unknown tokens that contain a number
        tokens = [token for token in Tokens.get_list(TokenFlags.UNKNOWN)
                  if parser_helper.find_number_in_string(token.content) is not
                  None]

        if not tokens:
            return

        Elements.set_check_alt_number(
            Elements.contains(ElementCategory.EPISODE_NUMBER))

        # If a token matches a known episode pattern, it has to be the episode
        # number
        if parser_number.search_for_episode_patterns(tokens):
            return

        if Elements.contains(ElementCategory.EPISODE_NUMBER):
            return  # We have previously found an episode number via keywords

        # From now on, we're only interested in numeric tokens
        tokens = [token for token in tokens if token.content.isdigit()]

        if not tokens:
            return

        # e.g. "01 (176)", "29 (04)"
        if parser_number.search_for_equivalent_numbers(tokens):
            return

        # e.g. " - 08"
        if parser_number.search_for_separated_numbers(tokens):
            return

        # e.g. "[12]", "(2006)"
        if parser_number.search_for_isolated_numbers(tokens):
            return

        # Consider using the last number as a last resort
        parser_number.search_for_last_number(tokens)

Example #6

0

Show file

File: parser.py Project: kayabe/anitopy

    def search_for_anime_title(self):
        enclosed_title = False

        # Find the first non-enclosed unknown token
        token_begin = Tokens.find(TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN)

        # If that doesn't work, find the first unknown token in the second
        # enclosed group, assuming that the first one is the release group
        if token_begin is None:
            enclosed_title = True
            token_begin = Tokens.get(0)
            skipped_previous_group = False
            while token_begin is not None:
                token_begin = Tokens.find_next(token_begin, TokenFlags.UNKNOWN)
                if token_begin is None:
                    break
                # Ignore groups that are composed of non-Latin characters
                if parser_helper.is_mostly_latin_string(token_begin.content):
                    if skipped_previous_group:
                        break  # Found it
                # Get the first unknown token of the next group
                token_begin = Tokens.find_next(token_begin, TokenFlags.BRACKET)
                skipped_previous_group = True

        if token_begin is None:
            return

        # Continue until an identifier (or a bracket, if the title is enclosed)
        # is found
        token_end = Tokens.find_next(
            token_begin, TokenFlags.IDENTIFIER | (
                TokenFlags.BRACKET if enclosed_title else TokenFlags.NONE
            ))

        # If within the interval there's an open bracket without its matching
        # pair, move the upper endpoint back to the bracket
        if not enclosed_title:
            last_bracket = token_end
            bracket_open = False
            for token in Tokens.get_list(TokenFlags.BRACKET, begin=token_begin,
                                         end=token_end):
                last_bracket = token
                bracket_open = not bracket_open
            if bracket_open:
                token_end = last_bracket

        # If the interval ends with an enclosed group (e.g. "Anime Title
        # [Fansub]"), move the upper endpoint back to the beginning of the
        # group. We ignore parentheses in order to keep certain groups (e.g.
        # "(TV)") intact.
        if not enclosed_title:
            token = Tokens.find_previous(token_end, TokenFlags.NOT_DELIMITER)
            while token.category == TokenCategory.BRACKET and \
                    token.content != ')':
                token = Tokens.find_previous(token, TokenFlags.BRACKET)
                if token is not None:
                    token_end = token
                    token = Tokens.find_previous(
                        token_end, TokenFlags.NOT_DELIMITER)

        # Token end is a bracket, so we get the previous token to be included
        # in the element
        token_end = Tokens.find_previous(token_end, TokenFlags.VALID)
        parser_helper.build_element(ElementCategory.ANIME_TITLE, token_begin,
                                    token_end, keep_delimiters=False)

Example #7

0

Show file

File: tokenizer.py Project: tylergibbs2/anitopy

    def _validate_delimiter_tokens(self):
        def find_previous_valid_token(token):
            return Tokens.find_previous(token, TokenFlags.VALID)

        def find_next_valid_token(token):
            return Tokens.find_next(token, TokenFlags.VALID)

        def is_delimiter_token(token):
            return token is not None and \
                   token.category == TokenCategory.DELIMITER

        def is_unknown_token(token):
            return token is not None and \
                   token.category == TokenCategory.UNKNOWN

        def is_single_character_token(token):
            return is_unknown_token(token) and len(token.content) == 1 and \
                   token.content != '-'

        def append_token_to(token, append_to):
            append_to.content += token.content
            token.category = TokenCategory.INVALID

        for token in Tokens.get_list():
            if token.category != TokenCategory.DELIMITER:
                continue

            delimiter = token.content
            prev_token = find_previous_valid_token(token)
            next_token = find_next_valid_token(token)

            # Check for single-character tokens to prevent splitting group
            # names, keywords, episode number, etc.
            if delimiter != ' ' and delimiter != '_':
                if is_single_character_token(prev_token):
                    append_token_to(token, prev_token)
                    while is_unknown_token(next_token):
                        append_token_to(next_token, prev_token)
                        next_token = find_next_valid_token(next_token)
                        if is_delimiter_token(next_token) and \
                                next_token.content == delimiter:
                            append_token_to(next_token, prev_token)
                            next_token = find_next_valid_token(next_token)
                    continue
                if is_single_character_token(next_token):
                    append_token_to(token, prev_token)
                    append_token_to(next_token, prev_token)
                    continue

            # Check for adjacent delimiters
            if is_unknown_token(prev_token) and is_delimiter_token(next_token):
                next_delimiter = next_token.content
                if delimiter != next_delimiter and delimiter != ',':
                    if next_delimiter == ' ' or next_delimiter == '_':
                        append_token_to(token, prev_token)

            elif is_delimiter_token(prev_token) and \
                    is_delimiter_token(next_token):
                prev_delimiter = prev_token.content
                next_delimiter = next_token.content
                if prev_delimiter == next_delimiter and \
                        prev_delimiter != delimiter:
                    token.category = TokenCategory.UNKNOWN  # e.g. "&" in "_&_"

            # Check for other special cases
            if delimiter == '&' or delimiter == '+':
                if is_unknown_token(prev_token) and \
                        is_unknown_token(next_token):
                    if prev_token.content.isdigit() and \
                            next_token.content.isdigit():
                        append_token_to(token, prev_token)
                        append_token_to(next_token, prev_token)  # e.g. "01+02"

        Tokens.update([
            token for token in Tokens.get_list()
            if token.category != TokenCategory.INVALID
        ])