def search_for_equivalent_numbers(tokens): for token in tokens: if parser_helper.is_token_isolated(token) or \ not is_valid_episode_number(token.content): continue # Find the first enclosed, non-delimiter token next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER) if next_token is None or next_token.category != TokenCategory.BRACKET: continue next_token = Tokens.find_next( next_token, TokenFlags.ENCLOSED | TokenFlags.NOT_DELIMITER) if next_token.category != TokenCategory.UNKNOWN: continue # Check if it's an isolated number if not parser_helper.is_token_isolated(next_token) or \ not next_token.content.isdigit() or \ not is_valid_episode_number(next_token.content): continue episode = min(token, next_token, key=lambda t: int(t.content)) alt_episode = max(token, next_token, key=lambda t: int(t.content)) set_episode_number(episode.content, episode, validate=False) set_alternative_episode_number(alt_episode.content, alt_episode) return True return False
def parse(filename, options=default_options): Elements.clear() Tokens.clear() # Add missing options for key, value in default_options.items(): options.setdefault(key, value) Elements.insert(ElementCategory.FILE_NAME, filename) if options['parse_file_extension']: filename, extension = remove_extension_from_filename(filename) if extension: Elements.insert(ElementCategory.FILE_EXTENSION, extension) if options['ignored_strings']: filename = remove_ignored_strings_from_filename( filename, options['ignored_strings']) if not filename: return None tokenizer = Tokenizer(filename, options) if not tokenizer.tokenize(): return None parser = Parser(options) if not parser.parse(): return None return Elements.get_dictionary()
def search_for_release_group(self): token_end = None while True: # Find the first enclosed unknown token if token_end: token_begin = Tokens.find_next( token_end, TokenFlags.ENCLOSED | TokenFlags.UNKNOWN) else: token_begin = Tokens.find( TokenFlags.ENCLOSED | TokenFlags.UNKNOWN) if token_begin is None: return # Continue until a bracket or identifier is found token_end = Tokens.find_next( token_begin, TokenFlags.BRACKET | TokenFlags.IDENTIFIER) if token_end is None: return if token_end.category != TokenCategory.BRACKET: continue # Ignore if it's not the first non-delimiter token in group previous_token = Tokens.find_previous( token_begin, TokenFlags.NOT_DELIMITER) if previous_token is not None and \ previous_token.category != TokenCategory.BRACKET: continue # Build release group, token end is a bracket, so we get the # previous token to be included in the element token_end = Tokens.find_previous(token_end, TokenFlags.VALID) parser_helper.build_element( ElementCategory.RELEASE_GROUP, token_begin, token_end, keep_delimiters=True) return
def search_for_episode_title(self): token_end = None while True: # Find the first non-enclosed unknown token if token_end: token_begin = Tokens.find_next( token_end, TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN) else: token_begin = Tokens.find( TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN) if token_begin is None: return # Continue until a bracket or identifier is found token_end = Tokens.find_next( token_begin, TokenFlags.BRACKET | TokenFlags.IDENTIFIER) # Ignore if it's only a dash if Tokens.distance(token_begin, token_end) <= 2 and \ parser_helper.is_dash_character(token_begin.content): continue # If token end is a bracket, then we get the previous token to be # included in the element if token_end and token_end.category == TokenCategory.BRACKET: token_end = Tokens.find_previous(token_end, TokenFlags.VALID) # Build episode title parser_helper.build_element( ElementCategory.EPISODE_TITLE, token_begin, token_end, keep_delimiters=False) return
def search_for_last_number(tokens): for token in tokens: token_index = Tokens.get_index(token) # Assuming that episode number always comes after the title, first # token cannot be what we're looking for if token_index == 0: continue # An enclosed token is unlikely to be the episode number at this point if token.enclosed: continue # Ignore if it's the first non-enclosed, non-delimiter token if all([ t.enclosed or t.category == TokenCategory.DELIMITER for t in Tokens.get_list()[:token_index] ]): continue # Ignore if the previous token is "Movie" or "Part" previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER) if previous_token.category == TokenCategory.UNKNOWN: if previous_token.content.lower() == 'movie' or \ previous_token.content.lower() == 'part': continue # We'll use this number after all if set_episode_number(token.content, token, validate=True): return True return False
def is_token_isolated(token): previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER) if previous_token.category != TokenCategory.BRACKET: return False next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER) if next_token.category != TokenCategory.BRACKET: return False return True
def search_for_isolated_numbers(self): for token in Tokens.get_list(TokenFlags.UNKNOWN): if not token.content.isdigit() or \ not parser_helper.is_token_isolated(token): continue number = int(token.content) # Anime year if number >= parser_number.ANIME_YEAR_MIN and \ number <= parser_number.ANIME_YEAR_MAX: if not Elements.contains(ElementCategory.ANIME_YEAR): Elements.insert(ElementCategory.ANIME_YEAR, token.content) token.category = TokenCategory.IDENTIFIER continue # Video resolution if number == 480 or number == 720 or number == 1080: # If these numbers are isolated, it's more likely for them to # be the video resolution rather than the episode number. Some # fansub groups use these without the "p" suffix. if not Elements.contains(ElementCategory.VIDEO_RESOLUTION): Elements.insert( ElementCategory.VIDEO_RESOLUTION, token.content) token.category = TokenCategory.IDENTIFIER continue
def build_element(category, token_begin=None, token_end=None, keep_delimiters=False): element = '' for token in Tokens.get_list(begin=token_begin, end=token_end): if token.category == TokenCategory.UNKNOWN: element += token.content token.category = TokenCategory.IDENTIFIER elif token.category == TokenCategory.BRACKET: element += token.content elif token.category == TokenCategory.DELIMITER: delimiter = token.content if keep_delimiters: element += delimiter elif token != token_begin and token != token_end: if delimiter == ',' or delimiter == '&': element += delimiter else: element += ' ' if not keep_delimiters: element = element.strip(' ' + DASHES) if element: Elements.insert(category, element)
def check_anime_season_keyword(token): def set_anime_season(first, second, content): Elements.insert(ElementCategory.ANIME_SEASON, content) first.category = TokenCategory.IDENTIFIER second.category = TokenCategory.IDENTIFIER previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER) if previous_token: number = get_number_from_ordinal(previous_token.content) if number: set_anime_season(previous_token, token, number) return True next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER) if next_token and next_token.content.isdigit(): set_anime_season(token, next_token, next_token.content) return True return False
def number_comes_before_another_number(token): separator_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER) if separator_token: separator = separator_token.content if separator == '&' or separator == 'of': other_token = Tokens.find_next(separator_token, TokenFlags.NOT_DELIMITER) if other_token and other_token.content.isdigit(): set_episode_number(token.content, token, validate=False) if separator == '&': set_episode_number(other_token.content, token, validate=False) separator_token.category = TokenCategory.IDENTIFIER other_token.category = TokenCategory.IDENTIFIER return True return False
def search_for_separated_numbers(tokens): for token in tokens: previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER) # See if the number has a preceding "-" separator if previous_token.category == TokenCategory.UNKNOWN and \ parser_helper.is_dash_character(previous_token.content): if set_episode_number(token.content, token, validate=True): previous_token.category = TokenCategory.IDENTIFIER return True return False
def search_for_keywords(self): for token in Tokens.get_list(TokenFlags.UNKNOWN): word = token.content word = word.strip(' -') if not word: continue # Don't bother if the word is a number that cannot be CRC if len(word) != 8 and word.isdigit(): continue category = ElementCategory.UNKNOWN keyword = keyword_manager.find(keyword_manager.normalize(word)) if keyword: category = keyword.category if not self.options['parse_release_group'] and \ category == ElementCategory.RELEASE_GROUP: continue if not ElementCategory.is_searchable(category) or \ not keyword.options.searchable: continue if ElementCategory.is_singular(category) and \ Elements.contains(category): continue if category == ElementCategory.ANIME_SEASON_PREFIX: parser_helper.check_anime_season_keyword(token) continue elif category == ElementCategory.EPISODE_PREFIX: if keyword.options.valid: parser_number.check_extent_keyword( ElementCategory.EPISODE_NUMBER, token) continue elif category == ElementCategory.RELEASE_VERSION: word = word[1:] # number without "v" elif category == ElementCategory.VOLUME_PREFIX: parser_number.check_extent_keyword( ElementCategory.VOLUME_NUMBER, token) continue else: if not Elements.contains(ElementCategory.FILE_CHECKSUM) and \ parser_helper.is_crc32(word): category = ElementCategory.FILE_CHECKSUM elif not Elements.contains(ElementCategory.VIDEO_RESOLUTION) \ and parser_helper.is_resolution(word): category = ElementCategory.VIDEO_RESOLUTION if category != ElementCategory.UNKNOWN: Elements.insert(category, word) if keyword is None or keyword.options.identifiable: token.category = TokenCategory.IDENTIFIER
def match_type_and_episode_pattern(word, token): number_begin = parser_helper.find_number_in_string(word) prefix = word[:number_begin] keyword = keyword_manager.find(keyword_manager.normalize(prefix), ElementCategory.ANIME_TYPE) if keyword: Elements.insert(ElementCategory.ANIME_TYPE, prefix) number = word[number_begin:] if match_episode_patterns(number, token) or \ set_episode_number(number, token, validate=True): # Split token (we do this last in order to avoid invalidating our # token reference earlier) token_index = Tokens.get_index(token) token.content = number Tokens.insert( token_index, Token( TokenCategory.IDENTIFIER if keyword.options.identifiable else TokenCategory.UNKNOWN, prefix, token.enclosed)) return True return False
def search_for_episode_number(self): # List all unknown tokens that contain a number tokens = [token for token in Tokens.get_list(TokenFlags.UNKNOWN) if parser_helper.find_number_in_string(token.content) is not None] if not tokens: return Elements.set_check_alt_number( Elements.contains(ElementCategory.EPISODE_NUMBER)) # If a token matches a known episode pattern, it has to be the episode # number if parser_number.search_for_episode_patterns(tokens): return if Elements.contains(ElementCategory.EPISODE_NUMBER): return # We have previously found an episode number via keywords # From now on, we're only interested in numeric tokens tokens = [token for token in tokens if token.content.isdigit()] if not tokens: return # e.g. "01 (176)", "29 (04)" if parser_number.search_for_equivalent_numbers(tokens): return # e.g. " - 08" if parser_number.search_for_separated_numbers(tokens): return # e.g. "[12]", "(2006)" if parser_number.search_for_isolated_numbers(tokens): return # Consider using the last number as a last resort parser_number.search_for_last_number(tokens)
def check_extent_keyword(category, token): next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER) if next_token.category == TokenCategory.UNKNOWN: if next_token and \ parser_helper.find_number_in_string(next_token.content) \ is not None: if category == ElementCategory.EPISODE_NUMBER: if not match_episode_patterns(next_token.content, next_token): set_episode_number(next_token.content, next_token, validate=False) elif category == ElementCategory.VOLUME_NUMBER: if not match_volume_patterns(next_token.content, next_token): set_volume_number(next_token.content, next_token, validate=False) else: return False token.category = TokenCategory.IDENTIFIER return True return False
def _validate_delimiter_tokens(self): def find_previous_valid_token(token): return Tokens.find_previous(token, TokenFlags.VALID) def find_next_valid_token(token): return Tokens.find_next(token, TokenFlags.VALID) def is_delimiter_token(token): return token is not None and \ token.category == TokenCategory.DELIMITER def is_unknown_token(token): return token is not None and \ token.category == TokenCategory.UNKNOWN def is_single_character_token(token): return is_unknown_token(token) and len(token.content) == 1 and \ token.content != '-' def append_token_to(token, append_to): append_to.content += token.content token.category = TokenCategory.INVALID for token in Tokens.get_list(): if token.category != TokenCategory.DELIMITER: continue delimiter = token.content prev_token = find_previous_valid_token(token) next_token = find_next_valid_token(token) # Check for single-character tokens to prevent splitting group # names, keywords, episode number, etc. if delimiter != ' ' and delimiter != '_': if is_single_character_token(prev_token): append_token_to(token, prev_token) while is_unknown_token(next_token): append_token_to(next_token, prev_token) next_token = find_next_valid_token(next_token) if is_delimiter_token(next_token) and \ next_token.content == delimiter: append_token_to(next_token, prev_token) next_token = find_next_valid_token(next_token) continue if is_single_character_token(next_token): append_token_to(token, prev_token) append_token_to(next_token, prev_token) continue # Check for adjacent delimiters if is_unknown_token(prev_token) and is_delimiter_token(next_token): next_delimiter = next_token.content if delimiter != next_delimiter and delimiter != ',': if next_delimiter == ' ' or next_delimiter == '_': append_token_to(token, prev_token) elif is_delimiter_token(prev_token) and \ is_delimiter_token(next_token): prev_delimiter = prev_token.content next_delimiter = next_token.content if prev_delimiter == next_delimiter and \ prev_delimiter != delimiter: token.category = TokenCategory.UNKNOWN # e.g. "&" in "_&_" # Check for other special cases if delimiter == '&' or delimiter == '+': if is_unknown_token(prev_token) and \ is_unknown_token(next_token): if prev_token.content.isdigit() and \ next_token.content.isdigit(): append_token_to(token, prev_token) append_token_to(next_token, prev_token) # e.g. "01+02" Tokens.update([ token for token in Tokens.get_list() if token.category != TokenCategory.INVALID ])
def search_for_anime_title(self): enclosed_title = False # Find the first non-enclosed unknown token token_begin = Tokens.find(TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN) # If that doesn't work, find the first unknown token in the second # enclosed group, assuming that the first one is the release group if token_begin is None: enclosed_title = True token_begin = Tokens.get(0) skipped_previous_group = False while token_begin is not None: token_begin = Tokens.find_next(token_begin, TokenFlags.UNKNOWN) if token_begin is None: break # Ignore groups that are composed of non-Latin characters if parser_helper.is_mostly_latin_string(token_begin.content): if skipped_previous_group: break # Found it # Get the first unknown token of the next group token_begin = Tokens.find_next(token_begin, TokenFlags.BRACKET) skipped_previous_group = True if token_begin is None: return # Continue until an identifier (or a bracket, if the title is enclosed) # is found token_end = Tokens.find_next( token_begin, TokenFlags.IDENTIFIER | ( TokenFlags.BRACKET if enclosed_title else TokenFlags.NONE )) # If within the interval there's an open bracket without its matching # pair, move the upper endpoint back to the bracket if not enclosed_title: last_bracket = token_end bracket_open = False for token in Tokens.get_list(TokenFlags.BRACKET, begin=token_begin, end=token_end): last_bracket = token bracket_open = not bracket_open if bracket_open: token_end = last_bracket # If the interval ends with an enclosed group (e.g. "Anime Title # [Fansub]"), move the upper endpoint back to the beginning of the # group. We ignore parentheses in order to keep certain groups (e.g. # "(TV)") intact. if not enclosed_title: token = Tokens.find_previous(token_end, TokenFlags.NOT_DELIMITER) while token.category == TokenCategory.BRACKET and \ token.content != ')': token = Tokens.find_previous(token, TokenFlags.BRACKET) if token is not None: token_end = token token = Tokens.find_previous( token_end, TokenFlags.NOT_DELIMITER) # Token end is a bracket, so we get the previous token to be included # in the element token_end = Tokens.find_previous(token_end, TokenFlags.VALID) parser_helper.build_element(ElementCategory.ANIME_TITLE, token_begin, token_end, keep_delimiters=False)
def _add_token(self, category, content, enclosed): Tokens.append(Token(category, content, enclosed))
def tokenize(self): self._tokenize_by_brackets() return not Tokens.empty()
def find_next_valid_token(token): return Tokens.find_next(token, TokenFlags.VALID)
def find_previous_valid_token(token): return Tokens.find_previous(token, TokenFlags.VALID)