def search_for_isolated_numbers(self): for token in Tokens.get_list(TokenFlags.UNKNOWN): if not token.content.isdigit() or \ not parser_helper.is_token_isolated(token): continue number = int(token.content) # Anime year if number >= parser_number.ANIME_YEAR_MIN and \ number <= parser_number.ANIME_YEAR_MAX: if not Elements.contains(ElementCategory.ANIME_YEAR): Elements.insert(ElementCategory.ANIME_YEAR, token.content) token.category = TokenCategory.IDENTIFIER continue # Video resolution if number == 480 or number == 720 or number == 1080: # If these numbers are isolated, it's more likely for them to # be the video resolution rather than the episode number. Some # fansub groups use these without the "p" suffix. if not Elements.contains(ElementCategory.VIDEO_RESOLUTION): Elements.insert( ElementCategory.VIDEO_RESOLUTION, token.content) token.category = TokenCategory.IDENTIFIER continue
def set_season_number(number, token): if not number.isdigit(): return False Elements.insert(ElementCategory.ANIME_SEASON, number) token.category = TokenCategory.IDENTIFIER return True
def parse(filename, options=default_options): Elements.clear() Tokens.clear() # Add missing options for key, value in default_options.items(): options.setdefault(key, value) Elements.insert(ElementCategory.FILE_NAME, filename) if options['parse_file_extension']: filename, extension = remove_extension_from_filename(filename) if extension: Elements.insert(ElementCategory.FILE_EXTENSION, extension) if options['ignored_strings']: filename = remove_ignored_strings_from_filename( filename, options['ignored_strings']) if not filename: return None tokenizer = Tokenizer(filename, options) if not tokenizer.tokenize(): return None parser = Parser(options) if not parser.parse(): return None return Elements.get_dictionary()
def build_element(category, token_begin=None, token_end=None, keep_delimiters=False): element = '' for token in Tokens.get_list(begin=token_begin, end=token_end): if token.category == TokenCategory.UNKNOWN: element += token.content token.category = TokenCategory.IDENTIFIER elif token.category == TokenCategory.BRACKET: element += token.content elif token.category == TokenCategory.DELIMITER: delimiter = token.content if keep_delimiters: element += delimiter elif token != token_begin and token != token_end: if delimiter == ',' or delimiter == '&': element += delimiter else: element += ' ' if not keep_delimiters: element = element.strip(' ' + DASHES) if element: Elements.insert(category, element)
def set_volume_number(number, token, validate): if validate: if not is_valid_volume_number(number): return False Elements.insert(ElementCategory.VOLUME_NUMBER, number) token.category = TokenCategory.IDENTIFIER return True
def match_single_episode_pattern(word, token): pattern = '(\\d{1,3})[vV](\\d)$' match = re.match(pattern, word) if match: set_episode_number(match.group(1), token, validate=False) Elements.insert(ElementCategory.RELEASE_VERSION, match.group(2)) return True return False
def search_for_keywords(self): for token in Tokens.get_list(TokenFlags.UNKNOWN): word = token.content word = word.strip(' -') if not word: continue # Don't bother if the word is a number that cannot be CRC if len(word) != 8 and word.isdigit(): continue category = ElementCategory.UNKNOWN keyword = keyword_manager.find(keyword_manager.normalize(word)) if keyword: category = keyword.category if not self.options['parse_release_group'] and \ category == ElementCategory.RELEASE_GROUP: continue if not ElementCategory.is_searchable(category) or \ not keyword.options.searchable: continue if ElementCategory.is_singular(category) and \ Elements.contains(category): continue if category == ElementCategory.ANIME_SEASON_PREFIX: parser_helper.check_anime_season_keyword(token) continue elif category == ElementCategory.EPISODE_PREFIX: if keyword.options.valid: parser_number.check_extent_keyword( ElementCategory.EPISODE_NUMBER, token) continue elif category == ElementCategory.RELEASE_VERSION: word = word[1:] # number without "v" elif category == ElementCategory.VOLUME_PREFIX: parser_number.check_extent_keyword( ElementCategory.VOLUME_NUMBER, token) continue else: if not Elements.contains(ElementCategory.FILE_CHECKSUM) and \ parser_helper.is_crc32(word): category = ElementCategory.FILE_CHECKSUM elif not Elements.contains(ElementCategory.VIDEO_RESOLUTION) \ and parser_helper.is_resolution(word): category = ElementCategory.VIDEO_RESOLUTION if category != ElementCategory.UNKNOWN: Elements.insert(category, word) if keyword is None or keyword.options.identifiable: token.category = TokenCategory.IDENTIFIER
def match_season_and_episode_pattern(word, token): pattern = 'S?(\\d{1,2})(?:-S?(\\d{1,2}))?' +\ '(?:x|[ ._-x]?E)(\\d{1,3})(?:-E?(\\d{1,3}))?$' match = re.match(pattern, word, flags=re.IGNORECASE) if match: Elements.insert(ElementCategory.ANIME_SEASON, match.group(1)) if match.group(2): Elements.insert(ElementCategory.ANIME_SEASON, match.group(2)) set_episode_number(match.group(3), token, validate=False) if match.group(4): set_episode_number(match.group(4), token, validate=False) return True return False
def match_multi_volume_pattern(word, token): pattern = '(\\d{1,2})[-~&+](\\d{1,2})(?:[vV](\\d))?$' match = re.match(pattern, word) if match: lower_bound = match.group(1) upper_bound = match.group(2) if int(lower_bound) < int(upper_bound): if set_volume_number(lower_bound, token, validate=True): set_volume_number(upper_bound, token, validate=False) if match.group(3): Elements.insert(ElementCategory.RELEASE_VERSION, match.group(3)) return True return False
def match_number_sign_pattern(word, token): if word[0] != '#': return False pattern = '#(\\d{1,3})(?:[-~&+](\\d{1,3}))?(?:[vV](\\d))?$' match = re.match(pattern, word) if match: if set_episode_number(match.group(1), token, validate=True): if match.group(2): set_episode_number(match.group(2), token, validate=True) if match.group(3): Elements.insert(ElementCategory.RELEASE_VERSION, match.group(3)) return True return False
def match_multi_episode_pattern(word, token): pattern = '(\\d{1,3})(?:[vV](\\d))?[-~&+](\\d{1,3})(?:[vV](\\d))?$' match = re.match(pattern, word) if match: lower_bound = match.group(1) upper_bound = match.group(3) # Avoid matching expressions such as "009-1" or "5-2" if int(lower_bound) < int(upper_bound): if set_episode_number(lower_bound, token, validate=True): set_episode_number(upper_bound, token, validate=False) if match.group(2): Elements.insert(ElementCategory.RELEASE_VERSION, match.group(2)) if match.group(4): Elements.insert(ElementCategory.RELEASE_VERSION, match.group(4)) return True return False
def peek(string): entries = [ (ElementCategory.AUDIO_TERM, ['Dual Audio']), (ElementCategory.VIDEO_TERM, ['H264', 'H.264', 'h264', 'h.264']), (ElementCategory.VIDEO_RESOLUTION, ['480p', '720p', '1080p']), (ElementCategory.SOURCE, ['Blu-Ray']) ] preidentified_tokens = [] for category, keywords in entries: for keyword in keywords: keyword_begin_pos = string.find(keyword) if keyword_begin_pos != -1: # Found the keyword in the string Elements.insert(category, keyword) keyword_end_pos = keyword_begin_pos + len(keyword) preidentified_tokens.append( (keyword_begin_pos, keyword_end_pos)) return sorted(preidentified_tokens)
def set_episode_number(number, token, validate): if validate and not is_valid_episode_number(number): return False token.category = TokenCategory.IDENTIFIER category = ElementCategory.EPISODE_NUMBER # Handle equivalent numbers if Elements.get_check_alt_number(): # TODO: check if getting only the first episode number is enough episode_number = Elements.get(ElementCategory.EPISODE_NUMBER)[0] if str2int(number) > str2int(episode_number): category = ElementCategory.EPISODE_NUMBER_ALT elif str2int(number) < str2int(episode_number): Elements.remove(ElementCategory.EPISODE_NUMBER, episode_number) Elements.insert(ElementCategory.EPISODE_NUMBER_ALT, episode_number) else: return False Elements.insert(category, number) return True
def match_type_and_episode_pattern(word, token): number_begin = parser_helper.find_number_in_string(word) prefix = word[:number_begin] keyword = keyword_manager.find(keyword_manager.normalize(prefix), ElementCategory.ANIME_TYPE) if keyword: Elements.insert(ElementCategory.ANIME_TYPE, prefix) number = word[number_begin:] if match_episode_patterns(number, token) or \ set_episode_number(number, token, validate=True): # Split token (we do this last in order to avoid invalidating our # token reference earlier) token_index = Tokens.get_index(token) token.content = number Tokens.insert( token_index, Token( TokenCategory.IDENTIFIER if keyword.options.identifiable else TokenCategory.UNKNOWN, prefix, token.enclosed)) return True return False
def set_alternative_episode_number(number, token): Elements.insert(ElementCategory.EPISODE_NUMBER_ALT, number) token.category = TokenCategory.IDENTIFIER return True
def set_anime_season(first, second, content): Elements.insert(ElementCategory.ANIME_SEASON, content) first.category = TokenCategory.IDENTIFIER second.category = TokenCategory.IDENTIFIER