def parse(self, data=None): """Parse movie name. Populates name, year, quality and proper_count attributes""" # Reset before parsing, so the parser can be reused. self.reset() if not data: data = self.data for char in '[]()_,.': data = data.replace(char, ' ') # if there are no spaces if data.find(' ') == -1: data = data.replace('-', ' ') # remove unwanted words (imax, ..) self.remove_words(data, self.remove) data = self.strip_spaces(data) # split to parts parts = data.split(' ') year = None cut_part = 256 self.quality = 'unknown' for part in parts: cut = False # check for year if part.isdigit(): num = int(part) if num > 1930 and num < 2050: year = part cut = True # if length > 3 and whole word in uppers, consider as cut word (most likely a group name) if len(part) > 3 and part.isupper() and part.isalpha(): cut = True # check for cutoff words if part.lower() in self.cutoffs: cut = True # check for propers if part.lower() in self.propers: self.proper_count += 1 cut = True # update cut position if cut and parts.index(part) < cut_part: cut_part = parts.index(part) if cut_part != 256: log.debug('parts: %s, cut is: %s' % (parts, parts[cut_part])) # calculate cut positon from cut_part abs_cut = len(' '.join(parts[:cut_part])) log.debug('after parts check, cut data would be: `%s` abs_cut: %i' % (data[:abs_cut], abs_cut)) # parse quality quality, remaining = qualities.quality_match(data) if quality: self.quality = quality # remaining string is same as data but quality information removed # find out position where there is first difference, this is earlies # quality bit, anything after that has no relevance to the movie name #dp = diff_pos(data, remaining) dp = None if dp is not None: log.debug('quality start: %s' % dp) if dp < abs_cut: log.debug('quality cut is even shorter') abs_cut = dp # make cut data = data[:abs_cut].strip() log.debug('data cut to `%s` - this will be the name' % data) # save results self.name = data if year: if year.isdigit(): self.year = int(year)
def parse(self, data=None, field=None, quality=qualities.UNKNOWN): # Clear the output variables before parsing self._reset() self.field = field self.quality = quality if data: self.data = data if not self.name or not self.data: raise Exception('SeriesParser initialization error, name: %s data: %s' % \ (repr(self.name), repr(self.data))) if self.expect_ep and self.expect_id: raise Exception('Flags expect_ep and expect_id are mutually exclusive') name = self.remove_dirt(self.name) # check if data appears to be unwanted (abort) if self.parse_unwanted(self.remove_dirt(self.data)): return def name_to_re(name): """Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+""" # TODO: Still doesn't handle the case where the user wants # "Schmost" and the feed contains "Schmost at Sea". blank = r'[\W_]' ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?' # accept either '&' or 'and' name = name.replace('&', '(?:and|&)') res = re.sub(blank + '+', ' ', name) res = res.strip() # check for 'and' surrounded by spaces so it is not replaced within a word or from above replacement res = res.replace(' and ', ' (?:and|&) ') res = re.sub(' +', blank + '*', res) res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+' return res log.debug('name: %s data: %s' % (name, self.data)) # name end position name_start = 0 name_end = 0 # regexp name matching if not self.name_regexps: # if we don't have name_regexps, generate one from the name self.name_regexps = [name_to_re(name)] self.re_from_name = True # try all specified regexps on this data for name_re in self.name_regexps: match = re.search(name_re, self.data) if match: if self.re_from_name: name_start, name_end = match.span(1) else: name_start, name_end = match.span() log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data)) break else: # leave this invalid log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps], self.data)) return # remove series name from raw data, move any prefix to end of string data_stripped = self.data[name_end:] + ' ' + self.data[:name_start] data_stripped = data_stripped.lower() log.debug('data stripped: %s' % data_stripped) # allow group(s) if self.allow_groups: for group in self.allow_groups: group = group.lower() for fmt in ['[%s]', '-%s']: if fmt % group in data_stripped: log.debug('%s is from group %s' % (self.data, group)) self.group = group data_stripped = data_stripped.replace(fmt % group, '') break if self.group: break else: log.debug('%s is not from groups %s' % (self.data, self.allow_groups)) return # leave invalid # search tags and quality if one was not provided to parse method if not quality or quality == qualities.UNKNOWN: log.debug('parsing quality ->') quality, remaining = qualities.quality_match(data_stripped) self.quality = quality if remaining: # Remove quality string from data log.debug('quality detected, using remaining data `%s`' % remaining) data_stripped = remaining # Remove unwanted words (qualities and such) from data for ep / id parsing data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() + self.codecs + self.sounds, not_in_word=True) data_parts = re.split('[\W_]+', data_stripped) for part in data_parts[:]: if part in self.propers: self.proper_count += 1 data_parts.remove(part) elif part in self.specials: self.special = True data_parts.remove(part) data_stripped = ' '.join(data_parts).strip() log.debug("data for id/ep parsing '%s'" % data_stripped) ep_match = self.parse_episode(data_stripped) if ep_match: # strict_name if self.strict_name: if ep_match['match'].start() > 1: return if self.expect_id: log.debug('found episode number, but expecting id, aborting!') return if ep_match['end_episode'] > ep_match['episode'] + 2: # This is a pack of too many episodes, ignore it. log.debug('Series pack contains too many episodes (%d). Rejecting' % (ep_match['end_episode'] - ep_match['episode'])) return self.season = ep_match['season'] self.episode = ep_match['episode'] self.end_episode = ep_match['end_episode'] self.valid = True return log.debug('-> no luck with ep_regexps') # search for ids later as last since they contain somewhat broad matches if self.expect_ep: # we should be getting season, ep ! # try to look up idiotic numbering scheme 101,102,103,201,202 # ressu: Added matching for 0101, 0102... It will fail on # season 11 though log.debug('expect_ep enabled') match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE) if match: # strict_name if self.strict_name: if match.start() > 1: return self.season = int(match.group(1)) self.episode = int(match.group(2)) log.debug(self) self.valid = True return log.debug('-> no luck with the expect_ep') else: if self.parse_unwanted_id(data_stripped): return for id_re in self.id_regexps: match = re.search(id_re, data_stripped) if match: # strict_name if self.strict_name: if match.start() - name_end >= 2: return if 'version' in match.groupdict(): if match.group('version'): self.proper_count = int(match.group('version')) - 1 self.id = match.group(1) else: self.id = '-'.join(match.groups()) self.id_groups = match.groups() if self.special: self.id += '-SPECIAL' self.valid = True log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern)) return log.debug('-> no luck with id_regexps') # No id found, check if this is a special if self.special: # Attempt to set id as the title of the special self.id = data_stripped self.valid = True log.debug('found special, setting id to \'%s\'' % self.id) return raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))
def parse(self, data=None, field=None, quality=qualities.UNKNOWN): # Clear the output variables before parsing self._reset() self.field = field self.quality = quality if data: self.data = data if not self.name or not self.data: raise Exception('SeriesParser initialization error, name: %s data: %s' % \ (repr(self.name), repr(self.data))) if self.expect_ep and self.expect_id: raise Exception( 'Flags expect_ep and expect_id are mutually exclusive') name = self.remove_dirt(self.name) # check if data appears to be unwanted (abort) if self.parse_unwanted(self.remove_dirt(self.data)): return def name_to_re(name): """Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+""" # TODO: Still doesn't handle the case where the user wants # "Schmost" and the feed contains "Schmost at Sea". blank = r'[\W_]' ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?' # accept either '&' or 'and' name = name.replace('&', '(?:and|&)') res = re.sub(blank + '+', ' ', name) res = res.strip() # check for 'and' surrounded by spaces so it is not replaced within a word or from above replacement res = res.replace(' and ', ' (?:and|&) ') res = re.sub(' +', blank + '*', res) res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+' return res log.debug('name: %s data: %s' % (name, self.data)) # name end position name_start = 0 name_end = 0 # regexp name matching if not self.name_regexps: # if we don't have name_regexps, generate one from the name self.name_regexps = [name_to_re(name)] self.re_from_name = True # try all specified regexps on this data for name_re in self.name_regexps: match = re.search(name_re, self.data) if match: if self.re_from_name: name_start, name_end = match.span(1) else: name_start, name_end = match.span() log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data)) break else: # leave this invalid log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps], self.data)) return # remove series name from raw data, move any prefix to end of string data_stripped = self.data[name_end:] + ' ' + self.data[:name_start] data_stripped = data_stripped.lower() log.debug('data stripped: %s' % data_stripped) # allow group(s) if self.allow_groups: for group in self.allow_groups: group = group.lower() for fmt in ['[%s]', '-%s']: if fmt % group in data_stripped: log.debug('%s is from group %s' % (self.data, group)) self.group = group data_stripped = data_stripped.replace(fmt % group, '') break if self.group: break else: log.debug('%s is not from groups %s' % (self.data, self.allow_groups)) return # leave invalid # search tags and quality if one was not provided to parse method if not quality or quality == qualities.UNKNOWN: log.debug('parsing quality ->') quality, remaining = qualities.quality_match(data_stripped) self.quality = quality if remaining: # Remove quality string from data log.debug('quality detected, using remaining data `%s`' % remaining) data_stripped = remaining # Remove unwanted words (qualities and such) from data for ep / id parsing data_stripped = self.remove_words( data_stripped, self.remove + qualities.registry.keys() + self.codecs + self.sounds, not_in_word=True) data_parts = re.split('[\W_]+', data_stripped) for part in data_parts[:]: if part in self.propers: self.proper_count += 1 data_parts.remove(part) elif part in self.specials: self.special = True data_parts.remove(part) data_stripped = ' '.join(data_parts).strip() log.debug("data for id/ep parsing '%s'" % data_stripped) ep_match = self.parse_episode(data_stripped) if ep_match: # strict_name if self.strict_name: if ep_match['match'].start() > 1: return if self.expect_id: log.debug('found episode number, but expecting id, aborting!') return if ep_match['end_episode'] > ep_match['episode'] + 2: # This is a pack of too many episodes, ignore it. log.debug( 'Series pack contains too many episodes (%d). Rejecting' % (ep_match['end_episode'] - ep_match['episode'])) return self.season = ep_match['season'] self.episode = ep_match['episode'] self.end_episode = ep_match['end_episode'] self.valid = True return log.debug('-> no luck with ep_regexps') # search for ids later as last since they contain somewhat broad matches if self.expect_ep: # we should be getting season, ep ! # try to look up idiotic numbering scheme 101,102,103,201,202 # ressu: Added matching for 0101, 0102... It will fail on # season 11 though log.debug('expect_ep enabled') match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE) if match: # strict_name if self.strict_name: if match.start() > 1: return self.season = int(match.group(1)) self.episode = int(match.group(2)) log.debug(self) self.valid = True return log.debug('-> no luck with the expect_ep') else: if self.parse_unwanted_id(data_stripped): return for id_re in self.id_regexps: match = re.search(id_re, data_stripped) if match: # strict_name if self.strict_name: if match.start() - name_end >= 2: return if 'version' in match.groupdict(): if match.group('version'): self.proper_count = int(match.group('version')) - 1 self.id = match.group(1) else: self.id = '-'.join(match.groups()) self.id_groups = match.groups() if self.special: self.id += '-SPECIAL' self.valid = True log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern)) return log.debug('-> no luck with id_regexps') # No id found, check if this is a special if self.special: # Attempt to set id as the title of the special self.id = data_stripped self.valid = True log.debug('found special, setting id to \'%s\'' % self.id) return raise ParseWarning( 'Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))