Example #1
0
    def parse(self, data=None):
        """Parse movie name. Populates name, year, quality and proper_count attributes"""

        # Reset before parsing, so the parser can be reused.
        self.reset()

        if not data:
            data = self.data

        for char in '[]()_,.':
            data = data.replace(char, ' ')

        # if there are no spaces
        if data.find(' ') == -1:
            data = data.replace('-', ' ')

        # remove unwanted words (imax, ..)
        self.remove_words(data, self.remove)

        data = self.strip_spaces(data)

        # split to parts
        parts = data.split(' ')
        year = None
        cut_part = 256
        self.quality = 'unknown'
        for part in parts:
            cut = False
            # check for year
            if part.isdigit():
                num = int(part)
                if num > 1930 and num < 2050:
                    year = part
                    cut = True
            # if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
            if len(part) > 3 and part.isupper() and part.isalpha():
                cut = True
            # check for cutoff words
            if part.lower() in self.cutoffs:
                cut = True
            # check for propers
            if part.lower() in self.propers:
                self.proper_count += 1
                cut = True
            # update cut position
            if cut and parts.index(part) < cut_part:
                cut_part = parts.index(part)

        if cut_part != 256:
            log.debug('parts: %s, cut is: %s' % (parts, parts[cut_part]))

        # calculate cut positon from cut_part
        abs_cut = len(' '.join(parts[:cut_part]))

        log.debug('after parts check, cut data would be: `%s` abs_cut: %i' % (data[:abs_cut], abs_cut))

        # parse quality
        quality, remaining = qualities.quality_match(data)
        if quality:
            self.quality = quality
            # remaining string is same as data but quality information removed
            # find out position where there is first difference, this is earlies
            # quality bit, anything after that has no relevance to the movie name
            #dp = diff_pos(data, remaining)
	    dp = None
            if dp is not None:
                log.debug('quality start: %s' % dp)
                if dp < abs_cut:
                    log.debug('quality cut is even shorter')
                    abs_cut = dp

        # make cut
        data = data[:abs_cut].strip()
        log.debug('data cut to `%s` - this will be the name' % data)

        # save results
        self.name = data

        if year:
            if year.isdigit():
                self.year = int(year)
Example #2
0
    def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
        # Clear the output variables before parsing
        self._reset()
        self.field = field
        self.quality = quality
        if data:
            self.data = data
        if not self.name or not self.data:
            raise Exception('SeriesParser initialization error, name: %s data: %s' % \
               (repr(self.name), repr(self.data)))

        if self.expect_ep and self.expect_id:
            raise Exception('Flags expect_ep and expect_id are mutually exclusive')

        name = self.remove_dirt(self.name)

        # check if data appears to be unwanted (abort)
        if self.parse_unwanted(self.remove_dirt(self.data)):
            return

        def name_to_re(name):
            """Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+"""
            # TODO: Still doesn't handle the case where the user wants
            # "Schmost" and the feed contains "Schmost at Sea".
            blank = r'[\W_]'
            ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?'
            # accept either '&' or 'and'
            name = name.replace('&', '(?:and|&)')
            res = re.sub(blank + '+', ' ', name)
            res = res.strip()
            # check for 'and' surrounded by spaces so it is not replaced within a word or from above replacement
            res = res.replace(' and ', ' (?:and|&) ')
            res = re.sub(' +', blank + '*', res)
            res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+'
            return res

        log.debug('name: %s data: %s' % (name, self.data))

        # name end position
        name_start = 0
        name_end = 0

        # regexp name matching
        if not self.name_regexps:
            # if we don't have name_regexps, generate one from the name
            self.name_regexps = [name_to_re(name)]
            self.re_from_name = True
        # try all specified regexps on this data
        for name_re in self.name_regexps:
            match = re.search(name_re, self.data)
            if match:
                if self.re_from_name:
                    name_start, name_end = match.span(1)
                else:
                    name_start, name_end = match.span()

                log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data))
                break
        else:
            # leave this invalid
            log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps],
                                                                 self.data))
            return


        # remove series name from raw data, move any prefix to end of string
        data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
        data_stripped = data_stripped.lower()
        log.debug('data stripped: %s' % data_stripped)

        # allow group(s)
        if self.allow_groups:
            for group in self.allow_groups:
                group = group.lower()
                for fmt in ['[%s]', '-%s']:
                    if fmt % group in data_stripped:
                        log.debug('%s is from group %s' % (self.data, group))
                        self.group = group
                        data_stripped = data_stripped.replace(fmt % group, '')
                        break
                if self.group:
                    break
            else:
                log.debug('%s is not from groups %s' % (self.data, self.allow_groups))
                return # leave invalid

        # search tags and quality if one was not provided to parse method
        if not quality or quality == qualities.UNKNOWN:
            log.debug('parsing quality ->')
            quality, remaining = qualities.quality_match(data_stripped)
            self.quality = quality
            if remaining:
                # Remove quality string from data
                log.debug('quality detected, using remaining data `%s`' % remaining)
                data_stripped = remaining

        # Remove unwanted words (qualities and such) from data for ep / id parsing
        data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() +
                                                         self.codecs + self.sounds, not_in_word=True)


        data_parts = re.split('[\W_]+', data_stripped)

        for part in data_parts[:]:
            if part in self.propers:
                self.proper_count += 1
                data_parts.remove(part)
            elif part in self.specials:
                self.special = True
                data_parts.remove(part)

        data_stripped = ' '.join(data_parts).strip()

        log.debug("data for id/ep parsing '%s'" % data_stripped)

        ep_match = self.parse_episode(data_stripped)
        if ep_match:
            # strict_name
            if self.strict_name:
                if ep_match['match'].start() > 1:
                    return

            if self.expect_id:
                log.debug('found episode number, but expecting id, aborting!')
                return

            if ep_match['end_episode'] > ep_match['episode'] + 2:
                # This is a pack of too many episodes, ignore it.
                log.debug('Series pack contains too many episodes (%d). Rejecting' %
                          (ep_match['end_episode'] - ep_match['episode']))
                return

            self.season = ep_match['season']
            self.episode = ep_match['episode']
            self.end_episode = ep_match['end_episode']
            self.valid = True
            return

        log.debug('-> no luck with ep_regexps')

        # search for ids later as last since they contain somewhat broad matches

        if self.expect_ep:
            # we should be getting season, ep !
            # try to look up idiotic numbering scheme 101,102,103,201,202
            # ressu: Added matching for 0101, 0102... It will fail on
            #        season 11 though
            log.debug('expect_ep enabled')
            match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE)
            if match:
                # strict_name
                if self.strict_name:
                    if match.start() > 1:
                        return

                self.season = int(match.group(1))
                self.episode = int(match.group(2))
                log.debug(self)
                self.valid = True
                return
            log.debug('-> no luck with the expect_ep')
        else:
            if self.parse_unwanted_id(data_stripped):
                return
            for id_re in self.id_regexps:
                match = re.search(id_re, data_stripped)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    if 'version' in match.groupdict():
                        if match.group('version'):
                            self.proper_count = int(match.group('version')) - 1
                        self.id = match.group(1)
                    else:
                        self.id = '-'.join(match.groups())
                    self.id_groups = match.groups()
                    if self.special:
                        self.id += '-SPECIAL'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern))
                    return
            log.debug('-> no luck with id_regexps')

        # No id found, check if this is a special
        if self.special:
            # Attempt to set id as the title of the special
            self.id = data_stripped
            self.valid = True
            log.debug('found special, setting id to \'%s\'' % self.id)
            return

        raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))
Example #3
0
    def parse(self, data=None):
        """Parse movie name. Populates name, year, quality and proper_count attributes"""

        # Reset before parsing, so the parser can be reused.
        self.reset()

        if not data:
            data = self.data

        for char in '[]()_,.':
            data = data.replace(char, ' ')

        # if there are no spaces
        if data.find(' ') == -1:
            data = data.replace('-', ' ')

        # remove unwanted words (imax, ..)
        self.remove_words(data, self.remove)

        data = self.strip_spaces(data)

        # split to parts
        parts = data.split(' ')
        year = None
        cut_part = 256
        self.quality = 'unknown'
        for part in parts:
            cut = False
            # check for year
            if part.isdigit():
                num = int(part)
                if num > 1930 and num < 2050:
                    year = part
                    cut = True
            # if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
            if len(part) > 3 and part.isupper() and part.isalpha():
                cut = True
            # check for cutoff words
            if part.lower() in self.cutoffs:
                cut = True
            # check for propers
            if part.lower() in self.propers:
                self.proper_count += 1
                cut = True
            # update cut position
            if cut and parts.index(part) < cut_part:
                cut_part = parts.index(part)

        if cut_part != 256:
            log.debug('parts: %s, cut is: %s' % (parts, parts[cut_part]))

        # calculate cut positon from cut_part
        abs_cut = len(' '.join(parts[:cut_part]))

        log.debug('after parts check, cut data would be: `%s` abs_cut: %i' % (data[:abs_cut], abs_cut))

        # parse quality
        quality, remaining = qualities.quality_match(data)
        if quality:
            self.quality = quality
            # remaining string is same as data but quality information removed
            # find out position where there is first difference, this is earlies
            # quality bit, anything after that has no relevance to the movie name
            #dp = diff_pos(data, remaining)
	    dp = None
            if dp is not None:
                log.debug('quality start: %s' % dp)
                if dp < abs_cut:
                    log.debug('quality cut is even shorter')
                    abs_cut = dp

        # make cut
        data = data[:abs_cut].strip()
        log.debug('data cut to `%s` - this will be the name' % data)

        # save results
        self.name = data

        if year:
            if year.isdigit():
                self.year = int(year)
    def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
        # Clear the output variables before parsing
        self._reset()
        self.field = field
        self.quality = quality
        if data:
            self.data = data
        if not self.name or not self.data:
            raise Exception('SeriesParser initialization error, name: %s data: %s' % \
               (repr(self.name), repr(self.data)))

        if self.expect_ep and self.expect_id:
            raise Exception(
                'Flags expect_ep and expect_id are mutually exclusive')

        name = self.remove_dirt(self.name)

        # check if data appears to be unwanted (abort)
        if self.parse_unwanted(self.remove_dirt(self.data)):
            return

        def name_to_re(name):
            """Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+"""
            # TODO: Still doesn't handle the case where the user wants
            # "Schmost" and the feed contains "Schmost at Sea".
            blank = r'[\W_]'
            ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?'
            # accept either '&' or 'and'
            name = name.replace('&', '(?:and|&)')
            res = re.sub(blank + '+', ' ', name)
            res = res.strip()
            # check for 'and' surrounded by spaces so it is not replaced within a word or from above replacement
            res = res.replace(' and ', ' (?:and|&) ')
            res = re.sub(' +', blank + '*', res)
            res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+'
            return res

        log.debug('name: %s data: %s' % (name, self.data))

        # name end position
        name_start = 0
        name_end = 0

        # regexp name matching
        if not self.name_regexps:
            # if we don't have name_regexps, generate one from the name
            self.name_regexps = [name_to_re(name)]
            self.re_from_name = True
        # try all specified regexps on this data
        for name_re in self.name_regexps:
            match = re.search(name_re, self.data)
            if match:
                if self.re_from_name:
                    name_start, name_end = match.span(1)
                else:
                    name_start, name_end = match.span()

                log.debug('NAME SUCCESS: %s matched to %s' %
                          (name_re.pattern, self.data))
                break
        else:
            # leave this invalid
            log.debug('FAIL: name regexps %s do not match %s' %
                      ([regexp.pattern
                        for regexp in self.name_regexps], self.data))
            return

        # remove series name from raw data, move any prefix to end of string
        data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
        data_stripped = data_stripped.lower()
        log.debug('data stripped: %s' % data_stripped)

        # allow group(s)
        if self.allow_groups:
            for group in self.allow_groups:
                group = group.lower()
                for fmt in ['[%s]', '-%s']:
                    if fmt % group in data_stripped:
                        log.debug('%s is from group %s' % (self.data, group))
                        self.group = group
                        data_stripped = data_stripped.replace(fmt % group, '')
                        break
                if self.group:
                    break
            else:
                log.debug('%s is not from groups %s' %
                          (self.data, self.allow_groups))
                return  # leave invalid

        # search tags and quality if one was not provided to parse method
        if not quality or quality == qualities.UNKNOWN:
            log.debug('parsing quality ->')
            quality, remaining = qualities.quality_match(data_stripped)
            self.quality = quality
            if remaining:
                # Remove quality string from data
                log.debug('quality detected, using remaining data `%s`' %
                          remaining)
                data_stripped = remaining

        # Remove unwanted words (qualities and such) from data for ep / id parsing
        data_stripped = self.remove_words(
            data_stripped,
            self.remove + qualities.registry.keys() + self.codecs +
            self.sounds,
            not_in_word=True)

        data_parts = re.split('[\W_]+', data_stripped)

        for part in data_parts[:]:
            if part in self.propers:
                self.proper_count += 1
                data_parts.remove(part)
            elif part in self.specials:
                self.special = True
                data_parts.remove(part)

        data_stripped = ' '.join(data_parts).strip()

        log.debug("data for id/ep parsing '%s'" % data_stripped)

        ep_match = self.parse_episode(data_stripped)
        if ep_match:
            # strict_name
            if self.strict_name:
                if ep_match['match'].start() > 1:
                    return

            if self.expect_id:
                log.debug('found episode number, but expecting id, aborting!')
                return

            if ep_match['end_episode'] > ep_match['episode'] + 2:
                # This is a pack of too many episodes, ignore it.
                log.debug(
                    'Series pack contains too many episodes (%d). Rejecting' %
                    (ep_match['end_episode'] - ep_match['episode']))
                return

            self.season = ep_match['season']
            self.episode = ep_match['episode']
            self.end_episode = ep_match['end_episode']
            self.valid = True
            return

        log.debug('-> no luck with ep_regexps')

        # search for ids later as last since they contain somewhat broad matches

        if self.expect_ep:
            # we should be getting season, ep !
            # try to look up idiotic numbering scheme 101,102,103,201,202
            # ressu: Added matching for 0101, 0102... It will fail on
            #        season 11 though
            log.debug('expect_ep enabled')
            match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'),
                              data_stripped, re.IGNORECASE | re.UNICODE)
            if match:
                # strict_name
                if self.strict_name:
                    if match.start() > 1:
                        return

                self.season = int(match.group(1))
                self.episode = int(match.group(2))
                log.debug(self)
                self.valid = True
                return
            log.debug('-> no luck with the expect_ep')
        else:
            if self.parse_unwanted_id(data_stripped):
                return
            for id_re in self.id_regexps:
                match = re.search(id_re, data_stripped)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    if 'version' in match.groupdict():
                        if match.group('version'):
                            self.proper_count = int(match.group('version')) - 1
                        self.id = match.group(1)
                    else:
                        self.id = '-'.join(match.groups())
                    self.id_groups = match.groups()
                    if self.special:
                        self.id += '-SPECIAL'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' %
                              (self.id, id_re.pattern))
                    return
            log.debug('-> no luck with id_regexps')

        # No id found, check if this is a special
        if self.special:
            # Attempt to set id as the title of the special
            self.id = data_stripped
            self.valid = True
            log.debug('found special, setting id to \'%s\'' % self.id)
            return

        raise ParseWarning(
            'Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering'
            % (self.data, self.name))