Esempio n. 1
0
    def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
        # Clear the output variables before parsing
        self._reset()
        self.field = field
        self.quality = quality
        if data:
            self.data = data
        if not self.name or not self.data:
            raise Exception('SeriesParser initialization error, name: %s data: %s' % \
               (repr(self.name), repr(self.data)))

        name = self.remove_dirt(self.name)

        # check if data appears to be unwanted (abort)
        if self.parse_unwanted(self.remove_dirt(self.data)):
            return

        log.debug('name: %s data: %s' % (name, self.data))

        # name end position
        name_start = 0
        name_end = 0

        # regexp name matching
        if not self.name_regexps:
            # if we don't have name_regexps, generate one from the name
            self.name_regexps = ReList([self.name_to_re(self.name)])
            self.re_from_name = True
        # try all specified regexps on this data
        for name_re in self.name_regexps:
            match = re.search(name_re, self.data)
            if match:
                if self.re_from_name:
                    name_start, name_end = match.span(1)
                else:
                    name_start, name_end = match.span()

                log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data))
                break
        else:
            # leave this invalid
            log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps],
                                                                 self.data))
            return

        # remove series name from raw data, move any prefix to end of string
        data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
        data_stripped = data_stripped.lower()
        log.debug('data stripped: %s' % data_stripped)

        # allow group(s)
        if self.allow_groups:
            for group in self.allow_groups:
                group = group.lower()
                for fmt in ['[%s]', '-%s']:
                    if fmt % group in data_stripped:
                        log.debug('%s is from group %s' % (self.data, group))
                        self.group = group
                        data_stripped = data_stripped.replace(fmt % group, '')
                        break
                if self.group:
                    break
            else:
                log.debug('%s is not from groups %s' % (self.data, self.allow_groups))
                return # leave invalid

        # search tags and quality if one was not provided to parse method
        if not quality or quality == qualities.UNKNOWN:
            log.debug('parsing quality ->')
            quality, remaining = qualities.quality_match(data_stripped)
            self.quality = quality
            if remaining:
                # Remove quality string from data
                log.debug('quality detected, using remaining data `%s`' % remaining)
                data_stripped = remaining

        # Remove unwanted words (qualities and such) from data for ep / id parsing
        data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() +
                                                         self.codecs + self.sounds, not_in_word=True)

        data_parts = re.split('[\W_]+', data_stripped)

        for part in data_parts[:]:
            if part in self.propers:
                self.proper_count += 1
                data_parts.remove(part)
            elif part in self.specials:
                self.special = True
                data_parts.remove(part)

        data_stripped = ' '.join(data_parts).strip()

        log.debug("data for id/ep parsing '%s'" % data_stripped)

        if self.identified_by in ['ep', 'auto']:
            ep_match = self.parse_episode(data_stripped)
            if ep_match:
                # strict_name
                if self.strict_name:
                    if ep_match['match'].start() > 1:
                        return

                if ep_match['end_episode'] > ep_match['episode'] + 2:
                    # This is a pack of too many episodes, ignore it.
                    log.debug('Series pack contains too many episodes (%d). Rejecting' %
                              (ep_match['end_episode'] - ep_match['episode']))
                    return

                self.season = ep_match['season']
                self.episode = ep_match['episode']
                self.end_episode = ep_match['end_episode']
                self.id_type = 'ep'
                self.valid = True
                return

            log.debug('-> no luck with ep_regexps')

            if self.identified_by == 'ep':
                # we should be getting season, ep !
                # try to look up idiotic numbering scheme 101,102,103,201,202
                # ressu: Added matching for 0101, 0102... It will fail on
                #        season 11 though
                log.debug('expect_ep enabled')
                match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() > 1:
                            return

                    self.season = int(match.group(1))
                    self.episode = int(match.group(2))
                    log.debug(self)
                    self.id_type = 'ep'
                    self.valid = True
                    return
                log.debug('-> no luck with the expect_ep')

        # Ep mode is done, check for unwanted ids
        if self.parse_unwanted_id(data_stripped):
            return

        # Try date mode after ep mode
        if self.identified_by in ['date', 'auto']:
            for date_re in self.date_regexps:
                match = re.search(date_re, data_stripped)
                if match:
                    # Check if this is a valid date
                    possdates = []

                    try:
                        # By default dayfirst and yearfirst will be tried as both True and False
                        # if either have been defined manually, restrict that option
                        dayfirst_opts = [True, False]
                        if self.date_dayfirst is not None:
                            dayfirst_opts = [self.date_dayfirst]
                        yearfirst_opts = [True, False]
                        if self.date_yearfirst is not None:
                            yearfirst_opts = [self.date_yearfirst]
                        kwargs_list = ({'dayfirst': d, 'yearfirst': y} for d in dayfirst_opts for y in yearfirst_opts)
                        for kwargs in kwargs_list:
                            possdate = parsedate(match.group(0), **kwargs)
                            # Don't accept dates farther than a day in the future
                            if possdate > datetime.now() + timedelta(days=1):
                                continue
                            if possdate not in possdates:
                                possdates.append(possdate)
                    except ValueError:
                        log.debug('%s is not a valid date, skipping' % match.group(0))
                        continue
                    if not possdates:
                        log.debug('All possible dates for %s were in the future' % match.group(0))
                        continue
                    possdates.sort()
                    # Pick the most recent date if there are ambiguities
                    bestdate = possdates[-1]

                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    self.id = bestdate
                    self.id_groups = match.groups()
                    self.id_type = 'date'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, date_re.pattern))
                    return
            log.debug('-> no luck with date_regexps')

        # Check id regexps
        if self.identified_by in ['id', 'auto']:
            for id_re in self.id_regexps:
                match = re.search(id_re, data_stripped)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    self.id = '-'.join(match.groups())
                    self.id_type = 'id'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern))
                    return
            log.debug('-> no luck with id_regexps')

        # Check sequences last as they contain the broadest matches
        if self.identified_by in ['sequence', 'auto']:
            for sequence_re in self.sequence_regexps:
                match = re.search(sequence_re, data_stripped)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    # First matching group is the sequence number
                    try:
                        self.id = int(match.group(1))
                    except ValueError:
                        self.id = self.roman_to_int(match.group(1))
                    self.season = 0
                    self.episode = self.id
                    # If anime style version was found, overwrite the proper count with it
                    if 'version' in match.groupdict():
                        if match.group('version'):
                            self.proper_count = int(match.group('version')) - 1
                    self.id_type = 'sequence'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, sequence_re.pattern))
                    return
            log.debug('-> no luck with sequence_regexps')

        # No id found, check if this is a special
        if self.special:
            # Attempt to set id as the title of the special
            self.id = data_stripped
            self.id_type = 'special'
            self.valid = True
            log.debug('found special, setting id to \'%s\'' % self.id)
            return

        raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))
Esempio n. 2
0
    def parse(self, data=None):
        """Parse movie name. Populates name, year, quality and proper_count attributes"""

        # Reset before parsing, so the parser can be reused.
        self.reset()

        if data is None:
            data = self.data

        for char in '[]()_,.':
            data = data.replace(char, ' ')

        # if there are no spaces
        if data.find(' ') == -1:
            data = data.replace('-', ' ')

        # remove unwanted words (imax, ..)
        self.remove_words(data, self.remove)

        data = self.strip_spaces(data)

        # split to parts
        parts = data.split(' ')
        year = None
        cut_part = 256
        for part in parts:
            cut = False
            # check for year
            if part.isdigit():
                num = int(part)
                if num > 1930 and num < 2050:
                    year = part
                    cut = True
            # if length > 3 and whole word in uppers, consider as cut word (most likely a group name)
            if len(part) > 3 and part.isupper() and part.isalpha():
                cut = True
            # check for cutoff words
            if part.lower() in self.cutoffs:
                cut = True
            # check for propers
            if part.lower() in self.propers:
                self.proper_count += 1
                cut = True
            # update cut position
            if cut and parts.index(part) < cut_part:
                cut_part = parts.index(part)

        if cut_part != 256:
            log.debug('parts: %s, cut is: %s' % (parts, parts[cut_part]))

        # calculate cut positon from cut_part
        abs_cut = len(' '.join(parts[:cut_part]))

        log.debug('after parts check, cut data would be: `%s` abs_cut: %i' % (data[:abs_cut], abs_cut))

        # parse quality
        quality, remaining = qualities.quality_match(data)
        if quality:
            self.quality = quality
            # remaining string is same as data but quality information removed
            # find out position where there is first difference, this is earliest
            # quality bit, anything after that has no relevance to the movie name
            dp = diff_pos(data, remaining)
            if dp is not None:
                log.debug('quality start: %s' % dp)
                if dp < abs_cut:
                    log.debug('quality cut is even shorter')
                    abs_cut = dp

        # make cut
        data = data[:abs_cut].strip()
        log.debug('data cut to `%s` - this will be the name' % data)

        # save results
        self.name = data

        if year:
            if year.isdigit():
                self.year = int(year)
Esempio n. 3
0
    def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
        # Clear the output variables before parsing
        self._reset()
        self.field = field
        self.quality = quality
        if data:
            self.data = data
        if not self.name or not self.data:
            raise Exception('SeriesParser initialization error, name: %s data: %s' % \
               (repr(self.name), repr(self.data)))

        if self.expect_ep and self.expect_id:
            raise Exception('Flags expect_ep and expect_id are mutually exclusive')

        name = self.remove_dirt(self.name)

        # check if data appears to be unwanted (abort)
        if self.parse_unwanted(self.remove_dirt(self.data)):
            return

        log.debug('name: %s data: %s' % (name, self.data))

        # name end position
        name_start = 0
        name_end = 0

        # regexp name matching
        if not self.name_regexps:
            # if we don't have name_regexps, generate one from the name
            self.name_regexps = ReList([self.name_to_re(name)])
            self.re_from_name = True
        # try all specified regexps on this data
        for name_re in self.name_regexps:
            match = re.search(name_re, self.data)
            if match:
                if self.re_from_name:
                    name_start, name_end = match.span(1)
                else:
                    name_start, name_end = match.span()

                log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data))
                break
        else:
            # leave this invalid
            log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps],
                                                                 self.data))
            return


        # remove series name from raw data, move any prefix to end of string
        data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
        data_stripped = data_stripped.lower()
        log.debug('data stripped: %s' % data_stripped)

        # allow group(s)
        if self.allow_groups:
            for group in self.allow_groups:
                group = group.lower()
                for fmt in ['[%s]', '-%s']:
                    if fmt % group in data_stripped:
                        log.debug('%s is from group %s' % (self.data, group))
                        self.group = group
                        data_stripped = data_stripped.replace(fmt % group, '')
                        break
                if self.group:
                    break
            else:
                log.debug('%s is not from groups %s' % (self.data, self.allow_groups))
                return # leave invalid

        # search tags and quality if one was not provided to parse method
        if not quality or quality == qualities.UNKNOWN:
            log.debug('parsing quality ->')
            quality, remaining = qualities.quality_match(data_stripped)
            self.quality = quality
            if remaining:
                # Remove quality string from data
                log.debug('quality detected, using remaining data `%s`' % remaining)
                data_stripped = remaining

        # Remove unwanted words (qualities and such) from data for ep / id parsing
        data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() +
                                                         self.codecs + self.sounds, not_in_word=True)


        data_parts = re.split('[\W_]+', data_stripped)

        for part in data_parts[:]:
            if part in self.propers:
                self.proper_count += 1
                data_parts.remove(part)
            elif part in self.specials:
                self.special = True
                data_parts.remove(part)

        data_stripped = ' '.join(data_parts).strip()

        log.debug("data for id/ep parsing '%s'" % data_stripped)

        ep_match = self.parse_episode(data_stripped)
        if ep_match:
            # strict_name
            if self.strict_name:
                if ep_match['match'].start() > 1:
                    return

            if self.expect_id:
                log.debug('found episode number, but expecting id, aborting!')
                return

            if ep_match['end_episode'] > ep_match['episode'] + 2:
                # This is a pack of too many episodes, ignore it.
                log.debug('Series pack contains too many episodes (%d). Rejecting' %
                          (ep_match['end_episode'] - ep_match['episode']))
                return

            self.season = ep_match['season']
            self.episode = ep_match['episode']
            self.end_episode = ep_match['end_episode']
            self.valid = True
            return

        log.debug('-> no luck with ep_regexps')

        # search for ids later as last since they contain somewhat broad matches

        if self.expect_ep:
            # we should be getting season, ep !
            # try to look up idiotic numbering scheme 101,102,103,201,202
            # ressu: Added matching for 0101, 0102... It will fail on
            #        season 11 though
            log.debug('expect_ep enabled')
            match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE)
            if match:
                # strict_name
                if self.strict_name:
                    if match.start() > 1:
                        return

                self.season = int(match.group(1))
                self.episode = int(match.group(2))
                log.debug(self)
                self.valid = True
                return
            log.debug('-> no luck with the expect_ep')
        else:
            if self.parse_unwanted_id(data_stripped):
                return
            for id_re in self.id_regexps:
                match = re.search(id_re, data_stripped)
                if match:
                    # strict_name
                    if self.strict_name:
                        if match.start() - name_end >= 2:
                            return
                    if 'version' in match.groupdict():
                        if match.group('version'):
                            self.proper_count = int(match.group('version')) - 1
                        self.id = match.group(1)
                    else:
                        self.id = '-'.join(match.groups())
                    self.id_groups = match.groups()
                    if self.special:
                        self.id += '-SPECIAL'
                    self.valid = True
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern))
                    return
            log.debug('-> no luck with id_regexps')

        # No id found, check if this is a special
        if self.special:
            # Attempt to set id as the title of the special
            self.id = data_stripped
            self.valid = True
            log.debug('found special, setting id to \'%s\'' % self.id)
            return

        raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))