Ejemplo n.º 1
0
def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])
            guess = Guess(match.groupdict(),
                          confidence=confidence,
                          raw=string[span[0]:span[1]])

            # decide whether we have only a single episode number or an
            # episode list
            if guess.get('episodeNumber'):
                eplist = number_list(guess['episodeNumber'])
                guess.set('episodeNumber',
                          eplist[0],
                          confidence=confidence,
                          raw=string[span[0]:span[1]])

                if len(eplist) > 1:
                    guess.set('episodeList',
                              eplist,
                              confidence=confidence,
                              raw=string[span[0]:span[1]])

            if guess.get('bonusNumber'):
                eplist = number_list(guess['bonusNumber'])
                guess.set('bonusNumber',
                          eplist[0],
                          confidence=confidence,
                          raw=string[span[0]:span[1]])

            return guess, span

    return None, None
def guess_weak_episodes_rexps(string, node):
    if 'episodeNumber' in node.root.info:
        return None, None

    for rexp, span_adjust in weak_episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

            epnum = int(metadata['episodeNumber'])
            if epnum > 100:
                season, epnum = epnum // 100, epnum % 100
                # episodes which have a season > 25 are most likely errors
                # (Simpsons is at 23!)
                if season > 25:
                    continue
                return Guess({ 'season': season,
                               'episodeNumber': epnum },
                             confidence=0.6, raw=string[span[0]:span[1]]), span
            else:
                return Guess(metadata, confidence=0.3, raw=string[span[0]:span[1]]), span

    return None, None
def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            guess = Guess(match.groupdict(), confidence=confidence)
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

            # episodes which have a season > 25 are most likely errors
            # (Simpsons is at 24!)
            if int(guess.get('season', 0)) > 25:
                continue

            # decide whether we have only a single episode number or an
            # episode list
            if guess.get('episodeNumber'):
                eplist = number_list(guess['episodeNumber'])
                guess.set('episodeNumber', int(eplist[0]), confidence=confidence)

                if len(eplist) > 1:
                    guess.set('episodeList', list(map(int, eplist)), confidence=confidence)

            if guess.get('bonusNumber'):
                eplist = number_list(guess['bonusNumber'])
                guess.set('bonusNumber', int(eplist[0]), confidence=confidence)

            return guess, span

    return None, None
Ejemplo n.º 4
0
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        # is it a subtitle language?
        if 'sub' in clean_string(string[:span[0]]).lower().split(' '):
            return (Guess({'subtitleLanguage': language},
                          confidence=confidence),
                    span)
        else:
            return (Guess({'language': language},
                          confidence=confidence),
                    span)

    return None, None
Ejemplo n.º 5
0
def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            result = (Guess(match.groupdict(), confidence=confidence),
                      (match.start() + span_adjust[0],
                       match.end() + span_adjust[1]))
            # episodes which have a season > 25 are most likely errors
            # (Simpsons is at 23!)
            if int(result[0].get('season', 0)) > 25:
                continue

            # decide whether we have only a single episode number or an
            # episode list
            if result[0].get('episodeNumber'):
                eplist = number_list(result[0]['episodeNumber'])
                result[0].set('episodeNumber',
                              int(eplist[0]),
                              confidence=confidence)

                if len(eplist) > 1:
                    result[0].set('episodeList',
                                  map(int, eplist),
                                  confidence=confidence)

            return result

    return None, None
Ejemplo n.º 6
0
def find_and_split_node(node, strategy, logger):
    string = ' %s ' % node.value  # add sentinels
    for matcher, confidence in strategy:
        if getattr(matcher, 'use_node', False):
            result, span = matcher(string, node)
        else:
            result, span = matcher(string)

        if result:
            # readjust span to compensate for sentinels
            span = (span[0] - 1, span[1] - 1)

            if isinstance(result, Guess):
                if confidence is None:
                    confidence = result.confidence(list(result.keys())[0])
            else:
                if confidence is None:
                    confidence = 1.0

            guess = format_guess(Guess(result, confidence=confidence))
            msg = 'Found with confidence %.2f: %s' % (confidence, guess)
            (logger or log).debug(msg)

            node.partition(span)
            absolute_span = (span[0] + node.offset, span[1] + node.offset)
            for child in node.children:
                if child.span == absolute_span:
                    child.guess = guess
                else:
                    find_and_split_node(child, strategy, logger)
            return
Ejemplo n.º 7
0
def process(mtree, filetype='autodetect'):
    filetype, other = guess_filetype(mtree, filetype)

    mtree.guess.set('type', filetype, confidence=1.0)
    log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess))

    filetype_info = Guess(other, confidence=1.0)
    # guess the mimetype of the filename
    # TODO: handle other mimetypes not found on the default type_maps
    # mimetypes.types_map['.srt']='text/subtitle'
    mime, _ = mimetypes.guess_type(mtree.string, strict=False)
    if mime is not None:
        filetype_info.update({'mimetype': mime}, confidence=1.0)

    node_ext = mtree.node_at((-1, ))
    node_ext.guess = filetype_info
    log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess))
def process(mtree, filetype='autodetect'):
    filetype, other = guess_filetype(mtree, filetype)

    mtree.guess.set('type', filetype, confidence=1.0)
    log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess))

    filetype_info = Guess(other, confidence=1.0)
    # guess the mimetype of the filename
    # TODO: handle other mimetypes not found on the default type_maps
    # mimetypes.types_map['.srt']='text/subtitle'
    mime, _ = mimetypes.guess_type(mtree.string, strict=False)
    if mime is not None:
        filetype_info.update({'mimetype': mime}, confidence=1.0)

    node_ext = mtree.node_at((-1,))
    node_ext.guess = filetype_info
    log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess))
Ejemplo n.º 9
0
 def guess_country(self, string, node=None, options=None):
     c = string.strip().lower()
     if not c in LNG_COMMON_WORDS:
         try:
             country, country_span = self._scan_country(c, True)
             if self.is_valid_country(country, options):
                 guess = Guess(country=country, confidence=1.0, input=node.value, span=(country_span[0] + 1, country_span[1] + 1))
                 return guess
         except babelfish.Error:
             pass
     return None, None
Ejemplo n.º 10
0
def process(mtree):
    for node in mtree.unidentified_leaves():
        # only keep explicit groups (enclosed in parentheses/brackets)
        if len(node.node_idx) == 2:
            try:
                country = Country(node.value[1:-1], strict=True)
                if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
                    continue
                node.guess = Guess(country=country, confidence=1.0)

            except ValueError:
                pass
Ejemplo n.º 11
0
def process(mtree, filetype='autodetect'):
    """guess the file type now (will be useful later)
    """
    filetype, other = guess_filetype(mtree, filetype)

    mtree.guess.set('type', filetype, confidence=1.0)
    log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess))

    filetype_info = Guess(other, confidence=1.0)
    # guess the mimetype of the filename
    # TODO: handle other mimetypes not found on the default type_maps
    # mimetypes.types_map['.srt']='text/subtitle'
    mime, _ = mimetypes.guess_type(mtree.string, strict=False)
    if mime is not None:
        filetype_info.update({'mimetype': mime}, confidence=1.0)

    node_ext = mtree.node_at((-1,))
    node_ext.guess = filetype_info
    log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess))

    if mtree.guess.get('type') in [None, 'unknown']:
        raise TransfoException(__name__, 'Unknown file type')
Ejemplo n.º 12
0
 def process(self, mtree, options=None):
     GuessFinder(self.guess_country, None, self.log, options).process_nodes(mtree.unidentified_leaves())
     for node in mtree.leaves_containing('language'):
         c = node.clean_value.lower()
         if c in self.replace_language:
             node.guess.set('language', None)
             try:
                 country = Country.fromguessit(c)
                 if self.is_valid_country(country, options):
                     guess = Guess(country=country, confidence=0.9, input=node.value, span=node.span)
                     found_guess(node, guess, logger=log)
             except babelfish.Error:
                 pass
Ejemplo n.º 13
0
def guess_weak_episodes_rexps(string, node):
    if 'episodeNumber' in node.root.info:
        return None, None

    for rexp, span_adjust in weak_episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

            epnum = int(metadata['episodeNumber'])
            if epnum > 100:
                return Guess(
                    {
                        'season': epnum // 100,
                        'episodeNumber': epnum % 100
                    },
                    confidence=0.6), span
            else:
                return Guess(metadata, confidence=0.3), span

    return None, None
Ejemplo n.º 14
0
def guess_video_rexps(string):
    string = '-' + string + '-'
    for rexp, confidence, span_adjust in video_rexps:
        match = re.search(sep + rexp + sep, string, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            # is this the better place to put it? (maybe, as it is at least
            # the soonest that we can catch it)
            if metadata.get('cdNumberTotal', -1) is None:
                del metadata['cdNumberTotal']
            return (Guess(metadata, confidence=confidence),
                    (match.start() + span_adjust[0],
                     match.end() + span_adjust[1] - 2))

    return None, None
Ejemplo n.º 15
0
    def process(self, mtree, options=None):
        for node in mtree.unidentified_leaves():
            if len(node.node_idx) == 2:
                c = node.value[1:-1].lower()
                if c in self.country_common_words:
                    continue

                # only keep explicit groups (enclosed in parentheses/brackets)
                if not node.is_explicit():
                    continue

                try:
                    country = Country(c, strict=True)
                except ValueError:
                    continue

                node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span)
Ejemplo n.º 16
0
def process(mtree):
    for node in mtree.unidentified_leaves():
        if len(node.node_idx) == 2:
            c = node.value[1:-1].lower()
            if c in country_common_words:
                continue

            # only keep explicit groups (enclosed in parentheses/brackets)
            if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
                continue

            try:
                country = Country(c, strict=True)
            except ValueError:
                continue

            node.guess = Guess(country=country, confidence=1.0, raw=c)
Ejemplo n.º 17
0
def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            guess = Guess(match.groupdict(), confidence=confidence)
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

            # decide whether we have only a single episode number or an
            # episode list
            if guess.get('episodeNumber'):
                eplist = number_list(guess['episodeNumber'])
                guess.set('episodeNumber', eplist[0], confidence=confidence)

                if len(eplist) > 1:
                    guess.set('episodeList', eplist, confidence=confidence)

            if guess.get('bonusNumber'):
                eplist = number_list(guess['bonusNumber'])
                guess.set('bonusNumber', eplist[0], confidence=confidence)

            return guess, span

    return None, None
Ejemplo n.º 18
0
def guess_language(string, node, skip=None):
    if skip:
        relative_skip = []
        for entry in skip:
            node_idx = entry['node_idx']
            span = entry['span']
            if node_idx == node.node_idx[:len(node_idx)]:
                relative_span = (span[0] - node.offset + 1,
                                 span[1] - node.offset + 1)
                relative_skip.append(relative_span)
        skip = relative_skip

    language, span, confidence = search_language(string, skip=skip)
    if language:
        return (Guess({'language': language},
                      confidence=confidence,
                      raw=string[span[0]:span[1]]), span)

    return None, None
Ejemplo n.º 19
0
 def found_property(node, name, value, confidence):
     node.guess = Guess({ name: value },
                        confidence=confidence)
     log.debug('Found with confidence %.2f: %s' % (confidence, node.guess))
Ejemplo n.º 20
0
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        return (Guess({'language': language}, confidence=confidence), span)

    return None, None
Ejemplo n.º 21
0
def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
            guess = Guess(match.groupdict(), confidence=confidence)
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

            # episodes which have a season > 30 are most likely errors
            # (Simpsons is at 24!)
            if int(guess.get('season', 0)) > 30:
                continue

            # decide whether we have only a single episode number or an
            # episode list
            if guess.get('episodeNumber'):
                eplist = number_list(guess['episodeNumber'])
                guess.set('episodeNumber', eplist[0], confidence=confidence)

                if len(eplist) > 1:
                    guess.set('episodeList', eplist, confidence=confidence)

            if guess.get('bonusNumber'):
                eplist = number_list(guess['bonusNumber'])
                guess.set('bonusNumber', eplist[0], confidence=confidence)

            return guess, span

    return None, None
Ejemplo n.º 22
0
 def __init__(self, string='', span=None, parent=None):
     self.string = string
     self.span = span or (0, len(string))
     self.parent = parent
     self.children = []
     self.guess = Guess()