Example #1
0
    def guess_language(string, node=None, options=None):
        allowed_languages = None
        if options and 'allowed_languages' in options:
            allowed_languages = options.get('allowed_languages')

        directory = list(filter(lambda x: x.category == 'path', node.ancestors))[0]
        if len(directory.clean_value) <= 3:
            # skip if we have a langage code as directory
            return None

        guess = search_language(string, allowed_languages)
        return guess
Example #2
0
    def guess_language(string, node=None, options=None):
        allowed_languages = None
        if options and 'allowed_languages' in options:
            allowed_languages = options.get('allowed_languages')

        directory = list(filter(lambda x: x.category == 'path',
                                node.ancestors))[0]
        if len(directory.clean_value) <= 3:
            # skip if we have a langage code as directory
            return None

        guess = search_language(string, allowed_languages)
        return guess
Example #3
0
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        # is it a subtitle language?
        if 'sub' in clean_string(string[:span[0]]).lower().split(' '):
            return (Guess({'subtitleLanguage': language},
                          confidence=confidence),
                    span)
        else:
            return (Guess({'language': language},
                          confidence=confidence),
                    span)

    return None, None
def guess_language(string, node, skip=None):
    if skip:
        relative_skip = []
        for entry in skip:
            node_idx = entry['node_idx']
            span = entry['span']
            if node_idx == node.node_idx[:len(node_idx)]:
                relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1)
                relative_skip.append(relative_span)
        skip = relative_skip

    language, span, confidence = search_language(string, skip=skip)
    if language:
        return (Guess({'language': language},
                      confidence=confidence,
                      raw= string[span[0]:span[1]]),
                span)

    return None, None
Example #5
0
def guess_language(string, node, skip=None):
    if skip:
        relative_skip = []
        for entry in skip:
            node_idx = entry['node_idx']
            span = entry['span']
            if node_idx == node.node_idx[:len(node_idx)]:
                relative_span = (span[0] - node.offset + 1,
                                 span[1] - node.offset + 1)
                relative_skip.append(relative_span)
        skip = relative_skip

    language, span, confidence = search_language(string, skip=skip)
    if language:
        return (Guess({'language': language},
                      confidence=confidence,
                      raw=string[span[0]:span[1]]), span)

    return None, None
Example #6
0
def guess_language(string):
    guess = search_language(string)
    return guess
 def guess_language(self, string, node=None, options=None):
     allowed_languages = None
     if options and 'allowed_languages' in options:
         allowed_languages = options.get('allowed_languages')
     guess = search_language(string, allowed_languages)
     return guess
Example #8
0
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        return (Guess({'language': language}, confidence=confidence), span)

    return None, None
Example #9
0
 def guess_language(self, string, node=None, options=None):
     guess = search_language(string)
     return guess
 def guess_language(self, string, node=None, options=None):
     guess = search_language(string)
     return guess
Example #11
0
 def guess_language(string, node=None, options=None):
     allowed_languages = None
     if options and 'allowed_languages' in options:
         allowed_languages = options.get('allowed_languages')
     guess = search_language(string, allowed_languages)
     return guess
Example #12
0
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        return (Guess({"language": language}, confidence=confidence), span)

    return None, None
Example #13
0
def guess_groups(string, result, filetype):
    # add sentinels so we can match a separator char at either end of
    # our groups, even when they are at the beginning or end of the string
    # we will adjust the span accordingly later
    #
    # filetype can either be movie, moviesubtitle, episode, episodesubtitle
    current = " " + string + " "

    regions = []  # list of (start, end) of matched regions

    def guessed(match_dict, confidence):
        guess = format_guess(Guess(match_dict, confidence=confidence))
        result.append(guess)
        log.debug("Found with confidence %.2f: %s" % (confidence, guess))
        return guess

    def update_found(string, guess, span, span_adjust=(0, 0)):
        span = (span[0] + span_adjust[0], span[1] + span_adjust[1])
        regions.append((span, guess))
        return blank_region(string, span)

    # try to find dates first, as they are very specific
    date, span = search_date(current)
    if date:
        guess = guessed({"date": date}, confidence=1.0)
        current = update_found(current, guess, span)

    # for non episodes only, look for year information
    if filetype not in ("episode", "episodesubtitle"):
        year, span = search_year(current)
        if year:
            guess = guessed({"year": year}, confidence=1.0)
            current = update_found(current, guess, span)

    # specific regexps (ie: cd number, season X episode, ...)
    for rexp, confidence, span_adjust in video_rexps:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it)
            if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None:
                del metadata["cdNumberTotal"]

            guess = guessed(metadata, confidence=confidence)
            current = update_found(current, guess, match.span(), span_adjust)

    if filetype in ("episode", "episodesubtitle"):
        for rexp, confidence, span_adjust in episode_rexps:
            match = re.search(rexp, current, re.IGNORECASE)
            if match:
                metadata = match.groupdict()
                guess = guessed(metadata, confidence=confidence)
                current = update_found(current, guess, match.span(), span_adjust)

    # Now websites, but as exact string instead of regexps
    clow = current.lower()
    for site in websites:
        pos = clow.find(site.lower())
        if pos != -1:
            guess = guessed({"website": site}, confidence=confidence)
            current = update_found(current, guess, (pos, pos + len(site)))
            clow = current.lower()

    # release groups have certain constraints, cannot be included in the previous general regexps
    group_names = [
        r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]",
        r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]",
        r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]",
    ]
    for rexp in group_names:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            metadata.update({"videoCodec": match.group(1)})
            guess = guessed(metadata, confidence=0.8)
            current = update_found(current, guess, match.span(), span_adjust=(1, -1))

    # common well-defined words and regexps
    confidence = 1.0  # for all of them
    for prop, value, pos, end in find_properties(current):
        guess = guessed({prop: value}, confidence=confidence)
        current = update_found(current, guess, (pos, end))

    # weak guesses for episode number, only run it if we don't have an estimate already
    if filetype in ("episode", "episodesubtitle"):
        if not any("episodeNumber" in match for match in result):
            for rexp, _, span_adjust in weak_episode_rexps:
                match = re.search(rexp, current, re.IGNORECASE)
                if match:
                    metadata = match.groupdict()
                    epnum = int(metadata["episodeNumber"])
                    if epnum > 100:
                        guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6)
                    else:
                        guess = guessed(metadata, confidence=0.3)
                    current = update_found(current, guess, match.span(), span_adjust)

    # try to find languages now
    language, span, confidence = search_language(current)
    while language:
        # is it a subtitle language?
        if "sub" in clean_string(current[: span[0]]).lower().split(" "):
            guess = guessed({"subtitleLanguage": language}, confidence=confidence)
        else:
            guess = guessed({"language": language}, confidence=confidence)
        current = update_found(current, guess, span)

        language, span, confidence = search_language(current)

    # remove our sentinels now and ajust spans accordingly
    assert current[0] == " " and current[-1] == " "
    current = current[1:-1]
    regions = [((start - 1, end - 1), guess) for (start, end), guess in regions]

    # split into '-' separated subgroups (with required separator chars
    # around the dash)
    didx = current.find("-")
    while didx > 0:
        regions.append(((didx, didx), None))
        didx = current.find("-", didx + 1)

    # cut our final groups, and rematch the guesses to the group that created
    # id, None if it is a leftover group
    region_spans = [span for span, guess in regions]
    string_groups = split_on_groups(string, region_spans)
    remaining_groups = split_on_groups(current, region_spans)
    guesses = []

    pos = 0
    for group in string_groups:
        found = False
        for span, guess in regions:
            if span[0] == pos:
                guesses.append(guess)
                found = True
        if not found:
            guesses.append(None)

        pos += len(group)

    return zip(string_groups, remaining_groups, guesses)