def guess_filetype(filename, filetype = 'autodetect'):
    other = {}

    # look at the extension first
    fileext = os.path.splitext(filename)[1][1:].lower()
    if fileext in subtitle_exts:
        if 'movie' in filetype:
            filetype = 'moviesubtitle'
        elif 'episode' in filetype:
            filetype = 'episodesubtitle'
        else:
            filetype = 'subtitle'
        other = { 'container': fileext }
    elif fileext in video_exts:
        if filetype == 'autodetect':
            filetype = 'video'
        other = { 'container': fileext }
    else:
        if filetype == 'autodetect':
            filetype = 'unknown'
        other = { 'extension': fileext }

    # now look whether there are some specific hints for episode vs movie
    if filetype in ('video', 'subtitle'):
        for rexp, confidence, span_adjust in episode_rexps:
            match = re.search(rexp, filename, re.IGNORECASE)
            if match:
                if filetype == 'video':
                    filetype = 'episode'
                elif filetype == 'subtitle':
                    filetype = 'episodesubtitle'
                break

        for prop, value, start, end in find_properties(filename):
            if canonical_form(value) == 'DVB':
                if filetype == 'video':
                    filetype = 'episode'
                elif filetype == 'subtitle':
                    filetype = 'episodesubtitle'
                break

        # if no episode info found, assume it's a movie
        if filetype == 'video':
            filetype = 'movie'
        elif filetype == 'subtitle':
            filetype = 'moviesubtitle'

    return filetype, other
Example #2
0
def guess_filetype(mtree, filetype):
    # put the filetype inside a dummy container to be able to have the
    # following functions work correctly as closures
    # this is a workaround for python 2 which doesn't have the
    # 'nonlocal' keyword (python 3 does have it)
    filetype_container = [filetype]
    other = {}
    filename = mtree.string

    def upgrade_episode():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'episode'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'episodesubtitle'

    def upgrade_movie():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'movie'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'moviesubtitle'

    def upgrade_subtitle():
        if 'movie' in filetype_container[0]:
            filetype_container[0] = 'moviesubtitle'
        elif 'episode' in filetype_container[0]:
            filetype_container[0] = 'episodesubtitle'
        else:
            filetype_container[0] = 'subtitle'

    def upgrade(type='unknown'):
        if filetype_container[0] == 'autodetect':
            filetype_container[0] = type

    # look at the extension first
    fileext = os.path.splitext(filename)[1][1:].lower()
    if fileext in subtitle_exts:
        upgrade_subtitle()
        other = {'container': fileext}
    elif fileext in video_exts:
        upgrade(type='video')
        other = {'container': fileext}
    else:
        upgrade(type='unknown')
        other = {'extension': fileext}

    # check whether we are in a 'Movies', 'Tv Shows', ... folder
    folder_rexps = [(r'Movies?', upgrade_movie),
                    (r'Tv[ _-]?Shows?', upgrade_episode),
                    (r'Series', upgrade_episode)]
    for frexp, upgrade_func in folder_rexps:
        frexp = re.compile(frexp, re.IGNORECASE)
        for pathgroup in mtree.children:
            if frexp.match(pathgroup.value):
                upgrade_func()

    # check for a few specific cases which will unintentionally make the
    # following heuristics confused (eg: OSS 117 will look like an episode,
    # season 1, epnum 17, when it is in fact a movie)
    fname = clean_string(filename).lower()
    for m in MOVIES:
        if m in fname:
            upgrade_movie()
    for s in SERIES:
        if s in fname:
            upgrade_episode()

    # now look whether there are some specific hints for episode vs movie
    if filetype_container[0] in ('video', 'subtitle'):
        # if we have an episode_rexp (eg: s02e13), it is an episode
        for rexp, _, _ in episode_rexps:
            match = re.search(rexp, filename, re.IGNORECASE)
            if match:
                upgrade_episode()
                break

        # if we have a 3-4 digit number that's not a year, maybe an episode
        match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename)
        if match:
            fullnumber = int(match.group()[1:-1])
            #season = fullnumber // 100
            epnumber = fullnumber % 100
            possible = True

            # check for validity
            if epnumber > 40:
                possible = False
            if valid_year(fullnumber):
                possible = False

            if possible:
                upgrade_episode()

        # if we have certain properties characteristic of episodes, it is an ep
        for prop, value, _, _ in find_properties(filename):
            log.debug('prop: %s = %s' % (prop, value))
            if prop == 'episodeFormat':
                upgrade_episode()
                break

            elif compute_canonical_form('format', value) == 'DVB':
                upgrade_episode()
                break

        # origin-specific type
        if 'tvu.org.ru' in filename:
            upgrade_episode()

        # if no episode info found, assume it's a movie
        upgrade_movie()

    filetype = filetype_container[0]
    return filetype, other
Example #3
0
def guess_properties(string):
    try:
        prop, value, pos, end = find_properties(string)[0]
        return { prop: value }, (pos, end)
    except IndexError:
        return None, None
def guess_filetype(mtree, filetype):
    # put the filetype inside a dummy container to be able to have the
    # following functions work correctly as closures
    # this is a workaround for python 2 which doesn't have the
    # 'nonlocal' keyword (python 3 does have it)
    filetype_container = [filetype]
    other = {}
    filename = mtree.string

    def upgrade_episode():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'episode'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'episodesubtitle'

    def upgrade_movie():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'movie'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'moviesubtitle'

    def upgrade_subtitle():
        if 'movie' in filetype_container[0]:
            filetype_container[0] = 'moviesubtitle'
        elif 'episode' in filetype_container[0]:
            filetype_container[0] = 'episodesubtitle'
        else:
            filetype_container[0] = 'subtitle'

    def upgrade(type='unknown'):
        if filetype_container[0] == 'autodetect':
            filetype_container[0] = type


    # look at the extension first
    fileext = os.path.splitext(filename)[1][1:].lower()
    if fileext in subtitle_exts:
        upgrade_subtitle()
        other = { 'container': fileext }
    elif fileext in video_exts:
        upgrade(type='video')
        other = { 'container': fileext }
    else:
        upgrade(type='unknown')
        other = { 'extension': fileext }



    # check whether we are in a 'Movies', 'Tv Shows', ... folder
    folder_rexps = [ (r'Movies?', upgrade_movie),
                     (r'Tv ?Shows?', upgrade_episode),
                     (r'Series', upgrade_episode)
                     ]
    for frexp, upgrade_func in folder_rexps:
        frexp = re.compile(frexp, re.IGNORECASE)
        for pathgroup in mtree.children:
            if frexp.match(pathgroup.value):
                upgrade_func()

    # check for a few specific cases which will unintentionally make the
    # following heuristics confused (eg: OSS 117 will look like an episode,
    # season 1, epnum 17, when it is in fact a movie)
    fname = clean_string(filename).lower()
    for m in MOVIES:
        if m in fname:
            upgrade_movie()
    for s in SERIES:
        if s in fname:
            upgrade_episode()

    # now look whether there are some specific hints for episode vs movie
    if filetype_container[0] in ('video', 'subtitle'):
        # if we have an episode_rexp (eg: s02e13), it is an episode
        for rexp, _, _ in episode_rexps:
            match = re.search(rexp, filename, re.IGNORECASE)
            if match:
                upgrade_episode()
                break

        # if we have a 3-4 digit number that's not a year, maybe an episode
        match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename)
        if match:
            fullnumber = int(match.group()[1:-1])
            #season = fullnumber // 100
            epnumber = fullnumber % 100
            possible = True

            # check for validity
            if epnumber > 40:
                possible = False
            if valid_year(fullnumber):
                possible = False

            if possible:
                upgrade_episode()

        # if we have certain properties characteristic of episodes, it is an ep
        for prop, value, _, _ in find_properties(filename):
            log.debug('prop: %s = %s' % (prop, value))
            if prop == 'episodeFormat':
                upgrade_episode()
                break

            elif canonical_form(value) == 'DVB':
                upgrade_episode()
                break

        # origin-specific type
        if 'tvu.org.ru' in filename:
            upgrade_episode()

        # if no episode info found, assume it's a movie
        upgrade_movie()

    filetype = filetype_container[0]
    return filetype, other
def guess_filetype(filename, filetype):
    other = {}

    # look at the extension first
    fileext = os.path.splitext(filename)[1][1:].lower()
    if fileext in subtitle_exts:
        if 'movie' in filetype:
            filetype = 'moviesubtitle'
        elif 'episode' in filetype:
            filetype = 'episodesubtitle'
        else:
            filetype = 'subtitle'
        other = { 'container': fileext }
    elif fileext in video_exts:
        if filetype == 'autodetect':
            filetype = 'video'
        other = { 'container': fileext }
    else:
        if filetype == 'autodetect':
            filetype = 'unknown'
        other = { 'extension': fileext }

    # put the filetype inside a dummy container to be able to have the
    # following functions work correctly as closures
    # this is a workaround for python 2 which doesn't have the
    # 'nonlocal' keyword (python 3 does have it)
    filetype_container = [filetype]

    def upgrade_episode():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'episode'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'episodesubtitle'

    def upgrade_movie():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'movie'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'moviesubtitle'

    # now look whether there are some specific hints for episode vs movie
    if filetype in ('video', 'subtitle'):
        for rexp, _, _ in episode_rexps:
            match = re.search(rexp, filename, re.IGNORECASE)
            if match:
                upgrade_episode()
                break

        for prop, value, _, _ in find_properties(filename):
            log.debug('prop: %s = %s' % (prop, value))
            if prop == 'episodeFormat':
                upgrade_episode()
                break

            elif canonical_form(value) == 'DVB':
                upgrade_episode()
                break

        # if no episode info found, assume it's a movie
        upgrade_movie()

    filetype = filetype_container[0]
    return filetype, other
Example #6
0
def guess_filetype(filename, filetype):
    other = {}

    # look at the extension first
    fileext = os.path.splitext(filename)[1][1:].lower()
    if fileext in subtitle_exts:
        if 'movie' in filetype:
            filetype = 'moviesubtitle'
        elif 'episode' in filetype:
            filetype = 'episodesubtitle'
        else:
            filetype = 'subtitle'
        other = {'container': fileext}
    elif fileext in video_exts:
        if filetype == 'autodetect':
            filetype = 'video'
        other = {'container': fileext}
    else:
        if filetype == 'autodetect':
            filetype = 'unknown'
        other = {'extension': fileext}

    # put the filetype inside a dummy container to be able to have the
    # following functions work correctly as closures
    # this is a workaround for python 2 which doesn't have the
    # 'nonlocal' keyword (python 3 does have it)
    filetype_container = [filetype]

    def upgrade_episode():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'episode'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'episodesubtitle'

    def upgrade_movie():
        if filetype_container[0] == 'video':
            filetype_container[0] = 'movie'
        elif filetype_container[0] == 'subtitle':
            filetype_container[0] = 'moviesubtitle'

    # now look whether there are some specific hints for episode vs movie
    if filetype in ('video', 'subtitle'):
        for rexp, _, _ in episode_rexps:
            match = re.search(rexp, filename, re.IGNORECASE)
            if match:
                upgrade_episode()
                break

        for prop, value, _, _ in find_properties(filename):
            log.debug('prop: %s = %s' % (prop, value))
            if prop == 'episodeFormat':
                upgrade_episode()
                break

            elif canonical_form(value) == 'DVB':
                upgrade_episode()
                break

        if 'tvu.org.ru' in filename:
            upgrade_episode()

        # if no episode info found, assume it's a movie
        upgrade_movie()

    filetype = filetype_container[0]
    return filetype, other
Example #7
0
def guess_groups(string, result, filetype):
    # add sentinels so we can match a separator char at either end of
    # our groups, even when they are at the beginning or end of the string
    # we will adjust the span accordingly later
    #
    # filetype can either be movie, moviesubtitle, episode, episodesubtitle
    current = " " + string + " "

    regions = []  # list of (start, end) of matched regions

    def guessed(match_dict, confidence):
        guess = format_guess(Guess(match_dict, confidence=confidence))
        result.append(guess)
        log.debug("Found with confidence %.2f: %s" % (confidence, guess))
        return guess

    def update_found(string, guess, span, span_adjust=(0, 0)):
        span = (span[0] + span_adjust[0], span[1] + span_adjust[1])
        regions.append((span, guess))
        return blank_region(string, span)

    # try to find dates first, as they are very specific
    date, span = search_date(current)
    if date:
        guess = guessed({"date": date}, confidence=1.0)
        current = update_found(current, guess, span)

    # for non episodes only, look for year information
    if filetype not in ("episode", "episodesubtitle"):
        year, span = search_year(current)
        if year:
            guess = guessed({"year": year}, confidence=1.0)
            current = update_found(current, guess, span)

    # specific regexps (ie: cd number, season X episode, ...)
    for rexp, confidence, span_adjust in video_rexps:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it)
            if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None:
                del metadata["cdNumberTotal"]

            guess = guessed(metadata, confidence=confidence)
            current = update_found(current, guess, match.span(), span_adjust)

    if filetype in ("episode", "episodesubtitle"):
        for rexp, confidence, span_adjust in episode_rexps:
            match = re.search(rexp, current, re.IGNORECASE)
            if match:
                metadata = match.groupdict()
                guess = guessed(metadata, confidence=confidence)
                current = update_found(current, guess, match.span(), span_adjust)

    # Now websites, but as exact string instead of regexps
    clow = current.lower()
    for site in websites:
        pos = clow.find(site.lower())
        if pos != -1:
            guess = guessed({"website": site}, confidence=confidence)
            current = update_found(current, guess, (pos, pos + len(site)))
            clow = current.lower()

    # release groups have certain constraints, cannot be included in the previous general regexps
    group_names = [
        r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]",
        r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]",
        r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]",
    ]
    for rexp in group_names:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            metadata.update({"videoCodec": match.group(1)})
            guess = guessed(metadata, confidence=0.8)
            current = update_found(current, guess, match.span(), span_adjust=(1, -1))

    # common well-defined words and regexps
    confidence = 1.0  # for all of them
    for prop, value, pos, end in find_properties(current):
        guess = guessed({prop: value}, confidence=confidence)
        current = update_found(current, guess, (pos, end))

    # weak guesses for episode number, only run it if we don't have an estimate already
    if filetype in ("episode", "episodesubtitle"):
        if not any("episodeNumber" in match for match in result):
            for rexp, _, span_adjust in weak_episode_rexps:
                match = re.search(rexp, current, re.IGNORECASE)
                if match:
                    metadata = match.groupdict()
                    epnum = int(metadata["episodeNumber"])
                    if epnum > 100:
                        guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6)
                    else:
                        guess = guessed(metadata, confidence=0.3)
                    current = update_found(current, guess, match.span(), span_adjust)

    # try to find languages now
    language, span, confidence = search_language(current)
    while language:
        # is it a subtitle language?
        if "sub" in clean_string(current[: span[0]]).lower().split(" "):
            guess = guessed({"subtitleLanguage": language}, confidence=confidence)
        else:
            guess = guessed({"language": language}, confidence=confidence)
        current = update_found(current, guess, span)

        language, span, confidence = search_language(current)

    # remove our sentinels now and ajust spans accordingly
    assert current[0] == " " and current[-1] == " "
    current = current[1:-1]
    regions = [((start - 1, end - 1), guess) for (start, end), guess in regions]

    # split into '-' separated subgroups (with required separator chars
    # around the dash)
    didx = current.find("-")
    while didx > 0:
        regions.append(((didx, didx), None))
        didx = current.find("-", didx + 1)

    # cut our final groups, and rematch the guesses to the group that created
    # id, None if it is a leftover group
    region_spans = [span for span, guess in regions]
    string_groups = split_on_groups(string, region_spans)
    remaining_groups = split_on_groups(current, region_spans)
    guesses = []

    pos = 0
    for group in string_groups:
        found = False
        for span, guess in regions:
            if span[0] == pos:
                guesses.append(guess)
                found = True
        if not found:
            guesses.append(None)

        pos += len(group)

    return zip(string_groups, remaining_groups, guesses)