def guess_file_info(filename, filetype, info = [ 'filename' ]): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('test/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] if isinstance(info, basestring): info = [ info ] for infotype in info: if infotype == 'filename': m = IterativeMatcher(filename, filetype = filetype) result.append(m.matched()) elif infotype == 'hash_mpc': import hash_mpc try: result.append(Guess({ 'hash_mpc': hash_mpc.hash_file(filename) }, confidence = 1.0)) except Exception, e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': import hash_ed2k try: result.append(Guess({ 'hash_ed2k': hash_ed2k.hash_file(filename) }, confidence = 1.0)) except Exception, e: log.warning('Could not compute ed2k hash because: %s' % e)
def _guess_filename(filename, filetype): mtree = IterativeMatcher(filename, filetype=filetype) opts, transfo_opts = mtree.second_pass_options if opts or transfo_opts: log.info("Running 2nd pass") mtree = IterativeMatcher(filename, filetype=filetype, opts=opts, transfo_opts=transfo_opts) return mtree.matched()
def _build_filename_mtree(filename, options=None, **kwargs): mtree = IterativeMatcher(filename, options=options, **kwargs) second_pass_options = mtree.second_pass_options if second_pass_options: log.info("Running 2nd pass") merged_options = dict(options) merged_options.update(second_pass_options) mtree = IterativeMatcher(filename, options=merged_options, **kwargs) return mtree
def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('test/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] if info is None: info = ['filename'] if isinstance(info, basestring): info = [info] for infotype in info: if infotype == 'filename': m = IterativeMatcher(filename, filetype=filetype) result.append(m.matched()) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append( Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception, e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append( Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception, e: log.warning('Could not compute ed2k hash because: %s' % e)
def _guess_filename(filename, filetype): mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage'])) if not langs: return warning('A weird error happened with language detection') # find the language that is likely more relevant for lng in langs: if lng.value in title2.value: # if the language was detected as part of a potential title, # look at this one in particular lang = lng break else: # pick the first one if we don't have a better choice lang = langs[0] # language code are rarely part of a title, and those # should be handled by the Language exceptions anyway if len(lang.value) <= 3: return m # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(title.root.value).split() if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and parts.index(lang.value) == len(parts) - 2): return m # if the language was in the middle of the other potential title, # keep the other title (eg: The Italian Job), except if it is at the # very beginning, in which case we consider it an error if m2['title'].startswith(lang.value): return m elif lang.value in title2.value: return m2 # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return warning('Not sure of the title because of the language position') return m
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: mtree = IterativeMatcher(filename, filetype=filetype, opts=['skip_first_year']) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') is None: return m if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage'])) if not langs: return warning('A weird error happened with language detection') # find the language that is likely more relevant for lng in langs: if lng.value in title2.value: # if the language was detected as part of a potential title, # look at this one in particular lang = lng break else: # pick the first one if we don't have a better choice lang = langs[0] # language code are rarely part of a title, and those # should be handled by the Language exceptions anyway if len(lang.value) <= 3: return m # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(title.root.value).split() if (m['type'] in ['moviesubtitle', 'episodesubtitle']): if lang.value in parts and (parts.index(lang.value) == len(parts) - 2): return m # if the language was in the middle of the other potential title, # keep the other title (eg: The Italian Job), except if it is at the # very beginning, in which case we consider it an error if m2['title'].startswith(lang.value): return m elif lang.value in title2.value: return m2 # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return warning('Not sure of the title because of the language position') return m
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() second_pass_opts = [] second_pass_transfo_opts = {} # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: second_pass_opts.append('skip_first_year') to_skip_language_nodes = [] title_nodes = set( n for n in find_nodes(mtree.match_tree, ['title', 'series'])) title_spans = {} for title_node in title_nodes: title_spans[title_node.span[0]] = title_node title_spans[title_node.span[1]] = title_node for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if len(lang_node.value) > 3 and ( lang_node.span[0] in list(title_spans.keys()) or lang_node.span[1] in list(title_spans.keys())): # Language is next or before title, and is not a language code. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if m['type'] in [ 'moviesubtitle', 'episodesubtitle' ] and (parts.index(lang_node.value) == len(parts) - 2): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if existing_lang_node.guess.confidence( 'language') >= lang_node.guess.confidence('language'): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: second_pass_transfo_opts['guess_language'] = (((), { 'skip': [{ 'node_idx': node.parent.node_idx, 'span': node.span } for node in to_skip_language_nodes] })) if second_pass_opts or second_pass_transfo_opts: # 2nd pass is needed log.info("Running 2nd pass with options: %s" % second_pass_opts) log.info("Transfo options: %s" % second_pass_transfo_opts) mtree = IterativeMatcher(filename, filetype=filetype, opts=second_pass_opts, transfo_opts=second_pass_transfo_opts) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return m
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() second_pass_opts = [] second_pass_transfo_opts = {} # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: second_pass_opts.append('skip_first_year') to_skip_language_nodes = [] title_nodes = set(n for n in find_nodes(mtree.match_tree, ['title', 'series'])) title_spans = {} for title_node in title_nodes: title_spans[title_node.span[0]] = title_node title_spans[title_node.span[1]] = title_node for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if len(lang_node.value) > 3 and (lang_node.span[0] in title_spans.keys() or lang_node.span[1] in title_spans.keys()): # Language is next or before title, and is not a language code. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if m['type'] in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language'): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: second_pass_transfo_opts['guess_language'] = ( ((), { 'skip': [ { 'node_idx': node.parent.node_idx, 'span': node.span } for node in to_skip_language_nodes ] })) if second_pass_opts or second_pass_transfo_opts: # 2nd pass is needed log.info("Running 2nd pass with options: %s" % second_pass_opts) log.info("Transfo options: %s" % second_pass_transfo_opts) mtree = IterativeMatcher(filename, filetype=filetype, opts=second_pass_opts, transfo_opts=second_pass_transfo_opts) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return m
def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) if info is None: info = ['filename'] if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': m = IterativeMatcher(filename, filetype=filetype) result.append(m.matched()) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append(Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append(Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) elif infotype.startswith('hash_'): import hashlib hashname = infotype[5:] try: hasher = getattr(hashlib, hashname)() hashers.append((infotype, hasher)) except AttributeError: log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) else: log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: chunk = f.read(blocksize) while chunk: for hasher in hasherobjs: hasher.update(chunk) chunk = f.read(blocksize) for infotype, hasher in hashers: result.append(Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception as e: log.warning('Could not compute hash because: %s' % e) result = merge_all(result) # last minute adjustments # if country is in the guessed properties, make it part of the filename if 'country' in result: result['series'] += ' (%s)' % result['country'].alpha2.upper() return result
def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ result = [] hashers = [] if info is None: info = ['filename'] if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': m = IterativeMatcher(filename, filetype=filetype) result.append(m.matched()) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: result.append( Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: result.append( Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) elif infotype.startswith('hash_'): import hashlib hashname = infotype[5:] try: hasher = getattr(hashlib, hashname)() hashers.append((infotype, hasher)) except AttributeError: log.warning( 'Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) else: log.warning('Invalid infotype: %s' % infotype) # do all the hashes now, but on a single pass if hashers: try: blocksize = 8192 hasherobjs = dict(hashers).values() with open(filename, 'rb') as f: chunk = f.read(blocksize) while chunk: for hasher in hasherobjs: hasher.update(chunk) chunk = f.read(blocksize) for infotype, hasher in hashers: result.append( Guess({infotype: hasher.hexdigest()}, confidence=1.0)) except Exception as e: log.warning('Could not compute hash because: %s' % e) result = merge_all(result) # last minute adjustments # if country is in the guessed properties, make it part of the filename if 'country' in result: result['series'] += ' (%s)' % result['country'].alpha2.upper() return result