class TestDownloadQueue(unittest.TestCase): def setUp(self): # download queue with default setting self._dq_default = DownloadQueue(user_agent='katzenbaum/4.2') # cache for custom downlodqueue self._cache = Cache() self._cache.open() self._dq_custom = DownloadQueue( local_cache=self._cache ) self._url = 'http://httpbin.org/status/{code}' self._job = types.SimpleNamespace(**{ 'url': ['http://httpbin.org/get'], 'response': None, 'return_code': None }) def test_user_agent(self): self._dq_default.push(self._job) job = self._dq_default.pop() for url, content in job.response: response = json.loads(content) self.assertTrue( response['headers']['User-Agent'] == 'katzenbaum/4.2' ) def test_bad_status_codes(self): test_codes = ['404', '408', '503'] self._job.url = [ self._url.format(code=_code) for _code in test_codes ] self._dq_default.push(self._job) job = self._dq_default.pop() for code in job.return_code: self.assertTrue(code['status'] in test_codes) test_codes.remove(code['status']) for response in job.response: url, content = response self.assertTrue(content is '') def test_good_status_codes(self): test_codes = ['200', '300', '304'] self._job.url = [ self._url.format(code=_code) for _code in test_codes ] self._job.response = None self._dq_default.push(self._job) job = self._dq_default.pop() for code in job.return_code: self.assertTrue(code['status'] in test_codes) test_codes.remove(code['status']) for response in job.response: url, content = response self.assertTrue(content is '') def test_without_local_cache(self): self._dq_default.push(self._job) job = self._dq_default.pop() self._dq_default.push(self._job) job = self._dq_default.pop() for code in job.return_code: self.assertTrue(code != 'local') self.assertTrue(code['status'] == '200') def test_with_local_cache(self): self._dq_default.push(self._job) job = self._dq_default.pop() for code in job.return_code: self.assertTrue(code['status'] == '200') for url, content in job.response: self._cache.write(url, content) # self._cache.write(job.url, job.response) self._dq_custom.push(job) job = self._dq_custom.pop() for code in job.return_code: self.assertTrue(code == 'local') def test_pop_empty(self): with self.assertRaises(Empty): self._dq_default.pop() def test_downloadqueue_shutdown(self): self.assertTrue(self._dq_default._shutdown_downloadqueue is False) self._dq_default.push(None) self.assertTrue(self._dq_default._shutdown_downloadqueue is True) self.assertTrue(self._dq_custom._shutdown_downloadqueue is False) self._dq_custom.push(self._job) self.assertTrue(self._dq_custom._shutdown_downloadqueue is False) self._dq_default.push(None) self._dq_default.push(None) self._dq_default.push(None) self.assertTrue(self._dq_default._shutdown_downloadqueue is True) def tearDown(self): self._cache.close()
class Session: def __init__( self, cache_path=None, parallel_jobs=1, parallel_downloads_per_job=8, timeout_sec=5, user_agent='libhugin/1.0' ): """ Init a session object with user specified parameters. Creating a Session: .. code-block:: python # importing the session from hugin.harvest import Session session = Session() There are some Session parameters like the 'user-agent' that may be changed by the user. The following example will create a Session that uses the user agent *'ravenlib/1.0'*, the cache will be stored at */tmp/hugincache/*, two job threads will be used, each job will have four simultanous *download threds* and the timeout for each http response will be ten seconds. Example: .. code-block:: python session = Session( user_agent='ravenlib/1.0', cache_path='/tmp/hugincache', parallel_jobs=2, parallel_downloads_per_job=4, timeout_sec=10 ) The following parameters are customizable by the user: :param str cache_path: Path of cache to be written to. This is the path where the *cache container* should be saved. Currently the cache is a python shelve storing valid http responses. :param parallel_jobs: Number of simultaneous jobs to be used. This parameter is used to set the number of simultaneous jobs. The default value is 1, as there is not much performance gain because of the GIL. The main purpose if of threads in this case is to make asynchronous submit execution possible. :param int parallel_downloads_per_job: Number of parallel downloads. This parameter sets the number of parallel download jobs. Each job will use this number of parallel jobs. :param int timeout_sec: Timeout for http requests to be used. This timeout will be use for *every* http response. :param str user_agent: User-agent to be used for metadata downloading. """ signal.signal(signal.SIGINT, self._signal_handler) if cache_path is None: cache_path = '/tmp/{uuid}'.format(uuid=str(uuid.uuid4())) self._config = { 'cache_path': cache_path, # limit parallel jobs to 4, there is no reason for a huge number of # parallel jobs because of the GIL 'parallel_jobs': min(4, parallel_jobs), 'download_threads': parallel_downloads_per_job, 'timeout_sec': timeout_sec, 'user_agent': user_agent, } self._plugin_handler = PluginHandler() self._plugin_handler.activate_plugins_by_category('Provider') self._plugin_handler.activate_plugins_by_category('Converter') self._plugin_handler.activate_plugins_by_category('Postprocessor') self._provider = self._plugin_handler.get_plugins_from_category( 'Provider' ) self._postprocessor = self._plugin_handler.get_plugins_from_category( 'Postprocessor' ) self._converter = self._plugin_handler.get_plugins_from_category( 'Converter' ) self._cache = Cache() self._cache.open(path=self._config['cache_path']) self._async_executor = ThreadPoolExecutor( max_workers=self._config['parallel_jobs'] ) self._cleanup_triggered = False self._provider_types = { 'movie': [], 'movie_picture': [], 'person': [], 'person_picture': [] } self._downloadqueues = [] self._submit_futures = [] self._shutdown_session = False # categorize provider for convinience reasons for provider in self._provider: self._categorize(provider) def create_query(self, **kwargs): """ Validate params and return a Query. This function returns a Query object build of the given kwargs. Keys are validated and default or missing values are set by the :class:`hugin.harvest.query.Query`. .. note:: All invalid key parameters will be filtered. If there are missing or inconsistent values a KeyError exception will be raised by the Query. The following query will search for the movie *Sin City*, the result amount is limited to a max number of five items. Download retries are set to two. Example code snippet: .. code-block:: python query = session.create_query(title='Sin City', amount=5, retries=2) # just to illustrate how a query looks like print(query) {'year': None, 'type': 'movie', 'providers': None, 'remove_invalid': True, 'fuzzysearch': False, 'amount': 5, 'language': '', 'imdbid': None, 'retries': 2, 'cache': True, 'strategy': 'flat', 'search': 'both', 'title': 'Sin City'} You will just receive a dictionary representing the search values. Depending on the metadata type there are different parameters to be used in a query. The following parameters are possible (default value inside brackets): Movie specific: :param str title: Movie title. The movie title, this key will set the type key to 'movie'. The title has to be set in single quotes. Example: .. code-block:: python # get a query for the movie watchmen, everything is initialized # with default values query = session.create_query(title='Watchmen') [...] # building another query, movie title with whitespace query = session.create_query(title='Only god forgives') [...] :param int year: Movie year. In most cases the movie release date as 4-digit int. Example: .. code-block:: python # appending a release date to the query query = session.create_query(title='Sin City', year=2005) [...] :param str imdbid: The imdbid. You can also search by imdbid, like movie titles the value has to be quoted. Example: .. code-block:: python # building a query from imdbid (Drive (2011)) query = session.create_query(imdbid='tt0780504') Person specific: :param str name: Person name. The name key will set the type key to 'person'. Like movie titles, person names has to be set into quotes. Example: .. code-block:: python query = session.create_query(name='Evangeline Lilly') General: This parameters may be set on movie an person queries as they are not specific to a single type. :param str search: Search textual, picture or both [text]. This parameter will influence the search by choosing provider that are only able to search for textual metadata, pictures or both. Example: .. code-block:: python # will trigger textual and picture only provider query = session.create_query(title='Sin City', search='both') :param str strategy: Search strategy deep or flat [flat]. When limiting the search results to three, every provider is looking for three results. After all providers are finished, the max amount of results according to the amount limit is returned. The way results are composed is defined by the strategy flag. Example: .. code-block:: python # invoking flat search query = session.create_query( title='Sin City', strategy='flat', amount=5 ) [...] # invoking deep search query = session.create_query( title='Sin City', strategy='deep', amount=5 ) [...] The table below illustrates a provider search with a amount limited to five. The first provider has found four results, the second provider only three and the third provider only found two results. Exemplar result table: +------------------+---------------+---------------+-------------+ | results/provider | #1 tmdb | #2 ofdb | #3 imdb | | priority ---> | 90 | 80 | 70 | +------------------+---------------+---------------+-------------+ | highest quality | result1 (f,d) | result1 (f,d) | result1 (f) | +------------------+---------------+---------------+-------------+ | ... | result2 (f,d) | result2 (f) | result2 | +------------------+---------------+---------------+-------------+ | ... | result3 (d) | result3 | | +------------------+---------------+---------------+-------------+ | ... | result4 (d) | result4 | | +------------------+---------------+---------------+-------------+ | lowest quality | | result5 | | +------------------+---------------+---------------+-------------+ After the provider results are *collected*, only five results are returned to the user as amount is five. How the results are picked depends on the strategy. Every provider has a priority. Priority of 90 is the *highest priority* in this example. Providers with a higher priority are preferred. Using the 'deep' strategy, the results are sorted by provider priority and the first five results (marked with 'd') are taken. Choosing the 'flat' strategy the results are chosen by its quality. This means that result1 of all three providers an result2 (marked with 'f') of the first and second provider are returned. :param bool cache: Use local cache [True]. If set the local cache will be used on each query. Http responses are cached. If a specific url response has already been cached previously it will be returned from the cache. If url is not found in the cache, a http request will be triggered. Only *valid* responses are cached. :param int retries: Number of retries per request [5]. If a http response timeout happens or a provider response is marked as invalid but not finished a retry will be triggered. This parameters limits the max possible retries. :param int amount: Number of Items you want to get [3]. This parameter limits the amount of results to be returned by a submit. .. code-block:: python # this query will return a max of 2 results query = session.create_query(title='Sin City', amount=2) [...] :param list providers: A list with provider name strings [all]. With the providers parameter you can limit the search to specific providers by giving libhugin a list with providers you want to query. Example: .. code-block:: python # this query will only trigger the omdbmovie and tmdbmovie # provider q = session.create_query( title='Sin', providers=['omdbmovie', 'tmdbmovie'] ) [...] To get the names of all available providers use the :meth:`Session.provider_plugins` method. .. code-block:: python providers = session.provider_plugins() for provider in providers: print(provider.name) Output: :: OFDBMovie OFDBPerson OMDBMovie TMDBPerson TMDBMovie :param str language: Language \ `ISO 639-1 <http://en.wikipedia.org/wiki/ISO_639>`_ Format [''] The language you want to use for your query. Currently there is only the tmdb provider that is multilingual. All other providers are limited to a specific language e.g. English or German. The genre normalization is currently also limited to German/English normalization only. Example: .. code-block:: python # this query will return German language attributes if the movie # provider is multilingual, otherwise the providers default # language will be returned query = session.create_query(title='Sin City', language='de') [...] :param bool fuzzysearch: Enable 'fuzzy search' mode. Content providers are pretty fussy about the title or name you search for. Therefor there is a fuzzy search mode implemented. This mode will try to get the right results even if the title/person is pretty much misspelled. Looking for the movie 'Only god forgives' works pretty well if the title is spelled correct, but if only a single word is misspelled like 'Only good forgives' no results will be found by the currently implemented providers. The libhugin fuzzy search is a simple workaround that is provider independent but requires a provider to be able to do a imdbid lookup. Enabling this mode libhugin will guess a imdbid for your misspelled title and query the available providers with this id. The downside is that currently only exact results for the guessed imdbid are returned. The fuzzy search will even work if you misspell the title like 'unly gut forgivs'. Example: .. code-block:: python # searching for misspelled Sin City title, will return nothing query = session.create_query(title='Sun Sity') results = session.submit(query) print(result) [] [...] # searching for misspelled Sin City title, with enabled # fuzzy search will return movies found by using the Sin City # imdbid tt0401792 query = session.create_query(title='Sun Sity', fuzzysearch=True) results = session.submit(query) print(result) [<OFDBMovie <movie> : Sin City (2005)>, <OMDBMovie <movie> : Sin City (2005)>, <TMDBMovie <movie, picture> : Sin City (2005)>] [...] :param str type: Type of metadata. person, movie. This parameter defines the type of metadata you want to search for, it is currently set automatically and should be may be ignored. """ return Query(kwargs) def _init_download_queue(self, query): """ Return a downloadqueue configured with user specified parameters. :return: A configured downloadqueue. """ if query.cache: query.cache = self._cache else: query.cache = None if query['fuzzysearch']: self._fuzzy_search(query) if query['type'] == 'movie' and query['id_title_lookup']: self._imdbid_title_lookup(query) downloadqueue = DownloadQueue( num_threads=self._config['download_threads'], timeout_sec=self._config['timeout_sec'], user_agent=self._config['user_agent'], local_cache=query.cache ) self._downloadqueues.append(downloadqueue) return downloadqueue def _add_to_cache(self, response): """ Write a response tuple (url, data) to local cache. """ for url, data in response: self._cache.write(url, data) def _submit(self, query): """ Here be dragons. """ results = [] downloadqueue = self._init_download_queue(query) for job in self._create_jobs_according_to_search_params(query): if job.done: results.append(self._job_to_result(job, query)) else: downloadqueue.push(job) while True: try: job = downloadqueue.pop() except queue.Empty: break if not job.response: results.append(self._job_to_result(job, query)) continue response = copy.deepcopy(job.response) # trigger provider to parse its request and process the result job.result, job.done = job.provider.parse_response( job.response, query ) if job.result: self._add_to_cache(response) if job.done: results.append(self._job_to_result(job, query)) else: self._process_flagged_as_not_done( job, downloadqueue, query, results ) downloadqueue.push(None) if query.remove_invalid: results = [result for result in results if result._result_dict.get('title')] return self._select_results_by_strategy(results, query) def submit(self, query): """ Submit a synchronous search query that blocks until finished. The following code block illustrates the query usage: .. code-block:: python results = s.submit(query) # blocks print(result) [<TMDBMovie <movie, picture> : Sin City (2005)>, <OFDBMovie <movie> : Sin City (2005)>, <OMDBMovie <movie> : Sin City (2005)>] The :meth:`Session.submit` method blocks. You can also submit the query asynchronously by using the :meth:`Session.submit_async` method. :param query: Query object with search parameters. :returns: A list with result objects. """ if self._shutdown_session: self.clean_up() else: return self._submit(query) def submit_async(self, query): """ Invoke :meth:`submit` asynchronously. The following code block illustrates the query usage: .. code-block:: python results_q1 = s.submit_async(query_one) # dosen't block results_q2 = s.submit_async(query_two) # dosen't block [...] :param query: Query object with search parameters. :returns: A future object objects. """ future = self._async_executor.submit( self.submit, query ) self._submit_futures.append(future) return future def _process_flagged_as_not_done(self, job, downloadqueue, query, results): """ Process jobs which are marked as not done by provider. """ if job.result: new_jobs = self._create_new_jobs_from_urls( job.result, job.provider, query ) for job in new_jobs: downloadqueue.push(job) else: job = self._decrement_retries(job) if job.done: results.append(self._job_to_result(job, query)) else: downloadqueue.push(job) def _fuzzy_search(self, query): if query['title'] and query['imdbid'] is None: year=query.get('year', '') if year: fmt = 'http://www.google.com/search?hl=de&q={title}+{year} imdb+movie&btnI=745' url = requests.get(fmt.format(title=query['title'], year=query.get('year', ''))).url else: fmt = "http://www.google.com/search?hl=de&q={title} imdb+movie&btnI=745" url = requests.get(fmt.format(title=query['title'])).url imdbids = re.findall('\/tt\d*/', url) if imdbids: query['imdbid'] = imdbids.pop().strip('/') def _imdbid_title_lookup(self, query): if query['imdbid']: fmt = 'http://www.imdb.com/title/{imdb_id}' url = requests.get(fmt.format(imdb_id=query['imdbid'])) if not 'google' in url.url: title, year = re.search( '\>(.+?)\s*\((\d{4})', url.text ).groups() if year.isnumeric(): query['title'], query['year'] = title, int(year) else: query['title'], query['year'] = title, None def _select_results_by_strategy(self, results, query): """ Filter result objects according to user specified search strategy. :param results: A list with finished results. :param query: The query that belongs to the results given. """ if len(results) == 0: return results if query.strategy == 'deep': return self._results_deep_strategy(results, query) else: return self._results_flat_strategy(results, query) def _results_deep_strategy(self, results, query): """ Return results proccessed with deep strategy. """ results.sort(key=lambda x: x.provider._priority, reverse=True) results = self._sort_by_ratio(results, query) return results[:query.amount] def _results_flat_strategy(self, results, query): """ Return results proccessed with flat strategy. """ result_map = defaultdict(list) # group by provider for result in results: result_map[result.provider].append(result) # sort by ratio for provider, results in result_map.items(): result_map[provider] = self._sort_by_ratio(results, query) result_map = OrderedDict( sorted( result_map.items(), key=lambda t: t[0]._priority, reverse=True ) ) results = list( filter(None, reduce(add, zip_longest(*result_map.values()))) ) return results[:query.amount] def _sort_by_ratio(self, results, query): """ Sort results by ratio between result and search params. """ ratio_table = [] qry_imdb = query.get('imdbid') for result in filter(lambda res: res._result_dict, results): ratio = 0.0 if qry_imdb and qry_imdb == result._result_dict.get('imdbid'): ratio = 1.0 elif query.get('title'): ratio_a = string_similarity_ratio( query.title, result._result_dict.get('original_title') ) ratio_b = string_similarity_ratio( query.title, result._result_dict['title'] ) ratio = max(ratio_a or 0.0, ratio_b or 0.0) # TODO: Fix first time wrong results. Maybe sort by year? if query.get('year') and result._result_dict['year']: a, b = query.get('year'), result._result_dict['year'] penalty = math.sqrt(1 - (abs(a - b) / max(a, b))) ratio *= penalty ratio_entry = {'result': result, 'ratio': ratio} ratio_table.append(ratio_entry) ratio_table.sort(key=lambda x: x['ratio'], reverse=True) return [res['result'] for res in ratio_table] def _job_to_result(self, job, query): """ Return a result generated from finished job and query. """ retries = query.retries - job.retries_left if query['type'] == 'movie': clean_result = movie_result_mask(job.result) if query['type'] == 'person': clean_result = person_result_mask(job.result) result = Result( provider=job.provider, query=query, result=clean_result, retries=retries ) return result def _decrement_retries(self, job): """ Decrement retries inside job, set to done if no retries left. """ if job.retries_left > 0: job.retries_left -= 1 else: job.done = True return job def _get_job_struct(self, provider, query): """ Return a job structure. """ params = [ 'url', 'future', 'response', 'done', 'result', 'return_code', 'retries_left', 'provider' ] job = types.SimpleNamespace(**{param: None for param in params}) job.provider, job.retries_left = provider, query.retries return job def _get_matching_provider(self, query): """ Return provider list with according to params in query. """ providers = [] for key, value in self._provider_types.items(): if query.type in key: providers += self._provider_types[key] if query.providers: allowed_provider = [x.upper() for x in query.providers] prov_filter = lambda x: x['name'].name.upper() in allowed_provider providers = [x for x in filter(prov_filter, providers)] return providers def _create_jobs_according_to_search_params(self, query): """ Create new jobs, according to given search params in query. """ job_list = [] for provider in self._get_matching_provider(query): provider = provider['name'] job = self._get_job_struct(provider=provider, query=query) url_list = job.provider.build_url(query) if url_list is not None: job.url = url_list else: job.done = True job_list.append(job) return job_list def _create_new_jobs_from_urls(self, urls, provider, query): """ Create new jobs from urls and a specific provider. """ jobs = [] for url_list in urls: job = self._get_job_struct(provider=provider, query=query) job.url = url_list jobs.append(job) return jobs def _categorize(self, provider): """ Cagegorizes providers according to its type. """ self._provider_types[provider.identify_type()].append( {'name': provider, 'supported_attrs': provider.supported_attrs} ) def provider_plugins(self, pluginname=None): """ Return provider plugins. :param pluginname: Name of a specific provider. Passing a provider plugin name will only return a single provider. :returns: Provider plugin list or specific provider. """ return self._get_plugin(self._provider, pluginname) def postprocessor_plugins(self, pluginname=None): """ Return postprocessor plugins. See analogue: :meth:`provider_plugins` """ return self._get_plugin(self._postprocessor, pluginname) def converter_plugins(self, pluginname=None): """ Return converter plugins. See analogue: :meth:`provider_plugins` """ return self._get_plugin(self._converter, pluginname) def _get_plugin(self, plugins, pluginname=None): if pluginname is None: return plugins else: for plugin in plugins: if pluginname.upper() in plugin.name.upper(): return plugin def clean_up(self): """ Do a clean up on keyboard interrupt or submit cancel. This method needs to be triggered after a cancel. It will block until ready. """ if self._cleanup_triggered is False: self._cleanup_triggered = True print('You pressed Ctrl+C!') print('cleaning up.') # kill all pending futures for future in self._submit_futures: if future.running() is False and future.done() is False: future.cancel() print('waiting for remaining futures to complete.') self._async_executor.shutdown(wait=True) # print('closing cache.') self._cache.close() # print('cache closed.') def cancel(self): """ Cancel the currently running session. The cancel method will set a shutdown flag inside the :meth:`Session`. All running jobs will be finished, pending jobs will be canceled. """ self._shutdown_session = True def _signal_handler(self, signal, frame): """ Invoke cancel on signal interrupt. """ self.cancel()