def test_derive_key_name_from_video(self): self._set_responses_xrange(BATCH_SIZE) _task_handler('UUID') videos = Video.all().fetch(BATCH_SIZE) for v in videos: key = VideoSubtitles.get_key_name('en', v.youtube_id) subs = VideoSubtitles.get_by_key_name(key) self.assertIsNotNone(subs)
def test_derive_key_name_from_video(self): self._set_responses_xrange(BATCH_SIZE) _task_handler('UUID') videos = Video.all().fetch(BATCH_SIZE) for v in videos: key = VideoSubtitles.get_key_name('en', v.youtube_id) subs = VideoSubtitles.get_by_key_name(key) self.assertIsNotNone(subs)
def download_subtitles(videos, report=None): if report is None: report = dict(REPORT_TEMPLATE) # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles # at each DEFER_SECONDS interval rpcs = [] for video in videos: url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id) rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS) urlfetch.make_fetch_call(rpc, url) rpcs.append((video.youtube_id, rpc)) report['fetches'] += 1 # Process asynchronous fetches for youtube_id, rpc in rpcs: lang = 'en' key_name = VideoSubtitles.get_key_name(lang, youtube_id) try: resp = rpc.get_result() if resp.status_code != 200: raise RuntimeError('status code: %s' % resp.status_code) if resp.final_url: logging.warn('%s redirect to %s' % (key_name, resp.final_url)) report['redirects'] += 1 json = resp.content.decode('utf-8') # Only update stale records current = VideoSubtitles.get_by_key_name(key_name) if not current or current.json != json: new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id, language=lang, json=json) new.put() report['writes'] += 1 else: logging.info('%s content already up-to-date' % key_name) except Exception, e: logging.error('%s subtitles fetch failed: %s' % (key_name, e)) report['errors'] += 1
def _task_handler(uid, task_id=0, cursor=None, report=None): """Task chain for fetching subtitles from the Universal Subtitles API It processes Video models in batches of BATCH_SIZE by fetching the English subtitles via an HTTP API call. This job runs regularly so fetch failures are fixed from run-to-run. Fetch failures are logged and suppressed as the task marches on. Errors include URL fetch timeouts, subtitles put failures, and response decoding failures. HTTP redirects indicate that the code needs updating to a new API endpoint. They are detected and reported separately. """ query = Video.all() query.with_cursor(cursor) videos = query.fetch(BATCH_SIZE) if report is None: report = dict(REPORT_TEMPLATE) VideoSubtitlesFetchReport(key_name=uid, **report).put() # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles # at each DEFER_SECONDS interval rpcs = [] for video in videos: url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id) rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS) urlfetch.make_fetch_call(rpc, url) rpcs.append((video.youtube_id, rpc)) report['fetches'] += 1 # Process asynchronous fetches for youtube_id, rpc in rpcs: lang = 'en' key_name = VideoSubtitles.get_key_name(lang, youtube_id) try: resp = rpc.get_result() if resp.status_code != 200: raise RuntimeError('status code: %s' % resp.status_code) if resp.final_url: logging.warn('%s redirect to %s' % (key_name, resp.final_url)) report['redirects'] += 1 json = resp.content.decode('utf-8') # Only update stale records current = VideoSubtitles.get_by_key_name(key_name) if not current or current.json != json: new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id, language=lang, json=json) new.put() report['writes'] += 1 else: logging.info('%s content already up-to-date' % key_name) except Exception, e: logging.error('%s subtitles fetch failed: %s' % (key_name, e)) report['errors'] += 1
def _task_handler(uid, task_id=0, cursor=None, report=None): """Task chain for fetching subtitles from the Universal Subtitles API It processes Video models in batches of BATCH_SIZE by fetching the English subtitles via an HTTP API call. This job runs regularly so fetch failures are fixed from run-to-run. Fetch failures are logged and suppressed as the task marches on. Errors include URL fetch timeouts, subtitles put failures, and response decoding failures. HTTP redirects indicate that the code needs updating to a new API endpoint. They are detected and reported separately. """ query = Video.all() query.with_cursor(cursor) videos = query.fetch(BATCH_SIZE) if report is None: report = dict(REPORT_TEMPLATE) VideoSubtitlesFetchReport(key_name=uid, **report).put() # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles # at each DEFER_SECONDS interval rpcs = [] for video in videos: url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id) rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS) urlfetch.make_fetch_call(rpc, url) rpcs.append((video.youtube_id, rpc)) report['fetches'] += 1 # Process asynchronous fetches for youtube_id, rpc in rpcs: lang = 'en' key_name = VideoSubtitles.get_key_name(lang, youtube_id) try: resp = rpc.get_result() if resp.status_code != 200: raise RuntimeError('status code: %s' % resp.status_code) if resp.final_url: logging.warn('%s redirect to %s' % (key_name, resp.final_url)) report['redirects'] += 1 json = resp.content.decode('utf-8') # Only update stale records current = VideoSubtitles.get_by_key_name(key_name) if not current or current.json != json: new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id, language=lang, json=json) new.put() report['writes'] += 1 else: logging.info('%s content already up-to-date' % key_name) except Exception, e: logging.error('%s subtitles fetch failed: %s' % (key_name, e)) report['errors'] += 1