Example #1
0
 def test_derive_key_name_from_video(self):
     self._set_responses_xrange(BATCH_SIZE)
     _task_handler('UUID')
     videos = Video.all().fetch(BATCH_SIZE)
     for v in videos:
         key = VideoSubtitles.get_key_name('en', v.youtube_id)
         subs = VideoSubtitles.get_by_key_name(key)
         self.assertIsNotNone(subs)
 def test_derive_key_name_from_video(self):
     self._set_responses_xrange(BATCH_SIZE)
     _task_handler('UUID')
     videos = Video.all().fetch(BATCH_SIZE)
     for v in videos:
         key = VideoSubtitles.get_key_name('en', v.youtube_id)
         subs = VideoSubtitles.get_by_key_name(key)
         self.assertIsNotNone(subs)
    def test_assume_utf8_encoded_content(self):
        # Universal Subtitles API returns utf-8
        # u'\xc4\xd0' is unicode for the utf-8 byte string '\xc3\x84\xc3\x90'
        utf8_str = '\xc3\x84\xc3\x90'
        unicode_str = u'\xc4\xd0'

        self._set_responses_xrange(1, content=utf8_str)

        _task_handler('UUID')
        self.assertEqual(VideoSubtitles.all().count(), 1)
        subs = VideoSubtitles.all().get()
        self.assertEqual(subs.json, unicode_str)
Example #4
0
    def test_assume_utf8_encoded_content(self):
        # Universal Subtitles API returns utf-8
        # u'\xc4\xd0' is unicode for the utf-8 byte string '\xc3\x84\xc3\x90'
        utf8_str = '\xc3\x84\xc3\x90'
        unicode_str = u'\xc4\xd0'

        self._set_responses_xrange(1, content=utf8_str)

        _task_handler('UUID')
        self.assertEqual(VideoSubtitles.all().count(), 1)
        subs = VideoSubtitles.all().get()
        self.assertEqual(subs.json, unicode_str)
Example #5
0
def download_subtitles(videos, report=None):
    if report is None:
        report = dict(REPORT_TEMPLATE)

    # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles
    # at each DEFER_SECONDS interval

    rpcs = []
    for video in videos:
        url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id)
        rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS)
        urlfetch.make_fetch_call(rpc, url)
        rpcs.append((video.youtube_id, rpc))
        report['fetches'] += 1

    # Process asynchronous fetches

    for youtube_id, rpc in rpcs:
        lang = 'en'
        key_name = VideoSubtitles.get_key_name(lang, youtube_id)
        try:
            resp = rpc.get_result()
            if resp.status_code != 200:
                raise RuntimeError('status code: %s' % resp.status_code)

            if resp.final_url:
                logging.warn('%s redirect to %s' % (key_name, resp.final_url))
                report['redirects'] += 1

            json = resp.content.decode('utf-8')

            # Only update stale records

            current = VideoSubtitles.get_by_key_name(key_name)
            if not current or current.json != json:
                new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id,
                                     language=lang, json=json)
                new.put()
                report['writes'] += 1
            else:
                logging.info('%s content already up-to-date' % key_name)
        except Exception, e:
            logging.error('%s subtitles fetch failed: %s' % (key_name, e))
            report['errors'] += 1
    def test_process_next_batch_on_nonempty_cursor(self):
        offset = 3

        # these should be skipped, they'll DownloadError
        for i in xrange(0, offset):
            Video(youtube_id=str(i)).put()

        # these should be downloaded
        self._set_responses_xrange(offset, BATCH_SIZE + offset)

        query = Video.all()
        query.fetch(offset)
        cursor = query.cursor()

        _task_handler('UUID', cursor=cursor)
        self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
Example #7
0
    def test_process_next_batch_on_nonempty_cursor(self):
        offset = 3

        # these should be skipped, they'll DownloadError
        for i in xrange(0, offset):
            Video(youtube_id=str(i)).put()

        # these should be downloaded
        self._set_responses_xrange(offset, BATCH_SIZE + offset)

        query = Video.all()
        query.fetch(offset)
        cursor = query.cursor()

        _task_handler('UUID', cursor=cursor)
        self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
    def test_should_not_put_duplicate_subtitles(self, info):
        self._set_responses_xrange(BATCH_SIZE, content="some json")

        # first fetch
        _task_handler('UUID', 0)
        self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
        self.assertEqual(info.call_count, 0)

        with patch('unisubs.VideoSubtitles') as MockVideoSubtitles:
            MockVideoSubtitles.get_key_name = VideoSubtitles.get_key_name
            MockVideoSubtitles.get_by_key_name = VideoSubtitles.get_by_key_name
            # second fetch, same content
            _task_handler('UUID', 1)
            self.assertEqual(MockVideoSubtitles.return_value.put.call_count, 0,
                             'duplicate subtitles should not be put()')
            self.assertEqual(info.call_count, BATCH_SIZE,
                             'skipped put should be logged')
Example #9
0
    def test_should_not_put_duplicate_subtitles(self, info):
        self._set_responses_xrange(BATCH_SIZE, content="some json")

        # first fetch
        _task_handler('UUID', 0)
        self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
        self.assertEqual(info.call_count, 0)

        with patch('unisubs.VideoSubtitles') as MockVideoSubtitles:
            MockVideoSubtitles.get_key_name = VideoSubtitles.get_key_name
            MockVideoSubtitles.get_by_key_name = VideoSubtitles.get_by_key_name
            # second fetch, same content
            _task_handler('UUID', 1)
            self.assertEqual(MockVideoSubtitles.return_value.put.call_count, 0,
                             'duplicate subtitles should not be put()')
            self.assertEqual(info.call_count, BATCH_SIZE,
                             'skipped put should be logged')
Example #10
0
 def test_process_first_batch_on_empty_cursor(self):
     self._set_responses_xrange(BATCH_SIZE)
     _task_handler('UUID')
     self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
Example #11
0
def _task_handler(uid, task_id=0, cursor=None, report=None):
    """Task chain for fetching subtitles from the Universal Subtitles API

    It processes Video models in batches of BATCH_SIZE by fetching the English
    subtitles via an HTTP API call.

    This job runs regularly so fetch failures are fixed from run-to-run.  Fetch
    failures are logged and suppressed as the task marches on.

    Errors include URL fetch timeouts, subtitles put failures, and response
    decoding failures.

    HTTP redirects indicate that the code needs updating to a new API endpoint.
    They are detected and reported separately.
    """

    query = Video.all()
    query.with_cursor(cursor)
    videos = query.fetch(BATCH_SIZE)

    if report is None:
        report = dict(REPORT_TEMPLATE)
        VideoSubtitlesFetchReport(key_name=uid, **report).put()

    # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles
    # at each DEFER_SECONDS interval

    rpcs = []
    for video in videos:
        url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id)
        rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS)
        urlfetch.make_fetch_call(rpc, url)
        rpcs.append((video.youtube_id, rpc))
        report['fetches'] += 1

    # Process asynchronous fetches

    for youtube_id, rpc in rpcs:
        lang = 'en'
        key_name = VideoSubtitles.get_key_name(lang, youtube_id)
        try:
            resp = rpc.get_result()
            if resp.status_code != 200:
                raise RuntimeError('status code: %s' % resp.status_code)

            if resp.final_url:
                logging.warn('%s redirect to %s' % (key_name, resp.final_url))
                report['redirects'] += 1

            json = resp.content.decode('utf-8')

            # Only update stale records

            current = VideoSubtitles.get_by_key_name(key_name)
            if not current or current.json != json:
                new = VideoSubtitles(key_name=key_name,
                                     youtube_id=youtube_id,
                                     language=lang,
                                     json=json)
                new.put()
                report['writes'] += 1
            else:
                logging.info('%s content already up-to-date' % key_name)
        except Exception, e:
            logging.error('%s subtitles fetch failed: %s' % (key_name, e))
            report['errors'] += 1
Example #12
0
 def test_process_first_batch_on_empty_cursor(self):
     self._set_responses_xrange(BATCH_SIZE)
     _task_handler('UUID')
     self.assertEqual(VideoSubtitles.all().count(), BATCH_SIZE)
Example #13
0
def _task_handler(uid, task_id=0, cursor=None, report=None):
    """Task chain for fetching subtitles from the Universal Subtitles API

    It processes Video models in batches of BATCH_SIZE by fetching the English
    subtitles via an HTTP API call.

    This job runs regularly so fetch failures are fixed from run-to-run.  Fetch
    failures are logged and suppressed as the task marches on.

    Errors include URL fetch timeouts, subtitles put failures, and response
    decoding failures.

    HTTP redirects indicate that the code needs updating to a new API endpoint.
    They are detected and reported separately.
    """

    query = Video.all()
    query.with_cursor(cursor)
    videos = query.fetch(BATCH_SIZE)

    if report is None:
        report = dict(REPORT_TEMPLATE)
        VideoSubtitlesFetchReport(key_name=uid, **report).put()

    # Asynchronously fetch. We'll rate-limit by fetching BATCH_SIZE subtitles
    # at each DEFER_SECONDS interval

    rpcs = []
    for video in videos:
        url = UNISUBS_URL % urllib.quote(YOUTUBE_URL % video.youtube_id)
        rpc = urlfetch.create_rpc(deadline=TIMEOUT_SECONDS)
        urlfetch.make_fetch_call(rpc, url)
        rpcs.append((video.youtube_id, rpc))
        report['fetches'] += 1

    # Process asynchronous fetches

    for youtube_id, rpc in rpcs:
        lang = 'en'
        key_name = VideoSubtitles.get_key_name(lang, youtube_id)
        try:
            resp = rpc.get_result()
            if resp.status_code != 200:
                raise RuntimeError('status code: %s' % resp.status_code)

            if resp.final_url:
                logging.warn('%s redirect to %s' % (key_name, resp.final_url))
                report['redirects'] += 1

            json = resp.content.decode('utf-8')

            # Only update stale records

            current = VideoSubtitles.get_by_key_name(key_name)
            if not current or current.json != json:
                new = VideoSubtitles(key_name=key_name, youtube_id=youtube_id,
                                     language=lang, json=json)
                new.put()
                report['writes'] += 1
            else:
                logging.info('%s content already up-to-date' % key_name)
        except Exception, e:
            logging.error('%s subtitles fetch failed: %s' % (key_name, e))
            report['errors'] += 1