Example #1
0
    def test_match_filter(self):
        class FilterYDL(YDL):
            def __init__(self, *args, **kwargs):
                super(FilterYDL, self).__init__(*args, **kwargs)
                self.params['simulate'] = True

            def process_info(self, info_dict):
                super(YDL, self).process_info(info_dict)

            def _match_entry(self, info_dict, incomplete=False):
                res = super(FilterYDL,
                            self)._match_entry(info_dict, incomplete)
                if res is None:
                    self.downloaded_info_dicts.append(info_dict)
                return res

        first = {
            'id': '1',
            'url': TEST_URL,
            'title': 'one',
            'extractor': 'TEST',
            'duration': 30,
            'filesize': 10 * 1024,
            'playlist_id': '42',
            'uploader': "變態妍字幕版 太妍 тест",
            'creator': "тест ' 123 ' тест--",
            'webpage_url': 'http://example.com/watch?v=shenanigans',
        }
        second = {
            'id': '2',
            'url': TEST_URL,
            'title': 'two',
            'extractor': 'TEST',
            'duration': 10,
            'description': 'foo',
            'filesize': 5 * 1024,
            'playlist_id': '43',
            'uploader': "тест 123",
            'webpage_url': 'http://example.com/watch?v=SHENANIGANS',
        }
        videos = [first, second]

        def get_videos(filter_=None):
            ydl = FilterYDL({'match_filter': filter_})
            for v in videos:
                ydl.process_ie_result(v, download=True)
            return [v['id'] for v in ydl.downloaded_info_dicts]

        res = get_videos()
        self.assertEqual(res, ['1', '2'])

        def f(v):
            if v['id'] == '1':
                return None
            else:
                return 'Video id is not 1'

        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func('duration < 30')
        res = get_videos(f)
        self.assertEqual(res, ['2'])

        f = match_filter_func('description = foo')
        res = get_videos(f)
        self.assertEqual(res, ['2'])

        f = match_filter_func('description =? foo')
        res = get_videos(f)
        self.assertEqual(res, ['1', '2'])

        f = match_filter_func('filesize > 5KiB')
        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func('playlist_id = 42')
        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func('uploader = "變態妍字幕版 太妍 тест"')
        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func('uploader != "變態妍字幕版 太妍 тест"')
        res = get_videos(f)
        self.assertEqual(res, ['2'])

        f = match_filter_func('creator = "тест \' 123 \' тест--"')
        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func("creator = 'тест \\' 123 \\' тест--'")
        res = get_videos(f)
        self.assertEqual(res, ['1'])

        f = match_filter_func(
            r"creator = 'тест \' 123 \' тест--' & duration > 30")
        res = get_videos(f)
        self.assertEqual(res, [])
Example #2
0
def _build_youtube_dl(worker, destdir, site, page):
    '''
    Builds a yt-dlp `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.

    The `YoutubeDL` instance does a few special brozzler-specific things:

    - keeps track of urls fetched using a `YoutubeDLSpy`
    - periodically updates `site.last_claimed` in rethinkdb
    - if brozzling through warcprox and downloading segmented videos (e.g.
      HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
      using a WARCPROX_WRITE_RECORD request
    - some logging

    Args:
        worker (brozzler.BrozzlerWorker): the calling brozzler worker
        destdir (str): where to save downloaded videos
        site (brozzler.Site): the site we are brozzling

    Returns:
        a yt-dlp `youtube_dl.YoutubeDL` instance
    '''

    class _YoutubeDL(youtube_dl.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)

        def urlopen(self, req):
            try:
                url = req.full_url
            except AttributeError:
                url = req
            self.logger.debug('fetching %r', url)
            return super().urlopen(req)

        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
                if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
                    # At this point ie_result['entries'] is an iterator that
                    # will fetch more metadata from youtube to list all the
                    # videos. We unroll that iterator here partly because
                    # otherwise `process_ie_result()` will clobber it, and we
                    # use it later to extract the watch pages as outlinks.
                    try:
                        ie_result['entries_no_dl'] = list(ie_result['entries'])
                    except Exception as e:
                        self.logger.warning(
                                "failed to unroll ie_result['entries']? for %s, %s; exception %s",
                                ie.IE_NAME, url, e)
                        ie_result['entries_no_dl'] =[]
                    ie_result['entries'] = []
                    self.logger.info(
                            'not downloading %s media files from this '
                            'playlist because we expect to capture them from '
                            'individual watch/track/detail pages',
                            len(ie_result['entries_no_dl']))
            else:
                self.logger.info(
                        'extractor %r found a download in %s', ie.IE_NAME, url)

        def _push_stitched_up_vid_to_warcprox(self, site, info_dict):
            # 220211 update: does yt-dlp supply content-type?
            # XXX Don't know how to get the right content-type. Youtube-dl
            # doesn't supply it. Sometimes (with --hls-prefer-native)
            # youtube-dl produces a stitched-up video that /usr/bin/file fails
            # to identify (says "application/octet-stream"). `ffprobe` doesn't
            # give us a mimetype.
            if info_dict.get('ext') == 'mp4':
                mimetype = 'video/mp4'
            else:
                try:
                    import magic
                    mimetype = magic.from_file(info_dict['filepath'], mime=True)
                except ImportError as e:
                    mimetype = 'video/%s' % info_dict['ext']
                    self.logger.warning(
                            'guessing mimetype %s because %r', mimetype, e)

            url = 'youtube-dl:%05d:%s' % (
                    info_dict.get('playlist_index') or 1,
                    info_dict['webpage_url'])
            size = os.path.getsize(info_dict['filepath'])
            self.logger.info(
                    'pushing %r video stitched-up as %s (%s bytes) to '
                    'warcprox at %s with url %s', info_dict['format'],
                    mimetype, size, worker._proxy_for(site), url)
            with open(info_dict['filepath'], 'rb') as f:
                # include content-length header to avoid chunked
                # transfer, which warcprox currently rejects
                extra_headers = dict(site.extra_headers())
                extra_headers['content-length'] = size
                request, response = worker._warcprox_write_record(
                        warcprox_address=worker._proxy_for(site), url=url,
                        warc_type='resource', content_type=mimetype, payload=f,
                        extra_headers=extra_headers)
                # consulted by _remember_videos()
                ydl.stitch_ups.append({
                    'url': url,
                    'response_code': response.code,
                    'content-type': mimetype,
                    'content-length': size,
                })

    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case yt-dlp takes a long time, heartbeat site.last_claimed
        # to prevent another brozzler-worker from claiming the site
        try:
            if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES):
                worker.logger.debug(
                        'heartbeating site.last_claimed to prevent another '
                        'brozzler-worker claiming this site id=%r', site.id)
                site.last_claimed = doublethink.utcnow()
                site.save()
        except:
            worker.logger.debug(
                    'problem heartbeating site.last_claimed site id=%r',
                    site.id, exc_info=True)

    def ydl_postprocess_hook(d):
        if d['status'] == 'finished':
            worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
            worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
            if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site):
                _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'])

    ydl_opts = {
        "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
        "retries": 1,
        "nocheckcertificate": True,
        "noplaylist": True,
        "noprogress": True,
        "nopart": True,
        "no_color": True,
        "progress_hooks": [maybe_heartbeat_site_last_claimed],
        "postprocessor_hooks": [ydl_postprocess_hook],

        # https://github.com/yt-dlp/yt-dlp#format-selection
        # "By default, yt-dlp tries to download the best available quality..."
        # https://github.com/yt-dlp/yt-dlp#sorting-formats
        # "You can change the criteria for being considered the best by using -S (--format-sort)...."
        # "vext: Video Extension (mp4 > webm > flv > other). If --prefer-free-formats is used, webm is preferred."
        # "aext: Audio Extension (m4a > aac > mp3 > ogg > opus > webm > other)."
        # "If --prefer-free-formats is used, the order changes to opus > ogg > webm > m4a > mp3 > aac."
        # "ext: Equivalent to vext,aext"
        "format_sort": ["ext"],
        "format": "b/bv+ba",
        # skip live streams
        "match_filter": match_filter_func("!is_live"),

        # --cache-dir local or...
        "cache_dir": False,

        "logger": logging.getLogger("youtube_dl"),
        "verbose": True,
        "quiet": False,
    }
    if worker._proxy_for(site):
        ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
    ydl = _YoutubeDL(ydl_opts)
    if site.extra_headers():
        ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
    ydl.fetch_spy = YoutubeDLSpy()
    ydl.stitch_ups = []
    ydl._opener.add_handler(ydl.fetch_spy)
    return ydl