def test_match_filter(self): class FilterYDL(YDL): def __init__(self, *args, **kwargs): super(FilterYDL, self).__init__(*args, **kwargs) self.params['simulate'] = True def process_info(self, info_dict): super(YDL, self).process_info(info_dict) def _match_entry(self, info_dict, incomplete=False): res = super(FilterYDL, self)._match_entry(info_dict, incomplete) if res is None: self.downloaded_info_dicts.append(info_dict) return res first = { 'id': '1', 'url': TEST_URL, 'title': 'one', 'extractor': 'TEST', 'duration': 30, 'filesize': 10 * 1024, 'playlist_id': '42', 'uploader': "變態妍字幕版 太妍 тест", 'creator': "тест ' 123 ' тест--", 'webpage_url': 'http://example.com/watch?v=shenanigans', } second = { 'id': '2', 'url': TEST_URL, 'title': 'two', 'extractor': 'TEST', 'duration': 10, 'description': 'foo', 'filesize': 5 * 1024, 'playlist_id': '43', 'uploader': "тест 123", 'webpage_url': 'http://example.com/watch?v=SHENANIGANS', } videos = [first, second] def get_videos(filter_=None): ydl = FilterYDL({'match_filter': filter_}) for v in videos: ydl.process_ie_result(v, download=True) return [v['id'] for v in ydl.downloaded_info_dicts] res = get_videos() self.assertEqual(res, ['1', '2']) def f(v): if v['id'] == '1': return None else: return 'Video id is not 1' res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func('duration < 30') res = get_videos(f) self.assertEqual(res, ['2']) f = match_filter_func('description = foo') res = get_videos(f) self.assertEqual(res, ['2']) f = match_filter_func('description =? foo') res = get_videos(f) self.assertEqual(res, ['1', '2']) f = match_filter_func('filesize > 5KiB') res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func('playlist_id = 42') res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func('uploader = "變態妍字幕版 太妍 тест"') res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func('uploader != "變態妍字幕版 太妍 тест"') res = get_videos(f) self.assertEqual(res, ['2']) f = match_filter_func('creator = "тест \' 123 \' тест--"') res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func("creator = 'тест \\' 123 \\' тест--'") res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func( r"creator = 'тест \' 123 \' тест--' & duration > 30") res = get_videos(f) self.assertEqual(res, [])
def _build_youtube_dl(worker, destdir, site, page): ''' Builds a yt-dlp `youtube_dl.YoutubeDL` for brozzling `site` with `worker`. The `YoutubeDL` instance does a few special brozzler-specific things: - keeps track of urls fetched using a `YoutubeDLSpy` - periodically updates `site.last_claimed` in rethinkdb - if brozzling through warcprox and downloading segmented videos (e.g. HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox using a WARCPROX_WRITE_RECORD request - some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker destdir (str): where to save downloaded videos site (brozzler.Site): the site we are brozzling Returns: a yt-dlp `youtube_dl.YoutubeDL` instance ''' class _YoutubeDL(youtube_dl.YoutubeDL): logger = logging.getLogger(__module__ + "." + __qualname__) def urlopen(self, req): try: url = req.full_url except AttributeError: url = req self.logger.debug('fetching %r', url) return super().urlopen(req) def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) if ie_result.get('_type') == 'playlist': self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}: # At this point ie_result['entries'] is an iterator that # will fetch more metadata from youtube to list all the # videos. We unroll that iterator here partly because # otherwise `process_ie_result()` will clobber it, and we # use it later to extract the watch pages as outlinks. try: ie_result['entries_no_dl'] = list(ie_result['entries']) except Exception as e: self.logger.warning( "failed to unroll ie_result['entries']? for %s, %s; exception %s", ie.IE_NAME, url, e) ie_result['entries_no_dl'] =[] ie_result['entries'] = [] self.logger.info( 'not downloading %s media files from this ' 'playlist because we expect to capture them from ' 'individual watch/track/detail pages', len(ie_result['entries_no_dl'])) else: self.logger.info( 'extractor %r found a download in %s', ie.IE_NAME, url) def _push_stitched_up_vid_to_warcprox(self, site, info_dict): # 220211 update: does yt-dlp supply content-type? # XXX Don't know how to get the right content-type. Youtube-dl # doesn't supply it. Sometimes (with --hls-prefer-native) # youtube-dl produces a stitched-up video that /usr/bin/file fails # to identify (says "application/octet-stream"). `ffprobe` doesn't # give us a mimetype. if info_dict.get('ext') == 'mp4': mimetype = 'video/mp4' else: try: import magic mimetype = magic.from_file(info_dict['filepath'], mime=True) except ImportError as e: mimetype = 'video/%s' % info_dict['ext'] self.logger.warning( 'guessing mimetype %s because %r', mimetype, e) url = 'youtube-dl:%05d:%s' % ( info_dict.get('playlist_index') or 1, info_dict['webpage_url']) size = os.path.getsize(info_dict['filepath']) self.logger.info( 'pushing %r video stitched-up as %s (%s bytes) to ' 'warcprox at %s with url %s', info_dict['format'], mimetype, size, worker._proxy_for(site), url) with open(info_dict['filepath'], 'rb') as f: # include content-length header to avoid chunked # transfer, which warcprox currently rejects extra_headers = dict(site.extra_headers()) extra_headers['content-length'] = size request, response = worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url=url, warc_type='resource', content_type=mimetype, payload=f, extra_headers=extra_headers) # consulted by _remember_videos() ydl.stitch_ups.append({ 'url': url, 'response_code': response.code, 'content-type': mimetype, 'content-length': size, }) def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES): worker.logger.debug( 'heartbeating site.last_claimed to prevent another ' 'brozzler-worker claiming this site id=%r', site.id) site.last_claimed = doublethink.utcnow() site.save() except: worker.logger.debug( 'problem heartbeating site.last_claimed site id=%r', site.id, exc_info=True) def ydl_postprocess_hook(d): if d['status'] == 'finished': worker.logger.info('[ydl_postprocess_hook] Finished postprocessing') worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor'])) if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict']) ydl_opts = { "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir), "retries": 1, "nocheckcertificate": True, "noplaylist": True, "noprogress": True, "nopart": True, "no_color": True, "progress_hooks": [maybe_heartbeat_site_last_claimed], "postprocessor_hooks": [ydl_postprocess_hook], # https://github.com/yt-dlp/yt-dlp#format-selection # "By default, yt-dlp tries to download the best available quality..." # https://github.com/yt-dlp/yt-dlp#sorting-formats # "You can change the criteria for being considered the best by using -S (--format-sort)...." # "vext: Video Extension (mp4 > webm > flv > other). If --prefer-free-formats is used, webm is preferred." # "aext: Audio Extension (m4a > aac > mp3 > ogg > opus > webm > other)." # "If --prefer-free-formats is used, the order changes to opus > ogg > webm > m4a > mp3 > aac." # "ext: Equivalent to vext,aext" "format_sort": ["ext"], "format": "b/bv+ba", # skip live streams "match_filter": match_filter_func("!is_live"), # --cache-dir local or... "cache_dir": False, "logger": logging.getLogger("youtube_dl"), "verbose": True, "quiet": False, } if worker._proxy_for(site): ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.fetch_spy = YoutubeDLSpy() ydl.stitch_ups = [] ydl._opener.add_handler(ydl.fetch_spy) return ydl