def file_size(self, fdict): res = '' if fdict.get('filesize') is not None: res += format_bytes(fdict['filesize']) elif fdict.get('filesize_approx') is not None: res += '~' + format_bytes(fdict['filesize_approx']) return res
def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): if self.params.get('noprogress', False): return downloaded_str = format_bytes(downloaded_data_len) speed_str = self.format_speed(speed) elapsed_str = FileDownloader.format_seconds(elapsed) msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str) self._report_progress_status(msg)
def generate_info(self): try: info_dict = self.__ydl.extract_info(self.__url, download=False) self.__ydl_obj.title = info_dict['title'] self.__ydl_obj.url = self.__url for format in info_dict['formats']: filesize = format.get('filesize') if format.get('filesize') else format.get('filesize_approx') self.__ydl_obj.format_info.append({ 'format_id': format['format'], 'extension': format['ext'], 'resolution': self.__ydl.format_resolution(format), 'filesize': format_bytes(filesize) }) return True except Exception as e: print(e) return False
def urlretrieve(self, url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None): """ original source: https://github.com/python/cpython/blob/ 21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229 Because urlopen also supports context, I decided to adapt the download function. """ start = time.time() url_parsed = urlparse.urlparse(url) request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader) if cookies_path is not None: cookie_jar = MozillaCookieJar(cookies_path) if os.path.isfile(cookies_path): cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.add_cookie_header(request) with contextlib.closing( urllib.request.urlopen(request, context=context)) as fp: headers = fp.info() # Just return the local path and the 'headers' for file:// # URLs. No sense in performing a copy unless requested. if url_parsed.scheme == 'file' and not filename: return os.path.normpath(url_parsed.path), headers if not filename: raise RuntimeError('No filename specified!') tfp = open(filename, 'wb') with tfp: result = filename, headers # read overall read = 0 # 4kb at once bs = 1024 * 8 blocknum = 0 # guess size size = int(headers.get('Content-Length', -1)) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError( 'retrieval incomplete: got only %i out of %i bytes' % (read, size), result) end = time.time() logging.debug('T%s - Download of %s finished in %s', self.thread_id, format_bytes(read), self.format_seconds(end - start)) return result
def format_size(bytes): return '%s (%d bytes)' % (format_bytes(bytes), bytes)
def test_template(self): ie = youtube_dl.extractor.get_info_extractor(test_case['name']) other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get( 'playlist', [] if is_playlist else [test_case]) def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) if not ie.working(): print_skipping('IE marked as not _WORKING') return for tc in test_cases: info_dict = tc.get('info_dict', {}) if not tc.get('file') and not (info_dict.get('id') and info_dict.get('ext')): raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') if 'skip' in test_case: print_skipping(test_case['skip']) return for other_ie in other_ies: if not other_ie.working(): print_skipping(u'test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) return params = get_params(test_case.get('params', {})) if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', True) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) ydl.add_default_info_extractors() finished_hook_called = set() def _hook(status): if status['status'] == 'finished': finished_hook_called.add(status['filename']) ydl.add_progress_hook(_hook) expect_warnings(ydl, test_case.get('expected_warnings', [])) def get_tc_filename(tc): return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) res_dict = None def try_rm_tcs_files(tcs=None): if tcs is None: tcs = test_cases for tc in tcs: tc_filename = get_tc_filename(tc) try_rm(tc_filename) try_rm(tc_filename + '.part') try_rm(os.path.splitext(tc_filename)[0] + '.info.json') try_rm_tcs_files() try: try_num = 1 while True: try: # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. res_dict = ydl.extract_info(test_case['url']) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): raise if try_num == RETRIES: report_warning(u'Failed due to network errors, skipping...') return print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) try_num += 1 else: break if is_playlist: self.assertEqual(res_dict['_type'], 'playlist') self.assertTrue('entries' in res_dict) expect_info_dict(self, test_case.get('info_dict', {}), res_dict) if 'playlist_mincount' in test_case: assertGreaterEqual( self, len(res_dict['entries']), test_case['playlist_mincount'], 'Expected at least %d in playlist %s, but got only %d' % ( test_case['playlist_mincount'], test_case['url'], len(res_dict['entries']))) if 'playlist_count' in test_case: self.assertEqual( len(res_dict['entries']), test_case['playlist_count'], 'Expected %d entries in playlist %s, but got %d.' % ( test_case['playlist_count'], test_case['url'], len(res_dict['entries']), )) if 'playlist_duration_sum' in test_case: got_duration = sum(e['duration'] for e in res_dict['entries']) self.assertEqual( test_case['playlist_duration_sum'], got_duration) for tc in test_cases: tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) expected_minsize = tc.get('file_minsize', 10000) if expected_minsize is not None: if params.get('test'): expected_minsize = max(expected_minsize, 10000) got_fsize = os.path.getsize(tc_filename) assertGreaterEqual( self, got_fsize, expected_minsize, 'Expected %s to be at least %s, but it\'s only %s ' % (tc_filename, format_bytes(expected_minsize), format_bytes(got_fsize))) if 'md5' in tc: md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' self.assertTrue( os.path.exists(info_json_fn), 'Missing info file %s' % info_json_fn) with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) expect_info_dict(self, tc.get('info_dict', {}), info_dict) finally: try_rm_tcs_files() if is_playlist and res_dict is not None and res_dict.get('entries'): # Remove all other files that may have been extracted if the # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] try_rm_tcs_files(res_tcs)
def _do_download(self, filename, info_dict): url = info_dict['url'] # Check file already present if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, 'status': 'finished', 'total_bytes': os.path.getsize(encodeFilename(filename)), }) return True # Attempt to download using rtmpdump if url.startswith('rtmp'): return self._download_with_rtmpdump(filename, url, info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), info_dict.get('tc_url', None), info_dict.get('rtmp_live', False), info_dict.get('rtmp_conn', None)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): return self._download_with_mplayer(filename, url) # m3u8 manifest are downloaded with ffmpeg if determine_ext(url) == u'm3u8': return self._download_m3u8_with_ffmpeg(filename, url) tmpfilename = self.temp_name(filename) stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} if 'user_agent' in info_dict: headers['Youtubedl-user-agent'] = info_dict['user_agent'] basic_request = compat_urllib_request.Request(url, None, headers) request = compat_urllib_request.Request(url, None, headers) if self.params.get('test', False): request.add_header('Range','bytes=0-10240') # Establish possible resume length if os.path.isfile(encodeFilename(tmpfilename)): resume_len = os.path.getsize(encodeFilename(tmpfilename)) else: resume_len = 0 open_mode = 'wb' if resume_len != 0: if self.params.get('continuedl', False): self.report_resuming_byte(resume_len) request.add_header('Range','bytes=%d-' % resume_len) open_mode = 'ab' else: resume_len = 0 count = 0 retries = self.params.get('retries', 0) while count <= retries: # Establish connection try: if count == 0 and 'urlhandle' in info_dict: data = info_dict['urlhandle'] data = compat_urllib_request.urlopen(request) break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error raise elif err.code == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header data = compat_urllib_request.urlopen(basic_request) content_length = data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and (resume_len - 100 < int(content_length) < resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, # changing the file size slightly and causing problems for some users. So # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. self.report_file_already_downloaded(filename) self.try_rename(tmpfilename, filename) self._hook_progress({ 'filename': filename, 'status': 'finished', }) return True else: # The length does not match, we start the download over self.report_unable_to_resume() open_mode = 'wb' break # Retry count += 1 if count <= retries: self.report_retry(count, retries) if count > retries: self.report_error(u'giving up after %s retries' % retries) return False data_len = data.info().get('Content-length', None) if data_len is not None: data_len = int(data_len) + resume_len min_data_len = self.params.get("min_filesize", None) max_data_len = self.params.get("max_filesize", None) if min_data_len is not None and data_len < min_data_len: self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) return False if max_data_len is not None and data_len > max_data_len: self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() while self._go_on: # Download and write before = time.time() data_block = data.read(block_size) after = time.time() if len(data_block) == 0: break byte_counter += len(data_block) # Open file just in time if stream is None: try: (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) assert stream is not None filename = self.undo_temp_name(tmpfilename) self.report_destination(filename) except (OSError, IOError) as err: self.report_error(u'unable to open for writing: %s' % str(err)) return False try: stream.write(data_block) except (IOError, OSError) as err: self.to_stderr(u"\n") self.report_error(u'unable to write data: %s' % str(err)) return False if not self.params.get('noresizebuffer', False): block_size = self.best_block_size(after - before, len(data_block)) # Progress message speed = self.calc_speed(start, time.time(), byte_counter - resume_len) if data_len is None: eta = percent = None else: percent = self.calc_percent(byte_counter, data_len) eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ 'downloaded_bytes': byte_counter, 'total_bytes': data_len, 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', 'eta': eta, 'speed': speed, }) # Apply rate limit self.slow_down(start, byte_counter - resume_len) if stream is None: self.to_stderr(u"\n") self.report_error(u'Did not get any data blocks') return False stream.close() self.report_finish(data_len_str, (time.time() - start)) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) # Update file modification time if self.params.get('updatetime', True): info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) self._hook_progress({ 'downloaded_bytes': byte_counter, 'total_bytes': byte_counter, 'filename': filename, 'status': 'finished', }) return True
def run_rtmpdump(args): start = time.time() resume_percent = None resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True proc_stderr_closed = False while not proc_stderr_closed: # read line from stderr line = u'' while True: char = proc.stderr.read(1) if not char: proc_stderr_closed = True break if char in [b'\r', b'\n']: break line += char.decode('ascii', 'replace') if not line: # proc_stderr_closed is True continue mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1))*1024) percent = float(mobj.group(2)) if not resume_percent: resume_percent = percent resume_downloaded_data_len = downloaded_data_len eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) data_len = None if percent > 0: data_len = int(downloaded_data_len * 100 / percent) data_len_str = u'~' + format_bytes(data_len) self.report_progress(percent, data_len_str, speed, eta) cursor_in_new_line = False self._hook_progress({ 'downloaded_bytes': downloaded_data_len, 'total_bytes': data_len, 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', 'eta': eta, 'speed': speed, }) else: # no percent for live streams mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) if mobj: downloaded_data_len = int(float(mobj.group(1))*1024) time_now = time.time() speed = self.calc_speed(start, time_now, downloaded_data_len) self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) cursor_in_new_line = False self._hook_progress({ 'downloaded_bytes': downloaded_data_len, 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', 'speed': speed, }) elif self.params.get('verbose', False): if not cursor_in_new_line: self.to_screen(u'') cursor_in_new_line = True self.to_screen(u'[rtmpdump] '+line) proc.wait() if not cursor_in_new_line: self.to_screen(u'') return proc.returncode
def format_speed(speed): if speed is None: return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed))
def test_template(self): ie = youtube_dl.extractor.get_info_extractor(test_case['name'])() other_ies = [ get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', []) ] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get('playlist', [] if is_playlist else [test_case]) def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) self.skipTest(reason) if not ie.working(): print_skipping('IE marked as not _WORKING') for tc in test_cases: info_dict = tc.get('info_dict', {}) if not (info_dict.get('id') and info_dict.get('ext')): raise Exception( 'Test definition (%s) requires both \'id\' and \'ext\' keys present to define the output file' % (tname, )) if 'skip' in test_case: print_skipping(test_case['skip']) for other_ie in other_ies: if not other_ie.working(): print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) params = get_params(test_case.get('params', {})) params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) ydl.add_default_info_extractors() finished_hook_called = set() def _hook(status): if status['status'] == 'finished': finished_hook_called.add(status['filename']) ydl.add_progress_hook(_hook) expect_warnings(ydl, test_case.get('expected_warnings', [])) def get_tc_filename(tc): return ydl.prepare_filename(tc.get('info_dict', {})) res_dict = None def try_rm_tcs_files(tcs=None): if tcs is None: tcs = test_cases for tc in tcs: tc_filename = get_tc_filename(tc) try_rm(tc_filename) try_rm(tc_filename + '.part') try_rm(os.path.splitext(tc_filename)[0] + '.info.json') try_rm_tcs_files() try: try_num = 1 while True: try: # We're not using .download here since that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. res_dict = ydl.extract_info( test_case['url'], force_generic_extractor=params.get( 'force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in ( compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or ( err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): msg = getattr(err, 'msg', error_to_compat_str(err)) err.msg = '%s (%s)' % ( msg, tname, ) raise err if try_num == RETRIES: report_warning( '%s failed due to network errors, skipping...' % tname) return print( 'Retrying: {0} failed tries\n\n##########\n\n'.format( try_num)) try_num += 1 else: break if is_playlist: self.assertTrue( res_dict['_type'] in ['playlist', 'multi_video']) self.assertTrue('entries' in res_dict) expect_info_dict(self, res_dict, test_case.get('info_dict', {})) if 'playlist_mincount' in test_case: assertGreaterEqual( self, len(res_dict['entries']), test_case['playlist_mincount'], 'Expected at least %d in playlist %s, but got only %d' % (test_case['playlist_mincount'], test_case['url'], len(res_dict['entries']))) if 'playlist_count' in test_case: self.assertEqual( len(res_dict['entries']), test_case['playlist_count'], 'Expected %d entries in playlist %s, but got %d.' % ( test_case['playlist_count'], test_case['url'], len(res_dict['entries']), )) if 'playlist_duration_sum' in test_case: got_duration = sum(e['duration'] for e in res_dict['entries']) self.assertEqual(test_case['playlist_duration_sum'], got_duration) # Generalize both playlists and single videos to unified format for # simplicity if 'entries' not in res_dict: res_dict['entries'] = [res_dict] for tc_num, tc in enumerate(test_cases): tc_res_dict = res_dict['entries'][tc_num] # First, check test cases' data against extracted data alone expect_info_dict(self, tc_res_dict, tc.get('info_dict', {})) # Now, check downloaded file consistency tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) expected_minsize = tc.get('file_minsize', 10000) if expected_minsize is not None: if params.get('test'): expected_minsize = max(expected_minsize, 10000) got_fsize = os.path.getsize(tc_filename) assertGreaterEqual( self, got_fsize, expected_minsize, 'Expected %s to be at least %s, but it\'s only %s ' % (tc_filename, format_bytes(expected_minsize), format_bytes(got_fsize))) if 'md5' in tc: md5_for_file = _file_md5(tc_filename) self.assertEqual(tc['md5'], md5_for_file) # Finally, check test cases' data again but this time against # extracted data from info JSON file written during processing info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' self.assertTrue(os.path.exists(info_json_fn), 'Missing info file %s' % info_json_fn) with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) expect_info_dict(self, info_dict, tc.get('info_dict', {})) finally: try_rm_tcs_files() if is_playlist and res_dict is not None and res_dict.get( 'entries'): # Remove all other files that may have been extracted if the # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] try_rm_tcs_files(res_tcs)
def _get_status_message(self) -> str: """ Creates a string that combines the status messages of all threads. The current download progress of a file is displayed in percent per Thread. A total display is also created, showing the total amount downloaded in relation to what still needs to be downloaded. @return: A status message string """ # to limit the output to one line limits = shutil.get_terminal_size() # Starting with a carriage return to overwrite the last message progressmessage = f'\033[{len(self.threads)}A\r' threads_status_message = '' threads_total_downloaded = 0 for thread in self.threads: i = thread.thread_id # A thread status contains it id and the progress # of the current file thread_percentage = self.thread_report[i]['percentage'] thread_current_url = self.thread_report[i]['current_url'] if not thread.is_alive(): thread_percentage = 100 thread_current_url = 'Finished!' if len(thread_current_url) + 13 > limits.columns: thread_current_url = thread_current_url[0 : limits.columns - 15] + '..' threads_status_message += '\033[KT%2i: %3i%% - %s\n' % (i, thread_percentage, thread_current_url) threads_total_downloaded += self.thread_report[i]['total'] extra_totalsize = self.thread_report[i]['extra_totalsize'] if extra_totalsize is not None and extra_totalsize != -1: self.total_to_download += extra_totalsize self.thread_report[i]['extra_totalsize'] = -1 progressmessage += threads_status_message percentage = 100 if self.total_to_download != 0: percentage = int(threads_total_downloaded * 100 / self.total_to_download) # The overall progress also includes the total size that needs to be # downloaded and the size that has already been downloaded. progressmessage_line = 'Total: %3s%% %12s/%12s' % ( percentage, format_bytes(threads_total_downloaded), format_bytes(self.total_to_download), ) progressmessage_line += ' | Files: %5s/%5s' % (len(self.report['success']), self.total_files) diff_to_last_status = threads_total_downloaded - self.last_threads_total_downloaded speed = self.calc_speed(self.last_status_timestamp, time.time(), diff_to_last_status) progressmessage_line += ' | ' + self.format_speed(speed) if len(progressmessage_line) > limits.columns: progressmessage_line = progressmessage_line[0 : limits.columns] progressmessage_line = '\033[K' + progressmessage_line progressmessage += progressmessage_line self.last_status_timestamp = time.time() self.last_threads_total_downloaded = threads_total_downloaded return progressmessage