def _dl_caption(url, dir_path, fne, ext): try: _dl_url(url, dir_path, fne, ext) except requests.exceptions.HTTPError: if cc.statics.args.ignore_caption_error: logging.warning('could not download caption %s to %s/%s%s', url, dir_path, fne, ext, exc_info=True) else: raise
def _get_dls(episode): if episode.url is not None: yield _dl_url, episode.url, 'index', '.html' for video in episode.videos: if video.rtmps: rtmp = max(video.rtmps, key=lambda r: r.width) yield _dl_rtmp, rtmp.url, video.fne, rtmp.ext else: logging.warning('content is unavailable: %s', video.page_url) yield _unavailable, video.page_url, video.fne, '.mp4.unavailable' for caption in video.captions: yield _dl_caption, caption.url, video.fne, caption.ext
def _get_mediagen_tree(feed, video_blob): parts = urllib.parse.urlparse(feed.show_url) new_parts = urllib.parse.ParseResult( scheme=parts.scheme, netloc=parts.netloc, path='feeds/mrss', params='', query=urllib.parse.urlencode({'uri': video_blob.uri}), fragment='') mrss_url = urllib.parse.urlunparse(new_parts) try: mrss_tree = cc.http.get_url_dom_tree(mrss_url) except lxml.etree.XMLSyntaxError: logging.warning('fix mrss xml %s', mrss_url, exc_info=True) mrss_tree = _get_url_dom_tree_with_fixes(mrss_url) content = mrss_tree.find('.//{http://search.yahoo.com/mrss/}content') mediagen_url = content.get('url') try: return cc.http.get_url_dom_tree(mediagen_url) except lxml.etree.XMLSyntaxError: logging.warning('fix mediagen xml %s', mediagen_url, exc_info=True) return _get_url_dom_tree_with_fixes(mediagen_url)
def _download(url, file_name, cwd, prog, download_timeout, monitor_period, cpu_bound, memory_bound, partial_okay): cwd = cwd or os.getcwd() file_name_part = file_name + '.part' output_path = os.path.join(cwd, file_name) output_path_part = os.path.join(cwd, file_name_part) digest = None for retry_exp in itertools.count(): timer = threading.Timer(download_timeout, lambda: None) timer.daemon = True proc = _make_subprocess(url, file_name_part, cwd, prog) timer.start() ret = -1 while True: try: ret = proc.wait(timeout=monitor_period) break except psutil.TimeoutExpired: pass cpu_percent = proc.get_cpu_percent(interval=None) memory_percent = proc.get_memory_percent() logging.trace('rtmp: pid=%d cpu=%.1f memory=%.1f', proc.pid, cpu_percent, memory_percent) if cpu_percent > cpu_bound: logging.error('rtmp: cpu limit exceeded') proc.kill() break if memory_percent > memory_bound: logging.error('rtmp: memory limit exceeded') proc.kill() break if timer.finished.is_set(): logging.error('rtmp: timeout: %s -> %s', url, output_path_part) proc.kill() break timer.cancel() if prog == 'rtmpdump' and ret == RTMPDUMP_INCOMPLETE: if partial_okay: logging.warning( 'rtmp: partial download %s to %s', url, file_name) ret = 0 break with open(output_path_part, 'rb') as output_file: new_digest = hashlib.sha1(output_file.read()).digest() if digest is not None and digest == new_digest: # We made no progress; the download might be completed. # Let's not retry and assume it was. logging.warning( 'rtmp: no progress: url=%s file_name=%s', url, file_name) ret = 0 break digest = new_digest # rtmpdump didn't complete the transfer; resume might get further. retry = 2 ** retry_exp if retry > download_timeout: logging.error('rtmp: retry timeout: %s -> %s', url, output_path_part) else: logging.trace('rtmp: retry=%d url=%s', retry, url) time.sleep(retry) continue if ret is not None and ret != 0: raise cc.Error('Could not download (ret=%s): %s' % (ret, url)) # Okay, we are done. break os.rename(output_path_part, output_path) logging.info('rtmp: success: %s -> %s', url, output_path)