def download_with_resume(url: str, path: str, cb: Callback = None) -> bool: """ Download a file pointed by url to a local path @param url: URL to download @param path: Local file to be saved @param cb: Callback object @return: True if the file was completely downloaded """ logging.debug("Downloading {} to {}".format(url, path)) # Clean existing file if os.path.exists(path): os.remove(path) if cb and cb.is_cancelled(): return False try: r = requests.head(url, allow_redirects=True) except requests.exceptions as e: logging.error(e) return False if r.status_code < 200 or r.status_code > 302: logging.error("Failed to reach {}, status is {}".format(url, r.status_code)) r.close() return False expected_size = int(r.headers.get("content-length")) r.close() if cb and cb.is_cancelled(): return False chunk_size = 2**20 last_byte = 0 with open(path, 'wb') as f: while last_byte < expected_size: if cb and cb.is_cancelled(): return False logging.debug("{} vs {}".format(last_byte, expected_size)) logging.debug("Starting download with already {}% of the file". format((100*last_byte)/expected_size)) resume_header = {'Range': 'bytes=%d-' % last_byte} resume_request = requests.get(url, headers=resume_header, stream=True, verify=True, allow_redirects=True) for data in resume_request.iter_content(chunk_size): last_byte += len(data) if cb and cb.is_cancelled(): return False if cb: cb.progress(100 * (last_byte / expected_size)) f.write(data) resume_request.close() if cb and cb.is_cancelled(): return False if cb: cb.progress(100) return True
def download_mp3(self, cb: Callback = None, dry_run: bool = False): """ Will get the list of MP3s and download them into the specified folder @param cb: Callback object @param dry_run: Will not actually download anythin (for test purposes only) @return: None """ if not self.folder(): err_str = 'No folder is defined for the download' logging.error(err_str) raise BulkDownloaderException(err_str) to_download = self.list_mp3(cb) logging.info('{} files will be downloaded'.format(len(to_download))) if cb and cb.is_cancelled(): return if cb: cb.progress(0) count = 0 downloads_successful = 0 downloads_skipped = 0 nb_downloads = len(to_download) step = 100. / nb_downloads for episode in to_download: if cb: if cb.is_cancelled(): continue cb.progress(count * step) # Getting the name and path path = os.path.join(self.folder(), episode.get_filename()) # Check if we should skip the file if not self.overwrite() and os.path.isfile(path): logging.info('Skipping {} as the file already exists at {}' .format(episode.get_filename(), path)) downloads_skipped += 1 count += 1 continue # Download file logging.info('Saving {} to {} from {}'.format(episode.get_filename(), path, episode.url())) if cb: cb.set_function(lambda x: (count + x / 100) * step) if not dry_run and try_download(episode.url(), path, cb=cb): downloads_successful += 1 if cb: cb.set_function(lambda x: x) count += 1 if cb and cb.is_cancelled(): return if cb: cb.progress(100) logging.info('{}/{} episodes were successfully downloaded'.format(downloads_successful, nb_downloads)) logging.info('{}/{} episodes were skipped because files already existed' .format(downloads_skipped, nb_downloads))
def list_mp3(self, cb: Callback = None, verbose: bool = False) -> List[Episode]: """ Will fetch the RSS or directory info and return the list of available MP3s @param cb: Callback object @param verbose: Outputs more logs @return: List of MP3 urls """ try: r = requests.get(self._url) except requests.RequestException as exc: err_str = 'Failed to connect to URL ({})'.format(exc) logging.error(err_str) raise BulkDownloaderException(err_str) if r.status_code != 200: err_str = 'Failed to access URL (code {})'.format(r.status_code) logging.error(err_str) raise BulkDownloaderException(err_str) page = r.content if cb and cb.is_cancelled(): return [] if self._page_is_rss(page): logging.info('Processing RSS document') to_download = self._get_episodes_to_download_from_rss(page) # We trim the list if needed if 0 < self._last_n < len(to_download): to_download = to_download[0:self._last_n] else: err_str = 'Content is not RSS' logging.error(err_str) raise BulkDownloaderException(err_str) if cb and cb.is_cancelled(): return [] if verbose: logging.info('{} episodes found in the feed:'.format(len(to_download))) for elem in to_download: logging.info(elem) return to_download
def try_download(url, path, max_try=3, sleep_time=5, cb: Callback = None) -> bool: """ Try to download the file multiple times, in case of connection failures @param url: URL to download @param path: Local file to be saved @param max_try: Number of download tries @param sleep_time: Wait time between tries in second @param cb: Callback object @return: True if the file was completely downloaded """ count = 0 while count < max_try: if download_with_resume(url, path, cb): return True if cb and cb.is_cancelled(): return False count += 1 sleep(sleep_time) logging.error('Download of {} failed after {} tries'.format(url, max_try)) return False