def download_from_google_drive(gd_id, destination): """ Use the requests package to download a file from Google Drive. """ URL = 'https://docs.google.com/uc?export=download' with get_http_session() as session: response = session.get(URL, params={'id': gd_id}, stream=True) token = _get_confirm_token(response) if token: response.close() params = {'id': gd_id, 'confirm': token} response = session.get(URL, params=params, stream=True) CHUNK_SIZE = 32768 with PathManager.open(destination, 'wb') as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) response.close()
def check_header(self): """ Performs a HEAD request to check if the URL / Google Drive ID is live. """ with get_http_session() as session: if self.from_google: URL = 'https://docs.google.com/uc?export=download' response = session.head(URL, params={'id': self.url}, stream=True) else: headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/77.0.3865.90 Safari/537.36') } response = session.head(self.url, allow_redirects=True, headers=headers) status = response.status_code assert status == 200
def download(url, path, fname, redownload=False, num_retries=5): """ Download file using `requests`. If ``redownload`` is set to false, then will not download tar file again if it is present (default ``False``). """ outfile = os.path.join(path, fname) download = not PathManager.exists(outfile) or redownload logging.info(f"Downloading {url} to {outfile}") retry = num_retries exp_backoff = [2**r for r in reversed(range(retry))] pbar = tqdm.tqdm(unit='B', unit_scale=True, desc='Downloading {}'.format(fname)) while download and retry > 0: response = None with get_http_session() as session: try: response = session.get(url, stream=True, timeout=5) # negative reply could be 'none' or just missing CHUNK_SIZE = 32768 total_size = int(response.headers.get('Content-Length', -1)) # server returns remaining size if resuming, so adjust total pbar.total = total_size done = 0 with PathManager.open(outfile, 'wb') as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) if total_size > 0: done += len(chunk) if total_size < done: # don't freak out if content-length was too small total_size = done pbar.total = total_size pbar.update(len(chunk)) break except ( requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, ): retry -= 1 pbar.clear() if retry > 0: pl = 'y' if retry == 1 else 'ies' logging.debug( f'Connection error, retrying. ({retry} retr{pl} left)') time.sleep(exp_backoff[retry]) else: logging.error('Retried too many times, stopped retrying.') finally: if response: response.close() if retry <= 0: raise RuntimeError( 'Connection broken too many times. Stopped retrying.') if download and retry > 0: pbar.update(done - pbar.n) if done < total_size: raise RuntimeError( f'Received less data than specified in Content-Length header for ' f'{url}. There may be a download problem.') pbar.close()