def isolated_fs(fs): # isolate fs but include CA bundle for https validation fs.add_real_directory( os.path.dirname(extract_zipped_paths(DEFAULT_CA_BUNDLE_PATH))) # add cassettes dir cassettes_dir = join(dirname(realpath(__file__)), "cassettes") fs.add_real_directory(cassettes_dir)
def test_zipped_paths_extracted(self, tmpdir): zipped_py = tmpdir.join('test.zip') with zipfile.ZipFile(zipped_py.strpath, 'w') as f: f.write(__file__) _, name = os.path.splitdrive(__file__) zipped_path = os.path.join(zipped_py.strpath, name.lstrip(r'\/')) extracted_path = extract_zipped_paths(zipped_path) assert extracted_path != zipped_path assert os.path.exists(extracted_path) assert filecmp.cmp(extracted_path, __file__)
def test_unzipped_paths_unchanged(self, path): assert path == extract_zipped_paths(path)
def _sync_download(self, url, destination_path): """Synchronous version of `download` method. Args: url: url to download destination_path: path where to write it Returns: None Raises: DownloadError: when download fails. Environment Variables: TFDS_HTTP_PROXY : Configure Proxy Servers for HTTP Requests TFDS_HTTPS_PROXY : Configure Proxy Servers for HTTPS Requests TFDS_FTP_PROXY : Configure Proxy Servers for FTP Requests TFDS_CA_BUNDLE : Configure Custom Client Side SSL Certificates (If REQUESTS_CA_BUNDLE is set, it would use that path) Note: FTPS Custom Certificate verification doesn't work for python version <= 2.7.8 """ proxies = { 'http': os.environ.get('TFDS_HTTP_PROXY', None), 'https': os.environ.get('TFDS_HTTPS_PROXY', None), 'ftp': os.environ.get('TFDS_FTP_PROXY', None) } ca_bundle = os.environ.get('TFDS_CA_BUNDLE', None) if not ca_bundle: ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE', None) if ca_bundle: ca_bundle = extract_zipped_paths(ca_bundle) if not hasattr(ssl, '_create_unverified_context'): # disable ftp ssl bypassing for python version <= 2.7.8 def disabled_py2_log_fn(*args, **kwargs): del args, kwargs return logging.info( 'SSL bypassing not available for python ' 'version <= 2.7.8 current version: %s ' 'Protocols affected: FTPS', sys.version.split(' ')[0]) ssl.__dict__['_create_unverified_context'] = disabled_py2_log_fn ssl.__dict__['create_default_context'] = disabled_py2_log_fn ca_verify = { 'urllib': ssl._create_unverified_context() # pylint: disable=W0212 if not ca_bundle else ssl.create_default_context(capath=ca_bundle), 'requests': False if not ca_bundle else ca_bundle } if kaggle.KaggleFile.is_kaggle_url(url): if proxies['http']: os.environ['KAGGLE_PROXY'] = proxies['http'] return self._sync_kaggle_download(url, destination_path) try: # If url is on a filesystem that gfile understands, use copy. Otherwise, # use requests (http) or urllib (ftp). if not url.startswith('http'): return self._sync_file_copy(url, destination_path) except tf.errors.UnimplementedError: pass session = requests.Session() session.proxies = proxies session.verify = ca_verify['requests'] if _DRIVE_URL.match(url): url = self._get_drive_url(url, session) use_urllib = url.startswith('ftp') if use_urllib: if proxies['ftp']: proxy = urllib.request.ProxyHandler({'ftp': proxies['ftp']}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) # pylint: disable=too-many-function-args request = urllib.request.Request(url) # disable ssl context check for FTPS for python version <= 2.7.8 if ca_verify['urllib'] is None: response = urllib.request.urlopen(request) else: response = urllib.request.urlopen(request, context=ca_verify['urllib']) else: response = session.get(url, stream=True) if response.status_code != 200: raise DownloadError('Failed to get url %s. HTTP code: %d.' % (url, response.status_code)) fname = _get_filename(response) path = os.path.join(destination_path, fname) size = 0 size_mb = 0 unit_mb = units.MiB self._pbar_dl_size.update_total( int(response.headers.get('Content-length', 0)) // unit_mb) with tf.io.gfile.GFile(path, 'wb') as file_: checksum = self._checksumer() if use_urllib: iterator = iter(lambda: response.read(io.DEFAULT_BUFFER_SIZE), b'') else: iterator = response.iter_content( chunk_size=io.DEFAULT_BUFFER_SIZE) for block in iterator: size += len(block) # Update the progress bar size_mb += len(block) if size_mb > unit_mb: self._pbar_dl_size.update(size_mb // unit_mb) size_mb %= unit_mb checksum.update(block) file_.write(block) self._pbar_url.update(1) return checksum.hexdigest(), size
def test_invalid_unc_path(self): path = r"\\localhost\invalid\location" assert extract_zipped_paths(path) == path
def loadUserAgent(self, *args, **kwargs): self.browser = kwargs.pop('browser', None) self.platforms = ['linux', 'windows', 'darwin', 'android', 'ios'] self.browsers = ['chrome', 'firefox'] if isinstance(self.browser, dict): self.custom = self.browser.get('custom', None) self.platform = self.browser.get('platform', None) self.desktop = self.browser.get('desktop', True) self.mobile = self.browser.get('mobile', True) self.browser = self.browser.get('browser', None) else: self.custom = kwargs.pop('custom', None) self.platform = kwargs.pop('platform', None) self.desktop = kwargs.pop('desktop', True) self.mobile = kwargs.pop('mobile', True) if not self.desktop and not self.mobile: sys.tracebacklimit = 0 raise RuntimeError( "Sorry you can't have mobile and desktop disabled at the same time." ) with open( extract_zipped_paths( os.path.join(os.path.dirname(__file__), 'browsers.json')), 'r') as fp: user_agents = json.load(fp, object_pairs_hook=OrderedDict) if self.custom: if not self.tryMatchCustom(user_agents): self.cipherSuite = [ ssl._DEFAULT_CIPHERS, '!AES128-SHA', '!ECDHE-RSA-AES256-SHA', ] self.headers = OrderedDict([ ('User-Agent', self.custom), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ), ('Accept-Language', 'en-US,en;q=0.9'), ('Accept-Encoding', 'gzip, deflate, br') ]) else: if self.browser and self.browser not in self.browsers: sys.tracebacklimit = 0 raise RuntimeError( 'Sorry "{}" browser is not valid, valid browsers are [{}].' .format(self.browser, ', '.join(self.browsers))) if not self.platform: self.platform = random.SystemRandom().choice(self.platforms) if self.platform not in self.platforms: sys.tracebacklimit = 0 raise RuntimeError( 'Sorry the platform "{}" is not valid, valid platforms are [{}]' .format(self.platform, ', '.join(self.platforms))) filteredAgents = self.filterAgents(user_agents['user_agents']) if not self.browser: # has to be at least one in there... while not filteredAgents.get(self.browser): self.browser = random.SystemRandom().choice( list(filteredAgents.keys())) if not filteredAgents[self.browser]: sys.tracebacklimit = 0 raise RuntimeError( 'Sorry "{}" browser was not found with a platform of "{}".' .format(self.browser, self.platform)) self.cipherSuite = user_agents['cipherSuite'][self.browser] self.headers = user_agents['headers'][self.browser] self.headers['User-Agent'] = random.SystemRandom().choice( filteredAgents[self.browser]) if not kwargs.get('allow_brotli', False) and 'br' in self.headers['Accept-Encoding']: self.headers['Accept-Encoding'] = ','.join([ encoding for encoding in self.headers['Accept-Encoding'].split(',') if encoding.strip() != 'br' ]).strip()