Ejemplo n.º 1
0
def isolated_fs(fs):
    # isolate fs but include CA bundle for https validation
    fs.add_real_directory(
        os.path.dirname(extract_zipped_paths(DEFAULT_CA_BUNDLE_PATH)))
    # add cassettes dir
    cassettes_dir = join(dirname(realpath(__file__)), "cassettes")
    fs.add_real_directory(cassettes_dir)
Ejemplo n.º 2
0
    def test_zipped_paths_extracted(self, tmpdir):
        zipped_py = tmpdir.join('test.zip')
        with zipfile.ZipFile(zipped_py.strpath, 'w') as f:
            f.write(__file__)

        _, name = os.path.splitdrive(__file__)
        zipped_path = os.path.join(zipped_py.strpath, name.lstrip(r'\/'))
        extracted_path = extract_zipped_paths(zipped_path)

        assert extracted_path != zipped_path
        assert os.path.exists(extracted_path)
        assert filecmp.cmp(extracted_path, __file__)
Ejemplo n.º 3
0
    def test_zipped_paths_extracted(self, tmpdir):
        zipped_py = tmpdir.join('test.zip')
        with zipfile.ZipFile(zipped_py.strpath, 'w') as f:
            f.write(__file__)

        _, name = os.path.splitdrive(__file__)
        zipped_path = os.path.join(zipped_py.strpath, name.lstrip(r'\/'))
        extracted_path = extract_zipped_paths(zipped_path)

        assert extracted_path != zipped_path
        assert os.path.exists(extracted_path)
        assert filecmp.cmp(extracted_path, __file__)
Ejemplo n.º 4
0
 def test_unzipped_paths_unchanged(self, path):
     assert path == extract_zipped_paths(path)
Ejemplo n.º 5
0
 def test_unzipped_paths_unchanged(self, path):
     assert path == extract_zipped_paths(path)
Ejemplo n.º 6
0
    def _sync_download(self, url, destination_path):
        """Synchronous version of `download` method.

    Args:
      url: url to download
      destination_path: path where to write it
    Returns:
      None

    Raises:
      DownloadError: when download fails.

    Environment Variables:
        TFDS_HTTP_PROXY  : Configure Proxy Servers for HTTP Requests
        TFDS_HTTPS_PROXY : Configure Proxy Servers for HTTPS Requests
        TFDS_FTP_PROXY   : Configure Proxy Servers for FTP Requests
        TFDS_CA_BUNDLE   : Configure Custom Client Side SSL Certificates
                           (If REQUESTS_CA_BUNDLE is set, it would use that
                           path)

      Note: FTPS Custom Certificate verification doesn't work for python version
      <= 2.7.8
    """
        proxies = {
            'http': os.environ.get('TFDS_HTTP_PROXY', None),
            'https': os.environ.get('TFDS_HTTPS_PROXY', None),
            'ftp': os.environ.get('TFDS_FTP_PROXY', None)
        }
        ca_bundle = os.environ.get('TFDS_CA_BUNDLE', None)
        if not ca_bundle:
            ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE', None)
        if ca_bundle:
            ca_bundle = extract_zipped_paths(ca_bundle)
        if not hasattr(ssl, '_create_unverified_context'):
            # disable ftp ssl bypassing for python version <= 2.7.8
            def disabled_py2_log_fn(*args, **kwargs):
                del args, kwargs
                return logging.info(
                    'SSL bypassing not available for python '
                    'version <= 2.7.8 current version: %s '
                    'Protocols affected: FTPS',
                    sys.version.split(' ')[0])

            ssl.__dict__['_create_unverified_context'] = disabled_py2_log_fn
            ssl.__dict__['create_default_context'] = disabled_py2_log_fn

        ca_verify = {
            'urllib':
            ssl._create_unverified_context()  # pylint: disable=W0212
            if not ca_bundle else ssl.create_default_context(capath=ca_bundle),
            'requests':
            False if not ca_bundle else ca_bundle
        }

        if kaggle.KaggleFile.is_kaggle_url(url):
            if proxies['http']:
                os.environ['KAGGLE_PROXY'] = proxies['http']
            return self._sync_kaggle_download(url, destination_path)

        try:
            # If url is on a filesystem that gfile understands, use copy. Otherwise,
            # use requests (http) or urllib (ftp).
            if not url.startswith('http'):
                return self._sync_file_copy(url, destination_path)
        except tf.errors.UnimplementedError:
            pass

        session = requests.Session()
        session.proxies = proxies
        session.verify = ca_verify['requests']
        if _DRIVE_URL.match(url):
            url = self._get_drive_url(url, session)
        use_urllib = url.startswith('ftp')
        if use_urllib:
            if proxies['ftp']:
                proxy = urllib.request.ProxyHandler({'ftp': proxies['ftp']})
                opener = urllib.request.build_opener(proxy)
                urllib.request.install_opener(opener)  # pylint: disable=too-many-function-args
            request = urllib.request.Request(url)

            # disable ssl context check for FTPS for python version <= 2.7.8
            if ca_verify['urllib'] is None:
                response = urllib.request.urlopen(request)
            else:
                response = urllib.request.urlopen(request,
                                                  context=ca_verify['urllib'])
        else:
            response = session.get(url, stream=True)
            if response.status_code != 200:
                raise DownloadError('Failed to get url %s. HTTP code: %d.' %
                                    (url, response.status_code))
        fname = _get_filename(response)
        path = os.path.join(destination_path, fname)
        size = 0

        size_mb = 0
        unit_mb = units.MiB
        self._pbar_dl_size.update_total(
            int(response.headers.get('Content-length', 0)) // unit_mb)
        with tf.io.gfile.GFile(path, 'wb') as file_:
            checksum = self._checksumer()
            if use_urllib:
                iterator = iter(lambda: response.read(io.DEFAULT_BUFFER_SIZE),
                                b'')
            else:
                iterator = response.iter_content(
                    chunk_size=io.DEFAULT_BUFFER_SIZE)

            for block in iterator:
                size += len(block)

                # Update the progress bar
                size_mb += len(block)
                if size_mb > unit_mb:
                    self._pbar_dl_size.update(size_mb // unit_mb)
                    size_mb %= unit_mb

                checksum.update(block)
                file_.write(block)
        self._pbar_url.update(1)
        return checksum.hexdigest(), size
Ejemplo n.º 7
0
 def test_invalid_unc_path(self):
     path = r"\\localhost\invalid\location"
     assert extract_zipped_paths(path) == path
Ejemplo n.º 8
0
    def loadUserAgent(self, *args, **kwargs):
        self.browser = kwargs.pop('browser', None)

        self.platforms = ['linux', 'windows', 'darwin', 'android', 'ios']
        self.browsers = ['chrome', 'firefox']

        if isinstance(self.browser, dict):
            self.custom = self.browser.get('custom', None)
            self.platform = self.browser.get('platform', None)
            self.desktop = self.browser.get('desktop', True)
            self.mobile = self.browser.get('mobile', True)
            self.browser = self.browser.get('browser', None)
        else:
            self.custom = kwargs.pop('custom', None)
            self.platform = kwargs.pop('platform', None)
            self.desktop = kwargs.pop('desktop', True)
            self.mobile = kwargs.pop('mobile', True)

        if not self.desktop and not self.mobile:
            sys.tracebacklimit = 0
            raise RuntimeError(
                "Sorry you can't have mobile and desktop disabled at the same time."
            )

        with open(
                extract_zipped_paths(
                    os.path.join(os.path.dirname(__file__), 'browsers.json')),
                'r') as fp:
            user_agents = json.load(fp, object_pairs_hook=OrderedDict)

        if self.custom:
            if not self.tryMatchCustom(user_agents):
                self.cipherSuite = [
                    ssl._DEFAULT_CIPHERS,
                    '!AES128-SHA',
                    '!ECDHE-RSA-AES256-SHA',
                ]
                self.headers = OrderedDict([
                    ('User-Agent', self.custom),
                    ('Accept',
                     'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
                     ), ('Accept-Language', 'en-US,en;q=0.9'),
                    ('Accept-Encoding', 'gzip, deflate, br')
                ])
        else:
            if self.browser and self.browser not in self.browsers:
                sys.tracebacklimit = 0
                raise RuntimeError(
                    'Sorry "{}" browser is not valid, valid browsers are [{}].'
                    .format(self.browser, ', '.join(self.browsers)))

            if not self.platform:
                self.platform = random.SystemRandom().choice(self.platforms)

            if self.platform not in self.platforms:
                sys.tracebacklimit = 0
                raise RuntimeError(
                    'Sorry the platform "{}" is not valid, valid platforms are [{}]'
                    .format(self.platform, ', '.join(self.platforms)))

            filteredAgents = self.filterAgents(user_agents['user_agents'])

            if not self.browser:
                # has to be at least one in there...
                while not filteredAgents.get(self.browser):
                    self.browser = random.SystemRandom().choice(
                        list(filteredAgents.keys()))

            if not filteredAgents[self.browser]:
                sys.tracebacklimit = 0
                raise RuntimeError(
                    'Sorry "{}" browser was not found with a platform of "{}".'
                    .format(self.browser, self.platform))

            self.cipherSuite = user_agents['cipherSuite'][self.browser]
            self.headers = user_agents['headers'][self.browser]

            self.headers['User-Agent'] = random.SystemRandom().choice(
                filteredAgents[self.browser])

        if not kwargs.get('allow_brotli',
                          False) and 'br' in self.headers['Accept-Encoding']:
            self.headers['Accept-Encoding'] = ','.join([
                encoding
                for encoding in self.headers['Accept-Encoding'].split(',')
                if encoding.strip() != 'br'
            ]).strip()