Esempio n. 1
0
 def open_aws_url(config, _, storage, url):
     reader_impl = SourceFile.extract_reader_impl(config)
     use_aws_account = "aws_access_key_id" in config["provider"] and "aws_secret_access_key" in config["provider"] and storage == "s3://"
     if reader_impl == "s3fs":
         if use_aws_account:
             aws_access_key_id = None
             if "aws_access_key_id" in config["provider"]:
                 aws_access_key_id = config["provider"]["aws_access_key_id"]
             aws_secret_access_key = None
             if "aws_secret_access_key" in config["provider"]:
                 aws_secret_access_key = config["provider"]["aws_secret_access_key"]
             s3 = S3FileSystem(anon=False, key=aws_access_key_id, secret=aws_secret_access_key)
             result = s3.open(f"s3://{url}", mode="r")
         else:
             s3 = S3FileSystem(anon=True)
             result = s3.open(f"s3://{url}", mode="r")
     else:
         if use_aws_account:
             aws_access_key_id = ""
             if "aws_access_key_id" in config["provider"]:
                 aws_access_key_id = config["provider"]["aws_access_key_id"]
             aws_secret_access_key = ""
             if "aws_secret_access_key" in config["provider"]:
                 aws_secret_access_key = config["provider"]["aws_secret_access_key"]
             result = open(f"s3://{aws_access_key_id}:{aws_secret_access_key}@{url}")
         else:
             config = Config(signature_version=UNSIGNED)
             params = {
                 "resource_kwargs": {"config": config},
             }
             result = open(f"{storage}{url}", transport_params=params)
     return result
Esempio n. 2
0
    def __init__(self,
                 obs_file_system: s3fs.S3FileSystem,
                 dir_path: str,
                 zarr_kwargs: Dict[str, Any] = None,
                 ds_id: str = None,
                 exception_type: type = ValueError):

        level_paths = {}
        for entry in obs_file_system.walk(dir_path, directories=True):
            level_dir = entry.split("/")[-1]
            basename, ext = os.path.splitext(level_dir)
            if basename.isdigit():
                level = int(basename)
                if entry.endswith(".zarr") and obs_file_system.isdir(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)
                elif entry.endswith(".link") and obs_file_system.isfile(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)

        num_levels = len(level_paths)
        # Consistency check
        for level in range(num_levels):
            if level not in level_paths:
                raise exception_type(
                    f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}"
                )

        super().__init__(ds_id=ds_id, parameters=zarr_kwargs)
        self._obs_file_system = obs_file_system
        self._dir_path = dir_path
        self._level_paths = level_paths
        self._num_levels = num_levels
 def __init__(self,
              key=None,
              username=None,
              secret=None,
              password=None,
              path=None,
              host=None,
              s3=None,
              **kwargs):
     if username is not None:
         if key is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `key` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         key = username
     if key is not None:
         kwargs['key'] = key
     if password is not None:
         if secret is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `secret` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         secret = password
     if secret is not None:
         kwargs['secret'] = secret
     # S3FileSystem.__init__(self, kwargs)  # not sure what do do here
     S3FileSystem.__init__(self, **kwargs)
Esempio n. 4
0
    def _write_test_data(cls, s3: s3fs.S3FileSystem):
        if not s3.isdir(cls.BUCKET_NAME):
            s3.mkdir(cls.BUCKET_NAME)

        data = helpers.make_test_store()
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_1.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_2.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
Esempio n. 5
0
def _s3_open_file_with_retries(fs: s3fs.S3FileSystem, path: str,
                               retries: int) -> Any:
    for _ in range(retries):
        try:
            logger.info(f"opening {path}")
            file = fs.open(path)
            return file
        except Exception as ex:
            logger.warning(f"could not open {path}: {ex}")
            # if the file has just been uploaded, then it might not be visible immediatly
            # but the fail to open has been cached by s3fs
            # so, we invalidate the cache
            fs.invalidate_cache(path)
            # and we give some time to S3 to settle the file status
            sleep(1)
Esempio n. 6
0
def resolve_filesystem_and_path(uri: str,
                                **kwargs) -> Tuple[EnhancedFileSystem, str]:
    parsed_uri = urlparse(uri)
    fs_path = parsed_uri.path
    # from https://github.com/apache/arrow/blob/master/python/pyarrow/filesystem.py#L419
    # with viewfs support
    if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        else:
            host = parsed_uri.scheme + "://" + host
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])

        fs = EnhancedFileSystem(pyarrow.hdfs.connect(host=host, port=port))
    elif parsed_uri.scheme == 's3' or parsed_uri.scheme == 's3a':
        fs = EnhancedFileSystem(
            pyarrow.filesystem.S3FSWrapper(S3FileSystem(**kwargs)))
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = EnhancedFileSystem(
            pyarrow.filesystem.LocalFileSystem.get_instance())

    _logger.info(f"Resolved base filesystem: {type(fs.base_fs)}")
    return fs, fs_path
Esempio n. 7
0
def filesystem() -> AbstractFileSystem:
    fs = LocalFileSystem()

    endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "")
    bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "")
    if endpoint_url != "" and bucket_name != "":
        key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "")
        secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "")
        # TODO: Remove when updated on the platform side.
        if key == "" or secret == "":
            key = os.getenv("AWS_ACCESS_KEY_ID", "")
            secret = os.getenv("AWS_SECRET_ACCESS_KEY", "")
        if key == "" or secret == "":
            raise RuntimeError("missing S3 bucket credentials")

        fs = S3FileSystem(key=key,
                          secret=secret,
                          use_ssl=False,
                          client_kwargs={"endpoint_url": endpoint_url})

        app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "")
        if app_id == "":
            raise RuntimeError("missing LIGHTNING_CLOUD_APP_ID")

        if not fs.exists(shared_storage_path()):
            raise RuntimeError(
                f"shared filesystem {shared_storage_path()} does not exist")

    return fs
Esempio n. 8
0
def retrieve_puf(aws_access_key_id=AWS_ACCESS_KEY_ID,
                 aws_secret_access_key=AWS_SECRET_ACCESS_KEY):
    """
    Function for retrieving the PUF from the OSPC S3 bucket
    """
    s3_reader_installed = S3FileSystem is not None
    has_credentials = (aws_access_key_id is not None
                       and aws_secret_access_key is not None)
    if has_credentials and s3_reader_installed:
        print("Reading puf from S3 bucket.")
        fs = S3FileSystem(
            key=AWS_ACCESS_KEY_ID,
            secret=AWS_SECRET_ACCESS_KEY,
        )
        with fs.open("s3://ospc-data-files/puf.csv.gz") as f:
            # Skips over header from top of file.
            puf_df = pd.read_csv(f, compression="gzip")
        return puf_df
    elif Path("puf.csv.gz").exists():
        print("Reading puf from puf.csv.gz.")
        return pd.read_csv("puf.csv.gz", compression="gzip")
    elif Path("puf.csv").exists():
        print("Reading puf from puf.csv.")
        return pd.read_csv("puf.csv")
    else:
        warnings.warn(
            f"PUF file not available (has_credentials={has_credentials}, "
            f"s3_reader_installed={s3_reader_installed})")
        return None
Esempio n. 9
0
def write_df_to_parquet_to_s3(df: pd.DataFrame,
                              filename: str,
                              s3_bucketname: str,
                              s3_bucketkey=None):
    # TODO: Need to figure out how to modify this file so it doesn't write the parquet file into the current working directory and then subsequently upload to S3. We want it to just upload directly to S3 (w/o having to write it to the current working directory)

    assert 's3://' not in s3_bucketname, 'prefix "s3://" not required'
    assert filename[-8:] == '.parquet', 'filename must have suffix ".parquet"'

    if 's3://' in s3_bucketname:
        pass
    else:
        s3_bucketname = 's3://' + s3_bucketname

    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)

    if s3_bucketkey is not None:
        key_to_use = s3_bucketkey + '/' + filename
    else:
        key_to_use = filename

    outputfile = s3_bucketname + '/' + key_to_use

    s3 = S3FileSystem()
    pq.write_to_dataset(table=table, root_path=outputfile, filesystem=s3)
Esempio n. 10
0
    def path_exists(self, path):
        if 's3://' in path:
            path_in_s3 = path.replace("s3://", "")

            return S3FileSystem(anon=False).exists(path_in_s3)
        else:
            return os.path.exists(path)
Esempio n. 11
0
 def s3_connect(self):
     """
     Wrapper to create a session at AWS S3 with given authorization-key
     """
     session = boto3.Session()
     self.s3_conn = session.resource("s3")
     self.s3_fs = S3FileSystem()
Esempio n. 12
0
 def s3_service(self):
     try:
         return self._tls.s3_service
     except (AttributeError, KeyError):
         from s3fs import S3FileSystem
         self._tls.s3_service = S3FileSystem(**self.s3_args)
         return self._tls.s3_service
Esempio n. 13
0
def _get_s3(key=None, username=None, secret=None, password=None, **kwargs):
    """ Reuse ``s3`` instance or construct a new S3FileSystem from storage_options.

    >>> isinstance(_get_s3(), S3FileSystem)
    True
    >>> s3 = _get_s3(anon=False)
    >>> s3.anon
    False
    """
    if username is not None:
        if key is not None:
            raise KeyError("S3 storage options got secrets argument "
                           "collision. Please, use either `key` "
                           "storage option or password field in URLpath, "
                           "not both options together.")
        key = username
    if key is not None:
        kwargs['key'] = key
    if password is not None:
        if secret is not None:
            raise KeyError("S3 storage options got secrets argument "
                           "collision. Please, use either `secret` "
                           "storage option or password field in URLpath, "
                           "not both options together.")
        secret = password
    if secret is not None:
        kwargs['secret'] = secret
    return S3FileSystem(**kwargs)
Esempio n. 14
0
    def __init__(self,
                 s3_file_system: s3fs.S3FileSystem,
                 dir_path: str,
                 zarr_kwargs: Dict[str, Any] = None,
                 ds_id: str = None,
                 chunk_cache_capacity: int = None,
                 exception_type: type = ValueError):

        level_paths = {}
        entries = s3_file_system.ls(dir_path, detail=False)
        for entry in entries:
            level_dir = entry.split("/")[-1]
            basename, ext = os.path.splitext(level_dir)
            if basename.isdigit():
                level = int(basename)
                if entry.endswith(".zarr") and s3_file_system.isdir(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)
                elif entry.endswith(".link") and s3_file_system.isfile(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)

        num_levels = len(level_paths)
        # Consistency check
        for level in range(num_levels):
            if level not in level_paths:
                raise exception_type(
                    f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}"
                )

        super().__init__(ds_id=ds_id, parameters=zarr_kwargs)
        self._s3_file_system = s3_file_system
        self._dir_path = dir_path
        self._level_paths = level_paths
        self._num_levels = num_levels

        self._chunk_cache_capacities = None
        if chunk_cache_capacity:
            weights = []
            weigth_sum = 0
            for level in range(num_levels):
                weight = 2**(num_levels - 1 - level)
                weight *= weight
                weigth_sum += weight
                weights.append(weight)
            self._chunk_cache_capacities = [
                round(chunk_cache_capacity * weight / weigth_sum)
                for weight in weights
            ]
 def write_run_config_to_s3(self, config_string):
     s3_key = self._project_parameters.compile_path({}, 'run_config',
                                                    'toml')
     s3_path = Path(s3_key)
     backup_path = Path(*s3_path.parts[:-1],
                        f'run_config_until_{datetime.now()}.toml')
     with S3FileSystem().open(
             f'{self._aws_parameters.s3_config_bucket}/{backup_path}',
             'wb') as f:
         f.write(toml.dumps(self._run_config).encode('utf-8'))
     with S3FileSystem().open(
             f'{self._aws_parameters.s3_config_bucket}/{s3_key}',
             'wb') as f:
         f.write(config_string.encode('utf-8'))
     message = f'New config written to {self._aws_parameters.s3_config_bucket}/{s3_key}'
     print(message)
     return message
 def run(self):
     table = pq.read_table(self.input().path)
     pq.write_to_dataset(
         table,
         root_path='s3://aws-meetup-almaty/yellow-taxi-ds',
         partition_cols=['pickup_date'],
         filesystem=S3FileSystem(),
     )
Esempio n. 17
0
File: s3.py Progetto: fortizc/dask
 def __init__(self, key=None, username=None, secret=None, password=None,
              path=None, host=None, s3=None, **kwargs):
     if username is not None:
         if key is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `key` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         key = username
     if key is not None:
         kwargs['key'] = key
     if password is not None:
         if secret is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `secret` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         secret = password
     if secret is not None:
         kwargs['secret'] = secret
     # S3FileSystem.__init__(self, kwargs)  # not sure what do do here
     S3FileSystem.__init__(self, **kwargs)
Esempio n. 18
0
File: s3.py Progetto: fortizc/dask
 def open(self, path, mode='rb'):
     s3_path = self._trim_filename(path)
     f = S3FileSystem.open(self, s3_path, mode=mode)
     return f
Esempio n. 19
0
File: s3.py Progetto: gameduell/dask
 def open(self, path, mode='rb', **kwargs):
     bucket = kwargs.pop('host', '')
     s3_path = bucket + path
     return S3FileSystem.open(self, s3_path, mode=mode)
Esempio n. 20
0
File: s3.py Progetto: gameduell/dask
 def glob(self, path, **kwargs):
     bucket = kwargs.pop('host', '')
     s3_path = bucket + path
     return S3FileSystem.glob(self, s3_path)
Esempio n. 21
0
File: s3.py Progetto: fortizc/dask
 def glob(self, path):
     s3_path = self._trim_filename(path)
     return ['s3://%s' % s for s in S3FileSystem.glob(self, s3_path)]