def test_default_pars(s3): s3 = S3FileSystem(default_block_size=20, default_fill_cache=False, client_kwargs={'endpoint_url': endpoint_uri}) fn = test_bucket_name + '/' + list(files)[0] with s3.open(fn) as f: assert f.blocksize == 20 assert f.fill_cache is False with s3.open(fn, block_size=40, fill_cache=True) as f: assert f.blocksize == 40 assert f.fill_cache is True
def test_write_large_secure(s3): # build our own s3fs with the relevant additional kwarg s3 = S3FileSystem(s3_additional_kwargs={'ServerSideEncryption': 'AES256'}, client_kwargs={'endpoint_url': endpoint_uri}) s3.mkdir('mybucket') with s3.open('mybucket/myfile', 'wb') as f: f.write(b'hello hello' * 10 ** 6) assert s3.cat('mybucket/myfile') == b'hello hello' * 10 ** 6
def test_config_kwargs_class_attributes_override(): s3 = S3FileSystem( config_kwargs={ "connect_timeout": 60, "read_timeout": 120, }, client_kwargs={'endpoint_url': endpoint_uri} ) assert s3.connect().meta.config.connect_timeout == 60 assert s3.connect().meta.config.read_timeout == 120
def check_if_file_exits(uri: str) -> bool: print("Now checking if file exists at {}".format(uri)) s3_filesystem = S3FileSystem(anon=False) exists = s3_filesystem.exists(uri) print(exists) return exists
def test_list_versions_many(s3): # moto doesn't actually behave in the same way that s3 does here so this doesn't test # anything really in moto 1.2 s3 = S3FileSystem(anon=False, version_aware=True) versioned_file = versioned_bucket_name + '/versioned_file2' for i in range(1200): with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') versions = s3.object_version_info(versioned_file) assert len(versions) == 1200
def test_mkdir_client_region_name(): bucket = 'test1_bucket' try: m = moto.mock_s3() m.start() s3 = S3FileSystem(anon=False, client_kwargs={"region_name": "eu-central-1"}) s3.mkdir(bucket) assert bucket in s3.ls('/') finally: m.stop()
def __init__( self, filepath: str, bucket_name: str = None, credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, s3fs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ParquetS3DataSet`` pointing to a concrete parquet file on S3. Args: filepath: Path to a parquet file, parquet collection or the directory of a multipart parquet. May contain the full path in S3 including bucket and protocol, e.g. `s3://bucket-name/path/to/file.parquet`. bucket_name: S3 bucket name. Must be specified **only** if not present in ``filepath``. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Additional loading options `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas save_args: Additional saving options for `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. s3fs_args: S3FileSystem options. You can see all available arguments at: https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem """ _credentials = copy.deepcopy(credentials) or {} _s3fs_args = copy.deepcopy(s3fs_args) or {} _s3 = S3FileSystem(client_kwargs=_credentials, **_s3fs_args) path = _s3._strip_protocol(filepath) # pylint: disable=protected-access path = PurePosixPath( "{}/{}".format(bucket_name, path) if bucket_name else path) super().__init__( load_args=load_args, save_args=save_args, filepath=path, version=version, exists_function=_s3.exists, glob_function=_s3.glob, ) self._s3 = _s3
def __init__(self, verbose=False): self.s3 = S3FileSystem(anon=False) track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv') dftrack = read_csv(track_detail) self.map_track_gbs_to_x8 = dftrack.set_index( 'gbs_track_sym')['x8_track_sym'].to_dict() self.map_track_x8_to_gbs = dftrack.set_index( 'x8_track_sym')['gbs_track_sym'].to_dict() self.verbose = verbose
def __init__(self, verbose=False): self.s3 = S3FileSystem(anon=False) self.br = BetResult() self.df = DataFrame() self.df_electronic = DataFrame() self.dfraw = DataFrame() # for normalizing track symbols track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv') self.dftrack = read_csv(track_detail) self.verbose = verbose
def save_model(model, model_id, acc, model_type="RandomForestClassifier", name="no_show_model"): """Take an sklearn model, serialize it and store on s3 for later use""" model_path = build_model_path(model_id, name) s3 = S3FileSystem(anon=False) with s3.open(model_path, mode="wb") as f: f.write(pickle.dumps(model)) # Return a model record: return model_record( model_id=model_id, acc=acc, model_type=model_type, name=name )
def __init__( self, filepath: str, bucket_name: str, credentials: Optional[Dict[str, Any]] = None, load_args: Optional[Dict[str, Any]] = None, save_args: Optional[Dict[str, Any]] = None, version: Version = None, ) -> None: """Creates a new instance of ``PickleS3DataSet`` pointing to a concrete file on S3. ``PickleS3DataSet`` uses pickle backend to serialise objects to disk: pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dumps and to load serialised objects into memory: pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads Args: filepath: path to a pkl file. bucket_name: S3 bucket name. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Options for loading pickle files. Refer to the help file of ``pickle.loads`` for options. save_args: Options for saving pickle files. Refer to the help file of ``pickle.dumps`` for options. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ _credentials = deepcopy(credentials) or {} _s3 = S3FileSystem(client_kwargs=_credentials) super().__init__( PurePosixPath("{}/{}".format(bucket_name, filepath)), version, exists_function=_s3.exists, glob_function=_s3.glob, ) self._bucket_name = bucket_name self._credentials = _credentials # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) self._s3 = _s3
def test_default_cache_type(s3, default_cache_type): data = b'a' * (10 * 2**20) s3 = S3FileSystem(anon=False, default_cache_type=default_cache_type) with s3.open(a, 'wb') as f: f.write(data) with s3.open(a, 'rb') as f: assert isinstance(f.cache, fsspec.core.caches[default_cache_type]) out = f.read(len(data)) assert len(data) == len(out) assert out == data
def test_exists_versioned(s3, version_aware): """Test to ensure that a prefix exists when using a versioned bucket""" import uuid n = 3 s3 = S3FileSystem(anon=False, version_aware=version_aware) segments = [versioned_bucket_name] + [str(uuid.uuid4()) for _ in range(n)] path = '/'.join(segments) for i in range(2, n + 1): assert not s3.exists('/'.join(segments[:i])) s3.touch(path) for i in range(2, n + 1): assert s3.exists('/'.join(segments[:i]))
def s3(): # writable local S3 system m = moto.mock_s3() m.start() import boto3 client = boto3.client('s3') client.create_bucket(Bucket=test_bucket_name, ACL='public-read') # initialize secure bucket bucket = client.create_bucket(Bucket=secure_bucket_name, ACL='public-read') policy = json.dumps({ "Version": "2012-10-17", "Id": "PutObjPolicy", "Statement": [ { "Sid": "DenyUnEncryptedObjectUploads", "Effect": "Deny", "Principal": "*", "Action": "s3:PutObject", "Resource": "arn:aws:s3:::{bucket_name}/*".format(bucket_name=secure_bucket_name), "Condition": { "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } } } ] }) client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) for k in [a, b, c, d]: try: client.delete_object(Bucket=test_bucket_name, Key=k) except: pass for flist in [files, csv_files, text_files]: for f, data in flist.items(): client.put_object(Bucket=test_bucket_name, Key=f, Body=data) yield S3FileSystem(anon=False) for flist in [files, csv_files, text_files]: for f, data in flist.items(): try: client.delete_object(Bucket=test_bucket_name, Key=f, Body=data) client.delete_object(Bucket=secure_bucket_name, Key=f, Body=data) except: pass for k in [a, b, c, d]: try: client.delete_object(Bucket=test_bucket_name, Key=k) client.delete_object(Bucket=secure_bucket_name, Key=k) except: pass m.stop()
def get_s3_data(bucket, key): df = None # Try reading csv from S3 file system try: s3 = S3FileSystem(anon=False) df = pd.read_csv(s3.open('{}/{}'.format(s_bucket, s_key), mode='rb')) print(df) except Exception as e: print(e) return df
def _determite_file_system(self, filename): """Determines file system Args: filename: filename to determine file system Returns: returns s3 if file system is s3, else None """ if filename.startswith("s3://"): s3 = S3FileSystem(anon=False, profile_name=self.profile_name) return s3 else: return None
def test_write_large_secure(s3): mock = moto.mock_s3() mock.start() # build our own s3fs with the relevant additional kwarg s3 = S3FileSystem(s3_additional_kwargs = {'ServerSideEncryption': 'AES256'}) s3.mkdir('mybucket') with s3.open('mybucket/myfile', 'wb') as f: f.write(b'hello hello' * 10**6) assert s3.cat('mybucket/myfile') == b'hello hello' * 10**6
def getScaData(item_name, bucket_name="script.control.tool"): ''' gets and cleans the sca data when there is no user data input ''' s3 = S3FileSystem() df = np.load(s3.open("{}/{}".format(bucket_name, item_name))) sca_load = (0.25 * np.arange(0, 96), df[0, :] ) # sample first profile for examples data_final = [] for x in range(len(sca_load[0])): row = {"load": str(sca_load[1][x]), "time": str(sca_load[0][x])} data_final.append(row) return data_final
def get_data(csvCols, whichData): s3 = S3FileSystem(anon=True) if whichData == "training": path = f's3://{S3_BUCKET}/{TRAINING_FILE_NAME}' elif whichData == "scoring": path = f's3://{S3_BUCKET}/{SCORING_FILE_NAME}' # PUT ERROR CATCHING HERE FOR ERRORS IN INPUT FILES # Read in .csv file, but only for specified columns. df = pd.read_csv(s3.open(f'{path}', mode='rb'), usecols=csvCols) for c in csvCols: if (df[c].dtype == 'object'): df = df[df[c].str.match(" ") == False] return df
def __init__( self, filepath: str, bucket_name: str, credentials: Optional[Dict[str, Any]] = None, load_args: Optional[Dict[str, Any]] = None, save_args: Optional[Dict[str, Any]] = None, version: Version = None, ) -> None: """Creates a new instance of ``CSVS3DataSet`` pointing to a concrete csv file on S3. Args: filepath: Path to a csv file. bucket_name: S3 bucket name. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Pandas options for loading csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html All defaults are preserved. save_args: Pandas options for saving csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ _credentials = copy.deepcopy(credentials) or {} _s3 = S3FileSystem(client_kwargs=_credentials) super().__init__( PurePosixPath("{}/{}".format(bucket_name, filepath)), version, exists_function=_s3.exists, glob_function=_s3.glob, ) self._bucket_name = bucket_name self._credentials = _credentials # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) self._s3 = _s3
def test_change_defaults_only_subsequent(monkeypatch): """Test for Issue #135 Ensure that changing the default block size doesn't affect existing file systems that were created using that default. It should only affect file systems created after the change. """ fs_default = S3FileSystem() assert fs_default.default_block_size == 5 * (1024 ** 2) fs_overridden = S3FileSystem(default_block_size=64 * (1024 ** 2)) assert fs_overridden.default_block_size == 64 * (1024 ** 2) # Suppose I want all subsequent file systems to have a block size of 1 GiB # instead of 5 MiB: monkeypatch.setattr(S3FileSystem, 'default_block_size', 1024 ** 3) fs_big = S3FileSystem() assert fs_big.default_block_size == 1024 ** 3 # Test the other file systems created to see if their block sizes changed assert fs_overridden.default_block_size == 64 * (1024 ** 2) assert fs_default.default_block_size == 5 * (1024 ** 2)
def s3fs_open(self, path, mode): from s3fs.core import S3FileSystem endpoint_url = os.environ.get('S3_ENDPOINT_URL') client_kwargs = {} if endpoint_url: client_kwargs = {'endpoint_url': endpoint_url} if 'r' in mode: self.wait_for_path(path) s3 = S3FileSystem(anon=False, default_fill_cache=False, client_kwargs=client_kwargs) return s3.open(path, mode=mode)
def save_predictions(engine, location='Cassandra', predictions=None, log=None): if location == 'S3': predictions = predictions[['appointment_id', 'patient_id', 'appointment_day', \ 'no_show_likelihood']].sort_values(by='no_show_likelihood', ascending=False) bytes_to_write = predictions.to_csv(None, index=False).encode() fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY) with fs.open(f"s3://{S3_BUCKET}/scores/latest_scores.csv", 'wb') as f: f.write(bytes_to_write) else: predictions = predictions.to_dict(orient='records') print(predictions[:3], flush=True) for batch in batches(predictions, 200): res = engine.save(PREDICTION_SCHEMA, batch).result() log.debug(res)
def __init__( self, filepath: str, bucket_name: str, credentials: Optional[Dict[str, Any]] = None, load_args: Optional[Dict[str, Any]] = None, save_args: Optional[Dict[str, Any]] = None, version: Version = None, ) -> None: """Creates a new instance of ``PickleS3DataSet`` pointing to a concrete file on S3. ``PickleS3DataSet`` uses pickle backend to serialise objects to disk: pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dumps and to load serialised objects into memory: pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads Args: filepath: path to a pkl file. bucket_name: S3 bucket name. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Options for loading pickle files. Refer to the help file of ``pickle.loads`` for options. save_args: Options for saving pickle files. Refer to the help file of ``pickle.dumps`` for options. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ default_load_args = {} default_save_args = {} self._filepath = filepath self._bucket_name = bucket_name self._credentials = credentials if credentials else {} self._version = version self._load_args = ({ **default_load_args, **load_args } if load_args is not None else default_load_args) self._save_args = ({ **default_save_args, **save_args } if save_args is not None else default_save_args) self._s3 = S3FileSystem(client_kwargs=self._credentials)
def __init__(self, verbose=False): self.s3 = S3FileSystem(anon=False) self.df = DataFrame() self.dfraw = DataFrame() self.df_scratch = DataFrame() self.df_result_matrix = DataFrame() self.df_payout = DataFrame() # now getting track detail exclusively from git file (horse/betsim/data/track_detail.csv) instead of relative path from where data is being loaded track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv') self.dftrack = read_csv(track_detail) self.map_track_chart_to_x8 = self.dftrack.set_index( 'chart_file_sym')['x8_track_sym'].to_dict() self.map_track_x8_to_chart = self.dftrack.set_index( 'x8_track_sym')['chart_file_sym'].to_dict() self.verbose = verbose
def test_versions_unaware(s3): versioned_file = versioned_bucket_name + '/versioned_file3' s3 = S3FileSystem(anon=False, version_aware=False) with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') with s3.open(versioned_file, 'wb') as fo: fo.write(b'2') with s3.open(versioned_file) as fo: assert fo.version_id is None assert fo.read() == b'2' with pytest.raises(ValueError): with s3.open(versioned_file, version_id='0'): fo.read()
async def _(): s3 = S3FileSystem(anon=False, asynchronous=True, client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri}) fn = test_bucket_name + "/nested/file1" data = b"hello\n" # Fails because client creation has not yet been awaited with pytest.raises(RuntimeError): await s3._cat_file(fn) await s3.connect() # creates client assert await s3._cat_file(fn) == data
def test_read_uncached(create_main_file): fs = S3PrefetchFileSystem() s3_path = str(create_main_file) with fs.open(s3_path, "rb", block_size=BLOCK_SIZE, prefetch_storage=list(CACHES.items())) as f: data = f.read() fs = S3FileSystem() with fs.open(s3_path, "rb") as f: actual_data = f.read() assert data == actual_data cleanup(os.path.basename(s3_path))
def lambda_handler(event, context): if event['data-source'] == 'json-payload': success_put_count = 0 for row in event['data']: row['UnitPrice'] = str(row['UnitPrice']) try: table.put_item(Item=cast_to_decimal(row)) success_put_count += 1 except ClientError as e: pass total_records = len(event['data']) if success_put_count > 0: message = f'Success: Inserted {success_put_count} of {total_records} records of json payload to DynamoDB' else: message = 'Fail: No records were inserted' return { 'statusCode': 200, 'body': message } elif event['data-source'] == 's3' and 's3-path' in event: o = urlparse(event['s3-path']) bucket = o.netloc filepath = o.path.lstrip('/') s3 = S3FileSystem(anon=False) df = pd.read_csv(s3.open(event['s3-path'], mode='rb'), dtype={ 'InvoiceNo': str, 'UnitPrice': str}, converters={'CustomerID': lambda id: str(int(float(id)))}) success_put_count = df.apply(insert_to_table, axis=1).sum() s3.cp(event['s3-path'], 's3://' + bucket + '/processed' + filepath[filepath.rfind('/'):]) s3.rm(event['s3-path']) s3_path = event['s3-path'] if success_put_count > 0: message = f'Success: Inserted {success_put_count} of {df.shape[0]} records of data from S3 path {s3_path} to DynamoDB' else: message = 'Fail: No records were inserted' return { 'statusCode': 200, 'body': message } else: return { 'statusCode': 200, 'body': 'Error: data not valid' }
def save_scores(ska, scoring, location): # Save to Cassandra if location == "both" or location == "cassandra": #Convert scoring data to list of objects scores = scoring.to_dict(orient='records') #Save to Cassandra ska.log("Saving to Cassandra", level=logging.INFO) ska.engine.save(SCORING_SCHEMA, scores).result() ska.log("Saving to Cassandra", labels=["S3saving"], level=logging.INFO) #Save to S3 if location == "both" or location == "S3": bytes_to_write = scoring.to_csv(None, index=False).encode() fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY) with fs.open(f"s3://{S3_PRIVATE_BUCKET}/{CHURN_MODEL_SCORES}", 'wb') as f: f.write(bytes_to_write) ska.log("Saving to S3", labels=["S3saving"], level=logging.INFO)
def test_current(s3): assert S3FileSystem.current() is s3