Example #1
0
def test_default_pars(s3):
    s3 = S3FileSystem(default_block_size=20, default_fill_cache=False,
                      client_kwargs={'endpoint_url': endpoint_uri})
    fn = test_bucket_name + '/' + list(files)[0]
    with s3.open(fn) as f:
        assert f.blocksize == 20
        assert f.fill_cache is False
    with s3.open(fn, block_size=40, fill_cache=True) as f:
        assert f.blocksize == 40
        assert f.fill_cache is True
Example #2
0
def test_write_large_secure(s3):
    # build our own s3fs with the relevant additional kwarg
    s3 = S3FileSystem(s3_additional_kwargs={'ServerSideEncryption': 'AES256'},
                      client_kwargs={'endpoint_url': endpoint_uri})
    s3.mkdir('mybucket')

    with s3.open('mybucket/myfile', 'wb') as f:
        f.write(b'hello hello' * 10 ** 6)

    assert s3.cat('mybucket/myfile') == b'hello hello' * 10 ** 6
Example #3
0
def test_config_kwargs_class_attributes_override():
    s3 = S3FileSystem(
        config_kwargs={
            "connect_timeout": 60,
            "read_timeout": 120,
        },
        client_kwargs={'endpoint_url': endpoint_uri}
    )
    assert s3.connect().meta.config.connect_timeout == 60
    assert s3.connect().meta.config.read_timeout == 120
def check_if_file_exits(uri: str) -> bool:

    print("Now checking if file exists at {}".format(uri))

    s3_filesystem = S3FileSystem(anon=False)
    exists = s3_filesystem.exists(uri)

    print(exists)

    return exists
Example #5
0
def test_list_versions_many(s3):
    # moto doesn't actually behave in the same way that s3 does here so this doesn't test
    # anything really in moto 1.2
    s3 = S3FileSystem(anon=False, version_aware=True)
    versioned_file = versioned_bucket_name + '/versioned_file2'
    for i in range(1200):
        with s3.open(versioned_file, 'wb') as fo:
            fo.write(b'1')
    versions = s3.object_version_info(versioned_file)
    assert len(versions) == 1200
Example #6
0
def test_mkdir_client_region_name():
    bucket = 'test1_bucket'
    try:
        m = moto.mock_s3()
        m.start()
        s3 = S3FileSystem(anon=False, client_kwargs={"region_name":
                                                     "eu-central-1"})
        s3.mkdir(bucket)
        assert bucket in s3.ls('/')
    finally:
        m.stop()
Example #7
0
    def __init__(
        self,
        filepath: str,
        bucket_name: str = None,
        credentials: Dict[str, Any] = None,
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
        version: Version = None,
        s3fs_args: Dict[str, Any] = None,
    ) -> None:
        """Creates a new instance of ``ParquetS3DataSet`` pointing to a concrete
        parquet file on S3.

        Args:
            filepath: Path to a parquet file, parquet collection or the directory
                of a multipart parquet. May contain the full path in S3 including
                bucket and protocol, e.g. `s3://bucket-name/path/to/file.parquet`.
            bucket_name: S3 bucket name. Must be specified **only** if not
                present in ``filepath``.
            credentials: Credentials to access the S3 bucket, such as
                ``aws_access_key_id``, ``aws_secret_access_key``.
            load_args: Additional loading options `pyarrow`:
                https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
                or `fastparquet`:
                https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas
            save_args: Additional saving options for `pyarrow`:
                https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas
                or `fastparquet`:
                https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.
            s3fs_args: S3FileSystem options. You can see all available arguments at:
                https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem
        """

        _credentials = copy.deepcopy(credentials) or {}
        _s3fs_args = copy.deepcopy(s3fs_args) or {}
        _s3 = S3FileSystem(client_kwargs=_credentials, **_s3fs_args)
        path = _s3._strip_protocol(filepath)  # pylint: disable=protected-access
        path = PurePosixPath(
            "{}/{}".format(bucket_name, path) if bucket_name else path)

        super().__init__(
            load_args=load_args,
            save_args=save_args,
            filepath=path,
            version=version,
            exists_function=_s3.exists,
            glob_function=_s3.glob,
        )

        self._s3 = _s3
Example #8
0
    def __init__(self, verbose=False):
        self.s3 = S3FileSystem(anon=False)

        track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv')
        dftrack = read_csv(track_detail)
        self.map_track_gbs_to_x8 = dftrack.set_index(
            'gbs_track_sym')['x8_track_sym'].to_dict()
        self.map_track_x8_to_gbs = dftrack.set_index(
            'x8_track_sym')['gbs_track_sym'].to_dict()

        self.verbose = verbose
Example #9
0
    def __init__(self, verbose=False):
        self.s3 = S3FileSystem(anon=False)
        self.br = BetResult()

        self.df = DataFrame()
        self.df_electronic = DataFrame()
        self.dfraw = DataFrame()
        # for normalizing track symbols
        track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv')
        self.dftrack = read_csv(track_detail)
        self.verbose = verbose
Example #10
0
def save_model(model, model_id, acc, model_type="RandomForestClassifier", name="no_show_model"):
  """Take an sklearn model, serialize it and store on s3 for later use"""
  model_path = build_model_path(model_id, name)
  s3 = S3FileSystem(anon=False)
  with s3.open(model_path, mode="wb") as f:
    f.write(pickle.dumps(model))

  # Return a model record:
  return model_record(
      model_id=model_id, acc=acc, model_type=model_type, name=name
  )
Example #11
0
    def __init__(
        self,
        filepath: str,
        bucket_name: str,
        credentials: Optional[Dict[str, Any]] = None,
        load_args: Optional[Dict[str, Any]] = None,
        save_args: Optional[Dict[str, Any]] = None,
        version: Version = None,
    ) -> None:
        """Creates a new instance of ``PickleS3DataSet`` pointing to a
        concrete file on S3. ``PickleS3DataSet`` uses pickle backend to
        serialise objects to disk:

        pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dumps

        and to load serialised objects into memory:

        pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads

        Args:
            filepath: path to a pkl file.
            bucket_name: S3 bucket name.
            credentials: Credentials to access the S3 bucket, such as
                ``aws_access_key_id``, ``aws_secret_access_key``.
            load_args: Options for loading pickle files. Refer to the help
                file of ``pickle.loads`` for options.
            save_args: Options for saving pickle files. Refer to the help
                file of ``pickle.dumps`` for options.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.
        """
        _credentials = deepcopy(credentials) or {}
        _s3 = S3FileSystem(client_kwargs=_credentials)
        super().__init__(
            PurePosixPath("{}/{}".format(bucket_name, filepath)),
            version,
            exists_function=_s3.exists,
            glob_function=_s3.glob,
        )
        self._bucket_name = bucket_name
        self._credentials = _credentials

        # Handle default load and save arguments
        self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
        if load_args is not None:
            self._load_args.update(load_args)
        self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
        if save_args is not None:
            self._save_args.update(save_args)

        self._s3 = _s3
Example #12
0
def test_default_cache_type(s3, default_cache_type):
    data = b'a' * (10 * 2**20)
    s3 = S3FileSystem(anon=False, default_cache_type=default_cache_type)

    with s3.open(a, 'wb') as f:
        f.write(data)

    with s3.open(a, 'rb') as f:
        assert isinstance(f.cache, fsspec.core.caches[default_cache_type])
        out = f.read(len(data))
        assert len(data) == len(out)
        assert out == data
Example #13
0
def test_exists_versioned(s3, version_aware):
    """Test to ensure that a prefix exists when using a versioned bucket"""
    import uuid
    n = 3
    s3 = S3FileSystem(anon=False, version_aware=version_aware)
    segments = [versioned_bucket_name] + [str(uuid.uuid4()) for _ in range(n)]
    path = '/'.join(segments)
    for i in range(2, n + 1):
        assert not s3.exists('/'.join(segments[:i]))
    s3.touch(path)
    for i in range(2, n + 1):
        assert s3.exists('/'.join(segments[:i]))
Example #14
0
def s3():
    # writable local S3 system
    m = moto.mock_s3()
    m.start()
    import boto3
    client = boto3.client('s3')
    client.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # initialize secure bucket
    bucket = client.create_bucket(Bucket=secure_bucket_name, ACL='public-read')
    policy = json.dumps({
        "Version": "2012-10-17",
        "Id": "PutObjPolicy",
        "Statement": [
            {
                "Sid": "DenyUnEncryptedObjectUploads",
                "Effect": "Deny",
                "Principal": "*",
                "Action": "s3:PutObject",
                "Resource": "arn:aws:s3:::{bucket_name}/*".format(bucket_name=secure_bucket_name),
                "Condition": {
                    "StringNotEquals": {
                        "s3:x-amz-server-side-encryption": "aws:kms"
                    }
                }
            }
        ]
    })
    client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)

    for k in [a, b, c, d]:
        try:
            client.delete_object(Bucket=test_bucket_name, Key=k)
        except:
            pass
    for flist in [files, csv_files, text_files]:
        for f, data in flist.items():
            client.put_object(Bucket=test_bucket_name, Key=f, Body=data)
    yield S3FileSystem(anon=False)
    for flist in [files, csv_files, text_files]:
        for f, data in flist.items():
            try:
                client.delete_object(Bucket=test_bucket_name, Key=f, Body=data)
                client.delete_object(Bucket=secure_bucket_name, Key=f, Body=data)
            except:
                pass
    for k in [a, b, c, d]:
        try:
            client.delete_object(Bucket=test_bucket_name, Key=k)
            client.delete_object(Bucket=secure_bucket_name, Key=k)
        except:
            pass
    m.stop()
Example #15
0
def get_s3_data(bucket, key):
    df = None

    # Try reading csv from S3 file system
    try:
        s3 = S3FileSystem(anon=False)

        df = pd.read_csv(s3.open('{}/{}'.format(s_bucket, s_key), mode='rb'))
        print(df)
    except Exception as e:
        print(e)
    return df
Example #16
0
 def _determite_file_system(self, filename):
     """Determines file system
     Args:
         filename: filename to determine file system
     Returns:
         returns s3 if file system is s3, else None
     """
     if filename.startswith("s3://"):
         s3 = S3FileSystem(anon=False, profile_name=self.profile_name)
         return s3
     else:
         return None
Example #17
0
def test_write_large_secure(s3):
    mock = moto.mock_s3()
    mock.start()

    # build our own s3fs with the relevant additional kwarg
    s3 = S3FileSystem(s3_additional_kwargs = {'ServerSideEncryption': 'AES256'})
    s3.mkdir('mybucket')

    with s3.open('mybucket/myfile', 'wb') as f:
        f.write(b'hello hello' * 10**6)

    assert s3.cat('mybucket/myfile') == b'hello hello' * 10**6
Example #18
0
def getScaData(item_name, bucket_name="script.control.tool"):
    ''' gets and cleans the sca data when there is no user data input '''
    s3 = S3FileSystem()
    df = np.load(s3.open("{}/{}".format(bucket_name, item_name)))
    sca_load = (0.25 * np.arange(0, 96), df[0, :]
                )  # sample first profile for examples

    data_final = []
    for x in range(len(sca_load[0])):
        row = {"load": str(sca_load[1][x]), "time": str(sca_load[0][x])}
        data_final.append(row)

    return data_final
Example #19
0
def get_data(csvCols, whichData):
    s3 = S3FileSystem(anon=True)
    if whichData == "training":
        path = f's3://{S3_BUCKET}/{TRAINING_FILE_NAME}'
    elif whichData == "scoring":
        path = f's3://{S3_BUCKET}/{SCORING_FILE_NAME}'
    # PUT ERROR CATCHING HERE FOR ERRORS IN INPUT FILES
    # Read in .csv file, but only for specified columns.
    df = pd.read_csv(s3.open(f'{path}', mode='rb'), usecols=csvCols)
    for c in csvCols:
        if (df[c].dtype == 'object'):
            df = df[df[c].str.match(" ") == False]
    return df
Example #20
0
    def __init__(
        self,
        filepath: str,
        bucket_name: str,
        credentials: Optional[Dict[str, Any]] = None,
        load_args: Optional[Dict[str, Any]] = None,
        save_args: Optional[Dict[str, Any]] = None,
        version: Version = None,
    ) -> None:
        """Creates a new instance of ``CSVS3DataSet`` pointing to a concrete
        csv file on S3.

        Args:
            filepath: Path to a csv file.
            bucket_name: S3 bucket name.
            credentials: Credentials to access the S3 bucket, such as
                ``aws_access_key_id``, ``aws_secret_access_key``.
            load_args: Pandas options for loading csv files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
                All defaults are preserved.
            save_args: Pandas options for saving csv files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
                All defaults are preserved, but "index", which is set to False.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.

        """
        _credentials = copy.deepcopy(credentials) or {}
        _s3 = S3FileSystem(client_kwargs=_credentials)
        super().__init__(
            PurePosixPath("{}/{}".format(bucket_name, filepath)),
            version,
            exists_function=_s3.exists,
            glob_function=_s3.glob,
        )
        self._bucket_name = bucket_name
        self._credentials = _credentials

        # Handle default load and save arguments
        self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
        if load_args is not None:
            self._load_args.update(load_args)
        self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
        if save_args is not None:
            self._save_args.update(save_args)

        self._s3 = _s3
Example #21
0
def test_change_defaults_only_subsequent(monkeypatch):
    """Test for Issue #135

    Ensure that changing the default block size doesn't affect existing file
    systems that were created using that default. It should only affect file
    systems created after the change.
    """
    fs_default = S3FileSystem()
    assert fs_default.default_block_size == 5 * (1024 ** 2)

    fs_overridden = S3FileSystem(default_block_size=64 * (1024 ** 2))
    assert fs_overridden.default_block_size == 64 * (1024 ** 2)

    # Suppose I want all subsequent file systems to have a block size of 1 GiB
    # instead of 5 MiB:
    monkeypatch.setattr(S3FileSystem, 'default_block_size', 1024 ** 3)

    fs_big = S3FileSystem()
    assert fs_big.default_block_size == 1024 ** 3

    # Test the other file systems created to see if their block sizes changed
    assert fs_overridden.default_block_size == 64 * (1024 ** 2)
    assert fs_default.default_block_size == 5 * (1024 ** 2)
Example #22
0
    def s3fs_open(self, path, mode):
        from s3fs.core import S3FileSystem

        endpoint_url = os.environ.get('S3_ENDPOINT_URL')
        client_kwargs = {}
        if endpoint_url:
            client_kwargs = {'endpoint_url': endpoint_url}

        if 'r' in mode:
            self.wait_for_path(path)

        s3 = S3FileSystem(anon=False, default_fill_cache=False,
                          client_kwargs=client_kwargs)
        return s3.open(path, mode=mode)
Example #23
0
def save_predictions(engine, location='Cassandra', predictions=None, log=None):
    if location == 'S3':
        predictions = predictions[['appointment_id', 'patient_id', 'appointment_day', \
                                   'no_show_likelihood']].sort_values(by='no_show_likelihood', ascending=False)
        bytes_to_write = predictions.to_csv(None, index=False).encode()
        fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY)
        with fs.open(f"s3://{S3_BUCKET}/scores/latest_scores.csv", 'wb') as f:
            f.write(bytes_to_write)
    else:
        predictions = predictions.to_dict(orient='records')
        print(predictions[:3], flush=True)
        for batch in batches(predictions, 200):
            res = engine.save(PREDICTION_SCHEMA, batch).result()
            log.debug(res)
Example #24
0
    def __init__(
        self,
        filepath: str,
        bucket_name: str,
        credentials: Optional[Dict[str, Any]] = None,
        load_args: Optional[Dict[str, Any]] = None,
        save_args: Optional[Dict[str, Any]] = None,
        version: Version = None,
    ) -> None:
        """Creates a new instance of ``PickleS3DataSet`` pointing to a
        concrete file on S3. ``PickleS3DataSet`` uses pickle backend to
        serialise objects to disk:

        pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dumps

        and to load serialised objects into memory:

        pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads

        Args:
            filepath: path to a pkl file.
            bucket_name: S3 bucket name.
            credentials: Credentials to access the S3 bucket, such as
                ``aws_access_key_id``, ``aws_secret_access_key``.
            load_args: Options for loading pickle files. Refer to the help
                file of ``pickle.loads`` for options.
            save_args: Options for saving pickle files. Refer to the help
                file of ``pickle.dumps`` for options.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.
        """
        default_load_args = {}
        default_save_args = {}

        self._filepath = filepath
        self._bucket_name = bucket_name
        self._credentials = credentials if credentials else {}
        self._version = version
        self._load_args = ({
            **default_load_args,
            **load_args
        } if load_args is not None else default_load_args)
        self._save_args = ({
            **default_save_args,
            **save_args
        } if save_args is not None else default_save_args)
        self._s3 = S3FileSystem(client_kwargs=self._credentials)
Example #25
0
 def __init__(self, verbose=False):
     self.s3 = S3FileSystem(anon=False)
     self.df = DataFrame()
     self.dfraw = DataFrame()
     self.df_scratch = DataFrame()
     self.df_result_matrix = DataFrame()
     self.df_payout = DataFrame()
     # now getting track detail exclusively from git file (horse/betsim/data/track_detail.csv) instead of relative path from where data is being loaded
     track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv')
     self.dftrack = read_csv(track_detail)
     self.map_track_chart_to_x8 = self.dftrack.set_index(
         'chart_file_sym')['x8_track_sym'].to_dict()
     self.map_track_x8_to_chart = self.dftrack.set_index(
         'x8_track_sym')['chart_file_sym'].to_dict()
     self.verbose = verbose
Example #26
0
def test_versions_unaware(s3):
    versioned_file = versioned_bucket_name + '/versioned_file3'
    s3 = S3FileSystem(anon=False, version_aware=False)
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'1')
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'2')

    with s3.open(versioned_file) as fo:
        assert fo.version_id is None
        assert fo.read() == b'2'

    with pytest.raises(ValueError):
        with s3.open(versioned_file, version_id='0'):
            fo.read()
Example #27
0
    async def _():
        s3 = S3FileSystem(anon=False,
                          asynchronous=True,
                          client_kwargs={"region_name": "eu-central-1",
                                         "endpoint_url": endpoint_uri})

        fn = test_bucket_name + "/nested/file1"
        data = b"hello\n"

        # Fails because client creation has not yet been awaited
        with pytest.raises(RuntimeError):
            await s3._cat_file(fn)

        await s3.connect()  # creates client

        assert await s3._cat_file(fn) == data
Example #28
0
def test_read_uncached(create_main_file):
    fs = S3PrefetchFileSystem()
    s3_path = str(create_main_file)

    with fs.open(s3_path,
                 "rb",
                 block_size=BLOCK_SIZE,
                 prefetch_storage=list(CACHES.items())) as f:
        data = f.read()

    fs = S3FileSystem()
    with fs.open(s3_path, "rb") as f:
        actual_data = f.read()

    assert data == actual_data
    cleanup(os.path.basename(s3_path))
def lambda_handler(event, context):

    if event['data-source'] == 'json-payload':
        success_put_count = 0
        for row in event['data']:
            row['UnitPrice'] = str(row['UnitPrice'])
            try:
                table.put_item(Item=cast_to_decimal(row))
                success_put_count += 1
            except ClientError as e:
                pass
        
        total_records = len(event['data'])
        if success_put_count > 0:
            message = f'Success: Inserted {success_put_count} of {total_records} records of json payload to DynamoDB'
        else:
            message = 'Fail: No records were inserted'
            
        return {
            'statusCode': 200,
            'body': message
        }

    elif event['data-source'] == 's3' and 's3-path' in event:    
        o = urlparse(event['s3-path'])
        bucket = o.netloc
        filepath = o.path.lstrip('/')

        s3 = S3FileSystem(anon=False)
        df = pd.read_csv(s3.open(event['s3-path'], mode='rb'), dtype={ 'InvoiceNo': str, 'UnitPrice': str}, converters={'CustomerID': lambda id: str(int(float(id)))})
        success_put_count = df.apply(insert_to_table, axis=1).sum()
        s3.cp(event['s3-path'], 's3://' + bucket + '/processed' + filepath[filepath.rfind('/'):])
        s3.rm(event['s3-path'])
        
        s3_path = event['s3-path']
        if success_put_count > 0:
            message = f'Success: Inserted {success_put_count} of {df.shape[0]} records of data from S3 path {s3_path} to DynamoDB'
        else:
            message = 'Fail: No records were inserted'
        
        return {
            'statusCode': 200,
            'body': message
        }

    else:
        return { 'statusCode': 200, 'body': 'Error: data not valid' }
Example #30
0
def save_scores(ska, scoring, location):
    # Save to Cassandra
    if location == "both" or location == "cassandra":
        #Convert scoring data to list of objects
        scores = scoring.to_dict(orient='records')
        #Save to Cassandra
        ska.log("Saving to Cassandra", level=logging.INFO)
        ska.engine.save(SCORING_SCHEMA, scores).result()
        ska.log("Saving to Cassandra", labels=["S3saving"], level=logging.INFO)
    #Save to S3
    if location == "both" or location == "S3":
        bytes_to_write = scoring.to_csv(None, index=False).encode()
        fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY)
        with fs.open(f"s3://{S3_PRIVATE_BUCKET}/{CHURN_MODEL_SCORES}",
                     'wb') as f:
            f.write(bytes_to_write)
        ska.log("Saving to S3", labels=["S3saving"], level=logging.INFO)
Example #31
0
def test_current(s3):
    assert S3FileSystem.current() is s3