Example #1
0
def test_versions_unaware(s3):
    versioned_file = versioned_bucket_name + '/versioned_file3'
    s3 = S3FileSystem(anon=False, version_aware=False)
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'1')
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'2')

    with s3.open(versioned_file) as fo:
        assert fo.version_id is None
        assert fo.read() == b'2'

    with pytest.raises(ValueError):
        with s3.open(versioned_file, version_id='0'):
            fo.read()
Example #2
0
def test_read_uncached(create_main_file):
    fs = S3PrefetchFileSystem()
    s3_path = str(create_main_file)

    with fs.open(s3_path,
                 "rb",
                 block_size=BLOCK_SIZE,
                 prefetch_storage=list(CACHES.items())) as f:
        data = f.read()

    fs = S3FileSystem()
    with fs.open(s3_path, "rb") as f:
        actual_data = f.read()

    assert data == actual_data
    cleanup(os.path.basename(s3_path))
Example #3
0
    async def _():
        s3 = S3FileSystem(anon=False,
                          asynchronous=True,
                          client_kwargs={"region_name": "eu-central-1",
                                         "endpoint_url": endpoint_uri})

        fn = test_bucket_name + "/nested/file1"
        data = b"hello\n"

        # Fails because client creation has not yet been awaited
        with pytest.raises(RuntimeError):
            await s3._cat_file(fn)

        await s3.connect()  # creates client

        assert await s3._cat_file(fn) == data
def lambda_handler(event, context):

    if event['data-source'] == 'json-payload':
        success_put_count = 0
        for row in event['data']:
            row['UnitPrice'] = str(row['UnitPrice'])
            try:
                table.put_item(Item=cast_to_decimal(row))
                success_put_count += 1
            except ClientError as e:
                pass
        
        total_records = len(event['data'])
        if success_put_count > 0:
            message = f'Success: Inserted {success_put_count} of {total_records} records of json payload to DynamoDB'
        else:
            message = 'Fail: No records were inserted'
            
        return {
            'statusCode': 200,
            'body': message
        }

    elif event['data-source'] == 's3' and 's3-path' in event:    
        o = urlparse(event['s3-path'])
        bucket = o.netloc
        filepath = o.path.lstrip('/')

        s3 = S3FileSystem(anon=False)
        df = pd.read_csv(s3.open(event['s3-path'], mode='rb'), dtype={ 'InvoiceNo': str, 'UnitPrice': str}, converters={'CustomerID': lambda id: str(int(float(id)))})
        success_put_count = df.apply(insert_to_table, axis=1).sum()
        s3.cp(event['s3-path'], 's3://' + bucket + '/processed' + filepath[filepath.rfind('/'):])
        s3.rm(event['s3-path'])
        
        s3_path = event['s3-path']
        if success_put_count > 0:
            message = f'Success: Inserted {success_put_count} of {df.shape[0]} records of data from S3 path {s3_path} to DynamoDB'
        else:
            message = 'Fail: No records were inserted'
        
        return {
            'statusCode': 200,
            'body': message
        }

    else:
        return { 'statusCode': 200, 'body': 'Error: data not valid' }
Example #5
0
def s3(s3_base):
    from botocore.session import Session
    # NB: we use the sync botocore client for setup
    session = Session()
    client = session.create_client('s3', endpoint_url=endpoint_uri)
    client.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    client.create_bucket(
        Bucket=versioned_bucket_name, ACL='public-read')
    client.put_bucket_versioning(
        Bucket=versioned_bucket_name,
        VersioningConfiguration={
            'Status': 'Enabled'
        }
    )

    # initialize secure bucket
    client.create_bucket(
        Bucket=secure_bucket_name, ACL='public-read')
    policy = json.dumps({
        "Version": "2012-10-17",
        "Id": "PutObjPolicy",
        "Statement": [
            {
                "Sid": "DenyUnEncryptedObjectUploads",
                "Effect": "Deny",
                "Principal": "*",
                "Action": "s3:PutObject",
                "Resource": "arn:aws:s3:::{bucket_name}/*".format(
                    bucket_name=secure_bucket_name),
                "Condition": {
                    "StringNotEquals": {
                        "s3:x-amz-server-side-encryption": "aws:kms"
                    }
                }
            }
        ]
    })
    client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
    for flist in [files, csv_files, text_files, glob_files]:
        for f, data in flist.items():
            client.put_object(Bucket=test_bucket_name, Key=f, Body=data)

    S3FileSystem.clear_instance_cache()
    s3 = S3FileSystem(anon=False, client_kwargs={'endpoint_url': endpoint_uri})
    s3.invalidate_cache()
    yield s3
Example #6
0
def test_versions(s3):
    versioned_file = versioned_bucket_name + '/versioned_file'
    s3 = S3FileSystem(anon=False, version_aware=True)
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'1')
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'2')
    versions = s3.object_version_info(versioned_file)
    assert len(versions) == 2

    with s3.open(versioned_file) as fo:
        assert fo.version_id == '1'
        assert fo.read() == b'2'

    with s3.open(versioned_file, version_id='0') as fo:
        assert fo.version_id == '0'
        assert fo.read() == b'1'
Example #7
0
def save_scores(ska, scoring, location):
    # Save to Cassandra
    if location == "both" or location == "cassandra":
        #Convert scoring data to list of objects
        scores = scoring.to_dict(orient='records')
        #Save to Cassandra
        ska.log("Saving to Cassandra", level=logging.INFO)
        ska.engine.save(SCORING_SCHEMA, scores).result()
        ska.log("Saving to Cassandra", labels=["S3saving"], level=logging.INFO)
    #Save to S3
    if location == "both" or location == "S3":
        bytes_to_write = scoring.to_csv(None, index=False).encode()
        fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY)
        with fs.open(f"s3://{S3_PRIVATE_BUCKET}/{CHURN_MODEL_SCORES}",
                     'wb') as f:
            f.write(bytes_to_write)
        ska.log("Saving to S3", labels=["S3saving"], level=logging.INFO)
Example #8
0
def fetch_data(engine, location='Cassandra'):
    """Use the Skafos data engine to pull in historic appointment data."""

    if location == "S3":

        s3 = S3FileSystem(anon=False)
        key = f"s3://{S3_BUCKET}/data/past_appointments.csv"
        fetched_data = make_dataframe(
            data=pd.read_csv(s3.open(f'{key}', mode='rb')))
    else:
        res = engine.create_view('appt', {
            'keyspace': 'no_shows',
            'table': 'appointments'
        }, DataSourceType.Cassandra).result()
        query = 'SELECT * FROM appt'
        fetched_data = make_dataframe(engine.query(query).result().get('data'))

    return fetched_data
Example #9
0
 def __init__(self, verbose=False):
     self.s3 = S3FileSystem(anon=False)
     self.dfraw = DataFrame()
     self.df = DataFrame()
     self.dfpp = DataFrame()
     self.dfwk = DataFrame()
     # now getting track detail exclusively from git file (horse/betsim/data/track_detail.csv) instead of relative path from where data is being loaded
     track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv')
     dftrack = read_csv(track_detail)
     self.map_track_jcp_to_x8 = dftrack.set_index(
         'jcp_track_sym')['x8_track_sym'].to_dict()
     self.map_track_x8_to_jcp = dftrack.set_index(
         'x8_track_sym')['jcp_track_sym'].to_dict()
     self.map_track_x8_to_itsp = dftrack.set_index(
         'x8_track_sym')['itsp_track_sym'].to_dict()
     self.map_track_chart_to_x8 = dftrack.set_index(
         'chart_file_sym')['x8_track_sym'].to_dict()
     self.verbose = verbose
Example #10
0
def test_versions(s3):
    versioned_file = versioned_bucket_name + '/versioned_file'
    s3 = S3FileSystem(anon=False, version_aware=True)
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'1')
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'2')
    versions = s3.object_version_info(versioned_file)
    version_ids = [version['VersionId'] for version in versions]
    assert len(version_ids) == 2

    with s3.open(versioned_file) as fo:
        assert fo.version_id == version_ids[1]
        assert fo.read() == b'2'

    with s3.open(versioned_file, version_id=version_ids[0]) as fo:
        assert fo.version_id == version_ids[0]
        assert fo.read() == b'1'
Example #11
0
def test_versioned_file_fullpath(s3):
    versioned_file = versioned_bucket_name + '/versioned_file_fullpath'
    s3 = S3FileSystem(anon=False, version_aware=True)
    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'1')
    # moto doesn't correctly return a versionId for a multipart upload. So we resort to this.
    # version_id = fo.version_id
    versions = s3.object_version_info(versioned_file)
    version_ids = [version['VersionId'] for version in versions]
    version_id = version_ids[0]

    with s3.open(versioned_file, 'wb') as fo:
        fo.write(b'2')

    file_with_version = "{}?versionId={}".format(versioned_file, version_id)

    with s3.open(file_with_version, 'rb') as fo:
        assert fo.version_id == version_id
        assert fo.read() == b'1'
 def __init__(self):
     self.data_dir = "skafos.example.data/HaptDataSet/RawData/"
     self.data_url = "https://s3.amazonaws.com/" + self.data_dir
     self.label_trans = {
         'X1': 'exp_id',
         'X2': 'user_id',
         'X3': 'activity_id',
         'X4': 'start',
         'X5': 'end'
     }
     self.target_map = {
         1.: 'walking',
         2.: 'climbing_upstairs',
         3.: 'climbing_downstairs',
         4.: 'sitting',
         5.: 'standing',
         6.: 'laying'
     }
     self.s3 = S3FileSystem(anon=True)
Example #13
0
def fetch_upcoming(engine, location='Cassandra'):
    """Use the Skafos data engine to pull in historic appointment data."""

    if location == "S3":

        s3 = S3FileSystem(anon=False)
        key = f"s3://{S3_BUCKET}/data/upcoming_appointments.csv"
        upcoming = pd.read_csv(s3.open(f'{key}', mode='rb'))

    else:
        ska.engine.create_view("upcoming_appointments", {
            "keyspace": "no_shows",
            "table": "upcoming"
        }, DataSourceType.Cassandra).result()
        upcoming = pd.DataFrame(
            ska.engine.query(
                "SELECT * FROM upcoming_appointments").result().get('data'))

    return upcoming
Example #14
0
    def queue_bets_s3(self, filename):
        """
        write self.df_bets to s3://x8-bucket/bets/
        test mode is not supported for s3 bet queueing
        """

        # we want to see what automatically generated betfiles output regardless of being empty or not
        # if self.df_bets.empty:
        #     raise Exception('bets.df_bets is empty')

        s3 = S3FileSystem(anon=False)

        s3_path = 'x8-bucket/bets/%s' % filename
        print('writing bets.df_bets to %s' % s3_path)

        bytes = self.df_bets.to_csv(None, index=False).encode()

        with s3.open(s3_path, 'wb') as f:
            f.write(bytes)
def load_data(s_bucket, s_key, params_dic):
    df = None

    # Try reading csv from S3 file system
    try:
        s3 = S3FileSystem(anon=False)

        df = pd.read_csv(s3.open('{}/{}'.format(s_bucket, s_key), mode='r'))
        print(df)
    except Exception as e:
        logging.info(e)
        raise e
    conn = connect(params_dic)
    copy_from_stringio(conn, df, 'badgedata')
    logger.info("copied file into database successfully")
    logger.info(execute_query(conn, "select count(*) from badgedata;"))
    #print(execute_query(conn, "delete from badgedata where true;"))
    conn.close()
    return df
Example #16
0
    def __init__(
        self,
        filepath: str,
        bucket_name: str,
        credentials: Optional[Dict[str, Any]] = None,
        load_args: Optional[Dict[str, Any]] = None,
        save_args: Optional[Dict[str, Any]] = None,
        version: Version = None,
    ) -> None:
        """Creates a new instance of ``CSVS3DataSet`` pointing to a concrete
        csv file on S3.

        Args:
            filepath: Path to a csv file.
            bucket_name: S3 bucket name.
            credentials: Credentials to access the S3 bucket, such as
                ``aws_access_key_id``, ``aws_secret_access_key``.
            load_args: Pandas options for loading csv files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
                All defaults are preserved.
            save_args: Pandas options for saving csv files.
                Here you can find all available arguments:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
                All defaults are preserved, but "index", which is set to False.
            version: If specified, should be an instance of
                ``kedro.io.core.Version``. If its ``load`` attribute is
                None, the latest version will be loaded. If its ``save``
                attribute is None, save version will be autogenerated.

        """
        default_save_args = {"index": False}
        self._save_args = ({
            **default_save_args,
            **save_args
        } if save_args else default_save_args)
        self._load_args = load_args if load_args else {}
        self._filepath = filepath
        self._bucket_name = bucket_name
        self._credentials = credentials if credentials else {}
        self._version = version
        self._s3 = S3FileSystem(client_kwargs=self._credentials)
Example #17
0
def s3_fs():
    # writable local S3 system
    try:
        m = moto.mock_s3()
        m.start()
        client = boto3.client("s3")
        client.create_bucket(Bucket=test_bucket_name, ACL="public-read")

        for f, data in files.items():
            client.put_object(Bucket=test_bucket_name, Key=f, Body=data)
        
        yield S3FileSystem(anon=False)

        for f, data in files.items():
            try:
                client.delete_object(Bucket=test_bucket_name, Key=f, Body=data)
            except:
                pass
    finally:
        m.stop()
Example #18
0
def create_new_users(request):
    if request.method == "POST":
        form = CreateUsersForm(request.POST, request.FILES)
        if form.is_valid():
            form.save()
        excel_file = request.FILES.get("file")
        excel_file_name = excel_file.name

        if excel_file_name[-3:] == 'xls' or excel_file_name[-4:] == 'xlsx':

            try:
                # reading the excel file
                s3 = S3FileSystem(anon=False)
                key = f'media/public/excel_file/{excel_file}'
                bucket = 'corbon2'

                df = pd.read_excel(
                    s3.open('{}/{}'.format(bucket, key), mode='rb'))
                for f in Files.objects.all():
                    f.delete()
                # Dropping the unnecessary columns
                data2 = df.dropna(axis=0, how="any")

                data2.columns = data2.columns.map(
                    lambda x: x.replace('\n', ''))

                # here is final_data, the list of dictionaries that can be easily stored in the database
                final_data = data2.to_dict(orient="records")

                # code to store into the DB goes here, data is in variable final_data
                for row in final_data:
                    if not User.objects.filter(
                            username__iexact=row['email']).exists():
                        User.objects.create_user(username=row['email'])
            except KeyError:
                return HttpResponse('excel file could not be processed')

            return redirect('download')
    else:
        form = CreateUsersForm()
    return render(request, 'store_users.html', {'form': form})
Example #19
0
def test_fsspec_versions_multiple(s3):
    """Test that the standard fsspec.core.get_fs_token_paths behaves as expected for versionId urls"""
    s3 = S3FileSystem(anon=False, version_aware=True)
    versioned_file = versioned_bucket_name + '/versioned_file3'
    version_lookup = {}
    for i in range(20):
        contents = str(i).encode()
        with s3.open(versioned_file, 'wb') as fo:
            fo.write(contents)
        version_lookup[fo.version_id] = contents
    urls = [
        "s3://{}?versionId={}".format(versioned_file, version)
        for version in version_lookup.keys()
    ]
    fs, token, paths = fsspec.core.get_fs_token_paths(urls)
    assert isinstance(fs, S3FileSystem)
    assert fs.version_aware
    for path in paths:
        with fs.open(path, 'rb') as fo:
            contents = fo.read()
            assert contents == version_lookup[fo.version_id]
def read_dataset():
    try:
        print "==============================================================================="
        print "Connecting to S3..."
        print "Please make sure you have set your S3 access/secret access key in your system."
        global S3
        S3 = S3FileSystem(anon=False)
        print "Connected to S3!"
        print "==============================================================================="
    except:
        import traceback
        traceback.print_exc()
        print "Failed to connect to S3!"
        print "If you have not set your S3 access/secret key, please follow the instruction:"
        print "Make sure you have Python and pip in your system, and install aws cli:"
        print "Type in the following command in your terminal:"
        print "pip install --upgrade awscli"
        print "After aws cli installed, type in:"
        print "aws configure"
        print "Then you can set your access/secret key. Good luck!"
        print "==============================================================================="

    try:
        print "=============================================="
        print "Downloading the data set from Amazon S3..."
        BUCKET_NAME = 'info7390-2018spring-team2-final-dataset'
        DATASET = 'creditcard.csv'
        global df
        df = pd.read_csv(
            S3.open('{}/{}'.format(BUCKET_NAME, DATASET), mode='rb'))
        print df.info()
        print "Read data set successfully!"
        print "=============================================="
    except:
        import traceback
        traceback.print_exc()
        print "Failed to download or read the data set!"
        print "=============================================="
def read_array_from_s3(array_uri):
    s3 = S3FileSystem()
    bucket, key = _get_bucket_and_key(array_uri)
    return np.load(s3.open('{}/{}'.format(bucket, key)))
Example #22
0
import json

from matplotlib import pyplot as plt
import pandas as pd
import pickle
from s3fs.core import S3FileSystem

MODEL_DIR = "mids-capstone-irrigation-detection/models"

s3_file = S3FileSystem()

# Models are model_name: description
models = {
    "supervised_baseline":
    "Balanced Dataset",
    "supervised_baseline_ex":
    "Balanced Extended Labels",
    "supervised_baseline_pretrained":
    "ImageNet Pretraining",
    "supervised_baseline_pretrained_ex":
    "ImageNet Pretraining with Extended Labels"
}


def f_scores(scores):
    precision = scores[6]
    recall = scores[7]
    if precision + recall == 0.0:
        return [0.0, 0.0]
    f1 = (2 * precision * recall) / (precision + recall)
    beta = 0.5
Example #23
0
]

# elasticsearch settings
ES_CLUSTER_URL = "https://search-es-covid-research-n6etnstkyvx6k2oxrlavq66sia.us-east-2.es.amazonaws.com/"
INDEX = "cord19-docs"
TREATMENT_INDEX = "cord19-treatments"
es = Elasticsearch(ES_CLUSTER_URL)

# local cache of data
DATA_DIR = os.environ.get("COVID_WEBAPP_DATA_DIR", "/home/ubuntu/efs-mnt/latest")

# s3 client
ACCESS_KEY = "AKIAT7JLV7IH5TNZAUTR"
SECRET_ACCESS_KEY = "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"
BUCKET = "covid-research-data"
s3_client = S3FileSystem(key=ACCESS_KEY, secret=SECRET_ACCESS_KEY)

STOPWORDS = {
    "ourselves",
    "hers",
    "between",
    "yourself",
    "but",
    "again",
    "there",
    "about",
    "once",
    "during",
    "out",
    "very",
    "having",
Example #24
0
 def __init__(self, aws_profile=None):
     #self.aws_profile = aws_profile
     self.s3 = S3FileSystem(anon=False, profile=aws_profile)
Example #25
0
def test_bucket_exists(s3):
    assert s3.exists(test_bucket_name)
    assert not s3.exists(test_bucket_name+'x')
    s3 = S3FileSystem(anon=True)
    assert s3.exists(test_bucket_name)
    assert not s3.exists(test_bucket_name+'x')
def write_array_to_s3(array_uri, arr):
    s3 = S3FileSystem()
    bucket, key = _get_bucket_and_key(array_uri)
    return np.save(s3.open('{}/{}'.format(bucket, key), "wb"), arr)
import pandas as pd
from s3fs.core import S3FileSystem
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from model_design import ClassificationModel

S3 = S3FileSystem()
df = pd.DataFrame()
df_fe = pd.DataFrame()
X_test_bm = pd.DataFrame()
y_test_bm = pd.DataFrame()
X_test_fe = pd.DataFrame()
y_test_fe = pd.DataFrame()
lr = LogisticRegression(random_state=0)
svm = SVC(random_state=0)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
gbt = GradientBoostingClassifier(random_state=0)
bm_lr = ClassificationModel('Logistic Regression', lr, '', '', '', '')
bm_rf = ClassificationModel('Random Forest', rf, '', '', '', '')
bm_svm = ClassificationModel('Support Vector Machine', svm, '', '', '', '')
bm_gbt = ClassificationModel('Gradient Boosting Tree', gbt, '', '', '', '')
fe_lr = ClassificationModel('Logistic Regression', lr, '', '', '', '')
fe_rf = ClassificationModel('Random Forest', rf, '', '', '', '')
fe_svm = ClassificationModel('Support Vector Machine', svm, '', '', '', '')
fe_gbt = ClassificationModel('Gradient Boosting Tree', gbt, '', '', '', '')
optimal_lr = ClassificationModel('Logistic Regression', '', '', '', '', '')
optimal_rf = ClassificationModel('Random Forest', rf, '', '', '', '')
optimal_svm = ClassificationModel('Support Vector Machine', svm, '', '', '',
                                  '')
Example #28
0
def test_config_kwargs():
    s3 = S3FileSystem(config_kwargs={'signature_version': 's3v4'})
    assert s3.connect(refresh=True).meta.config.signature_version == 's3v4'
Example #29
0
import time
time.sleep(
    5
)  # Hacky way of avoiding having to check if the buckets have been created by docker.

os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'
os.environ['AWS_S3_ENDPOINT'] = 'http://minio:9000'

tc.config.set_runtime_config('TURI_S3_REGION', 'us-east-1')
tc.config.set_runtime_config('TURI_FILEIO_INSECURE_SSL_CERTIFICATE_CHECKS', 1)
tc.config.set_runtime_config('TURI_S3_ENDPOINT', 'http://minio:9000')

s3 = S3FileSystem(anon=False,
                  client_kwargs={
                      'endpoint_url': 'http://minio:9000',
                      'aws_access_key_id': 'minio',
                      'aws_secret_access_key': 'minio123'
                  })
location = 'output/sample.csv'

# This doesn't work because pandas can't read a private bucket
#df = pd.read_csv('s3://output/sample.csv')

# This does work as S3FileSystem is correctly configured for a private bucket
print('First getting the csv using S3FileSystem, and pandas...')
try:
    df = pd.read_csv(s3.open(location))
    print(df)
    print('Success')
except:
    print('ERROR: Couldnt get from S3 using pandas')
Example #30
0
def test_multiple_objects(s3):
    s3.connect()
    assert s3.ls('test')
    s32 = S3FileSystem(anon=False)
    assert s32.session
    assert s3.ls('test') == s32.ls('test')