def test_versions_unaware(s3): versioned_file = versioned_bucket_name + '/versioned_file3' s3 = S3FileSystem(anon=False, version_aware=False) with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') with s3.open(versioned_file, 'wb') as fo: fo.write(b'2') with s3.open(versioned_file) as fo: assert fo.version_id is None assert fo.read() == b'2' with pytest.raises(ValueError): with s3.open(versioned_file, version_id='0'): fo.read()
def test_read_uncached(create_main_file): fs = S3PrefetchFileSystem() s3_path = str(create_main_file) with fs.open(s3_path, "rb", block_size=BLOCK_SIZE, prefetch_storage=list(CACHES.items())) as f: data = f.read() fs = S3FileSystem() with fs.open(s3_path, "rb") as f: actual_data = f.read() assert data == actual_data cleanup(os.path.basename(s3_path))
async def _(): s3 = S3FileSystem(anon=False, asynchronous=True, client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri}) fn = test_bucket_name + "/nested/file1" data = b"hello\n" # Fails because client creation has not yet been awaited with pytest.raises(RuntimeError): await s3._cat_file(fn) await s3.connect() # creates client assert await s3._cat_file(fn) == data
def lambda_handler(event, context): if event['data-source'] == 'json-payload': success_put_count = 0 for row in event['data']: row['UnitPrice'] = str(row['UnitPrice']) try: table.put_item(Item=cast_to_decimal(row)) success_put_count += 1 except ClientError as e: pass total_records = len(event['data']) if success_put_count > 0: message = f'Success: Inserted {success_put_count} of {total_records} records of json payload to DynamoDB' else: message = 'Fail: No records were inserted' return { 'statusCode': 200, 'body': message } elif event['data-source'] == 's3' and 's3-path' in event: o = urlparse(event['s3-path']) bucket = o.netloc filepath = o.path.lstrip('/') s3 = S3FileSystem(anon=False) df = pd.read_csv(s3.open(event['s3-path'], mode='rb'), dtype={ 'InvoiceNo': str, 'UnitPrice': str}, converters={'CustomerID': lambda id: str(int(float(id)))}) success_put_count = df.apply(insert_to_table, axis=1).sum() s3.cp(event['s3-path'], 's3://' + bucket + '/processed' + filepath[filepath.rfind('/'):]) s3.rm(event['s3-path']) s3_path = event['s3-path'] if success_put_count > 0: message = f'Success: Inserted {success_put_count} of {df.shape[0]} records of data from S3 path {s3_path} to DynamoDB' else: message = 'Fail: No records were inserted' return { 'statusCode': 200, 'body': message } else: return { 'statusCode': 200, 'body': 'Error: data not valid' }
def s3(s3_base): from botocore.session import Session # NB: we use the sync botocore client for setup session = Session() client = session.create_client('s3', endpoint_url=endpoint_uri) client.create_bucket(Bucket=test_bucket_name, ACL='public-read') client.create_bucket( Bucket=versioned_bucket_name, ACL='public-read') client.put_bucket_versioning( Bucket=versioned_bucket_name, VersioningConfiguration={ 'Status': 'Enabled' } ) # initialize secure bucket client.create_bucket( Bucket=secure_bucket_name, ACL='public-read') policy = json.dumps({ "Version": "2012-10-17", "Id": "PutObjPolicy", "Statement": [ { "Sid": "DenyUnEncryptedObjectUploads", "Effect": "Deny", "Principal": "*", "Action": "s3:PutObject", "Resource": "arn:aws:s3:::{bucket_name}/*".format( bucket_name=secure_bucket_name), "Condition": { "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } } } ] }) client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) for flist in [files, csv_files, text_files, glob_files]: for f, data in flist.items(): client.put_object(Bucket=test_bucket_name, Key=f, Body=data) S3FileSystem.clear_instance_cache() s3 = S3FileSystem(anon=False, client_kwargs={'endpoint_url': endpoint_uri}) s3.invalidate_cache() yield s3
def test_versions(s3): versioned_file = versioned_bucket_name + '/versioned_file' s3 = S3FileSystem(anon=False, version_aware=True) with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') with s3.open(versioned_file, 'wb') as fo: fo.write(b'2') versions = s3.object_version_info(versioned_file) assert len(versions) == 2 with s3.open(versioned_file) as fo: assert fo.version_id == '1' assert fo.read() == b'2' with s3.open(versioned_file, version_id='0') as fo: assert fo.version_id == '0' assert fo.read() == b'1'
def save_scores(ska, scoring, location): # Save to Cassandra if location == "both" or location == "cassandra": #Convert scoring data to list of objects scores = scoring.to_dict(orient='records') #Save to Cassandra ska.log("Saving to Cassandra", level=logging.INFO) ska.engine.save(SCORING_SCHEMA, scores).result() ska.log("Saving to Cassandra", labels=["S3saving"], level=logging.INFO) #Save to S3 if location == "both" or location == "S3": bytes_to_write = scoring.to_csv(None, index=False).encode() fs = S3FileSystem(key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY) with fs.open(f"s3://{S3_PRIVATE_BUCKET}/{CHURN_MODEL_SCORES}", 'wb') as f: f.write(bytes_to_write) ska.log("Saving to S3", labels=["S3saving"], level=logging.INFO)
def fetch_data(engine, location='Cassandra'): """Use the Skafos data engine to pull in historic appointment data.""" if location == "S3": s3 = S3FileSystem(anon=False) key = f"s3://{S3_BUCKET}/data/past_appointments.csv" fetched_data = make_dataframe( data=pd.read_csv(s3.open(f'{key}', mode='rb'))) else: res = engine.create_view('appt', { 'keyspace': 'no_shows', 'table': 'appointments' }, DataSourceType.Cassandra).result() query = 'SELECT * FROM appt' fetched_data = make_dataframe(engine.query(query).result().get('data')) return fetched_data
def __init__(self, verbose=False): self.s3 = S3FileSystem(anon=False) self.dfraw = DataFrame() self.df = DataFrame() self.dfpp = DataFrame() self.dfwk = DataFrame() # now getting track detail exclusively from git file (horse/betsim/data/track_detail.csv) instead of relative path from where data is being loaded track_detail = os.path.join(data.__path__._path[0], 'track_detail.csv') dftrack = read_csv(track_detail) self.map_track_jcp_to_x8 = dftrack.set_index( 'jcp_track_sym')['x8_track_sym'].to_dict() self.map_track_x8_to_jcp = dftrack.set_index( 'x8_track_sym')['jcp_track_sym'].to_dict() self.map_track_x8_to_itsp = dftrack.set_index( 'x8_track_sym')['itsp_track_sym'].to_dict() self.map_track_chart_to_x8 = dftrack.set_index( 'chart_file_sym')['x8_track_sym'].to_dict() self.verbose = verbose
def test_versions(s3): versioned_file = versioned_bucket_name + '/versioned_file' s3 = S3FileSystem(anon=False, version_aware=True) with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') with s3.open(versioned_file, 'wb') as fo: fo.write(b'2') versions = s3.object_version_info(versioned_file) version_ids = [version['VersionId'] for version in versions] assert len(version_ids) == 2 with s3.open(versioned_file) as fo: assert fo.version_id == version_ids[1] assert fo.read() == b'2' with s3.open(versioned_file, version_id=version_ids[0]) as fo: assert fo.version_id == version_ids[0] assert fo.read() == b'1'
def test_versioned_file_fullpath(s3): versioned_file = versioned_bucket_name + '/versioned_file_fullpath' s3 = S3FileSystem(anon=False, version_aware=True) with s3.open(versioned_file, 'wb') as fo: fo.write(b'1') # moto doesn't correctly return a versionId for a multipart upload. So we resort to this. # version_id = fo.version_id versions = s3.object_version_info(versioned_file) version_ids = [version['VersionId'] for version in versions] version_id = version_ids[0] with s3.open(versioned_file, 'wb') as fo: fo.write(b'2') file_with_version = "{}?versionId={}".format(versioned_file, version_id) with s3.open(file_with_version, 'rb') as fo: assert fo.version_id == version_id assert fo.read() == b'1'
def __init__(self): self.data_dir = "skafos.example.data/HaptDataSet/RawData/" self.data_url = "https://s3.amazonaws.com/" + self.data_dir self.label_trans = { 'X1': 'exp_id', 'X2': 'user_id', 'X3': 'activity_id', 'X4': 'start', 'X5': 'end' } self.target_map = { 1.: 'walking', 2.: 'climbing_upstairs', 3.: 'climbing_downstairs', 4.: 'sitting', 5.: 'standing', 6.: 'laying' } self.s3 = S3FileSystem(anon=True)
def fetch_upcoming(engine, location='Cassandra'): """Use the Skafos data engine to pull in historic appointment data.""" if location == "S3": s3 = S3FileSystem(anon=False) key = f"s3://{S3_BUCKET}/data/upcoming_appointments.csv" upcoming = pd.read_csv(s3.open(f'{key}', mode='rb')) else: ska.engine.create_view("upcoming_appointments", { "keyspace": "no_shows", "table": "upcoming" }, DataSourceType.Cassandra).result() upcoming = pd.DataFrame( ska.engine.query( "SELECT * FROM upcoming_appointments").result().get('data')) return upcoming
def queue_bets_s3(self, filename): """ write self.df_bets to s3://x8-bucket/bets/ test mode is not supported for s3 bet queueing """ # we want to see what automatically generated betfiles output regardless of being empty or not # if self.df_bets.empty: # raise Exception('bets.df_bets is empty') s3 = S3FileSystem(anon=False) s3_path = 'x8-bucket/bets/%s' % filename print('writing bets.df_bets to %s' % s3_path) bytes = self.df_bets.to_csv(None, index=False).encode() with s3.open(s3_path, 'wb') as f: f.write(bytes)
def load_data(s_bucket, s_key, params_dic): df = None # Try reading csv from S3 file system try: s3 = S3FileSystem(anon=False) df = pd.read_csv(s3.open('{}/{}'.format(s_bucket, s_key), mode='r')) print(df) except Exception as e: logging.info(e) raise e conn = connect(params_dic) copy_from_stringio(conn, df, 'badgedata') logger.info("copied file into database successfully") logger.info(execute_query(conn, "select count(*) from badgedata;")) #print(execute_query(conn, "delete from badgedata where true;")) conn.close() return df
def __init__( self, filepath: str, bucket_name: str, credentials: Optional[Dict[str, Any]] = None, load_args: Optional[Dict[str, Any]] = None, save_args: Optional[Dict[str, Any]] = None, version: Version = None, ) -> None: """Creates a new instance of ``CSVS3DataSet`` pointing to a concrete csv file on S3. Args: filepath: Path to a csv file. bucket_name: S3 bucket name. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Pandas options for loading csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html All defaults are preserved. save_args: Pandas options for saving csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ default_save_args = {"index": False} self._save_args = ({ **default_save_args, **save_args } if save_args else default_save_args) self._load_args = load_args if load_args else {} self._filepath = filepath self._bucket_name = bucket_name self._credentials = credentials if credentials else {} self._version = version self._s3 = S3FileSystem(client_kwargs=self._credentials)
def s3_fs(): # writable local S3 system try: m = moto.mock_s3() m.start() client = boto3.client("s3") client.create_bucket(Bucket=test_bucket_name, ACL="public-read") for f, data in files.items(): client.put_object(Bucket=test_bucket_name, Key=f, Body=data) yield S3FileSystem(anon=False) for f, data in files.items(): try: client.delete_object(Bucket=test_bucket_name, Key=f, Body=data) except: pass finally: m.stop()
def create_new_users(request): if request.method == "POST": form = CreateUsersForm(request.POST, request.FILES) if form.is_valid(): form.save() excel_file = request.FILES.get("file") excel_file_name = excel_file.name if excel_file_name[-3:] == 'xls' or excel_file_name[-4:] == 'xlsx': try: # reading the excel file s3 = S3FileSystem(anon=False) key = f'media/public/excel_file/{excel_file}' bucket = 'corbon2' df = pd.read_excel( s3.open('{}/{}'.format(bucket, key), mode='rb')) for f in Files.objects.all(): f.delete() # Dropping the unnecessary columns data2 = df.dropna(axis=0, how="any") data2.columns = data2.columns.map( lambda x: x.replace('\n', '')) # here is final_data, the list of dictionaries that can be easily stored in the database final_data = data2.to_dict(orient="records") # code to store into the DB goes here, data is in variable final_data for row in final_data: if not User.objects.filter( username__iexact=row['email']).exists(): User.objects.create_user(username=row['email']) except KeyError: return HttpResponse('excel file could not be processed') return redirect('download') else: form = CreateUsersForm() return render(request, 'store_users.html', {'form': form})
def test_fsspec_versions_multiple(s3): """Test that the standard fsspec.core.get_fs_token_paths behaves as expected for versionId urls""" s3 = S3FileSystem(anon=False, version_aware=True) versioned_file = versioned_bucket_name + '/versioned_file3' version_lookup = {} for i in range(20): contents = str(i).encode() with s3.open(versioned_file, 'wb') as fo: fo.write(contents) version_lookup[fo.version_id] = contents urls = [ "s3://{}?versionId={}".format(versioned_file, version) for version in version_lookup.keys() ] fs, token, paths = fsspec.core.get_fs_token_paths(urls) assert isinstance(fs, S3FileSystem) assert fs.version_aware for path in paths: with fs.open(path, 'rb') as fo: contents = fo.read() assert contents == version_lookup[fo.version_id]
def read_dataset(): try: print "===============================================================================" print "Connecting to S3..." print "Please make sure you have set your S3 access/secret access key in your system." global S3 S3 = S3FileSystem(anon=False) print "Connected to S3!" print "===============================================================================" except: import traceback traceback.print_exc() print "Failed to connect to S3!" print "If you have not set your S3 access/secret key, please follow the instruction:" print "Make sure you have Python and pip in your system, and install aws cli:" print "Type in the following command in your terminal:" print "pip install --upgrade awscli" print "After aws cli installed, type in:" print "aws configure" print "Then you can set your access/secret key. Good luck!" print "===============================================================================" try: print "==============================================" print "Downloading the data set from Amazon S3..." BUCKET_NAME = 'info7390-2018spring-team2-final-dataset' DATASET = 'creditcard.csv' global df df = pd.read_csv( S3.open('{}/{}'.format(BUCKET_NAME, DATASET), mode='rb')) print df.info() print "Read data set successfully!" print "==============================================" except: import traceback traceback.print_exc() print "Failed to download or read the data set!" print "=============================================="
def read_array_from_s3(array_uri): s3 = S3FileSystem() bucket, key = _get_bucket_and_key(array_uri) return np.load(s3.open('{}/{}'.format(bucket, key)))
import json from matplotlib import pyplot as plt import pandas as pd import pickle from s3fs.core import S3FileSystem MODEL_DIR = "mids-capstone-irrigation-detection/models" s3_file = S3FileSystem() # Models are model_name: description models = { "supervised_baseline": "Balanced Dataset", "supervised_baseline_ex": "Balanced Extended Labels", "supervised_baseline_pretrained": "ImageNet Pretraining", "supervised_baseline_pretrained_ex": "ImageNet Pretraining with Extended Labels" } def f_scores(scores): precision = scores[6] recall = scores[7] if precision + recall == 0.0: return [0.0, 0.0] f1 = (2 * precision * recall) / (precision + recall) beta = 0.5
] # elasticsearch settings ES_CLUSTER_URL = "https://search-es-covid-research-n6etnstkyvx6k2oxrlavq66sia.us-east-2.es.amazonaws.com/" INDEX = "cord19-docs" TREATMENT_INDEX = "cord19-treatments" es = Elasticsearch(ES_CLUSTER_URL) # local cache of data DATA_DIR = os.environ.get("COVID_WEBAPP_DATA_DIR", "/home/ubuntu/efs-mnt/latest") # s3 client ACCESS_KEY = "AKIAT7JLV7IH5TNZAUTR" SECRET_ACCESS_KEY = "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" BUCKET = "covid-research-data" s3_client = S3FileSystem(key=ACCESS_KEY, secret=SECRET_ACCESS_KEY) STOPWORDS = { "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having",
def __init__(self, aws_profile=None): #self.aws_profile = aws_profile self.s3 = S3FileSystem(anon=False, profile=aws_profile)
def test_bucket_exists(s3): assert s3.exists(test_bucket_name) assert not s3.exists(test_bucket_name+'x') s3 = S3FileSystem(anon=True) assert s3.exists(test_bucket_name) assert not s3.exists(test_bucket_name+'x')
def write_array_to_s3(array_uri, arr): s3 = S3FileSystem() bucket, key = _get_bucket_and_key(array_uri) return np.save(s3.open('{}/{}'.format(bucket, key), "wb"), arr)
import pandas as pd from s3fs.core import S3FileSystem from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from model_design import ClassificationModel S3 = S3FileSystem() df = pd.DataFrame() df_fe = pd.DataFrame() X_test_bm = pd.DataFrame() y_test_bm = pd.DataFrame() X_test_fe = pd.DataFrame() y_test_fe = pd.DataFrame() lr = LogisticRegression(random_state=0) svm = SVC(random_state=0) rf = RandomForestClassifier(n_estimators=100, random_state=0) gbt = GradientBoostingClassifier(random_state=0) bm_lr = ClassificationModel('Logistic Regression', lr, '', '', '', '') bm_rf = ClassificationModel('Random Forest', rf, '', '', '', '') bm_svm = ClassificationModel('Support Vector Machine', svm, '', '', '', '') bm_gbt = ClassificationModel('Gradient Boosting Tree', gbt, '', '', '', '') fe_lr = ClassificationModel('Logistic Regression', lr, '', '', '', '') fe_rf = ClassificationModel('Random Forest', rf, '', '', '', '') fe_svm = ClassificationModel('Support Vector Machine', svm, '', '', '', '') fe_gbt = ClassificationModel('Gradient Boosting Tree', gbt, '', '', '', '') optimal_lr = ClassificationModel('Logistic Regression', '', '', '', '', '') optimal_rf = ClassificationModel('Random Forest', rf, '', '', '', '') optimal_svm = ClassificationModel('Support Vector Machine', svm, '', '', '', '')
def test_config_kwargs(): s3 = S3FileSystem(config_kwargs={'signature_version': 's3v4'}) assert s3.connect(refresh=True).meta.config.signature_version == 's3v4'
import time time.sleep( 5 ) # Hacky way of avoiding having to check if the buckets have been created by docker. os.environ['AWS_ACCESS_KEY_ID'] = 'minio' os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123' os.environ['AWS_S3_ENDPOINT'] = 'http://minio:9000' tc.config.set_runtime_config('TURI_S3_REGION', 'us-east-1') tc.config.set_runtime_config('TURI_FILEIO_INSECURE_SSL_CERTIFICATE_CHECKS', 1) tc.config.set_runtime_config('TURI_S3_ENDPOINT', 'http://minio:9000') s3 = S3FileSystem(anon=False, client_kwargs={ 'endpoint_url': 'http://minio:9000', 'aws_access_key_id': 'minio', 'aws_secret_access_key': 'minio123' }) location = 'output/sample.csv' # This doesn't work because pandas can't read a private bucket #df = pd.read_csv('s3://output/sample.csv') # This does work as S3FileSystem is correctly configured for a private bucket print('First getting the csv using S3FileSystem, and pandas...') try: df = pd.read_csv(s3.open(location)) print(df) print('Success') except: print('ERROR: Couldnt get from S3 using pandas')
def test_multiple_objects(s3): s3.connect() assert s3.ls('test') s32 = S3FileSystem(anon=False) assert s32.session assert s3.ls('test') == s32.ls('test')