def __s3_file_system(self): from pyarrow import fs connection = self.connection if "role_arn" in connection._kwargs and connection._kwargs["role_arn"]: external_id = connection._kwargs.get("external_id", None) fs = fs.S3FileSystem( role_arn=connection._kwargs["role_arn"], session_name=connection._kwargs["role_session_name"], external_id="" if external_id is None else external_id, load_frequency=connection._kwargs["duration_seconds"], region=connection.region_name, ) elif connection.profile_name: profile = connection.session._session.full_config["profiles"][ connection.profile_name] fs = fs.S3FileSystem( access_key=profile.get("aws_access_key_id", None), secret_key=profile.get("aws_secret_access_key", None), session_token=profile.get("aws_session_token", None), region=connection.region_name, ) else: fs = fs.S3FileSystem( access_key=connection._kwargs.get("aws_access_key_id", None), secret_key=connection._kwargs.get("aws_secret_access_key", None), session_token=connection._kwargs.get("aws_session_token", None), region=connection.region_name, ) return fs
def main2(): # By default, MinIO will listen for unencrypted HTTP traffic. minio = fs.S3FileSystem(scheme="http", endpoint_override="10.0.0.2:9000") # List all contents in a bucket, recursively file_selector = fs.FileSelector('customer-data-text', recursive=True) print_file_info(minio, file_selector) print(read_pafs_file(minio, 'customer-data-text/customer.csv')) print(read_pafs_stream(minio, 'customer-data-text/customer.csv')) endpoint_url = 'http://10.0.0.2:9000' print_boto3_buckets(endpoint_url) # TODO: read multiple files using dataset # https://stackoverflow.com/questions/45082832/how-to-read-partitioned-parquet-files-from-s3-using-pyarrow-in-python file_system = get_s3fs() print(file_system.ls('example-data')) bucket_uri = 's3://example-data/external-data' print_parquet_pandas_shape(bucket_uri, file_system) print_parquet_dataset_info(bucket_uri, file_system, verbose=False) bucket_uri = 's3://example-data/external-clustered' print_parquet_pandas_shape(bucket_uri, file_system) print_parquet_dataset_info(bucket_uri, file_system, verbose=False)
def scan_file(self, bucket, key, schema): logging.info(f"delim is {self.delimiter}") uri = f"{bucket}/{key}" s3fs = fs.S3FileSystem() # Run column order validation by opening and not reading anything. filestream = s3fs.open_input_stream(uri) parse_opts = csv.ParseOptions(delimiter=self.delimiter) reader = csv.open_csv(filestream, parse_options=parse_opts) for index, col in enumerate(reader.schema): if col.name != schema[index].name: msg = "column {} is out of order".format(col.name) raise ColumnOrderException(msg) # Run the rest of the validations. filestream = s3fs.open_input_stream(uri) opts = csv.ConvertOptions(column_types=schema) reader = csv.open_csv(filestream, convert_options=opts, parse_options=parse_opts) # Kind of a hack, but it works...if delim wrong, everything is read # as one column. if len(schema) > 1 and len(reader.schema) == 1: raise WrongDelimiterException() # Parse through the file, pyarrow will through exceptions # if there's invalid data. for batch in reader: # If primary key is a string, need to check the column # for empty strings. if schema.field(self.primary_key).type == "string": table = pyarrow.Table.from_batches([batch]) for val in table[self.primary_key]: if val.as_py() == "": raise EmptyPrimaryKeyException()
def test_read_csv_arrow_nativefile(s3_base, s3so, pdf): # Write to buffer fname = "test_csv_reader_arrow_nativefile.csv" bname = "csv" buffer = pdf.to_csv(index=False) with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) with fs.open_input_file(f"{bname}/{fname}") as fil: got = cudf.read_csv(fil) assert_eq(pdf, got)
def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): # Write to buffer fname = "test_parquet_reader_arrow_nativefile.parquet" bname = "parquet" buffer = BytesIO() pdf.to_parquet(path=buffer) buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) with fs.open_input_file(f"{bname}/{fname}") as fil: got = cudf.read_parquet(fil, columns=columns) expect = pdf[columns] if columns else pdf assert_eq(expect, got)
def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bname = "orc" expect = pa.orc.ORCFile(source_file).read().to_pandas() with open(source_file, "rb") as f: buffer = f.read() with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) with fs.open_input_file(f"{bname}/{fname}") as fil: got = cudf.read_orc(fil, columns=columns) if columns: expect = expect[columns] assert_eq(expect, got)
def test_read_parquet(s3_base, s3so, open_file_options): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]}) buffer = BytesIO() pdf.to_parquet(path=buffer) buffer.seek(0) with s3_context(s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}): if "open_file_func" in open_file_options: fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) open_file_options["open_file_func"] = fs.open_input_file df = dask_cudf.read_parquet( "s3://daskparquet/*.parq", storage_options=s3so, open_file_options=open_file_options, ) assert df.a.sum().compute() == 10 assert df.b.sum().compute() == 9
def __init__(self, connection: Connection, s3_bucket: str, access_key: str, secret_key: str, s3_region: str) -> None: """ :param connection: Firebolt SDK connection class with established connection to the databse. :param s3_bucket: Intermediate bucket to store the data files before writing them to Firebolt. Has to be created and accessible. :param access_key: AWS Access Key ID that has read/write/delete permissions on the files in the bucket. :param secret_key: Corresponding AWS Secret Key. :param s3_region: S3 region. Best to keep this the same as Firebolt database region. Default us-east-1. """ super().__init__(connection) self.key_id = access_key self.secret_key = secret_key self.s3_bucket = s3_bucket self._updated_tables = set() self.unique_dir = f"{int(time())}_{uuid4()}" self.fs = fs.S3FileSystem(access_key=access_key, secret_key=secret_key, region=s3_region)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path): if s3_path.startswith("s3://"): s3_path = s3_path.replace("s3://", "", 1) local = fs.LocalFileSystem() s3 = fs.S3FileSystem(region=REGION) with local.open_input_stream(local_data_path) as f: tab = csv.read_csv(f) metadata = read_table_json(local_meta_path) arrow_cols = [] for col in metadata.columns: if col["name"] not in metadata.partitions: arrow_cols.append(convert_meta_col_to_arrow_tuple(col)) s = pa.schema(arrow_cols) tab = tab.cast(s) with s3.open_output_stream(s3_path) as f: pq.write_table(tab, f)
def s3_file_stream(s3_bucket, s3_file_path, s3_region): s3_client = fs.S3FileSystem(region=s3_region, anonymous=True) return s3_client.open_input_stream("{}/{}".format(s3_bucket, s3_file_path))
def read_parquet_with_pyarrow(): # https://issues.apache.org/jira/browse/ARROW-8832 # It does not work :( s3 = fs.S3FileSystem(region='eu-west-3')
from os import environ import pandas as pd import pyarrow as pa from pyarrow import fs import pyarrow.dataset as ds s3 = fs.S3FileSystem( access_key=environ['B2_ACCESS_KEY_ID'], secret_key=environ['B2_SECRET_ACCESS_KEY'], endpoint_override=environ['B2_ENDPOINT_URL'] ) dataset = ds.dataset( source='polygon-equities/data/trades', format='feather', filesystem=s3, partitioning='hive', exclude_invalid_files=True ) df = dataset.to_table( # columns=['symbol', 'sip_epoch', 'price', 'size'], filter=ds.field('date') == '2020-07-01' ).to_pandas() # local dataset = ds.dataset( source='/Users/bobcolner/QuantClarity/data/trades/feather/', format='feather',
def setUpClass(cls): # this prevents stupid requests to 169.254.169.254 which take a while os.environ["AWS_EC2_METADATA_DISABLED"] = "true" cls.s3 = fs.S3FileSystem(endpoint_override="127.0.0.1:3000", scheme="http", anonymous=True)
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df