Example #1
0
def test_s3_real_aws():
    # Exercise connection code with an AWS-backed S3 bucket.
    # This is a minimal integration check for ARROW-9261 and similar issues.
    from pyarrow.fs import S3FileSystem
    fs = S3FileSystem(anonymous=True)
    entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data'))
    assert len(entries) > 0
Example #2
0
def test_s3_options(monkeypatch):
    from pyarrow.fs import S3FileSystem

    # Avoid wait for unavailable metadata server in ARN role example below
    monkeypatch.setenv("AWS_EC2_METADATA_DISABLED", "true")

    fs = S3FileSystem(access_key='access',
                      secret_key='secret',
                      session_token='token',
                      region='us-east-2',
                      scheme='https',
                      endpoint_override='localhost:8999')
    assert isinstance(fs, S3FileSystem)
    assert fs.region == 'us-east-2'
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(role_arn='role',
                      session_name='session',
                      external_id='id',
                      load_frequency=100)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    with pytest.raises(ValueError):
        S3FileSystem(access_key='access')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', secret_key='secret', role_arn='arn')
Example #3
0
def test_s3_options():
    from pyarrow.fs import S3FileSystem

    fs = S3FileSystem(access_key='access',
                      secret_key='secret',
                      session_token='token',
                      region='us-east-2',
                      scheme='https',
                      endpoint_override='localhost:8999')
    assert isinstance(fs, S3FileSystem)
    assert fs.region == 'us-east-2'
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(role_arn='role',
                      session_name='session',
                      external_id='id',
                      load_frequency=100)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    with pytest.raises(ValueError):
        S3FileSystem(access_key='access')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', secret_key='secret', role_arn='arn')
Example #4
0
def test_s3_real_aws():
    # Exercise connection code with an AWS-backed S3 bucket.
    # This is a minimal integration check for ARROW-9261 and similar issues.
    from pyarrow.fs import S3FileSystem
    default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1')
    fs = S3FileSystem(anonymous=True)
    assert fs.region == default_region

    fs = S3FileSystem(anonymous=True, region='us-east-2')
    entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data'))
    assert len(entries) > 0
    with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f:
        md = f.metadata()
        assert 'Content-Type' in md
        assert md['Last-Modified'] == b'2020-01-17T16:26:28Z'
        # For some reason, the header value is quoted
        # (both with AWS and Minio)
        assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"'
Example #5
0
def get_s3_dataset(symbol: str, tick_type: str) -> FileSystemDataset:
    from pyarrow.fs import S3FileSystem
    s3  = S3FileSystem(
        access_key=B2_ACCESS_KEY_ID,
        secret_key=B2_SECRET_ACCESS_KEY,
        endpoint_override=B2_ENDPOINT_URL
    )
    ds = dataset(
        source=S3_PATH + f"/{tick_type}/symbol={symbol}/",
        format='feather',
        filesystem=s3,
        partitioning='hive',
        exclude_invalid_files=True
    )
    return ds
def main():
    parser = argparse.ArgumentParser(
        description="Generate sample parquet data")
    parser.add_argument('path',
                        type=str,
                        nargs='?',
                        help='path to save data to',
                        default="./data/data.parquet")
    parser.add_argument(
        '--source',
        type=str,
        help=
        'local path to import data from (optional; can be csv, json or parquet)'
    )
    parser.add_argument(
        '--endpoint',
        type=str,
        help=
        'S3 endpoint (e.g.: https://s3.eu-de.cloud-object-storage.appdomain.cloud'
    )
    parser.add_argument('--access_key', type=str, help='S3 access key')
    parser.add_argument('--secret_key', type=str, help='S3 secret key')
    args = parser.parse_args()

    if args.endpoint:
        print("Using S3 file system")
        parsed_endpoint = urlparse(args.endpoint)
        fs = S3FileSystem(endpoint_override=parsed_endpoint.netloc,
                          scheme=parsed_endpoint.scheme,
                          access_key=args.access_key,
                          secret_key=args.secret_key,
                          background_writes=False)
    else:
        print("Using local file system")
        os.makedirs(os.path.dirname(args.path), exist_ok=True)
        fs = LocalFileSystem()

    table = import_table(args.source)

    with fs.open_output_stream(args.path) as f:
        pq.write_table(table, f)
    print("Table written to", args.path)
    print(table.to_pandas())
Example #7
0
def s3fs(request, s3_connection, s3_server):
    request.config.pyarrow.requires('s3')
    from pyarrow.fs import S3FileSystem

    host, port, access_key, secret_key = s3_connection
    bucket = 'pyarrow-filesystem/'

    fs = S3FileSystem(access_key=access_key,
                      secret_key=secret_key,
                      endpoint_override='{}:{}'.format(host, port),
                      scheme='http')
    fs.create_dir(bucket)

    return dict(
        fs=fs,
        pathfn=bucket.__add__,
        allow_copy_file=True,
        allow_move_dir=False,
        allow_append_to_file=False,
    )
Example #8
0
def s3fs(request, minio_server):
    request.config.pyarrow.requires('s3')
    from pyarrow.fs import S3Options, S3FileSystem

    address, access_key, secret_key = minio_server
    bucket = 'pyarrow-filesystem/'
    options = S3Options(endpoint_override=address,
                        access_key=access_key,
                        secret_key=secret_key,
                        scheme='http')
    fs = S3FileSystem(options)
    fs.create_dir(bucket)

    return dict(
        fs=fs,
        pathfn=bucket.__add__,
        allow_copy_file=True,
        allow_move_dir=False,
        allow_append_to_file=False,
    )
Example #9
0
def test_s3_proxy_options(monkeypatch):
    from pyarrow.fs import S3FileSystem

    # The following two are equivalent:
    proxy_opts_1_dict = {'scheme': 'http', 'host': 'localhost', 'port': 8999}
    proxy_opts_1_str = 'http://localhost:8999'
    # The following two are equivalent:
    proxy_opts_2_dict = {'scheme': 'https', 'host': 'localhost', 'port': 8080}
    proxy_opts_2_str = 'https://localhost:8080'

    # Check dict case for 'proxy_options'
    fs = S3FileSystem(proxy_options=proxy_opts_1_dict)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(proxy_options=proxy_opts_2_dict)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    # Check str case for 'proxy_options'
    fs = S3FileSystem(proxy_options=proxy_opts_1_str)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(proxy_options=proxy_opts_2_str)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    # Check that two FSs using the same proxy_options dict are equal
    fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    # Check that two FSs using the same proxy_options str are equal
    fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
    fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    # Check that two FSs using equivalent proxy_options
    # (one dict, one str) are equal
    fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_1_str)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
    assert fs1 == fs2
    assert pickle.loads(pickle.dumps(fs1)) == fs2
    assert pickle.loads(pickle.dumps(fs2)) == fs1

    # Check that two FSs using nonequivalent proxy_options are not equal
    fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
    fs2 = S3FileSystem(proxy_options=proxy_opts_2_str)
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    # Check that two FSs (one using proxy_options and the other not)
    # are not equal
    fs1 = S3FileSystem(proxy_options=proxy_opts_1_dict)
    fs2 = S3FileSystem()
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_1_str)
    fs2 = S3FileSystem()
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_2_dict)
    fs2 = S3FileSystem()
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    fs1 = S3FileSystem(proxy_options=proxy_opts_2_str)
    fs2 = S3FileSystem()
    assert fs1 != fs2
    assert pickle.loads(pickle.dumps(fs1)) != fs2
    assert pickle.loads(pickle.dumps(fs2)) != fs1

    # Only dict and str are supported
    with pytest.raises(TypeError):
        S3FileSystem(proxy_options=('http', 'localhost', 9090))
    # Missing scheme
    with pytest.raises(KeyError):
        S3FileSystem(proxy_options={'host': 'localhost', 'port': 9090})
    # Missing host
    with pytest.raises(KeyError):
        S3FileSystem(proxy_options={'scheme': 'https', 'port': 9090})
    # Missing port
    with pytest.raises(KeyError):
        S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost'})
    # Invalid proxy URI (invalid scheme htttps)
    with pytest.raises(pa.ArrowInvalid):
        S3FileSystem(proxy_options='htttps://localhost:9000')
    # Invalid proxy_options dict (invalid scheme htttps)
    with pytest.raises(pa.ArrowInvalid):
        S3FileSystem(proxy_options={
            'scheme': 'htttp',
            'host': 'localhost',
            'port': 8999
        })
def main():
    args = parse_platform_metrics_calculator_pipeline_arguments(sys.argv[1:])
    time_range = _get_time_range(args.year, args.month)

    organisation_data = read_json_file(args.organisation_list_file)
    organisation_metadata = construct_organisation_list_from_dict(data=organisation_data)

    spine_messages = _read_spine_csv_gz_files(args.input_files)
    transfers = list(parse_transfers_from_messages(spine_messages, time_range))
    practice_metrics_data = calculate_practice_metrics_data(
        transfers, organisation_metadata.practices, time_range
    )
    national_metrics_data = calculate_national_metrics_data(
        transfers=transfers, time_range=time_range
    )
    organisation_metadata = construct_organisation_metadata(organisation_metadata)
    transfer_table = convert_transfers_to_table(transfers)

    practice_metrics_file_name = "practiceMetrics.json"
    organisation_metadata_file_name = "organisationMetadata.json"
    national_metrics_file_name = "nationalMetrics.json"
    transfers_file_name = "transfers.parquet"

    if _is_outputting_to_file(args):
        _write_data_platform_json_file(
            practice_metrics_data,
            f"{args.output_directory}/{args.month}-{args.year}-{practice_metrics_file_name}",
        )
        _write_data_platform_json_file(
            organisation_metadata,
            f"{args.output_directory}/{args.month}-{args.year}-{organisation_metadata_file_name}",
        )
        _write_data_platform_json_file(
            national_metrics_data,
            f"{args.output_directory}/{args.month}-{args.year}-{national_metrics_file_name}",
        )
        write_table(
            transfer_table,
            f"{args.output_directory}/{args.month}-{args.year}-{transfers_file_name}",
        )
    elif _is_outputting_to_s3(args):
        s3 = boto3.resource("s3", endpoint_url=args.s3_endpoint_url)

        bucket_name = args.output_bucket
        version = "v2"
        s3_path = f"{version}/{args.year}/{args.month}"

        _upload_data_platform_json_object(
            practice_metrics_data,
            s3.Object(bucket_name, f"{s3_path}/{practice_metrics_file_name}"),
        )
        _upload_data_platform_json_object(
            organisation_metadata,
            s3.Object(bucket_name, f"{s3_path}/{organisation_metadata_file_name}"),
        )
        _upload_data_platform_json_object(
            national_metrics_data,
            s3.Object(bucket_name, f"{s3_path}/{national_metrics_file_name}"),
        )
        write_table(
            table=transfer_table,
            where=bucket_name + "/" + f"{s3_path}/{transfers_file_name}",
            filesystem=S3FileSystem(endpoint_override=args.s3_endpoint_url),
        )
Example #11
0
def test_s3_options():
    from pyarrow.fs import S3FileSystem

    fs = S3FileSystem(access_key='access',
                      secret_key='secret',
                      session_token='token',
                      region='us-east-2',
                      scheme='https',
                      endpoint_override='localhost:8999')
    assert isinstance(fs, S3FileSystem)
    assert fs.region == 'us-east-2'
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(role_arn='role',
                      session_name='session',
                      external_id='id',
                      load_frequency=100)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(anonymous=True)
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    fs = S3FileSystem(background_writes=True,
                      default_metadata={
                          "ACL": "authenticated-read",
                          "Content-Type": "text/plain"
                      })
    assert isinstance(fs, S3FileSystem)
    assert pickle.loads(pickle.dumps(fs)) == fs

    with pytest.raises(ValueError):
        S3FileSystem(access_key='access')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(secret_key='secret', session_token='token')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', secret_key='secret', role_arn='arn')
    with pytest.raises(ValueError):
        S3FileSystem(access_key='access', secret_key='secret', anonymous=True)
    with pytest.raises(ValueError):
        S3FileSystem(role_arn="arn", anonymous=True)
    with pytest.raises(ValueError):
        S3FileSystem(default_metadata=["foo", "bar"])