Beispiel #1
0
def test_write_csv_to_s3_no_header(generate_data):
    bucket = 'test-bucket'
    key = 'test.csv'

    conn = boto3.resource('s3', region_name='us-west-2')
    conn.create_bucket(Bucket=bucket)

    utils.write_csv_to_s3(generate_data(), bucket, key, header=False)

    body = (conn.Object(bucket, key).get()['Body'].read().decode('utf-8'))

    assert len(body.rstrip().split('\n')) == 1
Beispiel #2
0
def test_write_csv_to_s3_no_header(generate_data):
    bucket = "test-bucket"
    key = "test.csv"

    conn = boto3.resource("s3", region_name="us-west-2")
    conn.create_bucket(Bucket=bucket)

    utils.write_csv_to_s3(generate_data(), bucket, key, header=False)

    body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8")

    assert len(body.rstrip().split("\n")) == 1
def save(dataframe, bucket, prefix, mode, version, start_ds):
    """Write dataframe to an s3 location and generate a manifest

    :dataframe DataFrame: rollup data
    :bucket str: s3 bucket
    :prefix str: s3 prefix
    :mode str: either `daily` or `monthly`
    :version int: version of the rollup
    :start_ds str: yyyymmdd
    """

    # format the save location of the data
    start_date = arrow.get(start_ds, "YYYYMMDD")

    # select the relevant fields
    select_expr = [
        F.lit(start_date.format("YYYY-MM-DD")),
        "search_provider",
        "search_count",
        "country",
        "locale",
        "distribution_id",
        "default_provider",
        "profile_count",
        "profile_share",  # only for daily
        F.lit(start_date.replace(days=+1).format("YYYY-MM-DD")),
    ]

    # replace mode specific items, like rollup_date
    if mode == "monthly":
        select_expr[0] = F.lit(start_date.format("YYYY-MM"))

        # NOTE: beware of calling remove when there are Column elements in the
        # array because boolean operations are overloaded for dataframes.
        shares_index = map(str, select_expr).index("profile_share")
        del select_expr[shares_index]

    key = ("{}/{}/processed-{}.csv".format(prefix, mode,
                                           start_date.format("YYYY-MM-DD")))

    # persist the dataframe to disk
    logging.info("Writing dataframe to {}/{}".format(bucket, key))
    utils.write_csv_to_s3(dataframe.select(select_expr),
                          bucket,
                          key,
                          header=False)

    csv_paths = get_csv_locations(bucket, key)

    # write the manifest to disk
    write_manifest(bucket, prefix, mode, version, start_ds, csv_paths)
Beispiel #4
0
def test_write_csv_to_s3_existing(generate_data):
    bucket = 'test-bucket'
    key = 'test.csv'

    conn = boto3.resource('s3', region_name='us-west-2')
    conn.create_bucket(Bucket=bucket)

    utils.write_csv_to_s3(generate_data(["foo"]), bucket, key)
    utils.write_csv_to_s3(generate_data(["foo", "bar"]), bucket, key)

    body = (conn.Object(bucket, key).get()['Body'].read().decode('utf-8'))

    # header + 2x row = 3
    assert len(body.rstrip().split('\n')) == 3
Beispiel #5
0
def test_write_csv_to_s3_existing(generate_data):
    bucket = "test-bucket"
    key = "test.csv"

    conn = boto3.resource("s3", region_name="us-west-2")
    conn.create_bucket(Bucket=bucket)

    utils.write_csv_to_s3(generate_data(["foo"]), bucket, key)
    utils.write_csv_to_s3(generate_data(["foo", "bar"]), bucket, key)

    body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8")

    # header + 2x row = 3
    assert len(body.rstrip().split("\n")) == 3
Beispiel #6
0
def test_write_csv_to_s3(generate_data):
    bucket = "test-bucket"
    key = "test.csv"

    conn = boto3.resource("s3", region_name="us-west-2")
    conn.create_bucket(
        Bucket=bucket,
        CreateBucketConfiguration={
            "LocationConstraint": "us-west-2",
        },
    )

    utils.write_csv_to_s3(generate_data(["foo"]), bucket, key)

    body = conn.Object(bucket, key).get()["Body"].read().decode("utf-8")

    # header + 1x row = 2
    assert len(body.rstrip().split("\n")) == 2
def write_dashboard_data(df, bucket, prefix, mode):
    """ Write the dashboard data to a s3 location. """
    # name of the output key
    key = "{}/topline-{}.csv".format(prefix, mode)
    utils.write_csv_to_s3(df, bucket, key)