def test_get_histograms(spark_co):
    pytest.age["data"]["name"] = "b'age'"
    pytest.company["data"]["name"] = "b'company'"
    pytest.eyesColor["data"]["name"] = "b'eyeColor'"
    pytest.gender["data"]["name"] = "b'gender'"
    pytest.isActive["data"]["name"] = "b'isActive'"
    pytest.latitude["data"]["name"] = "b'latitude'"
    pytest.longitude["data"]["name"] = "b'longitude'"
    pytest.transaction["data"]["name"] = "b'transaction'"

    pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
    pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

    spark = spark_co

    spark_df = spark.createDataFrame(pytest.test_df)

    spark_filler = SparkHistogrammar(
        features=[
            "date",
            "isActive",
            "age",
            "eyeColor",
            "gender",
            "company",
            "latitude",
            "longitude",
            ["isActive", "age"],
            ["latitude", "longitude"],
            "transaction",
        ],
        bin_specs={
            "transaction": {
                'num': 100,
                'low': -2000,
                'high': 2000
            },
            "longitude": {
                "bin_width": 5.0,
                "bin_offset": 0.0
            },
            "latitude": {
                "bin_width": 5.0,
                "bin_offset": 0.0
            },
        },
        read_key="input",
        store_key="output",
    )

    # test get_histograms() function call
    current_hists = spark_filler.get_histograms(spark_df)
    # current_hists = make_histograms(spark_df, features, bin_specs)
    assert current_hists["age"].toJson() == pytest.age
    assert current_hists["company"].toJson() == pytest.company
    assert current_hists["eyeColor"].toJson() == pytest.eyesColor
    assert current_hists["gender"].toJson() == pytest.gender
    assert current_hists["latitude"].toJson() == pytest.latitude
    assert current_hists["longitude"].toJson() == pytest.longitude
    assert current_hists["transaction"].toJson() == pytest.transaction
Beispiel #2
0
def test_get_histograms_timestamp(spark_co):
    from pyspark.sql.functions import to_timestamp

    spark = spark_co

    data_date = [
        "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-10 00:00:00",
        "2018-12-10 00:00:00", "2018-12-10 00:00:00", "2018-12-17 00:00:00",
        "2018-12-17 00:00:00", "2018-12-17 00:00:00", "2018-12-17 00:00:00",
        "2018-12-19 00:00:00"
    ]

    df = pd.DataFrame(data_date, columns=['dt'])
    sdf = spark.createDataFrame(df).withColumn(
        "dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss"))
    expected = {
        'data': {
            'binWidth': 2592000000000000.0,
            'bins': {
                '108': 9.0,
                '109': 1.0
            },
            'bins:type': 'Count',
            'entries': 10.0,
            'name': "b'dt'",
            'nanflow': 0.0,
            'nanflow:type': 'Count',
            'origin': 1.2625632e+18
        },
        'type': 'SparselyBin',
        'version': '1.0'
    }
    filler = SparkHistogrammar(features=['dt'])
    current_hists = filler.get_histograms(sdf)
    assert current_hists['dt'].toJson() == expected
Beispiel #3
0
def test_get_histograms(spark_co):
    pytest.age["data"]["name"] = "b'age'"
    pytest.company["data"]["name"] = "b'company'"
    pytest.eyesColor["data"]["name"] = "b'eyeColor'"
    pytest.gender["data"]["name"] = "b'gender'"
    pytest.isActive["data"]["name"] = "b'isActive'"
    pytest.latitude["data"]["name"] = "b'latitude'"
    pytest.longitude["data"]["name"] = "b'longitude'"

    pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
    pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

    spark = spark_co

    spark_df = spark.createDataFrame(pytest.test_df)

    spark_filler = SparkHistogrammar(features=[
        'date', 'isActive', 'age', 'eyeColor', 'gender', 'company', 'latitude',
        'longitude', ['isActive', 'age'], ['latitude', 'longitude']
    ],
                                     bin_specs={
                                         'longitude': {
                                             'bin_width': 5.0,
                                             'bin_offset': 0.0
                                         },
                                         'latitude': {
                                             'bin_width': 5.0,
                                             'bin_offset': 0.0
                                         }
                                     },
                                     read_key='input',
                                     store_key='output')

    # test get_histograms() function call
    current_hists = spark_filler.get_histograms(spark_df)
    # current_hists = make_histograms(spark_df, features, bin_specs)
    assert current_hists['age'].toJson() == pytest.age
    assert current_hists['company'].toJson() == pytest.company
    assert current_hists['eyeColor'].toJson() == pytest.eyesColor
    assert current_hists['gender'].toJson() == pytest.gender
    assert current_hists['latitude'].toJson() == pytest.latitude
    assert current_hists['longitude'].toJson() == pytest.longitude
def test_get_histograms_timestamp(spark_co):
    from pyspark.sql.functions import to_timestamp

    spark = spark_co

    data_date = [
        "2018-12-10 00:00:00",
        "2018-12-10 00:00:00",
        "2018-12-10 00:00:00",
        "2018-12-10 00:00:00",
        "2018-12-10 00:00:00",
        "2018-12-17 00:00:00",
        "2018-12-17 00:00:00",
        "2018-12-17 00:00:00",
        "2018-12-17 00:00:00",
        "2018-12-19 00:00:00",
    ]

    df = pd.DataFrame(data_date, columns=["dt"])
    sdf = spark.createDataFrame(df).withColumn(
        "dt", to_timestamp("dt", "yyyy-MM-dd HH:mm:ss"))
    expected = {
        "data": {
            "binWidth": 2592000000000000.0,
            "bins": {
                "108": 9.0,
                "109": 1.0
            },
            "bins:type": "Count",
            "entries": 10.0,
            "name": "b'dt'",
            "nanflow": 0.0,
            "nanflow:type": "Count",
            "origin": 1.2625632e18,
        },
        "type": "SparselyBin",
        "version": "1.0",
    }
    filler = SparkHistogrammar(features=["dt"])
    current_hists = filler.get_histograms(sdf)
    assert current_hists["dt"].toJson() == expected