Ejemplo n.º 1
0
def test_reduce_1():
    reductions = ['sum', 'mean', 'std', 'var', 'prod']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('name')
        schema.add_double_column('amount')
        schema.add_integer_column('hours')

        tp = TransformProcess(schema)
        tp.reduce('name', red)

        tp.to_java()
Ejemplo n.º 2
0
def test_reduce_1():
    reductions = ['sum', 'mean', 'std', 'var', 'prod']
    for red in reductions:
        schema = Schema()
        schema.add_string_column('name')
        schema.add_double_column('amount')
        schema.add_integer_column('hours')

        tp = TransformProcess(schema)
        tp.reduce('name', red)

        tp.to_java()
Ejemplo n.º 3
0
def test_schema():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')
    schema.add_integer_column('int1')
    schema.add_integer_column('int2')
    schema.add_double_column('dbl1')
    schema.add_double_column('dbl2')
    schema.add_float_column('flt1')
    schema.add_float_column('flt2')
    schema.add_categorical_column('cat1', ['A', 'B', 'C'])
    schema.add_categorical_column('cat2', ['A', 'B', 'C'])
    schema.to_java()
Ejemplo n.º 4
0
def test_schema():
    schema = Schema()
    schema.add_string_column('str1')
    schema.add_string_column('str2')
    schema.add_integer_column('int1')
    schema.add_integer_column('int2')
    schema.add_double_column('dbl1')
    schema.add_double_column('dbl2')
    schema.add_float_column('flt1')
    schema.add_float_column('flt2')
    schema.add_categorical_column('cat1', ['A', 'B', 'C'])
    schema.add_categorical_column('cat2', ['A', 'B', 'C'])
    schema.to_java()
Ejemplo n.º 5
0
# Basic example

from pydatavec import Schema, TransformProcess
from pydatavec import NotInSet, LessThan

# Let's define the schema of the data that we want to import
# The order in which columns are defined here should match the order in which they appear in the input data

input_schema = Schema()

input_schema.add_string_column("DateTimeString")
input_schema.add_string_column("CustomerID")
input_schema.add_string_column("MerchantID")

input_schema.add_integer_column("NumItemsInTransaction")

input_schema.add_categorical_column("MerchantCountryCode",
                                    ["USA", "CAN", "FR", "MX"])

# Some columns have restrictions on the allowable values, that we consider valid:

input_schema.add_double_column(
    "TransactionAmountUSD", 0.0, None, False,
    False)  # $0.0 or more, no maximum limit, no NaN and no Infinite values

input_schema.add_categorical_column("FraudLabel", ["Fraud", "Legit"])

# Lets define some operations to execute on the data...
# We do this by defining a TransformProcess
# At each step, we identify column by the name we gave them in the input data schema, above