def predict(data):

    # impute NA
    for var in config.CATEGORICAL_TO_IMPUTE:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
        data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE)

    # capture elapsed time
    data[config.YEAR_VARIABLE] = pf.elapsed_years(data,
                                                  config.YEAR_VARIABLE,
                                                  ref_var='YrSold')

    # log transform numerical variables
    for var in config.NUMERICAL_LOG:
        data[var] = pf.log_transform(data, var)

    # Group rare labels
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.encode_categorical(data, var,
                                          config.ENCODING_MAPPINGS[var])

    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
Esempio n. 2
0
def predict(df):
    print('predict function b4 anything', df.shape)

    print('predict function', df.shape)

    df = pf.extract_time(df)

    df = pf.log_transform(df, config.LOG_VARS)

    df = pf.to_str(df, config.VAR_TO_STR)

    df = pf.reduce_cardinality(df, df)

    df = pf.cat_to_str(df)

    encoder = ce.OneHotEncoder(use_cat_names=True)
    df = encoder.fit_transform(df)

    # scale variables
    df = pf.scale_features(df[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(df, config.OUTPUT_MODEL_PATH)

    return predictions
Esempio n. 3
0
def predict(data):
    
    # imputar datos faltantes
    for var in config.CATEGORICAL_TO_IMPUTE:
        data[var] = pf.impute_na(data, var, replacement='Missing')
    
    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data,
           config.NUMERICAL_TO_IMPUTE,
           replacement=config.LOTFRONTAGE_MODE)
    
    
    # intervalos de tiempo
    data[config.YEAR_VARIABLE] = pf.elapsed_years(data,
           config.YEAR_VARIABLE, ref_var='YrSold')
    
    
    # transformación logarítmica
    for var in config.NUMERICAL_LOG:
       data[var] = pf.log_transform(data, var)
    
    
    # agrupación de etiquetas poco frecuentes
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var])
    
    # codificación de var. categóricas
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.encode_categorical(data, var,
               config.ENCODING_MAPPINGS[var])
    
    
    # escalar variables
    data = pf.scale_features(data[config.FEATURES],
                             config.OUTPUT_SCALER_PATH)
    
    # obtener predicciones
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)
    
    return predictions
Esempio n. 4
0
# imputar variables categóricas
for var in config.CATEGORICAL_TO_IMPUTE:
    X_train[var] = pf.impute_na(X_train, var, replacement='Missing')

# imputar variables numéricas
X_train[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
    X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE)

# intervalos de tiempo
X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train,
                                                 config.YEAR_VARIABLE,
                                                 ref_var='YrSold')

# transformación logarítmica
for var in config.NUMERICAL_LOG:
    X_train[var] = pf.log_transform(X_train, var)

# agrupación de categorías poco frecuentes
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])

# codificación de variables categóricas
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.encode_categorical(X_train, var,
                                         config.ENCODING_MAPPINGS[var])

# entrenear y guardar el escalador
scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)

# escalar variables
Esempio n. 5
0
train = pf.trip_length(train, 'tpep_pickup_datetime', 'tpep_dropoff_datetime')

train = pf.remove_zero_or_neg_time(train, 'trip_seconds')

# divide data into train, val, test
train = df[df.tpep_dropoff_datetime <= pd.to_datetime('2017-06-30')]
test = df[df.tpep_dropoff_datetime >= pd.to_datetime('2017-11-01')]

val = train[train.tpep_dropoff_datetime >= pd.to_datetime('2017-06-01')]
train = train[train.tpep_dropoff_datetime < pd.to_datetime('2017-06-01')]

# continue preprocessing

train = pf.extract_time(train)

train = pf.log_transform(train, config.LOG_VARS)

train = pf.to_str(train, config.VAR_TO_STR)

train = pf.reduce_cardinality(train, train)

train = pf.cat_to_str(train)

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(train)

# y_train
y_train = train[config.TARGET]

# train scaler and save
scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)