Ejemplo n.º 1
0
def get_feature_distance_matrix():
    getter = FeatureGetter(feature_names)
    # agg = FeatureAgglomeration(n_clusters=12)
    categorical_mapping = load_categorical_mapping()
    categorical_transform = CategoricalTransformer(
        feature_names=getter.feature_names, mapping=categorical_mapping)
    x, _ = load_training_data(as_numpy=True)
    x = categorical_transform.transform(x)
    imputer = SimpleImputer(strategy='most_frequent')
    x = imputer.fit_transform(x)
    distance_matrix = []
    for row_feature_name in getter.feature_names:
        row = []
        for col_feature_name in getter.feature_names:
            if row_feature_name == col_feature_name:
                row.append(1.0)
            else:
                row_data = getter.get_column(row_feature_name, x)
                col_data = getter.get_column(col_feature_name, x)
                row_data = row_data.astype(np.float64)
                col_data = col_data.astype(np.float64)
                if np.isnan(row_data).any() > 0:
                    continue
                if np.isnan(col_data).any() > 0:
                    continue
                corr: np.ndarray = np.corrcoef(row_data, col_data)
                pearson = corr[0][1]
                # will be between -1 and 1.  A 0 means there is no relation.  A 1 or -1 means there is a perfect
                # correlation
                if np.isnan(pearson):
                    pearson = 0.0
                row.append(pearson)
        distance_matrix.append(row)
    return np.array(distance_matrix)
Ejemplo n.º 2
0
def build_train_and_submit(core, name):
    model = build_model_of(core)
    score = train_and_evaluate_model(model)
    print(score)
    x, y = load_training_data(as_numpy=True)
    model.fit(x, y)
    create_submission(model, name)
Ejemplo n.º 3
0
def evaluate_boosted_estimator():
    scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    features, values = load_training_data()
    pipe = build_boosted_estimator()
    pipe.fit(features, values)
    t = pipe.predict(features)
    results: np.ndarray = cross_val_score(pipe, features, values, scoring=scorer, verbose=4, )
    print(-1.0 * np.average(results))
Ejemplo n.º 4
0
def try_it_out():
    mpng = create_categorical_mapping()
    save_categorical_mapping(mpng)
    rec = load_categorical_mapping()

    features, values = load_training_data()
    tf = CategoricalTransformer(feature_names=features.keys().to_list(),
                                mapping=rec)
    result = tf.transform(features.to_numpy())
    print(result)
Ejemplo n.º 5
0
def train_and_evaluate_model():
    getter = FeatureGetter()
    model = MyModel(getter=getter,
                    core_type='elastic-net',
                    output_transform='log',
                    imputation_method='basic')
    x, y = load_training_data(as_numpy=True)
    scorer = make_scorer(mean_squared_log_error)
    results = cross_val_score(model, x, y, scoring=scorer, n_jobs=6, verbose=3)
    print(np.average(results))
Ejemplo n.º 6
0
def view_transformed_data():
    features, values = load_training_data()
    pipeline_steps = [
        ('remove_unused_columns', remove_unused_columns_transform()),
        ('convert_to_ordinal', convert_ordinal_columns_transform()),
        ('derive_age', derive_age_transform())
    ]
    pipe = Pipeline(steps=pipeline_steps)
    out = pipe.fit_transform(features)
    print(out)
Ejemplo n.º 7
0
def get_pca():
    features, values = load_training_data()
    names = features.keys()
    imp = SimpleImputer(strategy='most_frequent')
    x = imp.fit_transform(features, values)
    enc = OrdinalEncoder()
    x = enc.fit_transform(x, values)
    pca = PCA(n_components=8)
    out = pca.fit_transform(x, values)
    print(pca.components_)
    print(out)
Ejemplo n.º 8
0
def view_stack_input():
    features, values = load_training_data()
    pipeline_steps = [
        ('remove_unused_columns', remove_unused_columns_transform()),
        ('convert_to_ordinal', convert_ordinal_columns_transform()),
        ('derive_age', derive_age_transform()),
        ('extract_predictions', extract_nearest_neighbors_prediction_transform())
    ]
    pipe = Pipeline(steps=pipeline_steps)
    res = pipe.fit_transform(features, values)
    print(res)
Ejemplo n.º 9
0
def grid_search_model():
    scorer = make_scorer(mean_squared_log_error)
    x, y = load_training_data(as_numpy=True)
    model = MyModel()
    grid_params = {
        "core_type": ["linear-regression", "elastic-net"],
        "output_transform": ["none", "log"]
    }
    gs = GridSearchCV(model, grid_params, scoring=scorer, n_jobs=6, verbose=4)
    gs.fit(x, y)
    print("best score: ", gs.best_score_)
    print("best parameters: ", gs.best_params_)
Ejemplo n.º 10
0
def search_nearest_neighbors_parameters():
    features, values = load_training_data()
    scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

    param_grid = {
        "estimator__area_weight": [0.005, 0.01, 0.05],
    }
    pipe = build_nearest_neighbors_pipeline()
    searcher = GridSearchCV(estimator=pipe, scoring=scorer, param_grid=param_grid, verbose=4, n_jobs=6)
    searcher.fit(features, values)
    print("best parameters: ", searcher.best_params_)
    print("best score: ", absolute_value(searcher.best_score_))
    print("done")
Ejemplo n.º 11
0
def create_categorical_mapping(threshold: int = 30) -> CategoricalMapping:
    features: pd.DataFrame
    values: pd.DataFrame
    features, values = load_training_data()
    tmp = {}
    for k in features.keys():
        feat: pd.Series = features[k]
        if feat.dtype == float:
            continue
        if feat.unique().size > threshold:
            continue
        ord = assign_ordinal(feat, values)
        tmp[k] = CategoricalMapper(ord)
    return CategoricalMapping(tmp)
Ejemplo n.º 12
0
def get_chi2_scores():
    features, values = load_training_data()
    names = features.keys()
    imp = SimpleImputer(strategy='most_frequent')
    x = imp.fit_transform(features, values)
    enc = OrdinalEncoder()
    x = enc.fit_transform(x, values)
    c2, pval = chi2(x, values)
    out = []
    for i in range(0, len(names)):
        fv = FeatureAndValue(names[i], pval[i])
        out.append(fv)
    sort_feature_and_value(out)
    return out
Ejemplo n.º 13
0
def identify_outlier_rows():
    getter = FeatureGetter()
    model = MyModel(getter=getter,
                    core_type='elastic-net',
                    output_transform='none',
                    imputation_method='basic',
                    scaling='none')
    x, y = load_training_data(as_numpy=True, no_shuffle=True)
    model.fit(x, y)
    yp = model.predict(x)
    dev = np.abs(yp - y)
    avg_dev = np.average(dev)
    # add 1 to the index because the pandas dataframe index starts at 1, while numpy indices start at 0
    outliers = np.where(dev > 4.0 * avg_dev)
    outliers = [1 + k for k in outliers]
    print(outliers)
Ejemplo n.º 14
0
def produce_prepped_dataset():
    output_names = [
        "neighborhood", "zone", "style", "area", "beds", "baths", "build",
        'sold', 'age', 'cars', 'kitchen', 'overall', 'lotarea', 'saleprice'
    ]

    features, values = load_training_data(no_shuffle=True)
    indices = features.index
    input_names = features.keys().tolist()
    x = features.to_numpy()
    tf = HomeTransformer(input_names)
    out = tf.transform(x)
    t = np.concatenate([out.T, [values]]).T
    df = pd.DataFrame(t, columns=output_names, index=indices)
    df.round(decimals=3)
    df.to_csv('../data/prepped.csv', header=True)
Ejemplo n.º 15
0
def grid_search_model():
    scorer = make_scorer(mean_squared_log_error)
    x, y = load_training_data(as_numpy=True)
    reg1 = LinearRegression()
    reg2 = KNeighborsRegressor()
    estimators = [("linear", reg1), ("neighbors", reg2)]
    reg3 = StackingRegressor(estimators=estimators,
                             passthrough=True,
                             final_estimator=RidgeCV())
    core = GradientBoostingRegressor(init=reg3)
    model = build_model_of(core)
    results = cross_val_score(model, x, y, scoring=scorer)
    t = np.average(results)
    print(t)
    model.fit(x, y)
    save_model(model, 'stacked-and-boosted')
    create_submission(model, 'stacked-and-boosted')
Ejemplo n.º 16
0
def run_model():
    getter = FeatureGetter()
    model = MyModel(getter=getter,
                    core_type='linear-regression',
                    output_transform='none',
                    imputation_method='basic',
                    scaling='none')
    x, y = load_training_data(as_numpy=True,
                              remove_indices=[
                                  5, 14, 59, 67, 191, 219, 463, 504, 569, 589,
                                  609, 633, 689, 729, 775, 1325, 1424
                              ])
    scorer = make_scorer(mean_squared_log_error)
    results = cross_val_score(model, x, y, scoring=scorer, n_jobs=6, verbose=3)
    print(np.average(results))
    model.fit(x, y)
    y2 = model.predict(x)
    yd = np.abs(y2 - y)
    print(np.average(yd))
    save_model(model, 'none')
    create_submission(model, 'none')
Ejemplo n.º 17
0
def experiment3():

    low_variance_features = load_low_variance_feature_names()
    high_missing_value_features = load_features_with_high_missing_values()
    combined = combine_as_set(low_variance_features,
                              high_missing_value_features)

    neighborhood_transform = NeighborhoodTransform()
    features, values = load_training_data()
    names = features.keys().tolist()
    retained_names = remove_as_set(names, combined)
    x = features.to_numpy()
    y = values.to_numpy()
    pipe = Pipeline(steps=[
        ('retain',
         ColumnTransformer(transformers=[('retain', 'passthrough',
                                          indexes_of(names, retained_names))],
                           remainder='drop')),
        ("transform_neighborhoods",
         ColumnMapper(column_ref=index_of(retained_names, "Neighborhood"),
                      transform_function=neighborhood_transform.
                      transform_neighborhood_to_ordinal)),
        ('transform_zones',
         ColumnMapper(column_ref=index_of(retained_names, "MSZoning"),
                      transform_function=zoning_to_ordinal)),
        ('transform_styles',
         ColumnMapper(column_ref=index_of(retained_names, "HouseStyle"),
                      transform_function=house_style_ordinal)),
        ('transform_kitchen_quality',
         ColumnMapper(column_ref=index_of(retained_names, "KitchenQual"),
                      transform_function=map_kitchen_quality)),
        ('map_functional',
         ColumnMapper(column_ref=index_of(retained_names, "Functional"),
                      transform_function=map_functional)),
        # ('remove_low_variance_features', ColumnTransformer(transformers=[
        #     ('', preprocessing.StandardScaler(), [index_of(names, "KitchenQual")])
        # ]))
    ])
    out = pipe.fit_transform(x, y)
    print(out)
Ejemplo n.º 18
0
def train_and_evaluate_model(model):
    x, y = load_training_data(as_numpy=True)
    scorer = make_scorer(mean_squared_log_error)
    results = cross_val_score(model, x, y, scoring=scorer)
    return np.average(results)
Ejemplo n.º 19
0
def view_nearest_neighbors_output():
    features, values = load_training_data()
    pipe = build_nearest_neighbors_pipeline()
    out = pipe.fit_transform(features, values)
    print(out)
Ejemplo n.º 20
0
import os
import numpy as np
import pandas as pd
from home_prices.dataload import load_training_data
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
import math

features, values = load_training_data()
ok = features['GrLivArea'] + features['TotalBsmtSF'] < 6000
features = features[ok]
values = values[ok]


def get_neighborhood_colors(column: str = "Neighborhood"):
    # Could also doc olumn = 'MSZoning'
    x: pd.Series = features[column]
    enc = OrdinalEncoder()
    colors = enc.fit_transform(x.to_numpy().reshape([len(x), 1]))
    return colors


def get_prices(take_log: bool = False, thousands: bool = False):
    y = values
    if thousands:
        y = y / 1000.0
    if take_log:
        y = np.log(y)
    return y
Ejemplo n.º 21
0
def evaluate_nearest_neighbors_pipeline():
    scorer = make_scorer(mean_squared_log_error, greater_is_better=False)
    features, values = load_training_data()
    pipe = build_nearest_neighbors_pipeline()
    results: np.ndarray = cross_val_score(pipe, features, values, scoring=scorer, verbose=4)
    print(-1.0 * np.average(results))
Ejemplo n.º 22
0
def print_missing_values():
    features, values = load_training_data()
    mvs = count_missing_values(features)
    sort_feature_and_value(mvs)
    print_feature_values(mvs)