def main():
    mpl_logger = logging.getLogger('matplotlib')
    mpl_logger.setLevel(logging.WARNING)
    x, x_test, y, y_test = \
        get_basic_data_splited_train_test(price_groups='1', buildings_present='0;1',
                                          columns_to_omit=values_to_omit_in_basic_data_version,
                                          random_state=50, test_size=0.01)
    x = fill_nan_values(x)
    print(x.columns)
    x = x.values
    y = y.values

    normalized_x = preprocessing.normalize(x)

    neural_network_model = create_model(standardize(x), y)

    x_test = x_test.values
    y_test = y_test.values
    #x_test = preprocessing.normalize(x_test)
    #y_test = y_test # preprocessing.normalize(y_test)

    normalized_x = normalized_x[0:10000, :]
    normalized_y = y[0:10000]
    predictions = make_prediction(neural_network_model.model, normalized_x)
    get_result_statistics(predictions, normalized_y)

    log_statistics(neural_network_model)
    neural_network_model.save_model()
def main():
    # Accuracy around 80%, model trained only for cheap parcels
    x_train, x_test, y_train, y_test = \
        get_basic_data_splited_train_test(price_groups='0', buildings_present='0;1',
                                          random_state=50, test_size=0.2)

    decision_tree = create_decision_tree(x_train, x_test, y_train, y_test, max_depth=10)
    predictions = make_prediction(decision_tree, x_test)
    get_result_statistics(predictions, y_test)
    export_decision_tree(decision_tree, list(x_train.columns))
def run_sklearn_linear_regression(x_train_set, x_test_set, y_train_set,
                                  y_test_set):
    sklearn_regressor = LinearRegression().fit(x_train_set, y_train_set)

    sklearn_train_accuracy = sklearn_regressor.score(x_train_set, y_train_set)
    sklearn_test_accuracy = sklearn_regressor.score(x_test_set, y_test_set)

    # cross_validation_results = cross_val_score(sklearn_regressor, x_train_set, y_train_set, cv=50,
    #                                            scoring="neg_mean_squared_error")

    logging.info("Train accuracy: %s, Test accuracy: %s",
                 sklearn_train_accuracy, sklearn_test_accuracy)
    # logging.info("Cross Validation score: %s", cross_validation_results)
    logging.debug("Coefficient: %s", sklearn_regressor.coef_)

    predicted_values = sklearn_regressor.predict(x_test_set)

    get_result_statistics(predicted_values=predicted_values,
                          real_values=y_test_set)


if __name__ == '__main__':
    x_train_set, x_test_set, y_train_set, y_test_set = get_basic_data_splited_train_test(
        price_groups='0;1;2',
        buildings_present='0;1',
        columns_to_omit=values_to_omit_in_basic_data_version,
        test_size=0.2)
    run_sklearn_linear_regression(x_train_set, x_test_set, y_train_set,
                                  y_test_set)
Beispiel #4
0
def run_KNN_regression(x_train, x_test_set, y_train, y_test_set):
    # logging.info("Evaluation started")
    # parameters = {'n_neighbors': [8, 9, 10, 11]}
    # knn = KNeighborsRegressor()
    # clf = GridSearchCV(knn, parameters)
    # clf.fit(x_train, y_train)
    # logging.debug(clf.best_estimator_.coef_)
    # logging.debug(clf.best_params_)
    # logging.debug("Model fitted")

    neigh = KNeighborsRegressor(n_neighbors=9)
    neigh.fit(x_train, y_train)
    predicted_values = neigh.predict(x_test_set)
    get_result_statistics(predicted_values=predicted_values,
                          real_values=y_test_set)


def fill_nan_values(df):
    imp = SimpleImputer(strategy="most_frequent")  # startegy: 'mean'
    return imp.fit_transform(df)


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = \
        get_basic_data_splited_train_test(price_groups='0;1;2', buildings_present='0;1',
                                          random_state=50, test_size=0.2,
                                          use_distances=True,
                                          columns_to_omit=['Price_Group_int', 'Price_Group_int_second'])
    run_KNN_regression(fill_nan_values(x_train), fill_nan_values(x_test),
                       y_train, y_test)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)
        logging.info("Score: %s", score)
        get_classification_result_statistics(real_values=y_test,
                                             predicted_values=y_pred)


def fill_nan_values(df):
    imp = SimpleImputer(strategy="most_frequent")  # startegy: 'mean'
    return imp.fit_transform(df)


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = \
        get_basic_data_splited_train_test(price_groups='0;1;2;', buildings_present='0;1',
                                          random_state=50, test_size=0.2, columns_to_omit=['Sale_Amount'],
                                          use_distances=True, target_column="Price_Group_int")
    normalized_x_train = normalize(fill_nan_values(x_train))
    normalized_x_test = normalize(fill_nan_values(x_test))
    compare_classifiers(X_train=normalized_x_train,
                        X_test=normalized_x_test,
                        y_train=y_train,
                        y_test=y_test)
    # compare_classifiers(X_train=x_train, X_test=x_test, y_train=y_train, y_test=y_test)

# normalized_x = preprocessing.normalize(x)
# scalerX = MinMaxScaler()
# scalerX.fit(x)
# standarded_x = scalerX.transform(x)
Beispiel #6
0
import logging
import statsmodels.api as sm
from utils.get_basic_train_data import get_basic_data_splited_train_test
from utils.result_stats import get_result_statistics
from configuration.configuration_constants import values_to_omit_in_basic_data_version


def run_statsmodels_regression(x_train_set, x_test_set, y_train_set,
                               y_test_set):
    model = sm.OLS(y_train_set, x_train_set.astype(int))
    results = model.fit()
    logging.info(results.summary())

    predicted_values = results.predict(x_test_set.astype(int))
    get_result_statistics(predicted_values=predicted_values,
                          real_values=y_test_set)


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = \
        get_basic_data_splited_train_test(price_groups='2;', buildings_present='0;1',
                                          random_state=50, test_size=0.2,
                                          use_distances=False)
    run_statsmodels_regression(x_train, x_test, y_train, y_test)
Beispiel #7
0
def draw_heatmap(df):

    C_mat = df.corr()
    fig = plt.figure(figsize=(15, 15))
    sb.heatmap(C_mat, vmax=1, square=True, cmap="Blues")
    plt.show()


mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)

# 0 bez budynków, 0 - tanie
x, x_test, y, y_test = \
        get_basic_data_splited_train_test(price_groups='0', buildings_present='0',
                                          columns_to_omit=values_to_omit_in_basic_data_version,
                                          random_state=50, test_size=0.01)

data_df = get_basic_data(price_groups='0',
                         buildings_present='0',
                         columns_to_omit=values_to_omit_in_basic_data_version)

#data_df = fill_nan_values(data_df)

x = fill_nan_values(x)
data_df.describe()

q_low = data_df["Sale_Amount"].quantile(0.01)
q_hi = data_df["Sale_Amount"].quantile(0.99)

q_low2 = data_df["Sale_Amount"].quantile(0.01)
Beispiel #8
0
    x_train_set.fillna(0)
    y_train_set.fillna(0)
    x_test_set.fillna(0)
    y_test_set.fillna(0)
    # regressor = svm.SVR(kernel='precomputed', max_iter=10)
    # regressor = svm.SVR(kernel='poly', max_iter=1000)
    # INFO:root:Train accuracy: -9.416961286588589e+79, Test accuracy: -1.0148852463254762e+80
    # INFO:root:Train accuracy: -9.416961286588589e+79, Test accuracy: -1.0148852463254762e+80
    # INFO:root:-9.416961286588589e+79
    # INFO:root:-1.0148852463254762e+80
    logging.debug("Model created")

    regressor.fit(x_train_set, y_train_set)

    logging.debug("Fitted")
    train_accuracy = regressor.score(x_train_set, y_train_set)
    test_accuracy = regressor.score(x_test_set, y_test_set)
    logging.info("Train accuracy: %s, Test accuracy: %s", train_accuracy,
                 test_accuracy)
    results = regressor.predict(x_test_set)

    return train_accuracy, test_accuracy


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = \
        get_basic_data_splited_train_test(price_groups='0;1;2', buildings_present='0;1', basic_data_version=True,
                                          random_state=50, test_size=0.2)
    run_svm_regression(x_train, x_test, y_train, y_test)