Esempio n. 1
0
def go():
    city_col_names, city_data = read_file("../pa5/data/city/data.csv")

    graffiti = city_data[:, 0]
    garbage = city_data[:, 3]

    print("Task 1")
    print("GRAFFITI:", var(graffiti))
    print("GARBAGE:", var(garbage))
    print()
    print()

    print("Task 2")
    b = (np.arange(24)**2).reshape(6, 4)
    task2(b)

    print("Task 3")
    ### REPLACE 0.0 with appropriate call to linear regression
    print("Rodents, Garbage => Crime",
          linear_regression(city_data[:, 2:4], city_data[:, 7]))
    print()
    print()

    print("Task 4")
    ### REPLACE 0.0 with appropriate call to linear regression
    print("Graffiti => Crime:",
          linear_regression(city_data[:, 0:1], city_data[:, 7]))
    print()
    print()
def try_models(X, y, talk=True):
    """ Runs all the available models on the given data and prints errors. """

    model.bayesian_ridge(X, y, errors=talk)
    model.linear_regression(X, y, errors=talk)
    model.lars_lasso(X, y, errors=talk)
    model.lasso(X, y, errors=talk)
    model.ridge_regression(X, y, errors=talk)
Esempio n. 3
0
#!/usr/bin/python3

import numpy as np
import cv2
from model import linear_regression
from dataset import dataset
import canva

WINDOWS_NAME = 'regression'
d = dataset()
c = canva.canva(1400, 800)
lr = linear_regression(theta_lenght=2, learning_rate=1)


def rescale(value, mininit, maxinit, minfinal=0, maxfinal=1):
    scaleinit = maxinit - mininit
    scalefinal = maxfinal - minfinal
    return (value - mininit) / (scaleinit) * scalefinal + minfinal


def on_left_click(x, y):
    c.add_point(x, y)
    d.add_point(rescale(x, 0, c.width, 0, 1), rescale(y, 0, c.height, 1, 0))


def mouseControl(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        on_left_click(x, y)
    else:
        nothing()
Esempio n. 4
0
print("the singular values of x are : ")
print(LM_sing)
print()

LM_corr = fsm.correlation(LM)
print("the correlations between inputs and output are : ")
print(LM_corr)

Q = 7
LM_selected= np.concatenate((LM_U[:,:Q].T@LM[:-1,:],LM[-1,:].reshape(1,index_LM)),axis=0)
T_selected = np.concatenate((LM_U[:,:Q].T@T[:-1,:],T[-1,:].reshape(1,length_dataset-index_LM)),axis=0)

print("""\n# =============================================================================
# linear model
# =============================================================================\n""")
m_lin = model.linear_regression(Q)
m_lin.train(LM_selected)
y_lin = m_lin.evaluate(T_selected)*std_dataset[-1]+mean_dataset[-1]
e_lin = m_lin.error()*std_dataset[-1]

print('linear error = ',e_lin)

print("""\n# =============================================================================
# knn model
# =============================================================================\n""")

my_knn = model.knn()
k_opt,error_array_knn = my_knn.meta_find(LM_selected,vm.default)
error_array_knn=error_array_knn*std_dataset[-1]
print("k_opt = {}".format(k_opt))
print("error_array = ")
X = pca.fit_transform(X)

########################################################################################################################################
#split data using train_test_split function
X_train_conv, X_test_conv, y_train_conv, y_test_conv = train_test_split(
    X, y_methyl_mercury, test_size=0.2, random_state=0)
X_train_conv_1, X_test_conv_1, y_train_conv_1, y_test_conv_1 = train_test_split(
    X, y_total_mercury, test_size=0.2, random_state=0)

#train_pct_index = int(0.8*len(X))
#X_train_conv, X_test_conv = X[:train_pct_index], X[train_pct_index:]
#y_train_conv, y_test_conv = y_methyl_mercury[:train_pct_index], y_methyl_mercury[train_pct_index:]

################################################################
#call the regressor model and find error and store it
regressor = model.linear_regression(0.001, 10)
regressor.fit(X_train_conv, y_train_conv)
predicted = regressor.predict(X_test_conv)

regressor_1 = model.linear_regression(0.001, 10)
regressor_1.fit(X_train_conv_1, y_train_conv_1)
predicted_1 = regressor_1.predict(X_test_conv_1)
#################################################################

#calling using sklearn
regressor = LinearRegression()
X_train_conv = X_train_conv.astype(np.float64)
y_train_conv = y_train_conv.astype(np.float64)
regressor.fit(X_train_conv, y_train_conv)
X_test_conv = X_test_conv.astype(np.float64)
y_pred_0 = regressor.predict(X_test_conv)
Esempio n. 6
0
def main():  # {{{

    path = './data/use'

    # value is None for daily data
    data_common = {
        'currency': None,
        'powder_feed': ['same', 'same'],
        'yellow_bean': ['same'],
        'weather': None
    }
    data_wu = {
        'wu_export': ['divide', 'same'],
        'wu_price_perDate': None,
        # 'wu_price_perMonth': ['same', 'divide'],
    }
    data_chi = {
        'chi_export': ['divide', 'same'],
        'chi_price_perDate': None,
        'chi_price_perMonth': ['same', 'divede'],
        'chi_small_fish': ['divide', 'same']
    }

    d = RowDataHandler()
    for filename, inpu_method in data_common.items():
        d.add(pd.read_csv(f'{path}/{filename}.csv'), inpu_method)
    for filename, inpu_method in data_wu.items():
        d.add(pd.read_csv(f'{path}/wu/{filename}.csv'), inpu_method)

    # ['date', 'USD', 'CAD', 'SAR', 'AED', 'pf_weight', 'pf_price', 'yb_price', 'temp', 'temp_high', 'temp_low', 'point_temp', 'wu_ex_weight', 'wu_ex_price', 'wu_day_price', 'wu_day_amount']
    merged_data = d.get_merged_data(*d.get_start_end_tick())[[
        'date', 'wu_day_price'
    ]]
    data, time = preprocess(merged_data, 1, 1, 7, 1)
    linear_regression(data['test_x'], data['test_y'])

    merged_data = d.get_merged_data(*d.get_start_end_tick())
    merged_data = merged_data[[
        'date', 'temp', 'temp_high', 'temp_low', 'wu_ex_weight', 'wu_ex_price',
        'wu_day_price', 'wu_day_amount'
    ]]
    merged_data = merged_data.dropna().reset_index(drop=True)
    data, time = preprocess(merged_data, 1, 1, 7, 1)
    train_and_eval_model('weather_7dl', 'large', **data)

    merged_data = d.get_merged_data(*d.get_start_end_tick())
    merged_data = merged_data[[
        'date', 'temp', 'temp_high', 'temp_low', 'wu_ex_weight', 'wu_ex_price',
        'wu_day_price', 'wu_day_amount'
    ]]
    merged_data = merged_data.dropna().reset_index(drop=True)
    data, time = preprocess(merged_data, 1, 1, 14, 1)
    train_and_eval_model('weather_14dl', 'large', **data)

    merged_data = d.get_merged_data(*d.get_start_end_tick())
    merged_data = merged_data[[
        'date', 'temp', 'temp_high', 'temp_low', 'wu_ex_weight', 'wu_ex_price',
        'wu_day_price', 'wu_day_amount'
    ]]
    merged_data = merged_data.dropna().reset_index(drop=True)
    data, time = preprocess(merged_data, 7, 7, 4, 1)
    train_and_eval_model('weather_4wl', 'large', **data)

    merged_data = d.get_merged_data(*d.get_start_end_tick())
    merged_data = merged_data[[
        'date', 'temp', 'temp_high', 'temp_low', 'wu_ex_weight', 'wu_ex_price',
        'wu_day_price', 'wu_day_amount'
    ]]
    merged_data = merged_data.dropna().reset_index(drop=True)
    data, time = preprocess(merged_data, 30, 30, 4, 1)
    train_and_eval_model('weather_4ml', 'large', **data)
Esempio n. 7
0
def find_R2(x_tr, x_test, y_tr, y_test, y_mean):

    beta = model.linear_regression(x_tr, y_tr)
    yhats = model.apply_beta(beta, x_test)
    r2 = calculate_R2(yhats, y_test, y_mean)
    return r2
Esempio n. 8
0
    print("Task 1")
    print("GRAFFITI:", var(graffiti))
    print("GARBAGE:", var(garbage))
    print()
    print()


    print("Task 2")
    b = (np.arange(24)**2).reshape(6,4)
    task2(b)


    print("Task 3")
    ### REPLACE 0.0 with appropriate call to linear regression
<<<<<<< HEAD
    print("Rodents, Garbage => Crime", linear_regression(city_data[:,[2, 3]], city_data[:,7]))
=======
    print("Rodents, Garbage => Crime", 0.0)
>>>>>>> 773dcd3fdb9e1695ebc8bfcf1f2696415922ed23
    print()
    print()


    print("Task 4")
    ### REPLACE 0.0 with appropriate call to linear regression
<<<<<<< HEAD
    print("Graffiti => Crime:", linear_regression(city_data[:,[0]], city_data[:,7]))
=======
    print("Graffiti => Crime:", 0.0)
>>>>>>> 773dcd3fdb9e1695ebc8bfcf1f2696415922ed23
    print()
def main():
    data = utils.read_file('./data/new.csv')
    print("There are %d samples in raw data set" % len(data))
    print("Raw input data set information")
    utils.data_info(data)
    utils.missing_info(data, "raw_missing")

    # handle format and garbled text issues in raw data
    data = utils.time_format(data)
    data = utils.garbled_drawing(data)
    data = utils.garbled_floor(data)
    data = utils.garbled_living(data)
    data = utils.garbled_bath(data)
    data = utils.garbled_construct(data)
    data = utils.strange_building(data)

    # drop columns that provide no help
    data = utils.drop_columns(data, ['url', 'id', 'price', 'DOM'])
    print(
        "Raw data set information after transferring format and drop columns")
    utils.data_info(data)
    utils.missing_info(data, "raw_missing_2")

    # the rawdata contains more than 300000 data points, try to use 10% of rawdata in this project
    x_raw, y_raw, data, y = utils.data_splitting(data, data['totalPrice'], 0.1)
    data.to_csv('small.csv')
    print("smaller data set", np.shape(data), np.shape(y))
    print("y_info")
    print(y.describe())
    plt.hist(y)
    plt.xlabel("totalPrice")
    plt.ylabel("counts")
    plt.savefig('y.png')
    plt.close()

    # split D into D'' and D_Test
    x_doubleprime, y_doubleprime, x_test, y_test = utils.data_splitting(
        data, data['totalPrice'], 0.2)
    print("D'' shape", np.shape(x_doubleprime), np.shape(y_doubleprime))
    print("D_test shape", np.shape(x_test), np.shape(y_test))

    # split D'' into D' and D_pt
    x_prime, y_prime, x_pt, y_pt = utils.data_splitting(
        x_doubleprime, x_doubleprime['totalPrice'], 0.1)
    print("D_pt shape", np.shape(x_pt), np.shape(y_pt))
    print("D_prime shape", np.shape(x_prime), np.shape(y_prime))

    # Use pre-training set to look at data and conduct initial test
    print("Pre-training data set preprocessing:")
    utils.pre_training(x_pt)

    # D' data set preprocessing
    print("D' preprocessing:")
    x_train, y_train, x_val, y_val, cols_keep, imputation = utils.preprocessing(
        x_prime)
    print("D' after preprocessing:")
    print("D_train shape after preprocessing", np.shape(x_train),
          np.shape(y_train))
    print("D_val shape after preprocessing", np.shape(x_val), np.shape(y_val))

    # Linear Regression
    lin_reg = model.linear_regression(x_train, y_train, x_val, y_val,
                                      cols_keep)

    # Ridge Regression
    rid_reg = model.ridge_regression(x_train, y_train, x_val, y_val, cols_keep)

    # Lasso Regression
    las_reg = model.lasso_regression(x_train, y_train, x_val, y_val, cols_keep)

    # Random Forest
    rf = model.random_forest(x_train, y_train, x_val, y_val, cols_keep)

    # model tuning
    x, y, rid_reg_cv = model.ridge_cv(x_train, y_train, x_val, y_val,
                                      cols_keep)
    las_reg_cv = model.lasso_cv(x_train, y_train, x_val, y_val, cols_keep)
    rf_cv = model.random_forest_cv(x_train, y_train, x_val, y_val, cols_keep)

    # Final_result
    x_cv = pd.concat([x, y], axis=1)
    with_missing_cv = utils.missing_info(x_cv, "cv_missing")
    with_missing_test = utils.missing_info(x_test, "test_missing")
    continuous = ["Lng", "Lat", "square", "ladderRatio", "communityAverage"]
    discrete = [
        "Cid", "tradeTime", "followers", "livingRoom", "drawingRoom",
        "kitchen", "bathRoom", "floor", "buildingType", "constructionTime",
        "renovationCondition", "buildingStructure", "elevator",
        "fiveYearsProperty", "subway", "district"
    ]
    x_cv, y_cv, x_test, y_test = utils.method_2_prime(x_cv, with_missing_cv,
                                                      x_test,
                                                      with_missing_test,
                                                      continuous, discrete)
    x_test = pd.DataFrame(x_test, index=x_test.index, columns=cols_keep)

    predict_lin_red = lin_reg.predict(x_test)
    print("Linear Regression:")
    print("RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_lin_red)))
    predict_rid_reg = rid_reg.predict(x_test)
    predict_rid_reg_cv = rid_reg_cv.predict(x_test)
    print("Ridge Regression:")
    print("RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_rid_reg)))
    print("After cross validation, RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_rid_reg_cv)))
    predict_las_reg = las_reg.predict(x_test)
    predict_las_reg_cv = las_reg_cv.predict(x_test)
    print("Lasso Regression:")
    print("RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_las_reg)))
    print("After cross validation, RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_las_reg_cv)))
    predict_rf = rf.predict(x_test)
    predict_rf_cv = rf_cv.predict(x_test)
    print("Random Forest:")
    print("RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_rf)))
    print("After cross validation, RMSE on test data = %.5f" %
          np.sqrt(mean_squared_error(y_test, predict_rf_cv)))

    # plot
    plt.scatter(x_test['square'],
                y_test,
                c='blue',
                marker='o',
                label='real test')
    plt.scatter(x_test['square'],
                predict_rf_cv,
                c='red',
                marker='x',
                label='predict test')
    plt.xlabel('square')
    plt.legend(loc='upper right')
    plt.savefig("feature_square.png")
    plt.close()

    plt.scatter(x_test['livingRoom'],
                y_test,
                c='blue',
                marker='o',
                label='real test')
    plt.scatter(x_test['livingRoom'],
                predict_rf_cv,
                c='red',
                marker='x',
                label='predict test')
    plt.xlabel('livingRoom')
    plt.legend(loc='upper right')
    plt.savefig("feature_living.png")
    plt.close()