def pre_load_fixed_data(): keywords = [ "music", "food", "sport", "show", "movie", "car", "commercial", "party", "war", "hello" ] data = DataReader() return data.read("static/data/tweets.txt", keywords)
def test_read_function(url): """ Test method for read method in read_data class with given url, does not crash if it passed """ web_data = DataReader(url) data, columns = web_data.read() assert_equals(True, len(data) > 0) assert_equals(True, len(columns) > 0) assert_equals(252, len(data)) assert_equals(15, len(columns)) assert_equals(23, data.loc[0, 'Age (years)']) assert_equals('Weight (lbs)', columns[3]) return data, columns
def main(): sns.set(font_scale=0.7) # Process Data url = 'http://lib.stat.cmu.edu/datasets/bodyfat' web_data = DataReader(url) data, columns = web_data.read() # Plotting all_correlation = correlation_chart(data, columns) graphs(data, all_correlation) x = data.loc[:, 'Age (years)':'Wrist circumference (cm)'] y = data["Percent body fat from Siri's (1956) equation"].to_numpy() # Linear Regression x_train, x_test, y_train, y_test = \ train_test_split(x, y, test_size=0.4, random_state=1) model = DecisionTreeRegressor() model.fit(x_train, y_train) linear_reg_model = linear_regression_fit(x_train, y_train) print('MSE for linear train:', mean_squared_error(y_train, linear_reg_model.predict(x_train))) print('MSE for linear test:', mean_squared_error(y_test, linear_reg_model.predict(x_test))) print('MSE for decisiontree train:', mean_squared_error(y_train, model.predict(x_train))) print('MSE for decisiontree test:', mean_squared_error(y_test, model.predict(x_test))) # High correlation part x_high_correlation = data[high_correlation(all_correlation)].copy() x_high_train, x_high_test, y_high_train, y_high_test = \ train_test_split(x_high_correlation, y, test_size=0.4, random_state=1) high_model = DecisionTreeRegressor() high_model.fit(x_high_train, y_high_train) high_correlation_model = linear_regression_fit(x_high_train, y_high_train) print( 'MSE for high correlation train:', mean_squared_error(y_high_train, high_correlation_model.predict(x_high_train))) print( 'MSE for high correlation test:', mean_squared_error(y_high_test, high_correlation_model.predict(x_high_test))) print('MSE for high correlation decisiontree train:', mean_squared_error(y_high_train, high_model.predict(x_high_train))) print('MSE for high correlation decisiontree test:', mean_squared_error(y_high_test, high_model.predict(x_high_test)))