import time from common import process_data import pandas as pd from sklearn.externals import joblib from common import load_csv import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold # turn off warning: SettingWithCopyWarning pd.set_option('chained_assignment', None) x, y = load_csv.load_data(True) x_train = process_data.get_clean_data(x) x_train = x_train.drop(['Survived'], axis=1) print('x_train.shape: ', x_train.shape) print('x_train.columns => \n', x_train.columns.values) print('y.shape: ', y.shape) from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(criterion='entropy', max_depth=None) # clf = AdaBoostClassifier(base_estimator=tree, # n_estimators=1000, # learning_rate=0.1, # random_state=0) clf = AdaBoostClassifier(tree, random_state=7)
from common import split_train_test_data from common import process_data from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error import pandas as pd from sklearn.externals import joblib import time import numpy as np from sklearn.ensemble import GradientBoostingRegressor pd_csv = pd.read_csv('../train.csv') df_train = process_data.get_clean_data(pd_csv) # 將 SalePrice 做對數變換 df_train['SalePrice'] = np.log(df_train['SalePrice']) print('After log transformation, SalePrice skewness is ', df_train['SalePrice'].skew()) # 刪除Electrical欄位缺值的樣本(僅1個樣本) print('Sample(Id={}) is dropped due to Electrical is null'.format( df_train.loc[df_train['Electrical'].isnull()]['Id'].values)) csv_df = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) x_train, y_train = split_train_test_data.get_splitted_data(True, df_train) print('x_train.shape: ', x_train.shape) print('x_train.columns => \n', x_train.columns.values) print('y_train.shape: ', y_train.shape)
from common import process_data import pandas as pd from sklearn.externals import joblib import numpy as np df_csv = pd.read_csv('../test.csv') print('Before processing missing value, sample count =>\n{}'.format( process_data.get_missing_value_sample_count(df_csv))) print('Before processing missing value, sample proportion =>\n{}'.format( process_data.get_missing_value_sample_proportion(df_csv))) x_test = process_data.get_clean_data(df_csv) print('After processing missing value, sample count =>\n{}'.format( process_data.get_missing_value_sample_count(x_test))) print('After processing missing value, sample proportion =>\n{}'.format( process_data.get_missing_value_sample_proportion(x_test))) print('After clean, x_test.isnull().sum().max(): ', x_test.isnull().sum().max()) print('After clean, x_test.shape: ', x_test.shape) print('After clean, x_test.columns => \n', x_test.columns.values) # 將類別變量轉換為虛擬變量(one-hot encoding) categorical = [var for var in x_test.columns if x_test[var].dtype == 'O'] for col in categorical: x_test[col] = x_test[col].astype('category').cat.codes print('After clean, x_test(one-hot encoding).shape: ', x_test.shape) clf = joblib.load('rf_regressor_dump.pkl')
import pandas as pd from common import process_data csv_df = pd.read_csv('../test.csv') csv_df = process_data.get_clean_data(csv_df) print('Before processing missing value, sample count =>\n{}'.format( process_data.get_missing_value_sample_count(csv_df))) print('Before processing missing value, sample proportion =>\n{}'.format( process_data.get_missing_value_sample_proportion(csv_df))) # print(pd_csv.loc[pd_csv['TotalBsmtSF'].isnull()]) # 將缺值的 TotalBsmtSF 以其平均數取代 csv_df['TotalBsmtSF'] = csv_df['TotalBsmtSF'].fillna( csv_df['TotalBsmtSF'].mean()) # print(pd_csv['TotalBsmtSF'].isnull().sum()) print('Sample whose KitchenQual is null, KitchenAbvGr = ', csv_df.loc[csv_df['KitchenQual'].isnull()]['KitchenAbvGr'].values) # 找出 KitchenAbvGr 值為1的樣本,並列出它們的 KitchenQual csv_df.loc[csv_df['KitchenAbvGr'] == 1]['KitchenQual'] # 將缺值的 KitchenQual 以 'TA' 取代 csv_df['KitchenQual'] = csv_df['KitchenQual'].fillna('TA') # print(pd_csv['KitchenQual'].isnull().sum()) # print('Sample whose GarageArea is null, GarageType = ', pd_csv.loc[pd_csv['GarageArea'].isnull()]['GarageType'].values) # 將缺值的 GarageArea 以 GarageType 為 'Detchd' 的樣本,其 GarageArea 的平均數取代 # pd_csv['GarageArea'] = pd_csv['GarageArea'].fillna(int(pd_csv[pd_csv['GarageType']=='Detchd']['GarageArea'].mean())) # 將缺值的 GarageArea 以其平均數取代 csv_df['GarageArea'] = csv_df['GarageArea'].fillna(
import time import numpy as np from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.linear_model import ElasticNetCV csv_df = pd.read_csv('../train.csv') print( 'Before processing missing value, sample count =>\n{}'.format(process_data.get_missing_value_sample_count(csv_df))) print( 'Before processing missing value, sample proportion =>\n{}'.format( process_data.get_missing_value_sample_proportion(csv_df))) df_train = process_data.get_clean_data(csv_df) # 將 SalePrice 做對數變換 df_train['SalePrice'] = np.log(df_train['SalePrice']) print('After log transformation, SalePrice skewness is ', df_train['SalePrice'].skew()) # 刪除Electrical欄位缺值的樣本(僅1個樣本) print('Sample(Id={}) is dropped due to Electrical is null'.format( df_train.loc[df_train['Electrical'].isnull()]['Id'].values)) df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) # 刪除離群的 GrLivArea 值很高的數據 ids = df_train.sort_values(by='GrLivArea', ascending=False)[:2]['Id'] df_train = df_train.drop(ids.index) print(
from common import split_train_test_data from common import process_data from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error import pandas as pd from sklearn.externals import joblib import numpy as np csv_df = pd.read_csv('../train.csv') df_test = process_data.get_clean_data(csv_df) # 將 SalePrice 做對數變換 df_test['SalePrice'] = np.log(df_test['SalePrice']) print('After log transformation, SalePrice skewness is ', df_test['SalePrice'].skew()) # 刪除Electrical欄位缺值的樣本(僅1個樣本) print('Sample(Id={}) is dropped due to Electrical is null'.format( df_test.loc[df_test['Electrical'].isnull()]['Id'].values)) csv_df = df_test.drop(df_test.loc[df_test['Electrical'].isnull()].index) x_test, y_test = split_train_test_data.get_splitted_data(False, df_test) print('x_test.shape: ', x_test.shape) print('x_test.columns => \n', x_test.columns.values) print('y_test.shape: ', y_test.shape) # 將類別變量轉換為虛擬變量(one-hot encoding) # x_test = pd.get_dummies(x_test) # print('x_test(dummy).shape: ', x_test.shape)