def average_KFold(X, y, fold, start_deg, finish_deg, error_type): kf = KFold(n_splits=fold) error_test, error_train = [], [] KFold(n_splits=5, random_state=None, shuffle=False) for i in range(start_deg, finish_deg + 1): err1, err2 = 0, 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] P = poly(i) P_train = P.fit_transform(X_train) a = np.linalg.inv(P_train.transpose().dot(P_train)) w = a.dot(P_train.transpose()) W = w.dot(y_train) P_test = P.fit_transform(X_test) P_train = P.fit_transform(X_train) prediction_test = P_test.dot(W) prediction_train = P_train.dot(W) err1 += MSE(y_test, prediction_test) err2 += MSE(y_train, prediction_train) err1 = err1 / fold err2 = err2 / fold error_test.append(err1) error_train.append(err2) return error_test, error_train
def regr_with_reg(X_train, y_train, to_predict, deg, a): rr = Ridge(alpha=a) p = poly(degree=deg) X_train_poly = p.fit_transform(X_train) to_predict_poly = p.fit_transform(to_predict) rr.fit(X_train_poly, y_train) y_pred = rr.predict(to_predict_poly) return y_pred
def regr_without_reg(X_train, y_train, X_test, deg): p = poly(degree=deg) X_train_poly = p.fit_transform(X_train) model = LinearRegression() model.fit(X_train_poly, y_train) X_test_poly = p.fit_transform(X_test) y_pred = model.predict(X_test_poly) return y_pred
def makeInput(data): x = pd.DataFrame() x["agena"] = data.Age.map(lambda x:1 if math.isnan(x) else 0) x["farena"] = data.Fare.map(lambda x:1 if math.isnan(x) else 0) data.Age = data.Age.fillna(29.7) data.Fare = data.Fare.fillna(32.2) data.Cabin = data.Cabin.fillna("NA") x["cabinna"] = data.Cabin.map(lambda x:1 if x=="NA" else 0) x["sibsp"] = data.SibSp x["parch"] = data.Parch x["smallfamiliy"] = (data.SibSp+data.Parch).map(lambda x:1 if x<3 else 0) x["bigfamiliy"] = (data.SibSp+data.Parch).map(lambda x:1 if x>=3 else 0) x["age"] = data.Age.map(lambda x: (x-29.7)/13.0) x["age-10"] = data.Age.map(lambda x:1 if x<=10 else 0) x["age10-15"] = data.Age.map(lambda x:1 if x>10 and x<=15 else 0) x["age15-20"] = data.Age.map(lambda x:1 if x>15 and x<=20 else 0) x["age20-25"] = data.Age.map(lambda x:1 if x>20 and x<=25 else 0) x["age25-30"] = data.Age.map(lambda x:1 if x>25 and x<31 else 0) x["age30-"] = data.Age.map(lambda x:1 if x>30 else 0) x["class1"] = data.Pclass.map(lambda x:1 if x==1 else 0) x["class2"] = data.Pclass.map(lambda x:1 if x==2 else 0) x["class3"] = data.Pclass.map(lambda x:1 if x==3 else 0) x["male"] = data.Sex.map(lambda x:1 if x=="male" else 0) x["female"] = data.Sex.map(lambda x:1 if x=="female" else 0) x["fare"] = data.Fare.map(lambda x: (x-32.2)/49.7) x["fare-"] = data.Fare.map(lambda x:1 if x<20 else 0) x["fare+"] = data.Fare.map(lambda x:1 if x>=20 else 0) x["mrs"] = data.Name.map(lambda x:1 if x.lower().find("mrs")>=0 else 0) x["mr"] = data.Name.map(lambda x:1 if x.lower().find("mr")>=0 else 0) x["miss"] = data.Name.map(lambda x:1 if x.lower().find("miss")>=0 else 0) x["master"] = data.Name.map(lambda x:1 if x.lower().find("master")>=0 else 0) x["embark_C"] = data.Embarked.map(lambda x:1 if x=="C" else 0) x["embark_Q"] = data.Embarked.map(lambda x:1 if x=="Q" else 0) x["embark_S"] = data.Embarked.map(lambda x:1 if x=="S" else 0) #return x p = poly(2, interaction_only=False) return p.fit_transform(x)
stu_data.iloc[:,:-2] = f_dummies(stu_data.iloc[:,:-2]) # 2) scaling m_sc = standard() m_sc.fit(stu_data) stu_x_sc = m_sc.transform(stu_data) # 4. data 분리 train_x, test_x, train_y, test_y = train_test_split(stu_x_sc, stu_target, random_state=0) # 1. interaction 적용 data 추출 from sklearn.preprocessing import PolynomialFeatures as poly m_poly = poly(degree=2) m_poly.fit(train_x) # 각 설명변수에 대한 2차항 모델 생성 train_x_poly = m_poly.transform(train_x) # 각 설명변수에 대한 2차항 모델 생성 test_x_poly = m_poly.transform(test_x) m_poly.get_feature_names() # 변경된 설명변수들의 형태 확인 col_poly = m_poly.get_feature_names(stu_data.columns) # 실제 컬럼이름의 교호작용 출력 DataFrame(m_poly.transform(train_x) , columns = m_poly.get_feature_names(stu_data.columns)) # 2. 확장된 데이터셋을 RF에 학습, feature importance 확인 m_rf = rf(random_state=0) m_rf.fit(train_x_poly, train_y) m_rf.score(test_x_poly, test_y) # 0.71717
# --------------------------------------------------------------------------- # # [ 분석 시 고려사항 3. 교호작용 ] # - 변수 상호 간 서로 결합된 형태로 의미 있는 경우 # - 2차, 3차항 ... 추가 가능 # - 발생 가능한 모든 다차항의 interaction으로 부터 의미 있는 변수 추출 # 3.1 interaction 적용 data 추출 from sklearn.preprocessing import PolynomialFeatures as poly 원본 => 2차항 적용 (transform 작업) x1 x2 x3 x1^2 x2^2 x3^2 x1x2 x1x2 x2x3 1 2 3 1 4 9 2 3 6 2 4 5 4 16 25 8 10 20 m_poly = poly(degree = 2) # 2차항을 만들겠다 m_poly.fit(train_x) # 각 설명변수에 2차항 모델 생성 # ** test data set은 fitting 필요 없음 why? train_x_poly = m_poly.transform(train_x) # 스케일링 된 데이터셋으로 하는게 더 좋음 test_x_poly = m_poly.transform(test_x) m_poly.get_feature_names() # 변경된 설명변수들의 형태 = 2차항 모습 DataFrame(m_poly.transform(train_x), columns = m_poly.get_feature_names()) # 보기 좋음 -> 변수가 엄청 많으면 이 또한 쉽진 않음 col_poly = m_poly.get_feature_names(df_iris.feature_names) # 실제 컬럼이름이 반영된 교호작용 출력 DataFrame(m_poly.transform(train_x), columns = m_poly.get_feature_names(df_iris.feature_names)) # 훨씬 보기 좋음
from sklearn.preprocessing import MinMaxScaler as minmax m_sc2 = minmax() m_sc2.fit(train_x) train_x_sc2 = m_sc2.transform(train_x) m_sc2.fit(test_x) test_x_sc2 = m_sc2.transform(test_x) # step 2) 데이터 학습 & 평가 m_knn.fit(train_x_sc2, train_y) m_knn.score(test_x_sc2, test_y) # 0.959 => min_max 경우 scaling 전보다 상승* # 3) 전체 interaction 학습 (min_max로 scaling된 상태) # step 1) 모델 생성 from sklearn.preprocessing import PolynomialFeatures as poly m_poly = poly(degree=2) # 2차항까지 m_poly.fit(train_x_sc2) train_x_poly = m_poly.transform(train_x_sc2) test_x_poly = m_poly.transform(test_x_sc2) col_poly = m_poly.get_feature_names(df_cancer.feature_names) # step 2) 데이터 학습 & 평가 m_knn.fit(train_x_poly, train_y) m_knn.score(test_x_poly, test_y) # 0.964 => 2차항까지 교호작용한 설명변수로 이전 값보다 상승 ** # 4) 선택된 interaction 학습 # 변형된 데이터셋 Rf에 학습 후 feature importance 확인 m_rf = rf_c(random_state=0) m_rf.fit(train_x_poly, train_y)