コード例 #1
0
def average_KFold(X, y, fold, start_deg, finish_deg, error_type):
    kf = KFold(n_splits=fold)
    error_test, error_train = [], []
    KFold(n_splits=5, random_state=None, shuffle=False)
    for i in range(start_deg, finish_deg + 1):
        err1, err2 = 0, 0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            P = poly(i)
            P_train = P.fit_transform(X_train)
            a = np.linalg.inv(P_train.transpose().dot(P_train))
            w = a.dot(P_train.transpose())
            W = w.dot(y_train)
            P_test = P.fit_transform(X_test)
            P_train = P.fit_transform(X_train)
            prediction_test = P_test.dot(W)
            prediction_train = P_train.dot(W)
            err1 += MSE(y_test, prediction_test)
            err2 += MSE(y_train, prediction_train)
        err1 = err1 / fold
        err2 = err2 / fold
        error_test.append(err1)
        error_train.append(err2)
    return error_test, error_train
コード例 #2
0
def regr_with_reg(X_train, y_train, to_predict, deg, a):
    rr = Ridge(alpha=a)
    p = poly(degree=deg)
    X_train_poly = p.fit_transform(X_train)
    to_predict_poly = p.fit_transform(to_predict)
    rr.fit(X_train_poly, y_train)
    y_pred = rr.predict(to_predict_poly)
    return y_pred
コード例 #3
0
def regr_without_reg(X_train, y_train, X_test, deg):
    p = poly(degree=deg)
    X_train_poly = p.fit_transform(X_train)
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    X_test_poly = p.fit_transform(X_test)
    y_pred = model.predict(X_test_poly)
    return y_pred
コード例 #4
0
ファイル: titanic.py プロジェクト: takushi-m/kaggle-titanic
def makeInput(data):
    x = pd.DataFrame()
    x["agena"] = data.Age.map(lambda x:1 if math.isnan(x) else 0)
    x["farena"] = data.Fare.map(lambda x:1 if math.isnan(x) else 0)

    data.Age = data.Age.fillna(29.7)
    data.Fare = data.Fare.fillna(32.2)
    data.Cabin = data.Cabin.fillna("NA")

    x["cabinna"] = data.Cabin.map(lambda x:1 if x=="NA" else 0)
    x["sibsp"] = data.SibSp
    x["parch"] = data.Parch
    x["smallfamiliy"] = (data.SibSp+data.Parch).map(lambda x:1 if x<3 else 0)
    x["bigfamiliy"] = (data.SibSp+data.Parch).map(lambda x:1 if x>=3 else 0)
    x["age"] = data.Age.map(lambda x: (x-29.7)/13.0)
    x["age-10"] = data.Age.map(lambda x:1 if x<=10 else 0)
    x["age10-15"] = data.Age.map(lambda x:1 if x>10 and x<=15 else 0)
    x["age15-20"] = data.Age.map(lambda x:1 if x>15 and x<=20 else 0)
    x["age20-25"] = data.Age.map(lambda x:1 if x>20 and x<=25 else 0)
    x["age25-30"] = data.Age.map(lambda x:1 if x>25 and x<31 else 0)
    x["age30-"] = data.Age.map(lambda x:1 if x>30 else 0)
    x["class1"] = data.Pclass.map(lambda x:1 if x==1 else 0)
    x["class2"] = data.Pclass.map(lambda x:1 if x==2 else 0)
    x["class3"] = data.Pclass.map(lambda x:1 if x==3 else 0)
    x["male"] = data.Sex.map(lambda x:1 if x=="male" else 0)
    x["female"] = data.Sex.map(lambda x:1 if x=="female" else 0)
    x["fare"] = data.Fare.map(lambda x: (x-32.2)/49.7)
    x["fare-"] = data.Fare.map(lambda x:1 if x<20 else 0)
    x["fare+"] = data.Fare.map(lambda x:1 if x>=20 else 0)

    x["mrs"] = data.Name.map(lambda x:1 if x.lower().find("mrs")>=0 else 0)
    x["mr"] = data.Name.map(lambda x:1 if x.lower().find("mr")>=0 else 0)
    x["miss"] = data.Name.map(lambda x:1 if x.lower().find("miss")>=0 else 0)
    x["master"] = data.Name.map(lambda x:1 if x.lower().find("master")>=0 else 0)

    x["embark_C"] = data.Embarked.map(lambda x:1 if x=="C" else 0)
    x["embark_Q"] = data.Embarked.map(lambda x:1 if x=="Q" else 0)
    x["embark_S"] = data.Embarked.map(lambda x:1 if x=="S" else 0)

    #return x
    p = poly(2, interaction_only=False)
    return p.fit_transform(x)
コード例 #5
0
stu_data.iloc[:,:-2] = f_dummies(stu_data.iloc[:,:-2])

# 2) scaling
m_sc = standard()
m_sc.fit(stu_data)
stu_x_sc = m_sc.transform(stu_data)

# 4. data 분리
train_x, test_x, train_y, test_y = train_test_split(stu_x_sc,
                                                    stu_target,
                                                    random_state=0)

# 1. interaction 적용 data 추출
from sklearn.preprocessing import PolynomialFeatures as poly

m_poly = poly(degree=2)
m_poly.fit(train_x)         # 각 설명변수에 대한 2차항 모델 생성
train_x_poly = m_poly.transform(train_x)   # 각 설명변수에 대한 2차항 모델 생성
test_x_poly  = m_poly.transform(test_x) 

m_poly.get_feature_names()                       # 변경된 설명변수들의 형태 확인
col_poly = m_poly.get_feature_names(stu_data.columns)  # 실제 컬럼이름의 교호작용 출력
 
DataFrame(m_poly.transform(train_x) , 
          columns = m_poly.get_feature_names(stu_data.columns))

# 2. 확장된 데이터셋을 RF에 학습, feature importance 확인
m_rf = rf(random_state=0)
m_rf.fit(train_x_poly, train_y)
m_rf.score(test_x_poly, test_y) # 0.71717
   
コード例 #6
0
# --------------------------------------------------------------------------- #

# [ 분석 시 고려사항 3. 교호작용 ]
# - 변수 상호 간 서로 결합된 형태로 의미 있는 경우
# - 2차, 3차항 ... 추가 가능
# - 발생 가능한 모든 다차항의 interaction으로 부터 의미 있는 변수 추출

# 3.1 interaction 적용 data 추출    
from sklearn.preprocessing import PolynomialFeatures as poly

원본         => 2차항 적용 (transform 작업)
x1 x2 x3       x1^2 x2^2 x3^2 x1x2 x1x2 x2x3
1  2  3        1    4    9    2    3    6
2  4  5        4    16   25   8    10   20

m_poly = poly(degree = 2)    # 2차항을 만들겠다
m_poly.fit(train_x)          # 각 설명변수에 2차항 모델 생성
# ** test data set은 fitting 필요 없음 why?
train_x_poly = m_poly.transform(train_x)    # 스케일링 된 데이터셋으로 하는게 더 좋음
test_x_poly = m_poly.transform(test_x)

m_poly.get_feature_names()   # 변경된 설명변수들의 형태 = 2차항 모습

DataFrame(m_poly.transform(train_x),
          columns = m_poly.get_feature_names())  # 보기 좋음 -> 변수가 엄청 많으면 이 또한 쉽진 않음

col_poly = m_poly.get_feature_names(df_iris.feature_names)  # 실제 컬럼이름이 반영된 교호작용 출력

DataFrame(m_poly.transform(train_x),
          columns = m_poly.get_feature_names(df_iris.feature_names)) # 훨씬 보기 좋음
from sklearn.preprocessing import MinMaxScaler as minmax
m_sc2 = minmax()
m_sc2.fit(train_x)
train_x_sc2 = m_sc2.transform(train_x)

m_sc2.fit(test_x)
test_x_sc2 = m_sc2.transform(test_x)

# step 2) 데이터 학습 & 평가
m_knn.fit(train_x_sc2, train_y)
m_knn.score(test_x_sc2, test_y)  # 0.959 => min_max 경우 scaling 전보다 상승*

# 3) 전체 interaction 학습 (min_max로 scaling된 상태)
# step 1) 모델 생성
from sklearn.preprocessing import PolynomialFeatures as poly
m_poly = poly(degree=2)  # 2차항까지
m_poly.fit(train_x_sc2)

train_x_poly = m_poly.transform(train_x_sc2)
test_x_poly = m_poly.transform(test_x_sc2)

col_poly = m_poly.get_feature_names(df_cancer.feature_names)

# step 2) 데이터 학습 & 평가
m_knn.fit(train_x_poly, train_y)
m_knn.score(test_x_poly, test_y)  # 0.964 => 2차항까지 교호작용한 설명변수로 이전 값보다 상승 **

# 4) 선택된 interaction 학습
# 변형된 데이터셋 Rf에 학습 후 feature importance 확인
m_rf = rf_c(random_state=0)
m_rf.fit(train_x_poly, train_y)