def normalize(features): features_n = MinMaxScaler().fit_transform(features[features.columns[4:45]]) features_n = pd.DataFrame(features_n, columns=features.columns[4:45], index=features.index) if 'total_cases' in features_n.columns: features_n = features_n.drop(columns=['total_cases']) features_n['month'] = features['month'] features_n['weekofyear'] = features['weekofyear'] features_n['odd_year'] = features['odd_year'] return features_n
#replace missval in attribute age with mean imp = Imputer(strategy='mean') data['Age'] = imp.fit_transform(data[['Age']]) index = data.columns #check if missval still exist #print(data.isnull().sum()) #normalization data = MinMaxScaler().fit_transform(data) data = pd.DataFrame(data) data.columns = index #split attribute and target class X = data.drop(['Survived'], axis=1) y = data['Survived'] #find outliers FS = IsolationForest() FS.fit(X) # FS=EllipticEnvelope() # FS.fit(X) outliers = FS.predict(X) drop = [] for index, num in enumerate(outliers.tolist()): if (num == -1): drop.append(index)
Ticket.append("X") df["Ticket"] = Ticket df = pd.get_dummies(df, columns=["Ticket"], prefix="T") # Cabib 依照第一碼分類, 再取 One Hot df["Cabin"] = pd.Series( [i[0] if not pd.isnull(i) else 'X' for i in df['Cabin']]) df = pd.get_dummies(df, columns=["Cabin"], prefix="Cabin") # Embarked, Pclass 取 One Hot df = pd.get_dummies(df, columns=["Embarked"], prefix="Em") df["Pclass"] = df["Pclass"].astype("category") df = pd.get_dummies(df, columns=["Pclass"], prefix="Pc") # 捨棄 Name 欄位 df.drop(labels=["Name"], axis=1, inplace=True) # 確認缺值 與 目前的資料表內容 na_check(df) df.head() # 將資料最大最小化 df = MinMaxScaler().fit_transform(df) # 將前述轉換完畢資料 df , 重新切成 train_X, test_X train_num = train_Y.shape[0] train_X = df[:train_num] test_X = df[train_num:] # 使用三種模型 : 邏輯斯迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找 from sklearn.linear_model import LogisticRegression