def main(): qresult = connect_db('solar.db', 'dip') smiles, compounds, gaps = get_data(qresult) mols = get_mols(smiles) fps_morgan, failed_mols = get_fingerprints(mols) refine_compounds(compounds, mols, gaps, failed_mols) compound_array = np.array(compounds) gaps_array = np.array(gaps) train_id, test_id, y_train, y_test = train_test_split(compound_array, gaps_array, test_size=0.20, random_state=0) train_fps = get_fp_from_id(compounds, fps_morgan, train_id) test_fps = get_fp_from_id(compounds, fps_morgan, test_id) xgb1 = XGBRegressor(n_estimators=2000, learning_rate=0.03, max_depth=7, colsample_bytree=0.6, nthread=8, scale_pos_weight=1, gamma=0, random_state=0, subsample=0.6, min_child_weight=3, early_stopping_rounds=10, reg_alpha=1) modelfit(xgb1, train_fps, y_train) #xgb1 = joblib.load('gbdt_dip_xgb.joblib') #joblib.dump(xgb1, 'gbdt_dip_xgb2.joblib') y_pred_cv = cvp(xgb1, train_fps, y_train, cv=4, n_jobs=8) y_train_pred = xgb1.predict(train_fps) y_pred_test = xgb1.predict(test_fps) train_df = pd.DataFrame() test_df = pd.DataFrame() train_df['id'] = pd.Series(train_id) train_df['dip_exp'] = pd.Series(y_train) train_df['dip_cv'] = pd.Series(y_pred_cv) train_df['dip_gbdt'] = pd.Series(y_train_pred) train_df['Group'] = 'Train' test_df['id'] = pd.Series(test_id) test_df['dip_exp'] = pd.Series(y_test) test_df['dip_cv'] = pd.Series(y_pred_test) test_df['dip_gbdt'] = pd.Series(y_pred_test) test_df['Group'] = 'Test' result_df = pd.concat([train_df, test_df]) result_df.to_csv('dip_xgb_train_test.csv') test_err = mean_squared_error(y_pred_test, y_test) print('Test error: {:4f}'.format(np.sqrt(test_err)))
def scan_fit(self,X,y): self.n_classes = len(np.unique(y)) newX,newy,scan_round_total = self._sample_slicer(X,y) sample_vector_list = [] for estimator in self.estimators: estimator.fit(newX, newy) if self.k_fold > 1:# use cv predict_ = cvp(estimator, newX, newy, cv=self.k_fold, n_jobs = -1) else:#use oob predict_ = estimator.oob_decision_function_ #fill default value if meet nan inds = np.where(np.isnan(predict_)) predict_[inds] = 1./self.n_classes sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes)) sample_vector_list.append(sample_vector) return np.hstack(sample_vector_list)
def predict(self, X, y): """ Returns a generator containing the predictions for each of the internal models (using cross_val_predict and a CV=12). Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs: dict keyword arguments passed to Scikit-Learn API. """ for model in self.models: yield cvp(model, X, y, cv=12)
def predict(self, X, y): """ Returns a generator containing the predictions for each of the internal models (using cross_val_predict and a CV=12). Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs: dict keyword arguments passed to Scikit-Learn API. """ for model in self.models: yield cvp(model, X, y, cv=12)
print(set_sizes[0]) print('here', set_sizes[nrows] * 0.7) X_train = X.head(int(set_sizes[nrows] * 0.7)) X_test = X.tail(int(set_sizes[nrows] * 0.3)) Y_train = Y.head(int(set_sizes[nrows] * 0.7)) Y_test = Y.tail(int(set_sizes[nrows] * 0.3)) ne_lr = LinearRegression(minibatches=None) Y2 = pd.to_numeric(Y, downcast='float') print("here", type((Y2))) print(type(Y_train)) ne_lr.fit(X_train, pd.to_numeric(Y_train, downcast='float')) print(ne_lr) y_pred = ne_lr.predict(X_test) res = mean_squared_error(Y_test, y_pred) #res = scoring(y_target=Y_test, y_predicted=y_pred, metric='rmse') print("results: ", res) lin = linear_model.LinearRegression() lin.fit(X_train, Y_train) predictedCV = cvp(lin, X, Y, cv=10) print("rmse cross val", mean_squared_error(Y, predictedCV))
return data def make_matrix(l): matrix = np.full((ROW, COL), 0) for d in l: matrix[d[0]][d[1]] = d[2] return matrix if __name__ == "__main__": train = read_csv(TRAIN) gender = read_csv(GENDER) year = read_csv(YEAR) X1 = make_matrix(train) X2 = X1.T Y1 = np.asarray(gender).T[0] Y2 = np.asarray(year).T[0] clf1 = logr() scores = cvs(clf1, X1, Y1, cv=10) print("Min CV error: {}".format(1 - max(scores))) clf2 = logr(solver="saga", multi_class="multinomial") pred = cvp(clf2, X2, Y2, cv=10) mse1 = mse(Y2, pred) mse2 = mse(Y2, np.full_like(Y2, np.mean(Y2))) print("Regression MSE: {}".format(mse1)) print("Naive MSE: {}".format(mse2))
# 可以使用 Scikit-Image, Pillow, OpenCV 等等讓某些模式更為突出, 例如閉環等等 # 從這裡開始以下的輸出結果還沒有確認過(設備問題, 執行時間太長) # 多標籤分類 # 一般來說每個實例都只會被分在一個類別裡, 若希望分類器為實例分出多個標籤 # 注意: 不是所有的分類器都支援多標籤分類 from sklearn.neighbors import KNeighborsClassifier y_train_large = (y_train >= 7) # 儲存大於等於7的標籤 y_train_odd = (y_train % 2 == 1) # 儲存奇數 y_multilabel = np.c_[y_train_large, y_train_odd] knn_clf = KNeighborsClassifier() # default: n_neighbors = 5 knn_clf.fit(X_train, y_multilabel) # print(knn_clf.predict([some_digit])) # [[False False]] 2 既非大於等於7也非奇數 # 計算 f1_score y_train_knn_pred = cvp(knn_clf, X_train, y_train, cv=3) print(f1_score(y_train, y_train_knn_pred, average="macro")) # 多輸出分類 # 多標籤分類的泛化, 其標籤也可以是多種類別(兩個以上的值) # 由以下例子說明: 構建一個系統去除圖片的雜訊 # 注意: 此分類器的輸出是多個標籤(一個pixel 一個label, 像素強度範圍 0~255) # 首先, 先把乾淨的圖片加入雜訊並創建訓練集和測試集 rnd = np.RandomState(42) noise_train = rnd.randint(0, 100, (len(X_train)), 784) noise_test = rnd.randint(0, 100, (len(X_test)), 784) X_train_mod = X_train + noise_train X_test_mod = X_test + noise_test y_train_mod = X_train y_test_mod = X_test
def fit(self, X_train, y_train): self.n_classes = len(np.unique(y_train)) self.estimators_levels = [] klass = self.base_estimator.__class__ predictions_levels = [] self.classes = np.unique(y_train) # first level estimators = [klass(**params) for params in self.params_list] self.estimators_levels.append(estimators) predictions = [] for estimator in estimators: estimator.fit(X_train, y_train) if self.k_fold > 1: # use cv predict_ = cvp(estimator, X_train, y_train, cv=self.k_fold, n_jobs=-1) else: # use oob predict_ = estimator.oob_decision_function_ # fill default value if meet nan inds = np.where(np.isnan(predict_)) predict_[inds] = 1. / self.n_classes predictions.append(predict_) attr_to_next_level = np.hstack(predictions) y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0), axis=1), axis=0) self.max_accuracy = self.evaluate(y_pre, y_train) # cascade step while True: print('level {}, CV accuracy: {}'.format( len(self.estimators_levels), self.max_accuracy)) estimators = [klass(**params) for params in self.params_list] self.estimators_levels.append(estimators) predictions = [] X_train_step = np.hstack((attr_to_next_level, X_train)) for estimator in estimators: estimator.fit(X_train_step, y_train) if self.k_fold > 1: # use cv predict_ = cvp(estimator, X_train_step, y_train, cv=self.k_fold, n_jobs=-1) else: # use oob predict_ = estimator.oob_decision_function_ # fill default value if meet nan inds = np.where(np.isnan(predict_)) predict_[inds] = 1. / self.n_classes predictions.append(predict_) attr_to_next_level = np.hstack(predictions) y_pre = self.classes.take(np.argmax( np.array(predictions).mean(axis=0), axis=1), axis=0) accuracy = self.evaluate(y_pre, y_train) if accuracy > self.max_accuracy: self.max_accuracy = accuracy else: self.estimators_levels.pop() break