def gen_sub(): train = train_data.copy() train_idx = [i for i in range(train.shape[0])] test = test_data.copy() test_idx = [i + train.shape[0] for i in range(test.shape[0])] y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) xgb_clf = xgboostClassifier( objective='multi:softprob', eval_metric='mlogloss', num_class=3, nthread=12, eta=0.02, max_depth=6, subsample=0.8, colsample_bytree=1.0, colsample_bylevel=0.8, min_child_weight=1, silent=1, num_rounds=1700, seed=RS, ) print("Trainning:...") xgb_clf.fit(X_train, y) preds = xgb_clf.predict_proba(X_test) sub = pd.DataFrame(preds) # sub.columns = ["high", "medium", "low"] sub.columns = ["high", "medium", "low"] sub["listing_id"] = test.listing_id.values sub.to_csv("submission.csv", index=False)
def validation_score(early_stop=False): clf = xgboostClassifier( objective='multi:softprob', eval_metric='mlogloss', num_class=3, nthread=3, eta=0.04, max_depth=6, subsample=0.7, colsample_bytree=1.0, colsample_bylevel=0.7, min_child_weight=1, silent=1, num_rounds=700, seed=RS, ) print("*** Validation start ***") data = train_data.copy() y = data["interest_level"].apply(lambda x: target_num_map[x]) del data["interest_level"] # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) skf = StratifiedKFold(n_splits=3, shuffle=False) cv_scores = [] i = 0 for train_idx, val_idx in skf.split(data, y): i += 1 X = data.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) clf.fit(X_train, y_train) # clf.fit_CV(X_train, X_val, y_train, y_val) y_val_pred = clf.predict_proba(X_val) loss = log_loss(y_val, y_val_pred) print("Iteration {}'s loss: {}".format(i, loss)) cv_scores.append(loss) if early_stop: break print("*** Validation finished ***\n") return cv_scores
def validation_avg_score(clfs): print("*** Validation start ***") data = train_data.copy() y = data["interest_level"].apply(lambda x: target_num_map[x]) del data["interest_level"] # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) skf = StratifiedKFold(n_splits=3) cv_scores = {i: [] for i in range(len(clfs))} cv_scores["Avg"] = [] i = 0 for train_idx, val_idx in skf.split(data, y): i += 1 X = data.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) tmp = [] preds = [] j = 0 for clf in clfs: clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) tmp.append(y_val_pred) loss = log_loss(y_val, y_val_pred) cv_scores[j].append(loss) preds.append(y_val_pred) j += 1 print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss)) preds = np.array(preds) avg_pred = np.mean(preds, axis=0) loss = log_loss(y_val, avg_pred) cv_scores["Avg"].append(loss) print("Iteration {}'s Avg loss: {}".format(i, loss)) for i in range(len(clfs)): print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) print("*** Validation finished ***\n") return cv_scores["Avg"]
def genAvgSub(clfs): train = train_data.copy() train_idx = [i for i in range(train.shape[0])] test = test_data.copy() test_idx = [i + train.shape[0] for i in range(test.shape[0])] y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) print("Trainning:...") preds = [] for i in range(len(clfs)): print("Clf_{} fiting".format(i)) clfs[i].fit(X_train, y) print("Clf_{} predicting".format(i)) pred = clfs[i].predict_proba(X_test) preds.append(pred) sub = pd.DataFrame(np.mean(preds, axis=0)) # sub.columns = ["high", "medium", "low"] sub.columns = ["high", "medium", "low"] sub["listing_id"] = test.listing_id.values sub.to_csv("submission.csv", index=False) print("Train done.")
def stacking(clfs): print("Stacking") train = train_data.copy() test = test_data.copy() y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] train_stackers = [] for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]: skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True) #Create Arrays for meta train_stacker = [[0.0 for s in range(3)] for k in range(0, (train.shape[0]))] cv_scores = {i: [] for i in range(len(clfs))} cv_scores["Avg"] = [] print("Begin 10-flod cross validation") cnt = 0 for train_idx, val_idx in skf.split(train, y): cnt += 1 X = train.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) X_train.toarray() preds = [] k = 0 for clf in clfs: clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) loss = log_loss(y_val, y_val_pred) preds.append(y_val_pred) cv_scores[k].append(loss) k += 1 print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss)) preds = np.array(preds) avg_pred = np.mean(preds, axis=0) avg_loss = log_loss(y_val, avg_pred) cv_scores["Avg"].append(avg_loss) print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss)) no = 0 for real_idx in val_idx: for i in range(3): train_stacker[real_idx][i] = avg_pred[no][i] no += 1 for i in range(len(clfs)): print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) train_stackers.append(train_stacker) train_stacker = np.mean(train_stackers, axis=0) print("*** Validation finished ***\n") test_stacker = [[0.0 for s in range(3)] for k in range(0, (test.shape[0]))] train_idx = [i for i in range(train.shape[0])] test_idx = [i + train.shape[0] for i in range(test.shape[0])] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) print(X_train.shape, len(train_stacker)) print("Begin predicting") preds = [] for i in range(len(clfs)): print("Clf_{} fiting".format(i)) clfs[i].fit(X_train, y) print("Clf_{} predicting".format(i)) pred = clfs[i].predict_proba(X_test) preds.append(pred) preds = np.mean(preds, axis=0) for pr in range(0, len(preds)): for d in range(0, 3): test_stacker[pr][d] = (preds[pr][d]) print("merging columns") #stack xgboost predictions X_train = np.column_stack((X_train.toarray(), train_stacker)) # stack id to test X_test = np.column_stack((X_test.toarray(), test_stacker)) # stack target to train X = np.column_stack((y, X_train)) ids = test.listing_id.values X_test = np.column_stack((ids, X_test)) np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f') np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f') print("Write results...") output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"])) print("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("listing_id,high,medium,low\n") # the header for g in range(0, len(test_stacker)): f.write("%s" % (ids[g])) for prediction in test_stacker[g]: f.write(",%f" % (prediction)) f.write("\n") f.close() print("Done.")
attributes = full.columns[1:] skf = StratifiedKFold(n_splits=3, shuffle=True) data = full[attributes].copy() utils = preprcessUtil.Utils() y = full['timestamp'].apply(utils.processTime) cv_scores = [] i = 0 for train_idx, val_idx in skf.split(data, y): i += 1 X = data.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) clf.fit(X_train, y_train) #y_val_pred = clf.predict_proba(X_val) y_val_pred = clf.predict(X_val) pred = y_val_pred label = y_val.values correct = 0 for j in range(len(pred)): if pred[j] == label[j]: correct += 1 precision = float(correct*100.00)/len(pred) loss = log_loss(y_val, y_val_pred) precision = precision_score(label, pred, average='binary') recall = recall_score(label, pred, average='binary')