'colsample_bytree': 0.5, 'target': 'target', 'validation_set': xg_valid, 'num_class' : 9, 'objective': 'multi:softprob', 'eval:metric': 'mlogloss', 'silent': 1, } watchlist = [ (xg_train, 'train'), (xg_valid, 'valid') ] bst = xgb.train(params, xg_train, rounds, watchlist, early_stopping_rounds=100, evals_result=evals) return bst, evals if __name__ == '__main__': X, y = OttoCompetition.load_data(train=True) X_test, _ = OttoCompetition.load_data(train=False) le = LabelEncoder().fit(y) all_hold = [] all_hold_predict = [] all_test_predict = [] all_weights = [] # 20% holdout for i, (data_index, hold_index) in enumerate(StratifiedKFold(y, n_folds = 5, random_state=0)): X_data, X_hold = X[data_index], X[hold_index] y_data, y_hold = y[data_index], y[hold_index] y_hold_predict = [] y_test_predict = [] # train with 50%, validation with 5% for j, (train_index, valid_index) in enumerate(StratifiedShuffleSplit(y_data, 20, test_size = 0.05, train_size = 0.5, random_state=0)):
from lasagne.layers import DenseLayer from lasagne.layers import InputLayer from lasagne.layers import DropoutLayer from lasagne.nonlinearities import softmax from lasagne.updates import nesterov_momentum from nolearn.lasagne import NeuralNet if __name__ == '__main__': encoder = LabelEncoder() # Identical to StandardScaler using all train and test data. scaler = OttoScaler() # Training data X, y = OttoCompetition.load_data(train=True) y = encoder.fit_transform(y).astype('int32') X = scaler.transform(X).astype('float32') n_classes = np.unique(y).shape[0] n_features = X.shape[1] # Split a holdout set data_idx, hold_idx = next(iter(StratifiedShuffleSplit(y, 1, test_size = 0.2, random_state=0))) X_data, X_hold = X[data_idx], X[hold_idx] y_data, y_hold = y[data_idx], y[hold_idx] # Test data X_test, _ = OttoCompetition.load_data(train=False) X_test = scaler.transform(X_test).astype('float32')
] Xt, X_test, yt, y_test = train_test_split(train_ffm.values, y.values, test_size = 0.2) ll = [] for i, (train_index, valid_index) in enumerate(StratifiedKFold(yt, n_folds = 10, random_state=0)): print('Fold {}'.format(i)) X_train, X_valid = Xt[train_index], Xt[valid_index] y_train, y_valid = yt[train_index], yt[valid_index] valid_set = [] lb = LabelBinarizer() ybin = lb.fit_transform(yt) for ylabel in lb.classes_: print(ylabel) tdf = pd.DataFrame(np.vstack([(y_train == ylabel).T,X_train.T]).T) vdf = pd.DataFrame(np.vstack([(y_valid == ylabel).T,X_valid.T]).T) train_file = './ffm/ffm_train_fold_{}_{}.csv'.format(i, ylabel) valid_file = './ffm/ffm_valid_fold_{}_{}.csv'.format(i, ylabel) model_file = './ffm/ffm_model_fold_{}_{}.csv'.format(i, ylabel) predt_file = './ffm/ffm_predt_fold_{}_{}.csv'.format(i, ylabel) tdf.to_csv(train_file, sep=" ", header=False, index=False, quote=csv.QUOTE_NONE, quotechar=" ") vdf.to_csv(valid_file, sep=" ", header=False, index=False, quote=csv.QUOTE_NONE, quotechar=" ") check_call(['ffm-train'] + ffm_params + ['-p', valid_file, train_file, model_file]) check_call(['ffm-predict', valid_file, model_file, predt_file]) valid_set.append(np.loadtxt(predt_file)) yp = np.array(valid_set).T yp = (yp / yp.sum(axis=1)[:, np.newaxis]) ll.append(OttoCompetition.score(y_valid, yp, lb.classes_.tolist()))