d['target'] = gl.SArray(targets) return gl.SFrame(d) def _preds_to_array(self, preds): p = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') p['id'] = p['id'].astype(int) + 1 p = p.sort('id') del p['id'] preds_array = np.array(p.to_dataframe(), dtype=float) return preds_array if __name__ == '__main__': train, labels, test, _, _ = utils.load_data() clf = XGBoost(max_iterations=4800, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.009) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print ('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions) elif MODE == 'submission': clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_submission(consts.DATA_SAMPLE_SUBMISSION_PATH, os.path.join(consts.ENSEMBLE_PATH, MODEL_NAME + '.csv'), predictions)
import numpy as np import os from sklearn import ensemble, feature_extraction, preprocessing from otto_utils import consts, utils MODEL_NAME = 'model_02_random_forest' MODE = 'holdout' # cv|submission|holdout # import data train, labels, test, _, _ = utils.load_data() # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray() # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train classifier clf = ensemble.ExtraTreesClassifier(n_jobs=4, n_estimators=2000, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print('CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
iter_funcs = self.create_test_function(dataset, self.model) num_batches_test = int(math.ceil(dataset['num_examples_test'] / float(self.batch_size))) test_preds, test_probas = np.array([]), None for b in range(num_batches_test): batch_test_pred, batch_test_proba = iter_funcs['test'](b) test_preds = np.append(test_preds, batch_test_pred) test_probas = np.append(test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba return test_preds, test_probas if __name__ == '__main__': train, labels, test, _, _ = utils.load_data(os.path.join(consts.DATA_PATH, 'fe_train.csv'), os.path.join(consts.DATA_PATH, 'fe_test.csv')) from sklearn import decomposition # PCA pp = decomposition.PCA() train = pp.fit_transform(train) test = pp.transform(test) clf = NeuralNetwork(1024, 110, 128, 0.00013934891814068934, 0.9724490021642429, 6.238206486137665e-05, 0.3081052487919688, .02, True, 10, random_state=21) if MODE == 'cv': scores, predictions = utils.make_blender_cv(clf, train, labels, calibrate=False) print( 'CV:', scores, 'Mean log loss:', np.mean(scores)) utils.write_blender_data(consts.BLEND_PATH, MODEL_NAME + '.csv', predictions)
test_preds, test_probas = np.array([]), None for b in range(num_batches_test): batch_test_pred, batch_test_proba = iter_funcs['test'](b) test_preds = np.append(test_preds, batch_test_pred) test_probas = np.append( test_probas, batch_test_proba, axis=0) if test_probas is not None else batch_test_proba return test_preds, test_probas if __name__ == '__main__': train, labels, test, _, _ = utils.load_data( os.path.join(consts.DATA_PATH, 'fe_train.csv'), os.path.join(consts.DATA_PATH, 'fe_test.csv')) from sklearn import decomposition # PCA pp = decomposition.PCA() train = pp.fit_transform(train) test = pp.transform(test) clf = NeuralNetwork(1024, 110, 128, 0.00013934891814068934, 0.9724490021642429, 6.238206486137665e-05, 0.3081052487919688,
It achieves around 0.52914588084 log loss on holdout set """ import numpy as np import os from sklearn import ensemble, feature_extraction, linear_model, preprocessing from sklearn.svm import LinearSVC from otto_utils import consts, utils MODEL_NAME = 'model_01_bagging_linear' MODE = 'holdout' # cv|submission|holdout # import data train, labels, test, _, _ = utils.load_data() # 这里的_可能是取消这个变量 # polynomial features poly_feat = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True) train = poly_feat.fit_transform(train, labels) test = poly_feat.transform(test) print train.shape # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() test = tfidf.transform(test).toarray()