def test_quadratic_weighted_kappa(self): kappa = metrics.quadratic_weighted_kappa([1,2,3],[1,2,3]) self.assertAlmostEqual(kappa, 1.0) kappa = metrics.quadratic_weighted_kappa([1,2,1],[1,2,2],1,2) self.assertAlmostEqual(kappa, 0.4) kappa = metrics.quadratic_weighted_kappa([1,2,3,1,2,2,3],[1,2,3,1,2,3,2]) self.assertAlmostEqual(kappa, 0.75)
def test_quadratic_weighted_kappa(self): kappa = metrics.quadratic_weighted_kappa([1, 2, 3], [1, 2, 3]) self.assertAlmostEqual(kappa, 1.0) kappa = metrics.quadratic_weighted_kappa([1, 2, 1], [1, 2, 2], 1, 2) self.assertAlmostEqual(kappa, 0.4) kappa = metrics.quadratic_weighted_kappa([1, 2, 3, 1, 2, 2, 3], [1, 2, 3, 1, 2, 3, 2]) self.assertAlmostEqual(kappa, 0.75)
def predict(self, model, xg_train, xg_test, objective='reg:linear'): """ Parameters ---------- model : xgboost.Booster xgboost model ready for making predictions xg_train : xgboost.DMatrix training data xg_test : xgboost.DMatrix testing data Returns ------- model_prediction : ModelPrediction (named tuple) """ train_score = model.predict( xg_train, ntree_limit=model.best_iteration) test_score = model.predict( xg_test, ntree_limit=model.best_iteration) train_label = np.asarray(xg_train.get_label()) test_label = np.asarray(xg_test.get_label()) if objective == 'reg:linear': # Cuttofs are optimized here best_cuts = optimize_cutoffs(train_score, train_label, verbose=False) train_prediction = classify_with_cutoffs(train_score, best_cuts) test_prediction = classify_with_cutoffs(test_score, best_cuts) else: train_prediction = train_score test_prediction = test_score train_qwk = quadratic_weighted_kappa(train_label, train_prediction) test_qwk = quadratic_weighted_kappa(test_label, test_prediction) return ModelPrediction(train_label, test_label, train_score, test_score, train_prediction, test_prediction, train_qwk, test_qwk, precision_score(train_label, train_prediction, average=None), precision_score(test_label, test_prediction, average=None) )
def linear_reg(selected, sorted_feature, feature_data): features = [] scores = feature_data['score'] lm = linear_model.LinearRegression() kf = KFold(n_splits=10) overall = [] for title in sorted_feature.keys(): if title in selected: features.append(title) for i in range(len(scores)): z = 0 X = [] for t in features: X.append(feature_data[t][i]) X = np.array([list(x) for x in zip(*X)]) count = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = np.array(scores[i])[train_index], np.array( scores[i])[test_index] model = lm.fit(X_train, y_train) pred_score = lm.predict(X_test) kappa = metrics.quadratic_weighted_kappa(y_test, pred_score) z += 1 / 2 * math.log((1 + kappa) / (1 - kappa)) count += 1 weighted_kappa_mean = (math.e**(2 * z / count) - 1) / (math.e**(2 * z / count) + 1) overall.append(weighted_kappa_mean) print(weighted_kappa_mean) print(sum(overall) / len(overall)) return
def feature_selection(sorted_feature, feature_data): features = [] scores = feature_data['score'] lm = linear_model.LinearRegression() prev = 0 for title in sorted_feature.keys(): features.append(title) kappa = 0 z = 0 for i in range(len(scores)): X = [] for t in features: X.append(feature_data[t][i]) if len(X) == 1: X = np.array(X[0]).reshape(-1, 1) else: X = [list(x) for x in zip(*X)] model = lm.fit(X, scores[i]) pred_score = lm.predict(X) kappa = metrics.quadratic_weighted_kappa(scores[i], pred_score) z += 1 / 2 * math.log((1 + kappa) / (1 - kappa)) weighted_kappa_mean = (math.e**(2 * z / (len(scores))) - 1) / (math.e**(2 * z / (len(scores))) + 1) if weighted_kappa_mean < prev: features = features[:-1] else: prev = weighted_kappa_mean print(features) print(weighted_kappa_mean) pickle.dump(features, open('selected_features.txt', 'wb')) return features
def predict_score(): file = open("model/predictions.txt") numarray = [] while 1: line = file.readline() if not line: break numarray.append(int(float(line))) file = open("model/answers.txt") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(int(float(line))) ##print len(numarray) ##print len(answerarray) solutionarray = [] for x in range(0, len(numarray)): if numarray[x] == answerarray[x]: solutionarray.append(1) else: solutionarray.append(0) onecounter = solutionarray.count(1) print "QWK_Score: " + str(metrics.quadratic_weighted_kappa(answerarray,numarray))
def predict_score(): file = open("model/predictions.txt") numarray = [] while 1: line = file.readline() if not line: break numarray.append(int(float(line))) file = open("model/answers.txt") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(int(float(line))) ##print len(numarray) ##print len(answerarray) solutionarray = [] for x in range(0, len(numarray)): if numarray[x] == answerarray[x]: solutionarray.append(1) else: solutionarray.append(0) onecounter = solutionarray.count(1) print "QWK_Score: " + str( metrics.quadratic_weighted_kappa(answerarray, numarray))
def minimize_quadratic_weighted_kappa(cutpoints,y_pred=None,y=None): cutpoints = np.sort(cutpoints) cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]]) y_pred = pd.cut(y_pred,bins=cutpoints,labels=[1,2,3,4,5,6,7,8]) score = quadratic_weighted_kappa(y,y_pred) print score return -score
def eval_dag(dag, filename, dag_id=None): dag = normalize_dag(dag) if filename not in input_cache: input_cache[filename] = pd.read_csv('data/'+filename, sep=';') data = input_cache[filename] feats = data[data.columns[:-1]] targets = data[data.columns[-1]] le = preprocessing.LabelEncoder() ix = targets.index targets = pd.Series(le.fit_transform(targets), index=ix) errors = [] start_time = time.time() for train_idx, test_idx in cross_validation.StratifiedKFold(targets, n_folds=5): train_data = (feats.iloc[train_idx], targets.iloc[train_idx]) test_data = (feats.iloc[test_idx], targets.iloc[test_idx]) ms = train_dag(dag, train_data) preds = test_dag(dag, ms, test_data) acc = mm.quadratic_weighted_kappa(test_data[1], preds) errors.append(acc) m_errors = float(np.mean(errors)) s_errors = float(np.std(errors)) return m_errors, s_errors, time.time() - start_time
def on_epoch_end(self, epoch, logs={}): self.counter += 1 p = self.model.predict(self.X_val, verbose=0) #score the validation data #current kappa current = ml_metrics.quadratic_weighted_kappa( self.y_val.values.ravel(), np.clip(np.round(p.astype(int).ravel()), 1, 8)) print('Epoch %d Kappa: %f | Best Kappa: %f \n' % (epoch, current, self.best)) #if improvement over best.... if current > self.best: self.best = current self.best_rounds = self.counter self.wait = 0 self.model.save_weights(self.filepath, overwrite=True) print("model save weights") else: if self.wait >= self.patience: #no more patience, retrieve best model self.model.stop_training = True print('Best number of rounds: %d \nKappa: %f \n' % (self.best_rounds, self.best)) self.model.load_weights(self.filepath) self.wait += 1 #incremental the number of times without improvement
def predict(self, model, xg_train, xg_test, objective='reg:linear'): """ Parameters ---------- model : xgboost.Booster xgboost model ready for making predictions xg_train : xgboost.DMatrix training data xg_test : xgboost.DMatrix testing data Returns ------- model_prediction : ModelPrediction (named tuple) """ train_score = model.predict(xg_train, ntree_limit=model.best_iteration) test_score = model.predict(xg_test, ntree_limit=model.best_iteration) train_label = np.asarray(xg_train.get_label()) test_label = np.asarray(xg_test.get_label()) if objective == 'reg:linear': # Cuttofs are optimized here best_cuts = optimize_cutoffs(train_score, train_label, verbose=False) train_prediction = classify_with_cutoffs(train_score, best_cuts) test_prediction = classify_with_cutoffs(test_score, best_cuts) else: train_prediction = train_score test_prediction = test_score train_qwk = quadratic_weighted_kappa(train_label, train_prediction) test_qwk = quadratic_weighted_kappa(test_label, test_prediction) return ModelPrediction( train_label, test_label, train_score, test_score, train_prediction, test_prediction, train_qwk, test_qwk, precision_score(train_label, train_prediction, average=None), precision_score(test_label, test_prediction, average=None))
def evalerror_softmax_cdf(preds, dtrain, cdf): ## label are in [0,1,2,3] labels = dtrain.get_label() + 1 preds = getClfScore(preds, cdf) kappa = quadratic_weighted_kappa(labels, preds) ## we return -kappa for using early stopping kappa *= -1. return 'kappa', float(kappa)
def _score_offset(self, bin_offset, sv): flg = self._data[:, 0].astype(int) == sv self._data[flg, 1] = self._data[flg, 0] + bin_offset offset_pred = np.clip(np.round(self._data[:, 1]), 1, 8)\ .astype(int) kappa = quadratic_weighted_kappa(self._data[:, 2], offset_pred) return -kappa
def minimize_quadratic_weighted_kappa(cutpoints, y_pred=None, y=None): cutpoints = np.sort(cutpoints) cutpoints = np.concatenate([[-99999999999999999], cutpoints, [999999999999999]]) y_pred = pd.cut(y_pred, bins=cutpoints, labels=[1, 2, 3, 4, 5, 6, 7, 8]) score = quadratic_weighted_kappa(y, y_pred) print score return -score
def keras_model(): import pandas as pd import numpy as np from keras.preprocessing import sequence from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.callbacks import EarlyStopping from keras.utils import np_utils from data_util import load_csvs, load_other import ml_metrics as metrics nb_words = 6500 maxlen = 175 filter_length = 10 other_col_dim = 4 X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv', 'data/tpov4/test_1.csv', nb_words, maxlen, 'self', w2v=None) # read _other.csv other_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim) other_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim) print('other tensor:', other_train.shape) pool_length = maxlen - filter_length + 1 model = Sequential() model.add(Convolution1D(nb_filter=50, filter_length=filter_length, border_mode="valid", activation="relu", input_shape=(maxlen, other_col_dim))) model.add(MaxPooling1D(pool_length=pool_length)) model.add(Flatten()) model.add(Dropout(0.05)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer={{choice(['rmsprop', 'adam', 'adadelta', 'adagrad'])}}) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit(other_train, Y_train, batch_size=32, nb_epoch=25, validation_split=0.1, show_accuracy=True, callbacks=[earlystop]) classes = earlystop.model.predict_classes(other_test, batch_size=32) org_classes = np_utils.categorical_probas_to_classes(Y_test) acc = np_utils.accuracy(classes, org_classes) # accuracy only supports classes print('Test accuracy:', acc) kappa = metrics.quadratic_weighted_kappa(classes, org_classes) print('Test Kappa:', kappa) return {'loss': -acc, 'status': STATUS_OK}
def eval_wrapper(yhat, y): """ Evaluation metric for the competition : quad weighted kappa """ y = np.array(y) y = y.astype(int) yhat = np.array(yhat) yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int) return quadratic_weighted_kappa(yhat, y)
def on_epoch_end(self, epoch, logs={}): if epoch % 5 == 0 or epoch == (epochs2 - 1): self.predhis.append( model2.predict([np.array(X_train[x]) for x in variables2])) self.predhisval.append( model2.predict([np.array(X_validation[x]) for x in variables2])) self.scores_train.append( metrics.quadratic_weighted_kappa( get_output(predictions2.predhis[-1]), np.array(X_train['revenue_class']))) self.scores_validation.append( metrics.quadratic_weighted_kappa( np.digitize(predictions2.predhisval[-1][:, 0], get_offset(predictions2.predhis[-1])) + 1, np.array(X_validation['revenue_class']))) print 'training : ' + str(self.scores_train[-1]) print 'validation : ' + str(self.scores_validation[-1])
def evalerror(preds, dtrain): ## label are in [0,1,2,3] as required by XGBoost for multi-classification labels = dtrain.get_label() + 1 ## class probability preds = softmax(preds) ## decoding (naive argmax decoding) pred_labels = np.argmax(preds, axis=1) + 1 ## compute quadratic weighted kappa (using implementation from @Ben Hamner ## https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/quadratic_weighted_kappa.py kappa = quadratic_weighted_kappa(labels, pred_labels) return 'kappa', kappa
def evalerror_cocr_cdf(preds, dtrain, cdf): labels = dtrain.get_label() + 1 #print preds.shape ## get prediction #preds = sigmoid(preds) preds = applyCOCRRule(preds) preds = getScore(preds, cdf) kappa = quadratic_weighted_kappa(labels, preds) ## we return -kappa for using early stopping kappa *= -1. return 'kappa', float(kappa)
def xgb_regression_quadratic_weighted_kappa(preds,dtrain): labels = dtrain.get_label() cutpoints = [1.886638,3.303624,4.152756,4.825063,5.653934,6.236325,6.765184] res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(preds,labels),method='BFGS') cutpoints = np.sort(res.x) cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]]) y_pred = pd.cut(preds,bins=cutpoints,labels=[1,2,3,4,5,6,7,8]) kappa = quadratic_weighted_kappa(labels,y_pred) ## we return -kappa for using early stopping kappa *= -1. return 'kappa', float(kappa)
def _offset_qwk_score(self, offset): """ :param numpy.array offset: :param numpy.array y_true: :param numpy.array y_pred: :rtype: float """ offset_pred = self._apply_offset(self._data, offset) kappa = quadratic_weighted_kappa(self._data[:, 2], offset_pred) return -kappa
def evaluate_prediction(self, key_word=r'', decimal_places=4, csv_dump=False, df_ac_predict_target=pd.DataFrame(), predict_res=np.array([])): if len(df_ac_predict_target) == 0: df_ac_predict_target = self.df_ac_predict_target if len(predict_res) == 0: predict_res = self.predict_res recall = round( mtrx.recall_score(df_ac_predict_target.transpose().values[0], predict_res, average='weighted'), decimal_places) precision = round( mtrx.precision_score(df_ac_predict_target.transpose().values[0], predict_res, average='weighted'), decimal_places) f1 = round( mtrx.f1_score(df_ac_predict_target.transpose().values[0], predict_res, average='weighted'), decimal_places) kappa = round( metrics.kappa(predict_res, df_ac_predict_target.transpose().values[0]), decimal_places) qwk = round( metrics.quadratic_weighted_kappa( predict_res, df_ac_predict_target.transpose().values[0]), decimal_places) self.se_indices = pd.Series([recall, precision, f1, kappa, qwk], index=[ 'Recall', 'Precision', 'F1', 'Kappa', 'Quadratic Weighted Kappa' ]) print(self.se_indices) self.conf_mtx = pd.DataFrame( mtrx.confusion_matrix(df_ac_predict_target.transpose().values[0], predict_res)) print('Confusion Matrix:') print(self.conf_mtx) if csv_dump == True: self.se_indices.to_csv( self.data_dir + r'Classified-Prediction-Indices-' + key_word + r'.csv', encoding='latin1') self.conf_mtx.to_csv(self.data_dir + r'Classified-Prediction-Confusion-Matrix-' + key_word + r'.csv', encoding='latin1')
def cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, vocab_size, embd_dim, nb_filter, filter_length, batch_size, nb_epoch, optm): """ - CNN-1d on text input (represented in int) - MOT - dropout + L2 softmax :param <X, Y> train and test sets :param nb_classes # of classes :param maxlen max of n char in a sentence :param vocab_size :param embd_dim :param nb_filter :param filter_length :param batch_size :param nb_epoch :param optm optimizer options, e.g., adam, rmsprop, etc. :return: """ pool_length = maxlen - filter_length + 1 model = Sequential() model.add(Embedding(vocab_size, embd_dim, input_length=maxlen)) model.add(Dropout(0.25)) model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode="valid", activation="relu")) model.add(MaxPooling1D(pool_length=pool_length)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer=optm) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True, callbacks=[earlystop]) classes = earlystop.model.predict_classes(X_test, batch_size=batch_size) acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test accuracy:', acc) # return(acc) kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test Kappa:', kappa) return (kappa)
def default_errorfun(p, ysc, ytr): """ Parameters ---------- p : array of 8 cutoff values ysc : array of scores [array(double)] ytr : array of true labels [array(int)] """ errors = quadratic_weighted_kappa( classify_with_cutoffs(ysc, p).astype(np.int64), ytr) return 1 - errors
def on_epoch_end(self, epoch, logs={}): p = self.model.predict(self.X_val.values, verbose=0) current = ml_metrics.quadratic_weighted_kappa(self.y_val.values.ravel(),np.clip(np.round(p.astype(int).ravel()), 1, 8)) if current > self.best: self.best = current self.wait = 0 else: if self.wait >= self.patience: self.model.stop_training = True print('Epoch %05d: early stopping' % (epoch)) self.wait += 1 #incremental the number of times without improvement print('Epoch %d Kappa: %f | Best Kappa: %f \n' % (epoch,current,self.best))
def calc_mqwp(output): """ Calculate the mean quadratic_weighted_kappa across all the question sets :param outputs: dataframe containing target, output, question set :return: mean quadratic weighted kappa """ groups = output.groupby('set') kappas = [ quadratic_weighted_kappa(group[1]["output"], group[1]["target"]) for group in groups ] print('Kappa of each set: ', kappas) mean = mean_quadratic_weighted_kappa(kappas) return mean
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list, cdf_list, numValidMatrix): weight2 = param['weight2'] kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] p1 = p1_list[run,fold,:numValid] p2 = p2_list[run,fold,:numValid] true_label = true_label_list[run,fold,:numValid] cdf = cdf_list[run,fold,:] p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2) p_ens_score = getScore(p_ens, cdf) kappa_cv[run][fold] = quadratic_weighted_kappa(p_ens_score, true_label) kappa_cv_mean = np.mean(kappa_cv) return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
def _cut_qwk_score(cut_points, y_pred, y_true): """ :param list cut_points: :param numpy.array y_true: :param numpy.array y_pred: :rtype: float """ try: d_pred = np.digitize(y_pred[:, 0], cut_points) + 1 except ValueError: return 1 kappa = quadratic_weighted_kappa(y_true, d_pred) return -kappa
def xgb_regression_quadratic_weighted_kappa(preds, dtrain): labels = dtrain.get_label() cutpoints = [ 1.886638, 3.303624, 4.152756, 4.825063, 5.653934, 6.236325, 6.765184 ] res = minimize(minimize_quadratic_weighted_kappa, cutpoints, (preds, labels), method='BFGS') cutpoints = np.sort(res.x) cutpoints = np.concatenate([[-99999999999999999], cutpoints, [999999999999999]]) y_pred = pd.cut(preds, bins=cutpoints, labels=[1, 2, 3, 4, 5, 6, 7, 8]) kappa = quadratic_weighted_kappa(labels, y_pred) ## we return -kappa for using early stopping kappa *= -1. return 'kappa', float(kappa)
def test_classifier(parameters, clsClass, feats, targets, filename): errors = [] for train_idx, test_idx in cross_validation.StratifiedKFold(targets, n_folds=5): cls = clsClass(**parameters) train_data = (feats.iloc[train_idx], targets.iloc[train_idx]) test_data = (feats.iloc[test_idx], targets.iloc[test_idx]) cls.fit(train_data[0], train_data[1]) preds = cls.predict(test_data[0]) acc = mm.quadratic_weighted_kappa(test_data[1], preds) if filename == 'ml-prove.csv': acc = metrics.accuracy_score(test_data[1], preds) errors.append(acc) return errors, parameters
def cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_filter, filter_length, batch_size, nb_epoch, optm): """ - CNN-1d on 3d sensor which uses word2vec embedding - MOT :param <X, Y> train and test sets :param nb_classes # of classes :param maxlen max of n char in a sentence :param nb_filter :param filter_length :param batch_size :param nb_epoch :param optm :return: """ pool_length = maxlen - filter_length + 1 model = Sequential() model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode="valid", activation="relu", input_shape=(maxlen, 300))) model.add(MaxPooling1D(pool_length=pool_length)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer=optm) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True, callbacks=[earlystop]) classes = earlystop.model.predict_classes(X_test, batch_size=batch_size) acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test)) # accuracy only supports classes print('Test accuracy:', acc) # return(acc) kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test Kappa:', kappa) return (kappa)
def lstm_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, vocab_size, embd_dim, batch_size, nb_epoch, optm): """ - LSTM on text input (represented in int) - fully-connected model :param <X, Y> train and test sets :param nb_classes # of classes :param maxlen max of n char in a sentence :param vocab_size :param embd_dim :param batch_size :param nb_epoch :param optm optimizer options, e.g., adam, rmsprop, etc. :return: """ model = Sequential() model.add(Embedding(vocab_size, embd_dim, input_length=maxlen)) model.add(Dropout(0.25)) # model.add(LSTM(100, return_sequences=True)) model.add(LSTM(50)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer=optm) earlystop = EarlyStopping(monitor='val_loss', patience=2, verbose=1) model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True, callbacks=[earlystop]) classes = earlystop.model.predict_classes(X_test, batch_size=batch_size) acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test)) # accuracy only supports classes print('Test accuracy:', acc) kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test Kappa:', kappa) return (kappa)
def objective(params): mlp_params = { "layers_architecture": [int(l) for l in params["layers"]["n_units_layer"]], "enable_batch_normalization": [True] * params["layers"]["n_layers"], "activations": [params["activation"]] * params["layers"]["n_layers"], "dropout_probas": [params["dropout"]] * params["layers"]["n_layers"], "optimizer": "adam", "learning_rate": params["learning_rate"], "l2_regularization": params["l2_regularization"], "metric": "rmse", "epochs": 500, "batch_size": 2048 } mlp = MLPClassifier( mlp_params, weights_saving_directory_path_str= "D:/Projets_Data_Science/Competitions/Kaggle/PetFinder.my_Adoption_Prediction/data/weights/" ) # Train the model mlp.fit(X_train, y_train) # Make predictions predictions_npa = mlp.predict(X_test) # Evaluate the model qwk = quadratic_weighted_kappa(y_test, predictions_npa) print(mlp_params) print("QWK = ", qwk) return -qwk # Return negative value as we want to maximize it
def feature_testing(feature_data): scores = feature_data['score'] lm = linear_model.LinearRegression() kappa = {} #for each feature(j) in each essayset(i) for i in range(len(scores)): for key, value in feature_data.items(): if key != 'score': model = lm.fit(np.array(value[i]).reshape(-1, 1), scores[i]) pred_score = lm.predict(np.array(value[i]).reshape(-1, 1)) if key not in kappa: kappa[key] = [] kappa[key].append( metrics.quadratic_weighted_kappa(scores[i], pred_score)) for key, value in kappa.items(): kappa[key] = sum(value) / len(value) sorted_kappa = dict( sorted(kappa.items(), key=operator.itemgetter(1), reverse=True)) print(sorted_kappa) return sorted_kappa
def objective(params): lgb_params = { "application": "multiclass", "boosting": "gbdt", "metric": "qwk", "num_class": 5, "num_leaves": int(params["num_leaves"]), "max_depth": -1, "learning_rate": "{:.4f}".format(params["learning_rate"]), "bagging_fraction": "{:.4f}".format(params["bagging_fraction"]), "feature_fraction": "{:.4f}".format(params["feature_fraction"]), "min_split_gain": "{:.4f}".format(params["min_split_gain"]), "min_child_samples": int(params["min_child_samples"]), "min_child_weight": "{:.4f}".format(params["min_child_weight"]), "verbosity": -1, "seed": 17, "nthread": 16, "device": "cpu" } lgbm = BlendedLGBMClassifier(lgb_params, early_stopping_rounds=150, eval_size=0.2, eval_split_type="random", verbose_eval=100, nrounds=10000) # Train the model lgbm.fit(X_train, y_train) # Make predictions predictions_npa = lgbm.predict(X_test) # Evaluate the model qwk = quadratic_weighted_kappa(y_test, predictions_npa) print("QWK = ", qwk) return -qwk # Return negative value as we want to maximize it
def cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, vocab_size, embd_size, nb_filter, batch_size, nb_epoches, optm): ngram_filters = [2, 5, 8] input = Input(shape=(maxlen,), name='input', dtype='int32') embedded = Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=maxlen)(input) convs = [None, None, None] # three CNNs for i, n_gram in enumerate(ngram_filters): pool_length = maxlen - n_gram + 1 convs[i] = Convolution1D(nb_filter=nb_filter, filter_length=n_gram, border_mode="valid", activation="relu")(embedded) convs[i] = MaxPooling1D(pool_length=pool_length)(convs[i]) convs[i] = Flatten()(convs[i]) merged = merge([convs[0], convs[1], convs[2]], mode='concat', concat_axis=1) merged = Dropout(0.5)(merged) output = Dense(nb_classes, activation='softmax', name='output')(merged) model = Model(input, output) model.compile(optm, loss={'output': 'categorical_crossentropy'}) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit(X_train, Y_train, nb_epoch=nb_epoches, batch_size=batch_size, validation_split=0.1, callbacks=[earlystop]) probs = earlystop.model.predict(X_test, batch_size=batch_size) classes = np_utils.categorical_probas_to_classes(probs) acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test accuracy:', acc) kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test Kappa:', kappa) return acc
def eval_dag(dag, filename, dag_id=None): dag = normalize_dag(dag) # utils.draw_dag(dag) # pprint.pprint(dag) if filename not in input_cache: input_cache[filename] = pd.read_csv('data/' + filename, sep=';') data = input_cache[filename] feats = data[data.columns[:-1]] targets = data[data.columns[-1]] le = preprocessing.LabelEncoder() ix = targets.index targets = pd.Series(le.fit_transform(targets), index=ix) errors = [] start_time = time.time() for train_idx, test_idx in cross_validation.StratifiedKFold(targets, n_folds=5): train_data = (feats.iloc[train_idx], targets.iloc[train_idx]) test_data = (feats.iloc[test_idx], targets.iloc[test_idx]) ms = train_dag(dag, train_data) preds = test_dag(dag, ms, test_data) acc = mm.quadratic_weighted_kappa(test_data[1], preds) if filename == 'ml-prove.csv': acc = metrics.accuracy_score(test_data[1], preds) errors.append(acc) m_errors = float(np.mean(errors)) s_errors = float(np.std(errors)) return m_errors, s_errors, time.time() - start_time
def objective(params): xgb_params = { "objective": "multiclass", "booster": "gbtree", "metric": "qwk", "num_class": 5, "max_depth": int(params["max_depth"]), "eta": params["eta"], "subsample": params["subsample"], "colsample_bytree": params["colsample_bytree"], "gamma": params["gamma"], "min_child_weight": params["min_child_weight"], "verbosity": 0, "silent": 1, "seed": 17, "nthread": 30 } print("Params:", xgb_params) xgb = BlendedXGBClassifier(xgb_params, early_stopping_rounds=150, eval_size=0.2, eval_split_type="random", verbose_eval=100, nrounds=10000) # Train the model xgb.fit(X_train, y_train) # Make predictions predictions_npa = xgb.predict(X_test) # Evaluate the model qwk = quadratic_weighted_kappa(y_test, predictions_npa) print(xgb_params) print("QWK = ", qwk) return -qwk # Return negative value as we want to maximize it
def evalerror_ebc_cdf(preds, dtrain, cdf, hard_threshold=False): labels = dtrain.get_label() ## extended samples within the feature construction part if np.min(labels) == -1 and np.max(labels) == 1: labels = applyEBCRule(labels) ## extended samples within the objective value computation part ## See ebcobj function for detail else: ## label are in [0,1,2,3] labels += 1 #print preds.shape ## get prediction #hard = False if hard_threshold: preds = applyEBCRule(preds, hard_threshold=hard_threshold) else: preds = sigmoid(preds) preds = applyEBCRule(preds, hard_threshold=hard_threshold) preds = getScore(preds, cdf) kappa = quadratic_weighted_kappa(labels, preds) ## we return -kappa for using early stopping kappa *= -1. return 'kappa', float(kappa)
def quadratic_weighted_kappa_round(estimator, X, actual): """This function applies the ml_metrics.quadratic_weighted_kappa without calling sklearn.metrics.make_scorer Parameters ---------- estimator : estimator object implementing ‘fit’ The object to use to fit the data. X : array-like The data to fit. Can be for example a list, or an array. actual : array-like the actual label Returns ------- float ml_metrics.quadratic_weighted_kappa """ predict = estimator.predict(X) unique_actual = list(set(actual)) predict_round = [ max(min(unique_actual), min(unique_actual, key=lambda x: abs(x - p))) for p in predict ] return quadratic_weighted_kappa(actual, predict_round)
def evalerror(preds, dtrain): labels = dtrain.get_label() + 1 preds = softmax(preds) pred_labels = np.argmax(preds, axis=1) + 1 kappa = quadratic_weighted_kappa(labels, pred_labels) return 'kappa', kappa
name='pospool', input='poscnn') model.add_node(Flatten(), name='posflat', input='pospool') model.add_node(Dropout(0.5), name='posdropout', input='posflat') # using three CNNs to predict with L1 model.add_node(Dense(nb_classes, activation='softmax'), name='softmax', inputs=['dropout', 'posdropout'], merge_mode='concat') model.add_output(name='output', input='softmax') model.compile('rmsprop', loss={'output': 'categorical_crossentropy'}) # model.compile('rmsprop', loss={'output': 'mean_squared_error'}) # early stopping earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit({'input': X_train, 'posinput': pos_train, 'output': Y_train}, nb_epoch=nb_epoch, batch_size=batch_size, validation_split=0.1, callbacks=[earlystop]) # Graph doesn't have several arg/func existing in Sequential() # - fit no show-accuracy # - no predict_classes classes = model.predict({'input': X_test, 'posinput': pos_test}, batch_size=batch_size)['output'].argmax(axis=1) acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test)) # accuracy only supports classes print('Test accuracy:', acc) kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test)) print('Test Kappa:', kappa)
X = dataset Y = dataset.loc[:,['Response']] X=pd.get_dummies(X,'Product_Info_2') X.drop(['Response','Medical_History_30','Id'],axis=1,inplace=True) #%% # split data into train and test sets seed = 6 test_size = 0.20 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) y_test=pd.to_numeric(y_test.Response,errors='coerce') # fit model no training data model = XGBRegressor(max_depth=8, silent=True,learning_rate=0.1, min_child_weight=35,subsample=0.6,n_estimators=150, colsample_bytree=0.3,missing=-1 ) model.fit(X_train, y_train) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) kappa = quadratic_weighted_kappa(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("kappa: %.2f%%" % (kappa * 100.0)) #%% cm=confusion_matrix(y_test,list(map(int,predictions))) cm=np.delete(cm,[0,1],0) cm=np.delete(cm,[0,1],1) norm_cm=cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.imshow(norm_cm, interpolation='nearest', cmap=plt.cm.Blues) plt.show()
import ml_metrics as metrics file = open("predictions.txt") numarray = [] while 1: line = file.readline() if not line: break numarray.append(int(float(line))) file = open("answers.txt") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(int(float(line))) ##print len(numarray) ##print len(answerarray) solutionarray = [] for x in range(0, len(numarray)): if numarray[x] == answerarray[x]: solutionarray.append(1) else: solutionarray.append(0) onecounter = solutionarray.count(1) print "QWK_Score: " + str(metrics.quadratic_weighted_kappa(answerarray,numarray))
def qwk_score(y_true, y_pred): kappa = quadratic_weighted_kappa(y_true, y_pred) return kappa
print X2.shape # Splitting Corpus into train and test again X1Test = X1[len(y)+1:] X1 = X1[:len(y)] X2Test = X2[len(y)+1:] X2 = X2[:len(y)] # Passing the vectorized matrices to SVD svd = TruncatedSVD(n_components = 800) svd.fit(X1) X1 = svd.transform(X1) #X1Test = svd.transform(X1Test) svd = TruncatedSVD(n_components = 1200) svd.fit(X2) X2 = svd.transform(X2) #X2Test = svd.transform(X2Test) # Initialize Model Variables # clf = pipeline.Pipeline([('scl', StandardScaler()),('svm', SVC(C=10,gamma=0.0002))]) #Horizontally stacking the two matrices X = hstack((X1,X2,trainFeatures)) #X_test = hstack((X1Test,X2Test,testFeatures)) stemPred = cross_val_predict(clf,X,y,cv=2,n_jobs=-1) print "Kappa Score for Training Data\nStemming\nScore=%f" %(quadratic_weighted_kappa(y, stemPred))
def eval_wrapper(yhat, y): y = np.array(y) y = y.astype(int) yhat = np.array(yhat) yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int) return quadratic_weighted_kappa(yhat, y)
def getscore(y,ypred): return ml_metrics.quadratic_weighted_kappa(y,ypred)
best_params = searcher.best_estimator_.get_params() for param in sorted(current_params.keys()): print("\t%s: %r" % (param, best_params[param])) # save the trained model for later best.append(searcher.best_estimator_) # we then validate the ensemble on the set aside features and targets predictions = [] for i in range(len(best)): model = best[i] predictions.append(model.predict(features[i][test_mask])) # just averaging for now, play with this later predictions = np.sum(predictions, axis=0) / len(predictions) score = ml_metrics.quadratic_weighted_kappa(targets[test_mask], predictions) ensemble_scores.append(score) print("Ensemble score... %0.3f" % score) print("CV ensemble score... %0.3f" % np.mean(ensemble_scores)) # once the ensemble has been validated, we can fit each model with all the # training data and make predictions for the real test data, if we want to # generate a submission csv (requires --submit flag) if "--submit" in sys.argv: # run all models on all data predictions = [] for i in range(len(best)): model = best[i] print("Thinking...")
def qwk_wrapper(y, y_pred, splits): return quadratic_weighted_kappa([digitize(yp, splits) for yp in y_pred], y)
def hyperopt_obj(param, feat_name, trial_counter): kappa_cv = [] cols = list(test.columns) cols.remove('Id') train_features = cols for run in range(1,2): print("run%d"%(run)) #### all the path #load index path = "../../data/info/run%d"%(run) train_index = loadCVIndex("../../data/cv/train.run%d.txt"%(run)) test_index = loadCVIndex("../../data/cv/valid.run%d.txt"%(run)) X_train = train.iloc[train_index][train_features] X_valid = train.iloc[test_index][train_features] labels_train = train.iloc[train_index]['Response'] labels_valid = train.iloc[test_index]['Response'] # cdf cdf_valid_path = "%s/valid.cdf" % path ## load cdf cdf_valid = np.loadtxt(cdf_valid_path, dtype=float) ## make evalerror func evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid) evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid) evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold) evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid) ############## ## Training ## ############## ## you can use bagging to stabilize the predictions dvalid_base = xgb.DMatrix(X_valid, label=labels_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train) watchlist = [(dtrain_base, 'train'), (dvalid_base, 'eval')] w = np.loadtxt("../../data/info/All/weight.txt",dtype=float) if param["task"] in ["regression", "ranking"]: ## regression & pairwise ranking with xgboost bst = xgb.train(param, dtrain_base, param['num_round'],watchlist) pred = bst.predict(dvalid_base) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) elif param["task"] in ["huber"]: ## regression & pairwise ranking with xgboost bst = xgb.train(param, dtrain_base, param['num_round'],watchlist) pred = bst.predict(dvalid_base) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) elif param["task"] in ["regrank"]: bst = xgb.train(param, dtrain_base, param['num_round'],watchlist,feval=evalerror_regrank_valid) pred = bst.predict(dvalid_base) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) elif param["task"] in ["softmax"]: ## softmax regression with xgboost dvalid_base = xgb.DMatrix(X_valid, label=labels_valid-1,weight=w[test_index]) dtrain_base = xgb.DMatrix(X_train, label=labels_train-1,weight=w[train_index]) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_softmax_valid) pred = bst.predict(dvalid_base) we = np.asarray(range(1,9)) pred = np.sum(pred, axis=1) pred = pred + 1 dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) y_train_preds = y_train_preds * we[np.newaxis,:] y_train_preds = np.sum(y_train_preds, axis=1) y_train_preds = y_train_preds + 1 elif param["task"] in ["ebc"]: ## ebc with xgboost dvalid_base = xgb.DMatrix(X_valid, label=labels_valid,weight=w[test_index]) dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w[train_index]) obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_valid) pred = sigmoid(bst.predict(dvalid_base)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) dtrain_base = xgb.DMatrix(X_train) y_train_preds = sigmoid(bst.predict(dtrain_base)) y_train_preds = applyEBCRule(y_train_preds, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: ## cocr with xgboost dvalid_base = xgb.DMatrix(X_valid, label=labels_valid,weight=w[test_index]) dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w[train_index]) obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_valid) pred = bst.predict(dvalid_base) pred = applyCOCRRule(pred) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) y_train_preds = applyCOCRRule(y_train_preds) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train, labels_train) train_sort = pd.DataFrame({'cols':train_features,'value':list(rf.feature_importances_)}).sort(columns=['value'],ascending=False) train_sort.to_csv("sort.csv") pred = rf.predict(X_valid) y_train_preds = rf.predict(X_train) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train, labels_train) pred = etr.predict(X_valid) y_train_preds = etr.predict(X_train) elif param['task'] == "reg_skl_gbm": ## regression with sklearn gradient boosting regressor gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train, labels_train) pred = gbm.predict(X_valid) y_train_preds = gbm.predict(X_train) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train, labels_train) pred = svr.predict(X_valid) pred_raw = pred pred_rank = pred_raw.argsort().argsort() score = getScore(pred_rank,cdf=cdf_valid) print quadratic_weighted_kappa(score,labels_valid) id_test = train.iloc[test_index]['Id'] output = pd.DataFrame({"Id": id_test, "Response_raw": pred_raw}) output['Response_rank'] = pred_rank output['Response_cdf'] = score output['Response'] = labels_valid ## weighted averageing over different models cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8] res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(y_train_preds,labels_train),method='Nelder-Mead') cutpoints = np.sort(res.x) for i in range(0,3): res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(y_train_preds,labels_train),method='Nelder-Mead') kappa=minimize_quadratic_weighted_kappa(cutpoints,pred,labels_valid) cutpoints = np.sort(res.x) kappa_cv.append(kappa) print "kappa:%f"%(kappa) kappa_cv_mean = np.mean(kappa_cv) kappa_cv_std = np.std(kappa_cv) subm_path = "%s/train.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % ("../../result/All", feat_name, trial_counter, kappa_cv_mean, kappa_cv_std) output.to_csv(subm_path,index=False) print(" Mean: %.6f" % kappa_cv_mean) print(" Std: %.6f" % kappa_cv_std) output_path = "../../result" #path = "%s/All" % (feat_folder) save_path = "%s/All" % output_path subm_path = "%s/Subm" % output_path if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(subm_path): os.makedirs(subm_path) # cdf cdf_test_path = "../../data/info/All/test.cdf" # raw prediction path (rank) raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # submission path (relevance as in [1,2,3,4]) subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, kappa_cv_mean, kappa_cv_std) X_train = train[train_features] X_valid = test[train_features] labels_train = train['Response'] ## load cdf cdf_test = np.loadtxt(cdf_test_path, dtype=float) ## evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test) evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test) evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test) evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold) evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test) dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train) watchlist = [(dtrain_base, 'train')] if param["task"] in ["regression", "ranking"]: ## regression & pairwise ranking with xgboost dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist) pred = bst.predict(dvalid_base) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) elif param["task"] in ["regrank"]: dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist,feval=evalerror_regrank_test) pred = bst.predict(dvalid_base) dtrain_base = xgb.DMatrix(X_train) y_train_preds = bst.predict(dtrain_base) elif param["task"] in ["softmax"]: ## softmax regression with xgboost dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train-1,weight=w) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_softmax_test) pred = bst.predict(dvalid_base) w = np.asarray(range(1,9)) pred = pred * w[np.newaxis,:] pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w) ## softkappa with xgboost obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_softkappa_test) pred = softmax(bst.predict(dvalid_base)) w = np.asarray(range(1,9)) pred = pred * w[np.newaxis,:] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: ## ebc with xgboost dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w) obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_test) pred = sigmoid(bst.predict(dvalid_base)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: ## cocr with xgboost dvalid_base = xgb.DMatrix(X_valid) dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w) obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_test) pred = bst.predict(dvalid_base) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train, labels_train) pred = rf.predict(X_valid) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train, labels_train) pred = etr.predict(X_valid) elif param['task'] == "reg_skl_gbm": ## regression with sklearn gradient boosting regressor gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state'], max_leaf_nodes = param['max_leaf_nodes'] ) gbm.fit(X_train, labels_train) pred = gbm.predict(X_valid) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_valid = scaler.transform(X_valid) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train, labels_train) pred = svr.predict(X_valid) pred_raw = pred pred_rank = pred_raw.argsort().argsort() id_test = test['Id'] output = pd.DataFrame({"Id": id_test, "Response_raw": pred_raw}) output['Response_rank'] = pred_rank pred_score = getScore(pred, cdf_test) output['Response_cdf'] = pred_score cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]]) print cutpoints y_pred = pd.cut(pred,bins=cutpoints,labels=[1,2,3,4,5,6,7,8]) output['Response_cut'] = y_pred output.to_csv(subm_path, index=False) return kappa_cv_mean, kappa_cv_std