Exemple #1
0
    def test_quadratic_weighted_kappa(self):
        kappa = metrics.quadratic_weighted_kappa([1,2,3],[1,2,3])
        self.assertAlmostEqual(kappa, 1.0)

        kappa = metrics.quadratic_weighted_kappa([1,2,1],[1,2,2],1,2)
        self.assertAlmostEqual(kappa, 0.4)

        kappa = metrics.quadratic_weighted_kappa([1,2,3,1,2,2,3],[1,2,3,1,2,3,2])
        self.assertAlmostEqual(kappa, 0.75)
Exemple #2
0
    def test_quadratic_weighted_kappa(self):
        kappa = metrics.quadratic_weighted_kappa([1, 2, 3], [1, 2, 3])
        self.assertAlmostEqual(kappa, 1.0)

        kappa = metrics.quadratic_weighted_kappa([1, 2, 1], [1, 2, 2], 1, 2)
        self.assertAlmostEqual(kappa, 0.4)

        kappa = metrics.quadratic_weighted_kappa([1, 2, 3, 1, 2, 2, 3],
                                                 [1, 2, 3, 1, 2, 3, 2])
        self.assertAlmostEqual(kappa, 0.75)
    def predict(self, model, xg_train, xg_test, objective='reg:linear'):
        """
        Parameters
        ----------

        model : xgboost.Booster
            xgboost model ready for making predictions

        xg_train : xgboost.DMatrix
            training data

        xg_test : xgboost.DMatrix
            testing data


        Returns
        -------

        model_prediction : ModelPrediction (named tuple)

        """

        train_score = model.predict(
            xg_train, ntree_limit=model.best_iteration)
        test_score = model.predict(
            xg_test,  ntree_limit=model.best_iteration)

        train_label = np.asarray(xg_train.get_label())
        test_label = np.asarray(xg_test.get_label())

        if objective == 'reg:linear':
            # Cuttofs are optimized here
            best_cuts = optimize_cutoffs(train_score, train_label,
                                         verbose=False)
            train_prediction = classify_with_cutoffs(train_score, best_cuts)
            test_prediction = classify_with_cutoffs(test_score, best_cuts)
        else:
            train_prediction = train_score
            test_prediction = test_score

        train_qwk = quadratic_weighted_kappa(train_label, train_prediction)
        test_qwk = quadratic_weighted_kappa(test_label, test_prediction)

        return ModelPrediction(train_label, test_label,
                               train_score, test_score,
                               train_prediction, test_prediction,
                               train_qwk, test_qwk,
                               precision_score(train_label, train_prediction,
                                               average=None),
                               precision_score(test_label, test_prediction,
                                               average=None)
                               )
Exemple #4
0
def linear_reg(selected, sorted_feature, feature_data):
    features = []
    scores = feature_data['score']
    lm = linear_model.LinearRegression()
    kf = KFold(n_splits=10)
    overall = []
    for title in sorted_feature.keys():
        if title in selected:
            features.append(title)
    for i in range(len(scores)):
        z = 0
        X = []
        for t in features:
            X.append(feature_data[t][i])
        X = np.array([list(x) for x in zip(*X)])
        count = 0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = np.array(scores[i])[train_index], np.array(
                scores[i])[test_index]
            model = lm.fit(X_train, y_train)
            pred_score = lm.predict(X_test)
            kappa = metrics.quadratic_weighted_kappa(y_test, pred_score)
            z += 1 / 2 * math.log((1 + kappa) / (1 - kappa))
            count += 1
        weighted_kappa_mean = (math.e**(2 * z / count) -
                               1) / (math.e**(2 * z / count) + 1)
        overall.append(weighted_kappa_mean)
        print(weighted_kappa_mean)
    print(sum(overall) / len(overall))
    return
Exemple #5
0
def feature_selection(sorted_feature, feature_data):
    features = []
    scores = feature_data['score']
    lm = linear_model.LinearRegression()
    prev = 0
    for title in sorted_feature.keys():
        features.append(title)
        kappa = 0
        z = 0
        for i in range(len(scores)):
            X = []
            for t in features:
                X.append(feature_data[t][i])
            if len(X) == 1:
                X = np.array(X[0]).reshape(-1, 1)
            else:
                X = [list(x) for x in zip(*X)]
            model = lm.fit(X, scores[i])
            pred_score = lm.predict(X)
            kappa = metrics.quadratic_weighted_kappa(scores[i], pred_score)
            z += 1 / 2 * math.log((1 + kappa) / (1 - kappa))
        weighted_kappa_mean = (math.e**(2 * z / (len(scores))) -
                               1) / (math.e**(2 * z / (len(scores))) + 1)
        if weighted_kappa_mean < prev:
            features = features[:-1]
        else:
            prev = weighted_kappa_mean
        print(features)
        print(weighted_kappa_mean)
    pickle.dump(features, open('selected_features.txt', 'wb'))
    return features
def predict_score():
	file = open("model/predictions.txt")
	numarray = []
	while 1:
		line = file.readline()
		if not line:
			break
		numarray.append(int(float(line)))
	file = open("model/answers.txt")
	answerarray = []
	while 1:
		line = file.readline()
		if not line:
			break
		answerarray.append(int(float(line)))
	##print len(numarray)
	##print len(answerarray)
	solutionarray = []
	for x in range(0, len(numarray)):
		if numarray[x] == answerarray[x]:
			solutionarray.append(1)
		else:
			solutionarray.append(0)
	onecounter = solutionarray.count(1)
	print "QWK_Score: " + str(metrics.quadratic_weighted_kappa(answerarray,numarray))
def predict_score():
    file = open("model/predictions.txt")
    numarray = []
    while 1:
        line = file.readline()
        if not line:
            break
        numarray.append(int(float(line)))
    file = open("model/answers.txt")
    answerarray = []
    while 1:
        line = file.readline()
        if not line:
            break
        answerarray.append(int(float(line)))
    ##print len(numarray)
    ##print len(answerarray)
    solutionarray = []
    for x in range(0, len(numarray)):
        if numarray[x] == answerarray[x]:
            solutionarray.append(1)
        else:
            solutionarray.append(0)
    onecounter = solutionarray.count(1)
    print "QWK_Score: " + str(
        metrics.quadratic_weighted_kappa(answerarray, numarray))
Exemple #8
0
def minimize_quadratic_weighted_kappa(cutpoints,y_pred=None,y=None):
    cutpoints = np.sort(cutpoints)
    cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]])
    y_pred = pd.cut(y_pred,bins=cutpoints,labels=[1,2,3,4,5,6,7,8])
    score = quadratic_weighted_kappa(y,y_pred)
    print score
    return -score
Exemple #9
0
def eval_dag(dag, filename, dag_id=None):

    dag = normalize_dag(dag)

    if filename not in input_cache:
        input_cache[filename] = pd.read_csv('data/'+filename, sep=';')

    data = input_cache[filename]

    feats = data[data.columns[:-1]]
    targets = data[data.columns[-1]]

    le = preprocessing.LabelEncoder()

    ix = targets.index
    targets = pd.Series(le.fit_transform(targets), index=ix)

    errors = []

    start_time = time.time()

    for train_idx, test_idx in cross_validation.StratifiedKFold(targets, n_folds=5):
        train_data = (feats.iloc[train_idx], targets.iloc[train_idx])
        test_data = (feats.iloc[test_idx], targets.iloc[test_idx])

        ms = train_dag(dag, train_data)
        preds = test_dag(dag, ms, test_data)

        acc = mm.quadratic_weighted_kappa(test_data[1], preds)
        errors.append(acc)

    m_errors = float(np.mean(errors))
    s_errors = float(np.std(errors))

    return m_errors, s_errors, time.time() - start_time
Exemple #10
0
    def on_epoch_end(self, epoch, logs={}):

        self.counter += 1
        p = self.model.predict(self.X_val,
                               verbose=0)  #score the validation data

        #current kappa
        current = ml_metrics.quadratic_weighted_kappa(
            self.y_val.values.ravel(),
            np.clip(np.round(p.astype(int).ravel()), 1, 8))

        print('Epoch %d Kappa: %f | Best Kappa: %f \n' %
              (epoch, current, self.best))

        #if improvement over best....
        if current > self.best:
            self.best = current
            self.best_rounds = self.counter
            self.wait = 0
            self.model.save_weights(self.filepath, overwrite=True)
            print("model save weights")
        else:
            if self.wait >= self.patience:  #no more patience, retrieve best model
                self.model.stop_training = True
                print('Best number of rounds: %d \nKappa: %f \n' %
                      (self.best_rounds, self.best))

                self.model.load_weights(self.filepath)

            self.wait += 1  #incremental the number of times without improvement
Exemple #11
0
    def predict(self, model, xg_train, xg_test, objective='reg:linear'):
        """
        Parameters
        ----------

        model : xgboost.Booster
            xgboost model ready for making predictions

        xg_train : xgboost.DMatrix
            training data

        xg_test : xgboost.DMatrix
            testing data


        Returns
        -------

        model_prediction : ModelPrediction (named tuple)

        """

        train_score = model.predict(xg_train, ntree_limit=model.best_iteration)
        test_score = model.predict(xg_test, ntree_limit=model.best_iteration)

        train_label = np.asarray(xg_train.get_label())
        test_label = np.asarray(xg_test.get_label())

        if objective == 'reg:linear':
            # Cuttofs are optimized here
            best_cuts = optimize_cutoffs(train_score,
                                         train_label,
                                         verbose=False)
            train_prediction = classify_with_cutoffs(train_score, best_cuts)
            test_prediction = classify_with_cutoffs(test_score, best_cuts)
        else:
            train_prediction = train_score
            test_prediction = test_score

        train_qwk = quadratic_weighted_kappa(train_label, train_prediction)
        test_qwk = quadratic_weighted_kappa(test_label, test_prediction)

        return ModelPrediction(
            train_label, test_label, train_score, test_score, train_prediction,
            test_prediction, train_qwk, test_qwk,
            precision_score(train_label, train_prediction, average=None),
            precision_score(test_label, test_prediction, average=None))
Exemple #12
0
def evalerror_softmax_cdf(preds, dtrain, cdf):
    ## label are in [0,1,2,3]
    labels = dtrain.get_label() + 1
    preds = getClfScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
    def _score_offset(self, bin_offset, sv):
        flg = self._data[:, 0].astype(int) == sv
        self._data[flg, 1] = self._data[flg, 0] + bin_offset
        offset_pred = np.clip(np.round(self._data[:, 1]), 1, 8)\
            .astype(int)
        kappa = quadratic_weighted_kappa(self._data[:, 2], offset_pred)

        return -kappa
def evalerror_softmax_cdf(preds, dtrain, cdf):
    ## label are in [0,1,2,3]
    labels = dtrain.get_label() + 1
    preds = getClfScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Exemple #15
0
def minimize_quadratic_weighted_kappa(cutpoints, y_pred=None, y=None):
    cutpoints = np.sort(cutpoints)
    cutpoints = np.concatenate([[-99999999999999999], cutpoints,
                                [999999999999999]])
    y_pred = pd.cut(y_pred, bins=cutpoints, labels=[1, 2, 3, 4, 5, 6, 7, 8])
    score = quadratic_weighted_kappa(y, y_pred)
    print score
    return -score
def keras_model():

    import pandas as pd
    import numpy as np

    from keras.preprocessing import sequence
    from keras.models import Sequential
    from keras.layers.core import Dense, Dropout, Activation, Flatten
    from keras.layers.convolutional import Convolution1D, MaxPooling1D
    from keras.callbacks import EarlyStopping
    from keras.utils import np_utils

    from data_util import load_csvs, load_other
    import ml_metrics as metrics

    nb_words = 6500
    maxlen = 175
    filter_length = 10
    other_col_dim = 4

    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv',
                                                             'data/tpov4/test_1.csv',
                                                              nb_words, maxlen, 'self', w2v=None)

    # read _other.csv
    other_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim)
    other_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim)

    print('other tensor:', other_train.shape)

    pool_length = maxlen - filter_length + 1

    model = Sequential()
    model.add(Convolution1D(nb_filter=50,
                            filter_length=filter_length,
                            border_mode="valid", activation="relu",
                            input_shape=(maxlen, other_col_dim)))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Flatten())
    model.add(Dropout(0.05))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer={{choice(['rmsprop', 'adam', 'adadelta', 'adagrad'])}})

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(other_train, Y_train, batch_size=32, nb_epoch=25,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(other_test, batch_size=32)
    org_classes = np_utils.categorical_probas_to_classes(Y_test)

    acc = np_utils.accuracy(classes, org_classes)  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, org_classes)
    print('Test Kappa:', kappa)
    return {'loss': -acc, 'status': STATUS_OK}
Exemple #17
0
def eval_wrapper(yhat, y):
    """
    Evaluation metric for the competition : quad weighted kappa
    """  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)
def eval_wrapper(yhat, y):
    """
    Evaluation metric for the competition : quad weighted kappa
    """
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)
    return quadratic_weighted_kappa(yhat, y)
Exemple #19
0
 def on_epoch_end(self, epoch, logs={}):
     if epoch % 5 == 0 or epoch == (epochs2 - 1):
         self.predhis.append(
             model2.predict([np.array(X_train[x]) for x in variables2]))
         self.predhisval.append(
             model2.predict([np.array(X_validation[x])
                             for x in variables2]))
         self.scores_train.append(
             metrics.quadratic_weighted_kappa(
                 get_output(predictions2.predhis[-1]),
                 np.array(X_train['revenue_class'])))
         self.scores_validation.append(
             metrics.quadratic_weighted_kappa(
                 np.digitize(predictions2.predhisval[-1][:, 0],
                             get_offset(predictions2.predhis[-1])) + 1,
                 np.array(X_validation['revenue_class'])))
         print 'training : ' + str(self.scores_train[-1])
         print 'validation : ' + str(self.scores_validation[-1])
Exemple #20
0
def evalerror(preds, dtrain):
    ## label are in [0,1,2,3] as required by XGBoost for multi-classification
    labels = dtrain.get_label() + 1
    ## class probability
    preds = softmax(preds)
    ## decoding (naive argmax decoding)
    pred_labels = np.argmax(preds, axis=1) + 1
    ## compute quadratic weighted kappa (using implementation from @Ben Hamner
    ## https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/quadratic_weighted_kappa.py
    kappa = quadratic_weighted_kappa(labels, pred_labels)
    return 'kappa', kappa
Exemple #21
0
def evalerror(preds, dtrain):
    ## label are in [0,1,2,3] as required by XGBoost for multi-classification
    labels = dtrain.get_label() + 1
    ## class probability
    preds = softmax(preds)
    ## decoding (naive argmax decoding)
    pred_labels = np.argmax(preds, axis=1) + 1
    ## compute quadratic weighted kappa (using implementation from @Ben Hamner
    ## https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/quadratic_weighted_kappa.py
    kappa = quadratic_weighted_kappa(labels, pred_labels)
    return 'kappa', kappa
def evalerror_cocr_cdf(preds, dtrain, cdf):
    labels = dtrain.get_label() + 1
    #print preds.shape
    ## get prediction
    #preds = sigmoid(preds)
    preds = applyCOCRRule(preds)
    preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Exemple #23
0
def xgb_regression_quadratic_weighted_kappa(preds,dtrain):
    labels = dtrain.get_label()
    cutpoints = [1.886638,3.303624,4.152756,4.825063,5.653934,6.236325,6.765184]  
    res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(preds,labels),method='BFGS')
    cutpoints = np.sort(res.x)
    cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]])
    y_pred = pd.cut(preds,bins=cutpoints,labels=[1,2,3,4,5,6,7,8])
    kappa = quadratic_weighted_kappa(labels,y_pred)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Exemple #24
0
def evalerror_cocr_cdf(preds, dtrain, cdf):
    labels = dtrain.get_label() + 1
    #print preds.shape
    ## get prediction
    #preds = sigmoid(preds)
    preds = applyCOCRRule(preds)
    preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
    def _offset_qwk_score(self, offset):
        """

        :param numpy.array offset:
        :param numpy.array y_true:
        :param numpy.array y_pred:
        :rtype: float
        """
        offset_pred = self._apply_offset(self._data, offset)
        kappa = quadratic_weighted_kappa(self._data[:, 2], offset_pred)

        return -kappa
Exemple #26
0
    def evaluate_prediction(self,
                            key_word=r'',
                            decimal_places=4,
                            csv_dump=False,
                            df_ac_predict_target=pd.DataFrame(),
                            predict_res=np.array([])):
        if len(df_ac_predict_target) == 0:
            df_ac_predict_target = self.df_ac_predict_target
        if len(predict_res) == 0:
            predict_res = self.predict_res

        recall = round(
            mtrx.recall_score(df_ac_predict_target.transpose().values[0],
                              predict_res,
                              average='weighted'), decimal_places)
        precision = round(
            mtrx.precision_score(df_ac_predict_target.transpose().values[0],
                                 predict_res,
                                 average='weighted'), decimal_places)
        f1 = round(
            mtrx.f1_score(df_ac_predict_target.transpose().values[0],
                          predict_res,
                          average='weighted'), decimal_places)
        kappa = round(
            metrics.kappa(predict_res,
                          df_ac_predict_target.transpose().values[0]),
            decimal_places)
        qwk = round(
            metrics.quadratic_weighted_kappa(
                predict_res,
                df_ac_predict_target.transpose().values[0]), decimal_places)

        self.se_indices = pd.Series([recall, precision, f1, kappa, qwk],
                                    index=[
                                        'Recall', 'Precision', 'F1', 'Kappa',
                                        'Quadratic Weighted Kappa'
                                    ])
        print(self.se_indices)
        self.conf_mtx = pd.DataFrame(
            mtrx.confusion_matrix(df_ac_predict_target.transpose().values[0],
                                  predict_res))
        print('Confusion Matrix:')
        print(self.conf_mtx)

        if csv_dump == True:
            self.se_indices.to_csv(
                self.data_dir + r'Classified-Prediction-Indices-' + key_word +
                r'.csv',
                encoding='latin1')
            self.conf_mtx.to_csv(self.data_dir +
                                 r'Classified-Prediction-Confusion-Matrix-' +
                                 key_word + r'.csv',
                                 encoding='latin1')
Exemple #27
0
def cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                   maxlen, vocab_size, embd_dim,
                   nb_filter, filter_length, batch_size, nb_epoch, optm):
    """
    - CNN-1d on text input (represented in int)
    - MOT
    - dropout + L2 softmax

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param vocab_size
    :param embd_dim
    :param nb_filter
    :param filter_length
    :param batch_size
    :param nb_epoch
    :param optm optimizer options, e.g., adam, rmsprop, etc.
    :return:
    """
    pool_length = maxlen - filter_length + 1

    model = Sequential()
    model.add(Embedding(vocab_size, embd_dim, input_length=maxlen))
    model.add(Dropout(0.25))

    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode="valid",
                            activation="relu"))
    model.add(MaxPooling1D(pool_length=pool_length))

    model.add(Flatten())
    model.add(Dropout(0.5))

    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test accuracy:', acc)
    # return(acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
    def default_errorfun(p, ysc, ytr):
        """
        Parameters
        ----------

        p : array of 8 cutoff values

        ysc : array of scores [array(double)]

        ytr : array of true labels [array(int)]
        """
        errors = quadratic_weighted_kappa(
            classify_with_cutoffs(ysc, p).astype(np.int64), ytr)
        return 1 - errors
Exemple #29
0
    def default_errorfun(p, ysc, ytr):
        """
        Parameters
        ----------

        p : array of 8 cutoff values

        ysc : array of scores [array(double)]

        ytr : array of true labels [array(int)]
        """
        errors = quadratic_weighted_kappa(
            classify_with_cutoffs(ysc, p).astype(np.int64), ytr)
        return 1 - errors
    def on_epoch_end(self, epoch, logs={}):
        p = self.model.predict(self.X_val.values, verbose=0)
        current = ml_metrics.quadratic_weighted_kappa(self.y_val.values.ravel(),np.clip(np.round(p.astype(int).ravel()), 1, 8))

        if current > self.best:
            self.best = current
            self.wait = 0
        else:
            if self.wait >= self.patience:
                self.model.stop_training = True
                print('Epoch %05d: early stopping' % (epoch))

            self.wait += 1 #incremental the number of times without improvement
        print('Epoch %d Kappa: %f | Best Kappa: %f \n' % (epoch,current,self.best))
Exemple #31
0
def calc_mqwp(output):
    """
    Calculate the mean quadratic_weighted_kappa across all the question sets
    :param outputs: dataframe containing target, output, question set
    :return: mean quadratic weighted kappa
    """
    groups = output.groupby('set')

    kappas = [
        quadratic_weighted_kappa(group[1]["output"], group[1]["target"])
        for group in groups
    ]
    print('Kappa of each set: ', kappas)
    mean = mean_quadratic_weighted_kappa(kappas)
    return mean
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list, cdf_list, numValidMatrix):

    weight2 = param['weight2']
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            p1 = p1_list[run,fold,:numValid]
            p2 = p2_list[run,fold,:numValid]
            true_label = true_label_list[run,fold,:numValid]
            cdf = cdf_list[run,fold,:]
            p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2)
            p_ens_score = getScore(p_ens, cdf)
            kappa_cv[run][fold] = quadratic_weighted_kappa(p_ens_score, true_label)
    kappa_cv_mean = np.mean(kappa_cv)
    return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
Exemple #33
0
    def _cut_qwk_score(cut_points, y_pred, y_true):
        """

        :param list cut_points:
        :param numpy.array y_true:
        :param numpy.array y_pred:
        :rtype: float
        """
        try:
            d_pred = np.digitize(y_pred[:, 0], cut_points) + 1
        except ValueError:

            return 1
        kappa = quadratic_weighted_kappa(y_true, d_pred)

        return -kappa
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list, cdf_list, numValidMatrix):

    weight2 = param['weight2']
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            p1 = p1_list[run,fold,:numValid]
            p2 = p2_list[run,fold,:numValid]
            true_label = true_label_list[run,fold,:numValid]
            cdf = cdf_list[run,fold,:]
            p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2)
            p_ens_score = getScore(p_ens, cdf)
            kappa_cv[run][fold] = quadratic_weighted_kappa(p_ens_score, true_label)
    kappa_cv_mean = np.mean(kappa_cv)
    return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
Exemple #35
0
def xgb_regression_quadratic_weighted_kappa(preds, dtrain):
    labels = dtrain.get_label()
    cutpoints = [
        1.886638, 3.303624, 4.152756, 4.825063, 5.653934, 6.236325, 6.765184
    ]
    res = minimize(minimize_quadratic_weighted_kappa,
                   cutpoints, (preds, labels),
                   method='BFGS')
    cutpoints = np.sort(res.x)
    cutpoints = np.concatenate([[-99999999999999999], cutpoints,
                                [999999999999999]])
    y_pred = pd.cut(preds, bins=cutpoints, labels=[1, 2, 3, 4, 5, 6, 7, 8])
    kappa = quadratic_weighted_kappa(labels, y_pred)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Exemple #36
0
def test_classifier(parameters, clsClass, feats, targets, filename):
    errors = []
    for train_idx, test_idx in cross_validation.StratifiedKFold(targets,
                                                                n_folds=5):
        cls = clsClass(**parameters)
        train_data = (feats.iloc[train_idx], targets.iloc[train_idx])
        test_data = (feats.iloc[test_idx], targets.iloc[test_idx])

        cls.fit(train_data[0], train_data[1])
        preds = cls.predict(test_data[0])

        acc = mm.quadratic_weighted_kappa(test_data[1], preds)
        if filename == 'ml-prove.csv':
            acc = metrics.accuracy_score(test_data[1], preds)
        errors.append(acc)

    return errors, parameters
Exemple #37
0
def cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                  maxlen,
                  nb_filter, filter_length, batch_size, nb_epoch, optm):
    """
    - CNN-1d on 3d sensor which uses word2vec embedding
    - MOT

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param nb_filter
    :param filter_length
    :param batch_size
    :param nb_epoch
    :param optm
    :return:
    """
    pool_length = maxlen - filter_length + 1

    model = Sequential()

    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode="valid",
                            activation="relu", input_shape=(maxlen, 300)))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    # return(acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
Exemple #38
0
def lstm_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                  maxlen, vocab_size, embd_dim,
                  batch_size, nb_epoch, optm):
    """
    - LSTM  on text input (represented in int)
    - fully-connected model

    :param <X, Y> train and test sets
    :param nb_classes # of classes
    :param maxlen max of n char in a sentence
    :param vocab_size
    :param embd_dim
    :param batch_size
    :param nb_epoch
    :param optm optimizer options, e.g., adam, rmsprop, etc.
    :return:
    """

    model = Sequential()
    model.add(Embedding(vocab_size, embd_dim, input_length=maxlen))
    model.add(Dropout(0.25))

    # model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(50))

    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optm)

    earlystop = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
    acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return (kappa)
Exemple #39
0
        def objective(params):
            mlp_params = {
                "layers_architecture":
                [int(l) for l in params["layers"]["n_units_layer"]],
                "enable_batch_normalization":
                [True] * params["layers"]["n_layers"],
                "activations":
                [params["activation"]] * params["layers"]["n_layers"],
                "dropout_probas":
                [params["dropout"]] * params["layers"]["n_layers"],
                "optimizer":
                "adam",
                "learning_rate":
                params["learning_rate"],
                "l2_regularization":
                params["l2_regularization"],
                "metric":
                "rmse",
                "epochs":
                500,
                "batch_size":
                2048
            }

            mlp = MLPClassifier(
                mlp_params,
                weights_saving_directory_path_str=
                "D:/Projets_Data_Science/Competitions/Kaggle/PetFinder.my_Adoption_Prediction/data/weights/"
            )

            # Train the model
            mlp.fit(X_train, y_train)

            # Make predictions
            predictions_npa = mlp.predict(X_test)

            # Evaluate the model
            qwk = quadratic_weighted_kappa(y_test, predictions_npa)
            print(mlp_params)
            print("QWK = ", qwk)

            return -qwk  # Return negative value as we want to maximize it
Exemple #40
0
def feature_testing(feature_data):
    scores = feature_data['score']
    lm = linear_model.LinearRegression()
    kappa = {}
    #for each feature(j) in each essayset(i)
    for i in range(len(scores)):
        for key, value in feature_data.items():
            if key != 'score':
                model = lm.fit(np.array(value[i]).reshape(-1, 1), scores[i])
                pred_score = lm.predict(np.array(value[i]).reshape(-1, 1))
                if key not in kappa:
                    kappa[key] = []
                kappa[key].append(
                    metrics.quadratic_weighted_kappa(scores[i], pred_score))
    for key, value in kappa.items():
        kappa[key] = sum(value) / len(value)
    sorted_kappa = dict(
        sorted(kappa.items(), key=operator.itemgetter(1), reverse=True))
    print(sorted_kappa)
    return sorted_kappa
Exemple #41
0
        def objective(params):
            lgb_params = {
                "application": "multiclass",
                "boosting": "gbdt",
                "metric": "qwk",
                "num_class": 5,
                "num_leaves": int(params["num_leaves"]),
                "max_depth": -1,
                "learning_rate": "{:.4f}".format(params["learning_rate"]),
                "bagging_fraction":
                "{:.4f}".format(params["bagging_fraction"]),
                "feature_fraction":
                "{:.4f}".format(params["feature_fraction"]),
                "min_split_gain": "{:.4f}".format(params["min_split_gain"]),
                "min_child_samples": int(params["min_child_samples"]),
                "min_child_weight":
                "{:.4f}".format(params["min_child_weight"]),
                "verbosity": -1,
                "seed": 17,
                "nthread": 16,
                "device": "cpu"
            }

            lgbm = BlendedLGBMClassifier(lgb_params,
                                         early_stopping_rounds=150,
                                         eval_size=0.2,
                                         eval_split_type="random",
                                         verbose_eval=100,
                                         nrounds=10000)

            # Train the model
            lgbm.fit(X_train, y_train)

            # Make predictions
            predictions_npa = lgbm.predict(X_test)

            # Evaluate the model
            qwk = quadratic_weighted_kappa(y_test, predictions_npa)
            print("QWK = ", qwk)

            return -qwk  # Return negative value as we want to maximize it
Exemple #42
0
def cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                     maxlen, vocab_size, embd_size,
                     nb_filter, batch_size, nb_epoches, optm):
    ngram_filters = [2, 5, 8]

    input = Input(shape=(maxlen,), name='input', dtype='int32')
    embedded = Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=maxlen)(input)

    convs = [None, None, None]
    # three CNNs
    for i, n_gram in enumerate(ngram_filters):
        pool_length = maxlen - n_gram + 1
        convs[i] = Convolution1D(nb_filter=nb_filter,
                                 filter_length=n_gram,
                                 border_mode="valid",
                                 activation="relu")(embedded)
        convs[i] = MaxPooling1D(pool_length=pool_length)(convs[i])
        convs[i] = Flatten()(convs[i])

    merged = merge([convs[0], convs[1], convs[2]], mode='concat', concat_axis=1)
    merged = Dropout(0.5)(merged)
    output = Dense(nb_classes, activation='softmax', name='output')(merged)

    model = Model(input, output)
    model.compile(optm, loss={'output': 'categorical_crossentropy'})
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    model.fit(X_train, Y_train,
              nb_epoch=nb_epoches, batch_size=batch_size,
              validation_split=0.1, callbacks=[earlystop])

    probs = earlystop.model.predict(X_test, batch_size=batch_size)
    classes = np_utils.categorical_probas_to_classes(probs)

    acc = np_utils.accuracy(classes,
                            np_utils.categorical_probas_to_classes(Y_test))
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes,
                                             np_utils.categorical_probas_to_classes(Y_test))
    print('Test Kappa:', kappa)
    return acc
Exemple #43
0
def eval_dag(dag, filename, dag_id=None):

    dag = normalize_dag(dag)
    # utils.draw_dag(dag)
    # pprint.pprint(dag)

    if filename not in input_cache:
        input_cache[filename] = pd.read_csv('data/' + filename, sep=';')

    data = input_cache[filename]

    feats = data[data.columns[:-1]]
    targets = data[data.columns[-1]]

    le = preprocessing.LabelEncoder()

    ix = targets.index
    targets = pd.Series(le.fit_transform(targets), index=ix)

    errors = []

    start_time = time.time()

    for train_idx, test_idx in cross_validation.StratifiedKFold(targets,
                                                                n_folds=5):
        train_data = (feats.iloc[train_idx], targets.iloc[train_idx])
        test_data = (feats.iloc[test_idx], targets.iloc[test_idx])

        ms = train_dag(dag, train_data)
        preds = test_dag(dag, ms, test_data)

        acc = mm.quadratic_weighted_kappa(test_data[1], preds)
        if filename == 'ml-prove.csv':
            acc = metrics.accuracy_score(test_data[1], preds)
        errors.append(acc)

    m_errors = float(np.mean(errors))
    s_errors = float(np.std(errors))

    return m_errors, s_errors, time.time() - start_time
Exemple #44
0
        def objective(params):
            xgb_params = {
                "objective": "multiclass",
                "booster": "gbtree",
                "metric": "qwk",
                "num_class": 5,
                "max_depth": int(params["max_depth"]),
                "eta": params["eta"],
                "subsample": params["subsample"],
                "colsample_bytree": params["colsample_bytree"],
                "gamma": params["gamma"],
                "min_child_weight": params["min_child_weight"],
                "verbosity": 0,
                "silent": 1,
                "seed": 17,
                "nthread": 30
            }

            print("Params:", xgb_params)

            xgb = BlendedXGBClassifier(xgb_params,
                                       early_stopping_rounds=150,
                                       eval_size=0.2,
                                       eval_split_type="random",
                                       verbose_eval=100,
                                       nrounds=10000)

            # Train the model
            xgb.fit(X_train, y_train)

            # Make predictions
            predictions_npa = xgb.predict(X_test)

            # Evaluate the model
            qwk = quadratic_weighted_kappa(y_test, predictions_npa)
            print(xgb_params)
            print("QWK = ", qwk)

            return -qwk  # Return negative value as we want to maximize it
Exemple #45
0
def evalerror_ebc_cdf(preds, dtrain, cdf, hard_threshold=False):
    labels = dtrain.get_label()
    ## extended samples within the feature construction part
    if np.min(labels) == -1 and np.max(labels) == 1:
        labels = applyEBCRule(labels)
    ## extended samples within the objective value computation part
    ## See ebcobj function for detail
    else:
        ## label are in [0,1,2,3]
        labels += 1
    #print preds.shape
    ## get prediction
    #hard = False
    if hard_threshold:
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
    else:
        preds = sigmoid(preds)
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
        preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
def evalerror_ebc_cdf(preds, dtrain, cdf, hard_threshold=False):
    labels = dtrain.get_label()
    ## extended samples within the feature construction part
    if np.min(labels) == -1 and np.max(labels) == 1:
        labels = applyEBCRule(labels)
    ## extended samples within the objective value computation part
    ## See ebcobj function for detail
    else:
        ## label are in [0,1,2,3]
        labels += 1
    #print preds.shape
    ## get prediction
    #hard = False
    if hard_threshold:
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
    else:
        preds = sigmoid(preds)
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
        preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
def quadratic_weighted_kappa_round(estimator, X, actual):
    """This function applies the ml_metrics.quadratic_weighted_kappa without calling sklearn.metrics.make_scorer
    
    Parameters
    ----------
    estimator : estimator object implementing ‘fit’
        The object to use to fit the data.
    X : array-like
        The data to fit. Can be for example a list, or an array.
    actual : array-like
        the actual label
    
    Returns
    -------
    float
        ml_metrics.quadratic_weighted_kappa
    """
    predict = estimator.predict(X)
    unique_actual = list(set(actual))
    predict_round = [
        max(min(unique_actual), min(unique_actual, key=lambda x: abs(x - p)))
        for p in predict
    ]
    return quadratic_weighted_kappa(actual, predict_round)
Exemple #48
0
def evalerror(preds, dtrain):
    labels = dtrain.get_label() + 1
    preds = softmax(preds)
    pred_labels = np.argmax(preds, axis=1) + 1
    kappa = quadratic_weighted_kappa(labels, pred_labels)
    return 'kappa', kappa
               name='pospool', input='poscnn')
model.add_node(Flatten(), name='posflat', input='pospool')
model.add_node(Dropout(0.5), name='posdropout', input='posflat')


# using three CNNs to predict with L1
model.add_node(Dense(nb_classes, activation='softmax'), name='softmax',
               inputs=['dropout', 'posdropout'],
               merge_mode='concat')

model.add_output(name='output', input='softmax')
model.compile('rmsprop', loss={'output': 'categorical_crossentropy'})
# model.compile('rmsprop', loss={'output': 'mean_squared_error'})

# early stopping
earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
model.fit({'input': X_train, 'posinput': pos_train, 'output': Y_train},
          nb_epoch=nb_epoch, batch_size=batch_size,
          validation_split=0.1, callbacks=[earlystop])

# Graph doesn't have several arg/func existing in Sequential()
# - fit no show-accuracy
# - no predict_classes
classes = model.predict({'input': X_test, 'posinput': pos_test},
                        batch_size=batch_size)['output'].argmax(axis=1)
acc = np_utils.accuracy(classes, np_utils.categorical_probas_to_classes(Y_test))  # accuracy only supports classes
print('Test accuracy:', acc)
kappa = metrics.quadratic_weighted_kappa(classes, np_utils.categorical_probas_to_classes(Y_test))
print('Test Kappa:', kappa)

Exemple #50
0
X = dataset
Y = dataset.loc[:,['Response']]
X=pd.get_dummies(X,'Product_Info_2')
X.drop(['Response','Medical_History_30','Id'],axis=1,inplace=True)
#%%
# split data into train and test sets
seed = 6
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
y_test=pd.to_numeric(y_test.Response,errors='coerce')
# fit model no training data
model = XGBRegressor(max_depth=8, silent=True,learning_rate=0.1,
                     min_child_weight=35,subsample=0.6,n_estimators=150,
                     colsample_bytree=0.3,missing=-1 )
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
kappa = quadratic_weighted_kappa(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("kappa: %.2f%%" % (kappa * 100.0))

#%%
cm=confusion_matrix(y_test,list(map(int,predictions)))
cm=np.delete(cm,[0,1],0)
cm=np.delete(cm,[0,1],1)
norm_cm=cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(norm_cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.show()
import ml_metrics as metrics
file = open("predictions.txt")
numarray = []
while 1:
	line = file.readline()
	if not line:
		break
	numarray.append(int(float(line)))
file = open("answers.txt")
answerarray = []
while 1:
	line = file.readline()
	if not line:
		break
	answerarray.append(int(float(line)))
##print len(numarray)
##print len(answerarray)
solutionarray = []
for x in range(0, len(numarray)):
	if numarray[x] == answerarray[x]:
		solutionarray.append(1)
	else:
		solutionarray.append(0)
onecounter = solutionarray.count(1)
print "QWK_Score: " + str(metrics.quadratic_weighted_kappa(answerarray,numarray))
def qwk_score(y_true, y_pred):
    kappa = quadratic_weighted_kappa(y_true, y_pred)

    return kappa
print X2.shape
# Splitting Corpus into train and test again

X1Test  = X1[len(y)+1:]
X1      = X1[:len(y)]
X2Test  = X2[len(y)+1:]
X2      = X2[:len(y)]

# Passing the vectorized matrices to SVD
svd = TruncatedSVD(n_components = 800)
svd.fit(X1)
X1 = svd.transform(X1)
#X1Test  = svd.transform(X1Test)
svd = TruncatedSVD(n_components = 1200)
svd.fit(X2)
X2 = svd.transform(X2)
#X2Test = svd.transform(X2Test)

# Initialize Model Variables #

clf = pipeline.Pipeline([('scl', StandardScaler()),('svm', SVC(C=10,gamma=0.0002))])

#Horizontally stacking the two matrices
X = hstack((X1,X2,trainFeatures))
#X_test = hstack((X1Test,X2Test,testFeatures))

stemPred = cross_val_predict(clf,X,y,cv=2,n_jobs=-1)

print "Kappa Score for Training Data\nStemming\nScore=%f" %(quadratic_weighted_kappa(y, stemPred))
Exemple #54
0
def qwk_score(y_true, y_pred):
    kappa = quadratic_weighted_kappa(y_true, y_pred)

    return kappa
def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)
Exemple #56
0
def getscore(y,ypred):
    return  ml_metrics.quadratic_weighted_kappa(y,ypred)
Exemple #57
0
        best_params = searcher.best_estimator_.get_params()
        for param in sorted(current_params.keys()):
          print("\t%s: %r" % (param, best_params[param]))

        # save the trained model for later
        best.append(searcher.best_estimator_)

    # we then validate the ensemble on the set aside features and targets
    predictions = []
    for i in range(len(best)):
      model = best[i]
      predictions.append(model.predict(features[i][test_mask]))

    # just averaging for now, play with this later
    predictions = np.sum(predictions, axis=0) / len(predictions)
    score = ml_metrics.quadratic_weighted_kappa(targets[test_mask], predictions)
    ensemble_scores.append(score)
    print("Ensemble score... %0.3f" % score)

  print("CV ensemble score... %0.3f" % np.mean(ensemble_scores))

  # once the ensemble has been validated, we can fit each model with all the
  # training data and make predictions for the real test data, if we want to
  # generate a submission csv (requires --submit flag)
  if "--submit" in sys.argv:

    # run all models on all data
    predictions = []
    for i in range(len(best)):
      model = best[i]
      print("Thinking...")
def qwk_wrapper(y, y_pred, splits):
            
    return quadratic_weighted_kappa([digitize(yp, splits) for yp in y_pred], y)
def hyperopt_obj(param, feat_name, trial_counter):
    kappa_cv = []
    cols = list(test.columns)
    cols.remove('Id')
    train_features = cols
    for run in range(1,2):
        print("run%d"%(run))
        #### all the path
        #load index 
        path = "../../data/info/run%d"%(run)
       
        train_index = loadCVIndex("../../data/cv/train.run%d.txt"%(run))
        test_index = loadCVIndex("../../data/cv/valid.run%d.txt"%(run))
        X_train = train.iloc[train_index][train_features]
        X_valid = train.iloc[test_index][train_features]
        labels_train = train.iloc[train_index]['Response']
        labels_valid = train.iloc[test_index]['Response']
        
        # cdf
        cdf_valid_path = "%s/valid.cdf" % path
        ## load cdf
        cdf_valid = np.loadtxt(cdf_valid_path, dtype=float)
        
        ## make evalerror func
        evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid)
        evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid)
        evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold)
        evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid)
        ##############
        ## Training ##
        ##############
        ## you can use bagging to stabilize the predictions    
        dvalid_base = xgb.DMatrix(X_valid, label=labels_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train) 
        watchlist  = [(dtrain_base, 'train'), (dvalid_base, 'eval')]   
        w = np.loadtxt("../../data/info/All/weight.txt",dtype=float)
        if param["task"] in ["regression", "ranking"]:
            ## regression & pairwise ranking with xgboost
            bst = xgb.train(param, dtrain_base, param['num_round'],watchlist)
            pred = bst.predict(dvalid_base)
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = bst.predict(dtrain_base)
            
        elif param["task"] in ["huber"]:
            ## regression & pairwise ranking with xgboost
            bst = xgb.train(param, dtrain_base, param['num_round'],watchlist)
            pred = bst.predict(dvalid_base)
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = bst.predict(dtrain_base)   
        elif param["task"] in ["regrank"]:
            
            bst = xgb.train(param, dtrain_base, param['num_round'],watchlist,feval=evalerror_regrank_valid)
            pred = bst.predict(dvalid_base)
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = bst.predict(dtrain_base)
        
        elif param["task"] in ["softmax"]:
            ## softmax regression with xgboost
            dvalid_base = xgb.DMatrix(X_valid, label=labels_valid-1,weight=w[test_index])
            dtrain_base = xgb.DMatrix(X_train, label=labels_train-1,weight=w[train_index])
            bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_softmax_valid)
            pred = bst.predict(dvalid_base)
            we = np.asarray(range(1,9))
            pred = np.sum(pred, axis=1)
            pred = pred + 1
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = bst.predict(dtrain_base)
            y_train_preds = y_train_preds * we[np.newaxis,:]
            y_train_preds = np.sum(y_train_preds, axis=1)
            y_train_preds = y_train_preds + 1
            
        elif param["task"]  in ["ebc"]:
            ## ebc with xgboost
            dvalid_base = xgb.DMatrix(X_valid, label=labels_valid,weight=w[test_index])
            dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w[train_index])
            obj = lambda preds, dtrain: ebcObj(preds, dtrain)
            bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_valid)
            pred = sigmoid(bst.predict(dvalid_base))
            pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = sigmoid(bst.predict(dtrain_base))
            y_train_preds = applyEBCRule(y_train_preds, hard_threshold=ebc_hard_threshold)

        elif param["task"]  in ["cocr"]:
            ## cocr with xgboost
            dvalid_base = xgb.DMatrix(X_valid, label=labels_valid,weight=w[test_index])
            dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w[train_index])
            obj = lambda preds, dtrain: cocrObj(preds, dtrain)
            bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_valid)
            pred = bst.predict(dvalid_base)
            pred = applyCOCRRule(pred)
            dtrain_base = xgb.DMatrix(X_train)
            y_train_preds = bst.predict(dtrain_base)
            y_train_preds = applyCOCRRule(y_train_preds)
        elif param['task'] == "reg_skl_rf":
            ## regression with sklearn random forest regressor
            rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                       max_features=param['max_features'],
                                       n_jobs=param['n_jobs'],
                                       random_state=param['random_state'])
            rf.fit(X_train, labels_train)
            train_sort = pd.DataFrame({'cols':train_features,'value':list(rf.feature_importances_)}).sort(columns=['value'],ascending=False)
            train_sort.to_csv("sort.csv")
            pred = rf.predict(X_valid)
            y_train_preds = rf.predict(X_train)
        elif param['task'] == "reg_skl_etr":
        ## regression with sklearn extra trees regressor
            etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                      max_features=param['max_features'],
                                      n_jobs=param['n_jobs'],
                                      random_state=param['random_state'])
            etr.fit(X_train, labels_train)
            pred = etr.predict(X_valid)
            y_train_preds = etr.predict(X_train)
            
        elif param['task'] == "reg_skl_gbm":
        ## regression with sklearn gradient boosting regressor
            gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'],
                                            max_features=param['max_features'],
                                            learning_rate=param['learning_rate'],
                                            max_depth=param['max_depth'],
                                            subsample=param['subsample'],
                                            random_state=param['random_state'])
            gbm.fit(X_train, labels_train)
            pred = gbm.predict(X_valid)
            y_train_preds = gbm.predict(X_train)
        elif param['task'] == "reg_skl_svr":
        ## regression with sklearn support vector regression
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.transform(X_valid)
            svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'],
                                    degree=param['degree'], kernel=param['kernel'])
            svr.fit(X_train, labels_train)
            pred = svr.predict(X_valid)
        pred_raw = pred
        pred_rank = pred_raw.argsort().argsort()
        score =  getScore(pred_rank,cdf=cdf_valid)
        print quadratic_weighted_kappa(score,labels_valid)
        id_test = train.iloc[test_index]['Id']
        output = pd.DataFrame({"Id": id_test, "Response_raw": pred_raw})    
        output['Response_rank'] = pred_rank 
        output['Response_cdf'] = score
        output['Response'] = labels_valid
        ## weighted averageing over different models
        cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8]
        res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(y_train_preds,labels_train),method='Nelder-Mead')
        cutpoints = np.sort(res.x)
        for i in range(0,3):
            res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(y_train_preds,labels_train),method='Nelder-Mead')
            kappa=minimize_quadratic_weighted_kappa(cutpoints,pred,labels_valid)
            cutpoints = np.sort(res.x)
        kappa_cv.append(kappa)
        print "kappa:%f"%(kappa)
    kappa_cv_mean = np.mean(kappa_cv)
    kappa_cv_std = np.std(kappa_cv)
    subm_path = "%s/train.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % ("../../result/All", feat_name, trial_counter, kappa_cv_mean, kappa_cv_std)
    output.to_csv(subm_path,index=False)
    print("              Mean: %.6f" % kappa_cv_mean)
    print("              Std: %.6f" % kappa_cv_std)
    
    output_path = "../../result"
    #path = "%s/All" % (feat_folder)
    save_path = "%s/All" % output_path
    subm_path = "%s/Subm" % output_path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(subm_path):
        os.makedirs(subm_path)
    
    # cdf
    cdf_test_path = "../../data/info/All/test.cdf"
    # raw prediction path (rank)
    raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # submission path (relevance as in [1,2,3,4])
    subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, kappa_cv_mean, kappa_cv_std)

    X_train = train[train_features]
    X_valid = test[train_features]
    labels_train = train['Response']
    ## load cdf
    cdf_test = np.loadtxt(cdf_test_path, dtype=float)  
    ##
    evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test)
    evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test)
    evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test)
    evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold)
    evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test) 
     
    dvalid_base = xgb.DMatrix(X_valid)
    dtrain_base = xgb.DMatrix(X_train, label=labels_train)    
    watchlist  = [(dtrain_base, 'train')]
    if param["task"] in ["regression", "ranking"]:
        ## regression & pairwise ranking with xgboost
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train) 
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist)
        pred = bst.predict(dvalid_base)
        dtrain_base = xgb.DMatrix(X_train) 
        y_train_preds = bst.predict(dtrain_base)
        
    elif param["task"] in ["regrank"]:
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train) 
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist,feval=evalerror_regrank_test)
        pred = bst.predict(dvalid_base)
        dtrain_base = xgb.DMatrix(X_train) 
        y_train_preds = bst.predict(dtrain_base)
        
    elif param["task"] in ["softmax"]:
            ## softmax regression with xgboost
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train-1,weight=w)
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, feval=evalerror_softmax_test)
        pred = bst.predict(dvalid_base)
        w = np.asarray(range(1,9))
        pred = pred * w[np.newaxis,:]
        pred = np.sum(pred, axis=1)
                 
    elif param["task"] in ["softkappa"]:
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w)
        ## softkappa with xgboost
        obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_softkappa_test)
        pred = softmax(bst.predict(dvalid_base))
        w = np.asarray(range(1,9))
        pred = pred * w[np.newaxis,:]
        pred = np.sum(pred, axis=1)
      

    elif param["task"]  in ["ebc"]:
        ## ebc with xgboost
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w)
        obj = lambda preds, dtrain: ebcObj(preds, dtrain)
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_ebc_test)
        pred = sigmoid(bst.predict(dvalid_base))
        pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)
        
    elif param["task"]  in ["cocr"]:
        ## cocr with xgboost
        dvalid_base = xgb.DMatrix(X_valid)
        dtrain_base = xgb.DMatrix(X_train, label=labels_train,weight=w)
        obj = lambda preds, dtrain: cocrObj(preds, dtrain)
        bst = xgb.train(param, dtrain_base, param['num_round'], watchlist, obj=obj, feval=evalerror_cocr_test)
        pred = bst.predict(dvalid_base)
        pred = applyCOCRRule(pred)
    elif param['task'] == "reg_skl_rf":
            ## regression with sklearn random forest regressor
        rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                   max_features=param['max_features'],
                                   n_jobs=param['n_jobs'],
                                   random_state=param['random_state'])
        rf.fit(X_train, labels_train)
        pred = rf.predict(X_valid)  
    elif param['task'] == "reg_skl_etr":
        ## regression with sklearn extra trees regressor
        etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                  max_features=param['max_features'],
                                  n_jobs=param['n_jobs'],
                                  random_state=param['random_state'])
        etr.fit(X_train, labels_train)
        pred = etr.predict(X_valid)
    elif param['task'] == "reg_skl_gbm":
        ## regression with sklearn gradient boosting regressor
        gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'],
                                        max_features=param['max_features'],
                                        learning_rate=param['learning_rate'],
                                        max_depth=param['max_depth'],
                                        subsample=param['subsample'],
                                        random_state=param['random_state'],
                                        max_leaf_nodes = param['max_leaf_nodes']
                                        
                                        )
        gbm.fit(X_train, labels_train)
        pred = gbm.predict(X_valid)
    
    elif param['task'] == "reg_skl_svr":
        ## regression with sklearn support vector regression
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
        svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'],
                                degree=param['degree'], kernel=param['kernel'])
        svr.fit(X_train, labels_train)
        pred = svr.predict(X_valid)
    
    pred_raw = pred
    pred_rank = pred_raw.argsort().argsort()
    id_test = test['Id']
    output = pd.DataFrame({"Id": id_test, "Response_raw": pred_raw})    
    output['Response_rank'] = pred_rank 
    pred_score = getScore(pred, cdf_test)
    output['Response_cdf'] = pred_score
    cutpoints = np.concatenate([[-99999999999999999],cutpoints,[999999999999999]])
    print cutpoints
    y_pred = pd.cut(pred,bins=cutpoints,labels=[1,2,3,4,5,6,7,8])
    output['Response_cut']  = y_pred      
    output.to_csv(subm_path, index=False) 
    return kappa_cv_mean, kappa_cv_std