Example #1
0

X, Y = get_values(csvs_train,columns=LIST_LOGITS,hstack=False,with_labels=True)

print('Corr matrix')
print(corr_matrix(list(X.transpose([1, 0, 2]))))
print(' ')


if 'ho' in classifiers:
    ws = do_hyperopt(csvs_train)
    test_predicts = np.zeros(X[:,0,:].shape)
    for m in range(7):
        test_predicts += ws[m] * X[:,m,:]
    test_predicts /= 7
    print('roc %s logloss %s'%(roc_auc_score(Y,test_predicts),logloss(Y,test_predicts)))



for m in range(len(models)):
    print('%s roc %s logloss %s'%(models[m],roc_auc_score(Y,X[:,m,:]),logloss(Y,X[:,m,:])))



#X_valid, Y_valid = get_values(csvs_valid,columns=LIST_LOGITS,hstack=False,with_labels=True)
X_test = get_values(csvs_test,columns=LIST_CLASSES,hstack=False,with_labels=False)


test_results_list = []
valid_results_list = []
Example #2
0
valids = pd.read_csv(
    'models/RNN/pavel_all_outs_slim/birnn_all_outs_slim_baggin_logits_folded.csv',
    index_col=1)
#a = predictions.loc[predictions[LIST_CLASSES] > 0.99]
test = pd.read_csv(TEST_FILENAME, index_col=0)


def find_good_predicts(valids, column):

    b1 = valids[column] > 0.995
    b2 = valids[column] < 0.0005
    c = valids[b1 | b2]
    print(valids[b1].shape)
    print(valids[b2].shape)
    return c


c = find_good_predicts(valids, 'logits_toxic')
print(c.shape)

print(logloss(c[LIST_CLASSES].values, c[LIST_LOGITS].values))
print(logloss(valids[LIST_CLASSES].values, valids[LIST_LOGITS].values))

good_predictions = find_good_predicts(predictions, 'toxic')
g = good_predictions.join(test)
train = pd.read_csv(TRAIN_SLIM_FILENAME, index_col=1)
train = train.drop(columns=['Unnamed: 0'])
new_train = pd.concat([train, g])
fn = 'train_' + str(new_train.shape[0]) + '.csv'
new_train.to_csv(fn)
            x_nb = x_nb2
            valid_features = valid_word_features.multiply(r2)
            test_features = test_word_features.multiply(r2)

        m = LogisticRegression(C=4, dual=True)
        m.fit(x_nb, y)

        preds_valid = m.predict_proba(valid_features)[:, 1]
        preds_test = m.predict_proba(test_features)[:, 1]
        preds_test_fold_list.append(preds_test)
        preds_valid_fold_list.append(preds_valid)

    preds_test_list.append(np.array(preds_test_fold_list).T)
    preds_valid_list.append(np.array(preds_valid_fold_list).T)

    print('logloss: %s' % logloss(Y_valid, np.array(preds_valid_fold_list).T))
    print('ROC: %s' % roc_auc_score(Y_valid,
                                    np.array(preds_valid_fold_list).T))

l2_data = pd.DataFrame(columns=['id'] + LIST_LOGITS + LIST_CLASSES)
l2_data['id'] = pd.Series(
    np.concatenate(
        [list_of_y[fold].index.values for fold in range(cfg.fold_count)]))
l2_data[LIST_LOGITS] = pd.DataFrame(np.concatenate(preds_valid_list, axis=0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y, axis=0))
l2_data.set_index('id', inplace=True)
#control if df is correct
print('logloss: %s' %
      logloss(l2_data[LIST_CLASSES].values, l2_data[LIST_LOGITS].values))
print('ROC: %s' %
      roc_auc_score(l2_data[LIST_CLASSES].values, l2_data[LIST_LOGITS].values))
Example #4
0
#df = pd.concat([train[COMMENT], valid[COMMENT], test[COMMENT]], axis=0)
#df = df.fillna(UNKNOWN_WORD)
#nrow = train.shape[0]

tfidf = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 2),
                        max_features=800000)

print('transforming train')
_ = tfidf.fit_transform(
    pd.concat([train[COMMENT], valid[COMMENT], test[COMMENT]], axis=0))

X_train = tfidf.transform(train[COMMENT])
model = ExtraTreesClassifier(n_jobs=-1, random_state=3, verbose=True)
print('fitting classifier')
model.fit(X_train, Y_train)

print('transforming valid')
X_valid = tfidf.transform(valid[COMMENT])

preds = model.predict_proba(X_valid)
Y_valid = valid[LIST_CLASSES].values
a = np.array(preds)[:, :, 1].T
print(logloss(Y_valid, a))

#preds = pd.DataFrame([[c[1] for c in preds[row]] for row in range(len(preds))]).T
#preds.columns = LIST_CLASSES
#preds['id'] = tid
#for c in LIST_CLASSES:
#    preds[c] = preds[c].clip(0 + 1e12, 1 - 1e12)
Example #5
0
            valid_features = valid_word_features.multiply(r2)
            test_features = test_word_features.multiply(r2)

        m = LogisticRegression(C=4, dual=True)
        m.fit(x_nb, y)

        preds_valid = m.predict_proba(valid_features)[:, 1]
        preds_test = m.predict_proba(test_features)[:, 1]
        preds_test_fold_list.append(preds_test)
        preds_valid_fold_list.append(preds_valid)

    preds_test_list.append(np.array(preds_test_fold_list).T)
    preds_valid_list.append(np.array(preds_valid_fold_list).T)
    list_of_y.append(Y_valid)

    print('logloss: %s' % logloss(Y_valid, np.array(preds_valid_fold_list).T))
    print('ROC: %s' % roc_auc_score(Y_valid,
                                    np.array(preds_valid_fold_list).T))

l2_data = pd.DataFrame(columns=LIST_LOGITS + LIST_CLASSES)
l2_data[LIST_LOGITS] = pd.DataFrame(np.concatenate(preds_valid_list, axis=0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y, axis=0))
l2_data.to_csv(cfg.fp_out_train)

preds_test_list2 = [np.array(preds_test_list[i:i + 6]).T for i in range(10)]
test_predicts = np.ones(preds_test_list2[0].shape)
for fold_predict in preds_test_list2:
    test_predicts *= fold_predict

test_predicts **= (1. / len(preds_test_list2))
new_submission = pd.read_csv(SAMPLE_SUBMISSION_FILENAME)
 def objective(space_elements):
     ws = [s / sum(space_elements) for s in space_elements]
     preds = ws[0] * xs[0]
     for l, _ in enumerate(space_elements[1:]):
         preds += ws[l + 1] * xs[l + 1]
     return logloss(ys[0], preds)
Example #7
0
        algorithm = algorithm.instance()

        if is_classifier(algorithm):
            resultlist = []
            """Predict labels with n fold cross validation"""
            y_pred = cross_val_predict(algorithm, X_out, y_out, cv=5)
            """Calculate evaluation metrics"""
            accuracy = accuracy_score(y_out, y_pred)
            resultlist.append(accuracy)
            precision = precision_score(y_out, y_pred, average='weighted')
            resultlist.append(precision)
            recall = recall_score(y_out, y_pred, average='weighted')
            resultlist.append(recall)
            f1 = f1_score(y_out, y_pred, average='weighted')
            resultlist.append(f1)
            log_loss = logloss(y_out, y_pred)
            resultlist.append(log_loss)
            roc_auc = multiclass_roc_auc_score(y_out,
                                               y_pred,
                                               average='weighted')
            resultlist.append(roc_auc)

            pipeline_res.loc[len(pipeline_res.index)] = resultlist

        else:
            """
            If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y,
            then transform on X. Safe the transformed dataset in X
            """
            if hasattr(algorithm, 'fit_transform'):
                X_out = algorithm.fit_transform(X_out, y_out)
#print('using catboost %s' %roc_auc_score(Y_valid,preds_cat))
print('using xgb %s' % roc_auc_score(Y_valid, preds_xgb))
print('using lgb %s' % roc_auc_score(Y_valid, preds_lgb))
print('using lr %s' % roc_auc_score(Y_valid, preds_logistic))
print('using nn %s' % roc_auc_score(Y_valid, preds_nn))
print('using xgb + lr %s' %
      roc_auc_score(Y_valid, np.mean([preds_logistic, preds_xgb], axis=0)))
print('using xgb + nn %s' %
      roc_auc_score(Y_valid, np.mean([preds_xgb, preds_nn], axis=0)))
print('using xgb + lr + nn %s' % roc_auc_score(
    Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn], axis=0)))
print('using xgb + lr + nn + lgb %s' % roc_auc_score(
    Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn, preds_lgb],
                     axis=0)))
print('----------logloss------------')
print('using mean %s' % logloss(Y_valid, np.mean(X_valid, axis=1)))
#print('using catboost %s' %logloss(Y_valid,preds_cat))
print('using xgb %s' % logloss(Y_valid, preds_xgb))
print('using lgb %s' % logloss(Y_valid, preds_lgb))
print('using lr %s' % logloss(Y_valid, preds_logistic))
print('using nn %s' % logloss(Y_valid, preds_nn))
print('using xgb + nn %s' %
      logloss(Y_valid, np.mean([preds_xgb, preds_nn], axis=0)))
print('using xgb + lr %s' %
      logloss(Y_valid, np.mean([preds_logistic, preds_xgb], axis=0)))
print('using xgb + lr + nn %s' %
      logloss(Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn], axis=0)))
print(
    'using xgb + lr + nn + lgb %s' %
    logloss(Y_valid,
            np.mean([preds_logistic, preds_xgb, preds_nn, preds_lgb], axis=0)))
Example #9
0
        return X


X, Y = get_values(csvs_train,
                  columns=LIST_LOGITS,
                  hstack=False,
                  with_labels=True)

print('Corr matrix')
print(corr_matrix(list(X.transpose([1, 0, 2]))))
print(' ')

rocs = []
for m, model in enumerate(models):
    print('%s roc %s logloss %s' %
          (model, roc_auc_score(Y, X[:, m, :]), logloss(Y, X[:, m, :])))
    rocs.append(roc_auc_score(Y, X[:, m, :]))

kf = KFold(n_splits=10)
folder = kf.split(dfs[0])
#blends_valid = []
#blends_test = []
train_index, valid_index = folder.__next__()
dfs_train = [df.iloc[train_index] for df in dfs]
dfs_valid = [df.iloc[valid_index] for df in dfs]

rocs = []
for d, df in enumerate(dfs_train):
    print('%s roc %s logloss %s' %
          (d, roc_auc_score(df[LIST_CLASSES], df[LIST_LOGITS]),
           logloss(df[LIST_CLASSES].values, df[LIST_LOGITS].values)))
Example #10
0
    def transform_dataset(self, algorithm: BaseEstimator, n_folds: int = 5) -> Tuple[pd.DataFrame, Dict[str, float]]:
        """
        Given a set of fully-qualified hyperparameters, create and not working a algorithm model.
        Returns: Model object and metrics dictionary
        """

        """Load input dataset and class_column"""
        df = self.dataset.load(self.s3_config, self.s3_bucket)
        class_column = self.dataset.class_column

        """Split input dataset in X and y"""
        X, y = df.drop(class_column, axis=1), df[class_column]

        """
        Checks if algorithm (BaseEstimator) is a classifier. 
        
        If True, predict y_pred with the method cross_val_predict. Then calculate the evaluation metrics for the
        algorithm model and return them as a dict. Convert y_pred to pd Series and concatenate X & y_pred.
        
        If False, call fit_transform or fit and then transform on X, y and return the transformed dataset as Dataframe.
        """

        if is_classifier(algorithm):

            """Predict labels with n fold cross validation"""
            y_pred = cross_val_predict(algorithm, X, y, cv=n_folds)

            """Calculate evaluation metrics"""
            accuracy = accuracy_score(y, y_pred)
            precision = precision_score(y, y_pred, average='weighted')
            recall = recall_score(y, y_pred, average='weighted')
            f1 = f1_score(y, y_pred, average='weighted')
            # TODO
            log_loss = logloss(y, y_pred)
            roc_auc = multiclass_roc_auc_score(y, y_pred, average='weighted')

            """Convert np array y_pred to pd series and add it to X"""
            y_pred = pd.Series(y_pred)
            X = pd.concat([X, y_pred], axis=1)
            X.columns = range(X.shape[1])

            return X, {'accuracy': accuracy,
                       'precision': precision,
                       'recall': recall,
                       'f1': f1,
                       'neg_log_loss': log_loss,
                       'roc_auc': roc_auc
                       }
        else:
            """
            If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y,
            then transform on X. Safe the transformed dataset in X
            """
            if hasattr(algorithm, 'fit_transform'):
                X = algorithm.fit_transform(X, y)
            else:
                # noinspection PyUnresolvedReferences
                X = algorithm.fit(X, y).transform(X)

            X = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1]))

            return X, {}