X, Y = get_values(csvs_train,columns=LIST_LOGITS,hstack=False,with_labels=True) print('Corr matrix') print(corr_matrix(list(X.transpose([1, 0, 2])))) print(' ') if 'ho' in classifiers: ws = do_hyperopt(csvs_train) test_predicts = np.zeros(X[:,0,:].shape) for m in range(7): test_predicts += ws[m] * X[:,m,:] test_predicts /= 7 print('roc %s logloss %s'%(roc_auc_score(Y,test_predicts),logloss(Y,test_predicts))) for m in range(len(models)): print('%s roc %s logloss %s'%(models[m],roc_auc_score(Y,X[:,m,:]),logloss(Y,X[:,m,:]))) #X_valid, Y_valid = get_values(csvs_valid,columns=LIST_LOGITS,hstack=False,with_labels=True) X_test = get_values(csvs_test,columns=LIST_CLASSES,hstack=False,with_labels=False) test_results_list = [] valid_results_list = []
valids = pd.read_csv( 'models/RNN/pavel_all_outs_slim/birnn_all_outs_slim_baggin_logits_folded.csv', index_col=1) #a = predictions.loc[predictions[LIST_CLASSES] > 0.99] test = pd.read_csv(TEST_FILENAME, index_col=0) def find_good_predicts(valids, column): b1 = valids[column] > 0.995 b2 = valids[column] < 0.0005 c = valids[b1 | b2] print(valids[b1].shape) print(valids[b2].shape) return c c = find_good_predicts(valids, 'logits_toxic') print(c.shape) print(logloss(c[LIST_CLASSES].values, c[LIST_LOGITS].values)) print(logloss(valids[LIST_CLASSES].values, valids[LIST_LOGITS].values)) good_predictions = find_good_predicts(predictions, 'toxic') g = good_predictions.join(test) train = pd.read_csv(TRAIN_SLIM_FILENAME, index_col=1) train = train.drop(columns=['Unnamed: 0']) new_train = pd.concat([train, g]) fn = 'train_' + str(new_train.shape[0]) + '.csv' new_train.to_csv(fn)
x_nb = x_nb2 valid_features = valid_word_features.multiply(r2) test_features = test_word_features.multiply(r2) m = LogisticRegression(C=4, dual=True) m.fit(x_nb, y) preds_valid = m.predict_proba(valid_features)[:, 1] preds_test = m.predict_proba(test_features)[:, 1] preds_test_fold_list.append(preds_test) preds_valid_fold_list.append(preds_valid) preds_test_list.append(np.array(preds_test_fold_list).T) preds_valid_list.append(np.array(preds_valid_fold_list).T) print('logloss: %s' % logloss(Y_valid, np.array(preds_valid_fold_list).T)) print('ROC: %s' % roc_auc_score(Y_valid, np.array(preds_valid_fold_list).T)) l2_data = pd.DataFrame(columns=['id'] + LIST_LOGITS + LIST_CLASSES) l2_data['id'] = pd.Series( np.concatenate( [list_of_y[fold].index.values for fold in range(cfg.fold_count)])) l2_data[LIST_LOGITS] = pd.DataFrame(np.concatenate(preds_valid_list, axis=0)) l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y, axis=0)) l2_data.set_index('id', inplace=True) #control if df is correct print('logloss: %s' % logloss(l2_data[LIST_CLASSES].values, l2_data[LIST_LOGITS].values)) print('ROC: %s' % roc_auc_score(l2_data[LIST_CLASSES].values, l2_data[LIST_LOGITS].values))
#df = pd.concat([train[COMMENT], valid[COMMENT], test[COMMENT]], axis=0) #df = df.fillna(UNKNOWN_WORD) #nrow = train.shape[0] tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=800000) print('transforming train') _ = tfidf.fit_transform( pd.concat([train[COMMENT], valid[COMMENT], test[COMMENT]], axis=0)) X_train = tfidf.transform(train[COMMENT]) model = ExtraTreesClassifier(n_jobs=-1, random_state=3, verbose=True) print('fitting classifier') model.fit(X_train, Y_train) print('transforming valid') X_valid = tfidf.transform(valid[COMMENT]) preds = model.predict_proba(X_valid) Y_valid = valid[LIST_CLASSES].values a = np.array(preds)[:, :, 1].T print(logloss(Y_valid, a)) #preds = pd.DataFrame([[c[1] for c in preds[row]] for row in range(len(preds))]).T #preds.columns = LIST_CLASSES #preds['id'] = tid #for c in LIST_CLASSES: # preds[c] = preds[c].clip(0 + 1e12, 1 - 1e12)
valid_features = valid_word_features.multiply(r2) test_features = test_word_features.multiply(r2) m = LogisticRegression(C=4, dual=True) m.fit(x_nb, y) preds_valid = m.predict_proba(valid_features)[:, 1] preds_test = m.predict_proba(test_features)[:, 1] preds_test_fold_list.append(preds_test) preds_valid_fold_list.append(preds_valid) preds_test_list.append(np.array(preds_test_fold_list).T) preds_valid_list.append(np.array(preds_valid_fold_list).T) list_of_y.append(Y_valid) print('logloss: %s' % logloss(Y_valid, np.array(preds_valid_fold_list).T)) print('ROC: %s' % roc_auc_score(Y_valid, np.array(preds_valid_fold_list).T)) l2_data = pd.DataFrame(columns=LIST_LOGITS + LIST_CLASSES) l2_data[LIST_LOGITS] = pd.DataFrame(np.concatenate(preds_valid_list, axis=0)) l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y, axis=0)) l2_data.to_csv(cfg.fp_out_train) preds_test_list2 = [np.array(preds_test_list[i:i + 6]).T for i in range(10)] test_predicts = np.ones(preds_test_list2[0].shape) for fold_predict in preds_test_list2: test_predicts *= fold_predict test_predicts **= (1. / len(preds_test_list2)) new_submission = pd.read_csv(SAMPLE_SUBMISSION_FILENAME)
def objective(space_elements): ws = [s / sum(space_elements) for s in space_elements] preds = ws[0] * xs[0] for l, _ in enumerate(space_elements[1:]): preds += ws[l + 1] * xs[l + 1] return logloss(ys[0], preds)
algorithm = algorithm.instance() if is_classifier(algorithm): resultlist = [] """Predict labels with n fold cross validation""" y_pred = cross_val_predict(algorithm, X_out, y_out, cv=5) """Calculate evaluation metrics""" accuracy = accuracy_score(y_out, y_pred) resultlist.append(accuracy) precision = precision_score(y_out, y_pred, average='weighted') resultlist.append(precision) recall = recall_score(y_out, y_pred, average='weighted') resultlist.append(recall) f1 = f1_score(y_out, y_pred, average='weighted') resultlist.append(f1) log_loss = logloss(y_out, y_pred) resultlist.append(log_loss) roc_auc = multiclass_roc_auc_score(y_out, y_pred, average='weighted') resultlist.append(roc_auc) pipeline_res.loc[len(pipeline_res.index)] = resultlist else: """ If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y, then transform on X. Safe the transformed dataset in X """ if hasattr(algorithm, 'fit_transform'): X_out = algorithm.fit_transform(X_out, y_out)
#print('using catboost %s' %roc_auc_score(Y_valid,preds_cat)) print('using xgb %s' % roc_auc_score(Y_valid, preds_xgb)) print('using lgb %s' % roc_auc_score(Y_valid, preds_lgb)) print('using lr %s' % roc_auc_score(Y_valid, preds_logistic)) print('using nn %s' % roc_auc_score(Y_valid, preds_nn)) print('using xgb + lr %s' % roc_auc_score(Y_valid, np.mean([preds_logistic, preds_xgb], axis=0))) print('using xgb + nn %s' % roc_auc_score(Y_valid, np.mean([preds_xgb, preds_nn], axis=0))) print('using xgb + lr + nn %s' % roc_auc_score( Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn], axis=0))) print('using xgb + lr + nn + lgb %s' % roc_auc_score( Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn, preds_lgb], axis=0))) print('----------logloss------------') print('using mean %s' % logloss(Y_valid, np.mean(X_valid, axis=1))) #print('using catboost %s' %logloss(Y_valid,preds_cat)) print('using xgb %s' % logloss(Y_valid, preds_xgb)) print('using lgb %s' % logloss(Y_valid, preds_lgb)) print('using lr %s' % logloss(Y_valid, preds_logistic)) print('using nn %s' % logloss(Y_valid, preds_nn)) print('using xgb + nn %s' % logloss(Y_valid, np.mean([preds_xgb, preds_nn], axis=0))) print('using xgb + lr %s' % logloss(Y_valid, np.mean([preds_logistic, preds_xgb], axis=0))) print('using xgb + lr + nn %s' % logloss(Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn], axis=0))) print( 'using xgb + lr + nn + lgb %s' % logloss(Y_valid, np.mean([preds_logistic, preds_xgb, preds_nn, preds_lgb], axis=0)))
return X X, Y = get_values(csvs_train, columns=LIST_LOGITS, hstack=False, with_labels=True) print('Corr matrix') print(corr_matrix(list(X.transpose([1, 0, 2])))) print(' ') rocs = [] for m, model in enumerate(models): print('%s roc %s logloss %s' % (model, roc_auc_score(Y, X[:, m, :]), logloss(Y, X[:, m, :]))) rocs.append(roc_auc_score(Y, X[:, m, :])) kf = KFold(n_splits=10) folder = kf.split(dfs[0]) #blends_valid = [] #blends_test = [] train_index, valid_index = folder.__next__() dfs_train = [df.iloc[train_index] for df in dfs] dfs_valid = [df.iloc[valid_index] for df in dfs] rocs = [] for d, df in enumerate(dfs_train): print('%s roc %s logloss %s' % (d, roc_auc_score(df[LIST_CLASSES], df[LIST_LOGITS]), logloss(df[LIST_CLASSES].values, df[LIST_LOGITS].values)))
def transform_dataset(self, algorithm: BaseEstimator, n_folds: int = 5) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Given a set of fully-qualified hyperparameters, create and not working a algorithm model. Returns: Model object and metrics dictionary """ """Load input dataset and class_column""" df = self.dataset.load(self.s3_config, self.s3_bucket) class_column = self.dataset.class_column """Split input dataset in X and y""" X, y = df.drop(class_column, axis=1), df[class_column] """ Checks if algorithm (BaseEstimator) is a classifier. If True, predict y_pred with the method cross_val_predict. Then calculate the evaluation metrics for the algorithm model and return them as a dict. Convert y_pred to pd Series and concatenate X & y_pred. If False, call fit_transform or fit and then transform on X, y and return the transformed dataset as Dataframe. """ if is_classifier(algorithm): """Predict labels with n fold cross validation""" y_pred = cross_val_predict(algorithm, X, y, cv=n_folds) """Calculate evaluation metrics""" accuracy = accuracy_score(y, y_pred) precision = precision_score(y, y_pred, average='weighted') recall = recall_score(y, y_pred, average='weighted') f1 = f1_score(y, y_pred, average='weighted') # TODO log_loss = logloss(y, y_pred) roc_auc = multiclass_roc_auc_score(y, y_pred, average='weighted') """Convert np array y_pred to pd series and add it to X""" y_pred = pd.Series(y_pred) X = pd.concat([X, y_pred], axis=1) X.columns = range(X.shape[1]) return X, {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'neg_log_loss': log_loss, 'roc_auc': roc_auc } else: """ If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y, then transform on X. Safe the transformed dataset in X """ if hasattr(algorithm, 'fit_transform'): X = algorithm.fit_transform(X, y) else: # noinspection PyUnresolvedReferences X = algorithm.fit(X, y).transform(X) X = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1])) return X, {}