# 'models/NBSVM/slim/nbsvm_submission.csv', # 'models/RNN/pavel_attention_slim2/l2_test_data.csv', # 'models/RNN/pavel_all_outs_slim/l2_test_data.csv'] csv_files = [ 'models/PUBLIC/' + fn for fn in os.listdir('models/PUBLIC/') if fn.endswith('.csv') ] test_predicts_list = [] for csv_file in csv_files: orig_submission = pd.read_csv(csv_file) predictions = orig_submission[LIST_CLASSES] test_predicts_list.append(predictions) corr_matrix([p.values for p in test_predicts_list]) def bag_by_average(test_predicts_list): bagged_predicts = np.zeros(test_predicts_list[0].shape) for predict in test_predicts_list: bagged_predicts += predict bagged_predicts /= len(test_predicts_list) return bagged_predicts def bag_by_geomean(test_predicts_list): bagged_predicts = np.ones(test_predicts_list[0].shape) for predict in test_predicts_list: bagged_predicts *= predict
ys = [df[LIST_CLASSES].values for df in dfs] for i, _ in enumerate(csv_files[1:]): assert np.array_equal(ys[0], ys[i]) Y = ys[0] return X, Y else: return X X, Y = get_values(csvs_train,columns=LIST_LOGITS,hstack=False,with_labels=True) print('Corr matrix') print(corr_matrix(list(X.transpose([1, 0, 2])))) print(' ') if 'ho' in classifiers: ws = do_hyperopt(csvs_train) test_predicts = np.zeros(X[:,0,:].shape) for m in range(7): test_predicts += ws[m] * X[:,m,:] test_predicts /= 7 print('roc %s logloss %s'%(roc_auc_score(Y,test_predicts),logloss(Y,test_predicts))) for m in range(len(models)): print('%s roc %s logloss %s'%(models[m],roc_auc_score(Y,X[:,m,:]),logloss(Y,X[:,m,:])))
import scipy from utilities import corr_matrix list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] list_logits = ['logits_' + c for c in list_classes] csvs_train = ['models/CNN/inception5_slim/train_logits_folded.csv', 'models/RNN/pavel_baseline/train_logits_folded.csv', 'models/CAPS/caps_first_test/train_logits/caps_first_testk0_e3.csv'] dfs = [pd.read_csv(csv) for csv in csvs_train] xs = [df[list_logits].values for df in dfs] n_models = len(csvs_train) print('Corr matrix') print(corr_matrix(xs)) print(' ') df = dfs[1].copy() for logit in list_logits: df[logit] = df[logit].map(lambda x: 0 if x < 0.02 else 1) print(roc_auc_score(y_true=df[list_classes].values,y_score=df[list_logits].values)) """ graph = tf.Graph() with graph.as_default(): X = tf.Variable(df[list_logits].values,trainable=False, dtype=tf.float32) Y = tf.Variable(df[list_classes].values,trainable=False,dtype=tf.float32)