def getOptimalT_helper(model, ValDataGen, valid_nums): lastFullValPred = np.empty((0, 28)) lastFullValLabels = np.empty((0, 28)) for i in tqdm(range(valid_nums)): img, lbl = next(ValDataGen) scores = model.predict(img) lastFullValPred = np.append(lastFullValPred, scores, axis=0) lastFullValLabels = np.append(lastFullValLabels, lbl, axis=0) print(lastFullValPred.shape, lastFullValLabels.shape) rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], 28)) for j, t in enumerate(tqdm(rng)): for i in range(28): p = np.array(lastFullValPred[:, i] > t, dtype=np.int8) scoref1 = off1(lastFullValLabels[:, i], p, average='binary') f1s[j, i] = scoref1 print(np.max(f1s, axis=0)) print(np.mean(np.max(f1s, axis=0))) T = np.empty(28) for i in range(28): T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] print(T) return T, np.mean(np.max(f1s, axis=0))
def getOptimalT(self, model, val_gen): last_full_val_pred = np.empty((0, 28)) last_full_val_labels = np.empty((0, 28)) for i in tqdm(range(len(val_gen))): data_im, data_label = val_gen[i] scores = model.predict(data_im) last_full_val_pred = np.append(last_full_val_pred, scores, axis=0) last_full_val_labels = np.append(last_full_val_labels, data_label, axis=0) print(last_full_val_pred.shape, last_full_val_labels.shape) rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], 28)) for j, t in enumerate(tqdm(rng)): for i in range(28): p = np.array(last_full_val_pred[:, i] > t, dtype=np.int8) #scoref1 = K.eval(f1_score(fullValLabels[:,i], p, average='binary')) scoref1 = off1(last_full_val_labels[:, i], p, average='binary') f1s[j, i] = scoref1 print(np.max(f1s, axis=0)) print(np.mean(np.max(f1s, axis=0))) T = np.empty(28) for i in range(28): T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] #print('Choosing threshold: ', T, ', validation F1-score: ', max(f1s)) print(T) return T, np.mean(np.max(f1s, axis=0))
def f1_full_validation(self, y_true, y_pred, class_num, save_path=None): """ :param y_true: :param y_pred: :param class_num: :param save_path: """ rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], class_num)) # reshape, if problem is mutli-label problem y_true = np.reshape(y_true, (y_true.shape[0], -1, class_num)) y_pred = np.reshape(y_pred, (y_pred.shape[0], -1, class_num)) for j, t in enumerate(tqdm(rng, 'Updating the F1 threshold')): for i in range(class_num): p = np.array(y_pred[..., i] > t, dtype=np.int8) scoref1 = off1(y_true[..., i].reshape(-1), p.reshape(-1), average='binary') f1s[j, i] = scoref1 print('Individual F1-scores for each class:') print(np.max(f1s, axis=0)) print('Macro F1-score CV =', np.mean(np.max(f1s, axis=0))) T = np.empty(class_num * y_true.shape[1]) F_T = np.empty(class_num) # Duplicate the threshold j times for j in range(y_true.shape[1]): for i in range(class_num): T[i + j * class_num] = rng[np.where( f1s[:, i] == np.max(f1s[:, i]))[0][0]] F_T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] print('Probability threshold maximizing CV F1-score for each class:') print(F_T) self.threshold = T self.final_t = F_T
def evaluate(): print("Loading weights ...") import glob import os list_of_files = glob.glob('*.h5') # * means all if need specific format then *.csv latest_file = max(list_of_files, key=os.path.getctime) model = load_model(latest_file, custom_objects={'f1_measure': f1_measure}) fullValGen = valid_gen lastFullValPred = np.empty((0, 28)) lastFullValLabels = np.empty((0, 28)) for i in tqdm(range(len(fullValGen))): im, lbl = fullValGen[i] scores = model.predict(im) lastFullValPred = np.append(lastFullValPred, scores, axis=0) lastFullValLabels = np.append(lastFullValLabels, lbl, axis=0) print(lastFullValPred.shape, lastFullValLabels.shape) from sklearn.metrics import f1_score as off1, f1_score rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], 28)) for j, t in enumerate(tqdm(rng)): for i in range(28): p = np.array(lastFullValPred[:, i] > t, dtype=np.int8) scoref1 = off1(lastFullValLabels[:, i], p, average='binary') f1s[j, i] = scoref1 print('Individual F1-scores for each class:') print(np.max(f1s, axis=0)) print('Macro F1-score CV =', np.mean(np.max(f1s, axis=0))) T = np.empty(28) for i in range(28): T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] print('Probability threshold maximizing CV F1-score for each class:') print(T) pathsTest, labelsTest = testDataset() testg = ProteinDataGenerator(pathsTest, labelsTest, BATCH_SIZE, SHAPE) submit = pd.read_csv(DATA_DIR + 'sample_submission.csv') P = np.zeros((pathsTest.shape[0], 28)) for i in tqdm(range(len(testg))): images, labels = testg[i] score = model.predict(images) P[i * BATCH_SIZE:i * BATCH_SIZE + score.shape[0]] = score PP = np.array(P) prediction = [] for row in tqdm(range(submit.shape[0])): str_label = '' for col in range(PP.shape[1]): if (PP[row, col] < T[col]): str_label += '' else: str_label += str(col) + ' ' prediction.append(str_label.strip()) submit['Predicted'] = np.array(prediction) ts = str(int(time.time())) submit.to_csv('model' + ts + '.csv', index=False)
lastFullValLabels = np.empty((0, 28)) for i in tqdm(range(len(fullValGen))): im, lbl = fullValGen[i] scores = bestModel.predict(im) lastFullValPred = np.append(lastFullValPred, scores, axis=0) lastFullValLabels = np.append(lastFullValLabels, lbl, axis=0) print(lastFullValPred.shape, lastFullValLabels.shape) rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], 28)) for j, t in enumerate(tqdm(rng)): for i in range(28): p = np.array(lastFullValPred[:, i] > t, dtype=np.int8) scoref1 = off1(lastFullValLabels[:, i], p, average='binary') f1s[j, i] = scoref1 print('Individual F1-scores for each class:') print(np.max(f1s, axis=0)) print('Macro F1-score CV =', np.mean(np.max(f1s, axis=0))) plt.plot(rng, f1s) T = np.empty(28) for i in range(28): T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] if T[i] < 0.01: T[i] = 0.01 print('Probability threshold maximizing CV F1-score for each class:') print(T)
def generateSubmitFile(model=None): pathsTest, labelsTest = getTestDataset() if not model: model = load_model('./base.model', custom_objects={'f1': f1}) # , 'f1_loss': f1_loss}) paths, labels = getTrainDataset() testg = ProteinDataGenerator(pathsTest, labelsTest, BATCH_SIZE, SHAPE) lastTrainIndex = int((1 - VAL_RATIO) * paths.shape[0]) pathsVal = paths[lastTrainIndex:] labelsVal = labels[lastTrainIndex:] vg = ProteinDataGenerator(pathsVal, labelsVal, BATCH_SIZE, SHAPE, use_cache=True, shuffle=False) submit = pd.read_csv(sample_dir) P = np.zeros((pathsTest.shape[0], 28)) for i in tqdm(range(len(testg))): images, labels = testg[i] score = model.predict(images) P[i * BATCH_SIZE:i * BATCH_SIZE + score.shape[0]] = score rng = np.arange(0, 1, 0.001) f1s = np.zeros((rng.shape[0], 28)) lastFullValPred = np.empty((0, 28)) np.random.seed(SEED) keys = np.arange(paths.shape[0], dtype=np.int) np.random.shuffle(keys) fullValGen = vg lastFullValLabels = np.empty((0, 28)) for i in tqdm(range(len(fullValGen))): im, lbl = fullValGen[i] scores = model.predict(im) lastFullValPred = np.append(lastFullValPred, scores, axis=0) lastFullValLabels = np.append(lastFullValLabels, lbl, axis=0) print(lastFullValPred.shape, lastFullValLabels.shape) for j, t in enumerate(tqdm(rng)): for i in range(28): p = np.array(lastFullValPred[:, i] > t, dtype=np.int8) scoref1 = off1(lastFullValLabels[:, i], p, average='binary') f1s[j, i] = scoref1 PP = np.array(P) prediction = [] T = np.empty(28) for i in range(28): T[i] = rng[np.where(f1s[:, i] == np.max(f1s[:, i]))[0][0]] print('Probability threshold maximizing CV F1-score for each class:') print(T) for row in tqdm(range(submit.shape[0])): str_label = '' for col in range(PP.shape[1]): if (PP[row, col] < T[col]): str_label += '' else: str_label += str(col) + ' ' prediction.append(str_label.strip()) submit['Predicted'] = np.array(prediction) submit.to_csv('submit.csv', index=False)