def save_as_npy(df): resized = {} char_data = {'id': list(df.iloc[:, 0]), 'data': np.array(df.iloc[:, 1:])} for i in T(range(len(char_data['id']))): image = np.copy(char_data['data'][i]).reshape(HEIGHT, WIDTH) name = char_data['id'][i] np.save(os.path.join(img_dir, name + '.npy'), image)
def tfrec_extract(filename): global train_row global test_row global train global test tfrecord_path = os.path.join(tfrec_dir, filename) index_path = tfrecord_path.replace('.tfrec', '.index') if 'train' in filename: savedir = train_dir else:savedir = test_dir dataset = TFRecordDataset(tfrecord_path, index_path, transform=decode_image) loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) for data in T(loader): # print(len(loader)) if 'train' in filename: train_row += 1 else: test_row += 1 img_name = data['image_name'].squeeze().data.cpu().numpy().copy() img_name = os.path.join(savedir, ''.join(map(chr, img_name))) img_name += '.jpg' image_file = data['image'].squeeze().data.cpu().numpy() cv2.imwrite(img_name, image_file) del data['image'] del data['image_name'] for k, v in data.items(): if 'train' in filename: train.loc[train_row, 'image_name'] = img_name train.loc[train_row, k] = v.squeeze().data.cpu().numpy() train.loc[train_row, 'tfrec'] = filename.replace('.tfrec', '') else: test.loc[test_row, 'image_name'] = img_name test.loc[test_row, k] = v.squeeze().data.cpu().numpy() test.loc[test_row, 'tfrec'] = filename.replace('.tfrec', '')
def evaluate(): model.eval() PREDS = np.zeros((len(test_df), 1)) with torch.no_grad(): for t in range(len(augs)): print('TTA {}'.format(t + 1)) test_ds = SETIDataset(image_ids=test_df.image_name.values, meta_features=test_meta, dim=sz, transforms=augs[t]) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=4) img_ids = [] preds = [] for idx, (img_id, inputs, meta) in T(enumerate(test_loader), total=len(test_loader)): inputs = inputs.to(device) meta = meta.to(device) outputs = model(inputs.float(), meta) img_ids.extend(img_id) preds.extend( torch.softmax(outputs, 1)[:, 1].detach().cpu().numpy()) zippedList = list(zip(img_ids, preds)) temp_df = pd.DataFrame(zippedList, columns=['image_name', f'target{t}']) temp_df.to_csv(f'submission_TTA{t}.csv', index=False)
def evaluate(): model.eval() PREDS = np.zeros((len(valid_df), 1)) IMG_IDS = [] LAB = [] with torch.no_grad(): for t in range(len(augs)): print('TTA {}'.format(t + 1)) test_ds = MelanomaDataset(image_ids=valid_df.image_name.values, meta_features=valid_meta, dim=sz, transforms=augs[t]) test_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, num_workers=4) img_ids = [] preds = [] lab = [] for idx, (img_id, inputs, meta, labels) in T(enumerate(test_loader), total=len(test_loader)): inputs = inputs.to(device) meta = meta.to(device) labels = labels.to(device) outputs = model(inputs.float(), meta) img_ids.extend(img_id) preds.extend( torch.softmax(outputs, 1)[:, 1].detach().cpu().numpy()) lab.extend(torch.argmax(labels, 1).cpu().numpy()) # zippedList = list(zip(img_ids, preds)) print(np.array(preds).shape, np.array(lab).shape) score_diff_tta = np.abs(np.array(preds) - np.array(lab)).reshape( len(valid_loader.dataset), 1) # print(score_diff_tta.shape) # print(np.array(PREDS).shape, np.array(LAB).shape) zippedList_tta = list( zip(img_ids, lab, np.squeeze(preds), np.squeeze(score_diff_tta))) temp_df = pd.DataFrame( zippedList_tta, columns=['image_name', 'label', 'predictions', 'difference']) temp_df.to_csv(f'submission_TTA{t}.csv', index=False) IMG_IDS = img_ids LAB = lab PREDS += np.array(preds).reshape(len(valid_loader.dataset), 1) PREDS /= len(augs) score_diff = np.abs( np.array(PREDS) - np.array(LAB).reshape(len(valid_loader.dataset), 1)) # print(np.array(PREDS).shape, np.array(LAB).shape) zippedList = list( zip(IMG_IDS, LAB, np.squeeze(PREDS), np.squeeze(score_diff))) submission = pd.DataFrame( zippedList, columns=['image_name', 'label', 'prediction', 'difference']) submission = submission.sort_values(by=['difference'], ascending=False) submission.to_csv('val_report.csv', index=False)
def eval(val_list, transformer): running_loss = 0.0 predictions = [] actual_labels = [] val_dataset = DRDataset('../data/new_data/train.csv', val_list, dim, transformer) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size // 2, shuffle=True, num_workers=num_worker) model.eval() for data, labels in T(val_data_loader): labels = labels.view(-1, 1) labels = labels.to(device, dtype=torch.float) data = data.to(device, dtype=torch.float) preds = model(data) loss = criterion(preds, labels).data.cpu().numpy() running_loss += loss.item() * data.size(0) predictions.extend(preds.data.cpu().numpy()) actual_labels.extend(labels.data.cpu()) epoch_loss = running_loss / len(val_data_loader.dataset) print('Validation Loss: {:.4f}'.format(epoch_loss)) qk = quadratic_kappa(torch.Tensor(predictions), torch.Tensor(actual_labels)) print('Quadratic Kappa: {:.4f}'.format(qk)) return qk, epoch_loss
def load_prediction_to_top_3(preds, fnames, order_df): col = ['img_name'] test_df = DataFrame(fnames, columns=col) test_df['label'] = '' predictions = array(preds).reshape(len(preds), 251) for i, pred in T(enumerate(predictions), total=len(predictions)): test_df.loc[i, 'label'] = ' '.join( str(int(i)) for i in argsort(pred)[::-1][:3]) test_df = merge(order_df['img_name'], test_df) return test_df
def Resize(df, size=128): resized = {} char_data = {'id': list(df.iloc[:, 0]), 'data': np.array(df.iloc[:, 1:])} for i in T(range(len(char_data['id']))): # image = cv2.resize(df.loc[df.index[i]].values.reshape(137,236),(size,size)) # image0 = 255 - df.loc[df.index[i]].values.reshape(137,236).astype(np.uint8) image0 = 255 - np.copy(char_data['data'][i]).reshape(HEIGHT, WIDTH) name = char_data['id'][i] #normalize each image by its max val # print(image0.max()) img = (image0 * (255.0 / image0.max())).astype(np.uint8) # print('Before crop: ', np.max(img)) image = crop_resize(img) np.save(os.path.join(img_dir, name + '.npy'), image)
def Resize(df, size=128): resized = {} df = df.set_index('image_id') for i in T(range(df.shape[0])): # image = cv2.resize(df.loc[df.index[i]].values.reshape(137,236),(size,size)) image0 = 255 - df.loc[df.index[i]].values.reshape(137, 236).astype( np.uint8) #normalize each image by its max val img = (image0 * (255.0 / image0.max())).astype(np.uint8) # print(img.max()) image = crop_resize(img) # print(image.max()) resized[df.index[i]] = image.reshape(-1) resized = pd.DataFrame(resized).T.reset_index() resized.columns = resized.columns.astype(str) resized.rename(columns={'index': 'image_id'}, inplace=True) return resized
name = char_data['id'][i] np.save(os.path.join(img_dir, name + '.npy'), image) def col_to_numpy(idx): img = np.array(df.iloc[idx][1:]).reshape(128, 128) np.save(os.path.join(img_dir, df.iloc[idx][0] + '.npy'), img, allow_pickle=True) # prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(0)) # df0 = Resize(prqt) # prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(1)) # df1 = Resize(prqt) # prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(2)) # df2 = Resize(prqt) # prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(3)) # df3 = Resize(prqt) # df = pd.concat([df0, df1, df2, df3], ignore_index=True) # df.to_parquet('train_images_data.parquet', index=False) # p_map(col_to_numpy, list(range(l))) # p_map(Resize, ['data/train_image_data_{}.parquet'.format(i) for i in range(4)]) for i in T(range(4)): df = pd.read_parquet('data/train_image_data_{}.parquet'.format(i)) # Resize(df) save_as_npy(df) # x = np.load('data/128_numpy/Train_10.npy') # print(df.head()
def train(transformer, epoch, num_fold): try: os.mkdir('models/' + model_dir) except: pass kf = KFold(n_splits=num_fold, shuffle=True, random_state=42) df = pd.read_csv('../data/new_data/train.csv') for cv_num, (train_list, val_list) in enumerate(kf.split(df)): best_qk = 0 best_loss = np.inf for e in T(range(epoch)): train_dataset = DRDataset('../data/new_data/train.csv', train_list, dim, transformer) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_worker) model.train() running_loss = 0.0 with T(total=len(train_data_loader), file=sys.stdout) as pbar: for count, (data, labels) in T(enumerate(train_data_loader), total=len(train_data_loader)): labels = labels.view(-1, 1) data = data.to(device, dtype=torch.float) labels = labels.to(device, dtype=torch.float) optimizer.zero_grad() it_num = cv_num * epochs * len( train_data_loader) + e * len( train_data_loader) + count + 1 # Verify this formula lr = triangular_lr(it_num, 4 * len(train_data_loader) * num_fold, 3e-5, 3e-4, 0.15) set_lr(optimizer, lr) with torch.set_grad_enabled(True): outputs = model(data) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() * data.size(0) epoch_loss = running_loss / len(train_data_loader.dataset) pbar.set_description('Loss: {:.4f}'.format( running_loss / ((count + 1) * batch_size))) pbar.update(1) print('\nTraining Loss: {:.4f}'.format(epoch_loss)) val_qk, val_loss = eval(val_list, transformer) logger(cv_num + 1, e + 1, get_lr(optimizer), epoch_loss, val_loss, val_qk.data.cpu().numpy(), 'resnet101_dim_256_logger.csv') if val_qk > best_qk and val_loss < best_loss: print(' -----------------------------') print('| New best model! |') print(' -----------------------------') best_qk = val_qk best_loss = val_loss torch.save( { 'epoch': e, 'cv_num': cv_num, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': running_loss, 'kappa': best_qk }, 'models/' + model_dir + '/' + model_dir + '_fold_' + str(cv_num) + '.pth')
def evaluate(results, pred_thr): model.eval() total = 0.0 running_loss = 0.0 running_acc = 0.0 grapheme_root_out = 0.0 vowel_diacritic_out = 0.0 consonant_diacritic_out = 0.0 running_acc = 0.0 pred1 = [] pred2 = [] pred3 = [] pred1_val = [] pred2_val = [] pred3_val = [] lab1 = [] lab2 = [] lab3 = [] names = [] with torch.no_grad(): for idx, (name, inputs, labels1, labels2, labels3) in T(enumerate(valid_loader), total=len(valid_loader)): inputs = inputs.to(device) labels1 = labels1.to(device) labels2 = labels2.to(device) labels3 = labels3.to(device) total += len(inputs) outputs1, outputs2, outputs3 = model(inputs.float()) names.extend(name) pred1.extend(torch.argmax(outputs1, dim=1).cpu().numpy()) pred2.extend(torch.argmax(outputs2, dim=1).cpu().numpy()) pred3.extend(torch.argmax(outputs3, dim=1).cpu().numpy()) pred1_val.extend( np.max(nn.functional.softmax(outputs1).cpu().numpy(), axis=1)) pred2_val.extend( np.max(nn.functional.softmax(outputs2).cpu().numpy(), axis=1)) pred3_val.extend( np.max(nn.functional.softmax(outputs3).cpu().numpy(), axis=1)) lab1.extend(labels1.cpu().numpy()) lab2.extend(labels2.cpu().numpy()) lab3.extend(labels3.cpu().numpy()) loss1 = 0.7 * criterion(outputs1, labels1) loss2 = 0.20 * criterion(outputs2, labels2) loss3 = 0.10 * criterion(outputs3, labels3) running_loss += loss1.item() + loss2.item() + loss3.item() grapheme_root_out += (outputs1.argmax(1) == labels1).float().mean() vowel_diacritic_out += ( outputs2.argmax(1) == labels2).float().mean() consonant_diacritic_out += ( outputs3.argmax(1) == labels3).float().mean() recall_graph = sklearn.metrics.recall_score(pred1, lab1, average='macro') recall_vowel = sklearn.metrics.recall_score(pred2, lab2, average='macro') recall_consonant = sklearn.metrics.recall_score(pred3, lab3, average='macro') scores = [recall_graph, recall_vowel, recall_consonant] total_recall = np.average(scores, weights=[2, 1, 1]) msg = 'Loss: {:.4f} \n Acc: \t Root {:.4f} \t Vowel {:.4f} \t Consonant {:.4f} \nRecall: \t Root {:.4f} \t Vowel {:.4f} \t Consonant {:.4f} Total {:.4f}\n'.format( running_loss / (len(valid_loader)), grapheme_root_out / (len(valid_loader)), vowel_diacritic_out / (len(valid_loader)), consonant_diacritic_out / (len(valid_loader)), recall_graph, recall_vowel, recall_consonant, total_recall) print(msg) l = 0 for idx, (i, j) in enumerate(zip(lab1, pred1)): if i != j: results.loc[l, 'ID'] = names[idx] results.loc[l, 'Graph_Actual'] = idx_to_class(i) results.loc[l, 'Graph_Pred'] = idx_to_class(j) l += 1 pred_thr['ID'] = names pred_thr['Graph_confidence'] = pred1_val pred_thr['Vowel_confidence'] = pred2_val pred_thr['Consonant_confidence'] = pred3_val pred_thr['Graph_pred'] = pred1 pred_thr['Vowel_pred'] = pred2 pred_thr['Consonant_pred'] = pred3 results.to_csv('results.csv', index=False) df = results.groupby('Graph_Actual').size().reset_index( name='Count').rename(columns={'Graph_Actual': 'Graph_Actual_value'}) df.to_csv('results_count.csv', index=False) return total_recall, pred_thr
tfrec_dir = f'{data_dir}/tfrecords' train_dir = f"{data_dir}/malignant" test_dir = f"{data_dir}/test_768" filelist = os.listdir(tfrec_dir) filelist = [f for f in filelist if '.tfrec' in f] attributes = ['patient_id', 'target', 'sex', 'width', 'height', 'age_approx', 'anatom_site_general_challenge', 'diagnosis', 'image_name', 'image'] os.makedirs(train_dir, exist_ok=True) os.makedirs(test_dir, exist_ok=True) train = pd.DataFrame() test = pd.DataFrame() train_row = 0 test_row = 0 # Creating index files for i in T(filelist): tfrec_file = os.path.join(tfrec_dir, i) index_file = os.path.join(tfrec_dir, i).replace('.tfrec', '.index') if not os.path.exists(index_file): os.system(f"python3 -m tfrecord.tools.tfrecord2idx {tfrec_file} {index_file}") def tfrec_extract(filename): global train_row global test_row global train global test tfrecord_path = os.path.join(tfrec_dir, filename) index_path = tfrecord_path.replace('.tfrec', '.index') if 'train' in filename: savedir = train_dir
def evaluate(epoch,history): model.eval() total = 0.0 running_loss = 0.0 running_acc = 0.0 grapheme_root_out=0.0 vowel_diacritic_out=0.0 consonant_diacritic_out=0.0 running_acc = 0.0 pred1= [] pred2= [] pred3 = [] lab1 = [] lab2 = [] lab3 = [] with torch.no_grad(): for idx, (inputs,labels1,labels2,labels3) in T(enumerate(valid_loader),total=len(valid_loader)): inputs = inputs.to(device) labels1 = labels1.to(device) labels2 = labels2.to(device) labels3 = labels3.to(device) total += len(inputs) outputs1,outputs2,outputs3 = model(inputs.float()) pred1.extend(torch.argmax(outputs1, dim=1).cpu().numpy()) pred2.extend(torch.argmax(outputs2, dim=1).cpu().numpy()) pred3.extend(torch.argmax(outputs3, dim=1).cpu().numpy()) lab1.extend(labels1.cpu().numpy()) lab2.extend(labels2.cpu().numpy()) lab3.extend(labels3.cpu().numpy()) loss1 = 0.70*criterion(outputs1,labels1) loss2 = 0.20*criterion(outputs2,labels2) loss3 = 0.10*criterion(outputs3,labels3) running_loss += loss1.item()+loss2.item()+loss3.item() grapheme_root_out += (outputs1.argmax(1)==labels1).float().mean() vowel_diacritic_out += (outputs2.argmax(1)==labels2).float().mean() consonant_diacritic_out += (outputs3.argmax(1)==labels3).float().mean() recall_graph = sklearn.metrics.recall_score(pred1, lab1, average='macro') recall_vowel = sklearn.metrics.recall_score(pred2, lab2, average='macro') recall_consonant = sklearn.metrics.recall_score(pred3, lab3, average='macro') scores = [recall_graph, recall_vowel, recall_consonant] total_recall = np.average(scores, weights=[2, 1, 1]) writer.add_scalar('Loss/val', running_loss/(len(valid_loader)), epoch) writer.add_scalar('Val Accuracy/Root', grapheme_root_out/(len(valid_loader)), epoch) writer.add_scalar('Val Accuracy/Vowel', vowel_diacritic_out/(len(valid_loader)), epoch) writer.add_scalar('Val Accuracy/Consonant', consonant_diacritic_out/(len(valid_loader)), epoch) writer.add_scalar('Val Recall/Root', recall_graph, epoch) writer.add_scalar('Val Recall/Vowel', recall_vowel, epoch) writer.add_scalar('Val Recall/Consonant', recall_consonant, epoch) writer.add_scalar('Val Recall/Total', total_recall, epoch) msg = 'Loss: {:.4f} \n Acc: \t Root {:.4f} \t Vowel {:.4f} \t Consonant {:.4f} \nRecall: \t Root {:.4f} \t Vowel {:.4f} \t Consonant {:.4f} Total {:.4f}\n'.format(running_loss/(len(valid_loader)), grapheme_root_out/(len(valid_loader)), vowel_diacritic_out/(len(valid_loader)), consonant_diacritic_out/(len(valid_loader)), recall_graph, recall_vowel, recall_consonant, total_recall) print(msg) lr_reduce_scheduler.step(running_loss) history.loc[epoch, 'valid_loss'] = running_loss/(len(valid_loader)) history.loc[epoch, 'valid_grapheme_recall'] = recall_graph history.loc[epoch, 'valid_vowel_recall'] = recall_vowel history.loc[epoch, 'valid_conso_recall'] = recall_consonant history.loc[epoch, 'valid_recall'] = total_recall history.to_csv(os.path.join(history_dir, 'history_{}.csv'.format(model_name)), index=False) return running_loss/(len(valid_loader)), total_recall
for i, (_, test_index) in enumerate(mskf.split(X, y)): train_df.iloc[test_index, -1] = i train_df['fold'] = train_df['fold'].astype('int') idxs = [i for i in range(len(train_df))] train_idx = [] val_idx = [] model = seresnext(nunique, pretrained_model).to(device) # model = Dnet(nunique).to(device) # model = EfficientNetWrapper(pretrained_model).to(device) # print(summary(model, (3, 128, 128))) writer.add_graph(model, torch.FloatTensor(np.random.randn(1, 1, 137, 236)).cuda()) # writer.close() # For stratified split for i in T(range(len(train_df))): if train_df.iloc[i]['fold'] == fold: val_idx.append(i) else: train_idx.append(i) # train_idx = idxs[:int((n_fold-1)*len(idxs)/(n_fold))] # train_idx = np.load('train_pseudo_idxs.npy') # val_idx = idxs[int((n_fold-1)*len(idxs)/(n_fold)):] train_ds = BanglaDataset(train_df, 'data/numpy_format', train_idx, aug=train_aug) train_loader = DataLoader(train_ds,batch_size=batch_size, shuffle=True) valid_ds = BanglaDataset(train_df, 'data/numpy_format', val_idx, aug=None) valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=True) writer = SummaryWriter(tb_dir) ## This function for train is copied from @hanjoonchoe