Beispiel #1
0
def save_as_npy(df):
    resized = {}
    char_data = {'id': list(df.iloc[:, 0]), 'data': np.array(df.iloc[:, 1:])}
    for i in T(range(len(char_data['id']))):
        image = np.copy(char_data['data'][i]).reshape(HEIGHT, WIDTH)
        name = char_data['id'][i]
        np.save(os.path.join(img_dir, name + '.npy'), image)
def tfrec_extract(filename):
    global train_row
    global test_row
    global train
    global test
    tfrecord_path = os.path.join(tfrec_dir, filename)
    index_path = tfrecord_path.replace('.tfrec', '.index')
    
    if 'train' in filename:
        savedir = train_dir
    else:savedir = test_dir
    dataset = TFRecordDataset(tfrecord_path, index_path, transform=decode_image)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
    for data in T(loader):
        # print(len(loader))
        if 'train' in filename:
            train_row += 1
        else: test_row += 1
        img_name = data['image_name'].squeeze().data.cpu().numpy().copy()
        img_name = os.path.join(savedir, ''.join(map(chr, img_name)))
        img_name += '.jpg'
        image_file = data['image'].squeeze().data.cpu().numpy()
        cv2.imwrite(img_name, image_file)
        del data['image']
        del data['image_name']
        for k, v in data.items():
            if 'train' in filename:
                train.loc[train_row, 'image_name'] = img_name
                train.loc[train_row, k] = v.squeeze().data.cpu().numpy()
                train.loc[train_row, 'tfrec'] = filename.replace('.tfrec', '')
            else:
                test.loc[test_row, 'image_name'] = img_name
                test.loc[test_row, k] = v.squeeze().data.cpu().numpy()
                test.loc[test_row, 'tfrec'] = filename.replace('.tfrec', '')
def evaluate():
    model.eval()
    PREDS = np.zeros((len(test_df), 1))
    with torch.no_grad():
        for t in range(len(augs)):
            print('TTA {}'.format(t + 1))
            test_ds = SETIDataset(image_ids=test_df.image_name.values,
                                  meta_features=test_meta,
                                  dim=sz,
                                  transforms=augs[t])
            test_loader = DataLoader(test_ds,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=4)

            img_ids = []
            preds = []
            for idx, (img_id, inputs, meta) in T(enumerate(test_loader),
                                                 total=len(test_loader)):
                inputs = inputs.to(device)
                meta = meta.to(device)
                outputs = model(inputs.float(), meta)
                img_ids.extend(img_id)
                preds.extend(
                    torch.softmax(outputs, 1)[:, 1].detach().cpu().numpy())
            zippedList = list(zip(img_ids, preds))
            temp_df = pd.DataFrame(zippedList,
                                   columns=['image_name', f'target{t}'])
            temp_df.to_csv(f'submission_TTA{t}.csv', index=False)
Beispiel #4
0
def evaluate():
    model.eval()
    PREDS = np.zeros((len(valid_df), 1))
    IMG_IDS = []
    LAB = []
    with torch.no_grad():
        for t in range(len(augs)):
            print('TTA {}'.format(t + 1))
            test_ds = MelanomaDataset(image_ids=valid_df.image_name.values,
                                      meta_features=valid_meta,
                                      dim=sz,
                                      transforms=augs[t])
            test_loader = DataLoader(valid_ds,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=4)

            img_ids = []
            preds = []
            lab = []

            for idx, (img_id, inputs, meta,
                      labels) in T(enumerate(test_loader),
                                   total=len(test_loader)):
                inputs = inputs.to(device)
                meta = meta.to(device)
                labels = labels.to(device)
                outputs = model(inputs.float(), meta)
                img_ids.extend(img_id)
                preds.extend(
                    torch.softmax(outputs, 1)[:, 1].detach().cpu().numpy())
                lab.extend(torch.argmax(labels, 1).cpu().numpy())
            # zippedList =  list(zip(img_ids, preds))
            print(np.array(preds).shape, np.array(lab).shape)
            score_diff_tta = np.abs(np.array(preds) - np.array(lab)).reshape(
                len(valid_loader.dataset), 1)
            # print(score_diff_tta.shape)
            # print(np.array(PREDS).shape, np.array(LAB).shape)
            zippedList_tta = list(
                zip(img_ids, lab, np.squeeze(preds),
                    np.squeeze(score_diff_tta)))
            temp_df = pd.DataFrame(
                zippedList_tta,
                columns=['image_name', 'label', 'predictions', 'difference'])
            temp_df.to_csv(f'submission_TTA{t}.csv', index=False)
            IMG_IDS = img_ids
            LAB = lab
            PREDS += np.array(preds).reshape(len(valid_loader.dataset), 1)
        PREDS /= len(augs)
        score_diff = np.abs(
            np.array(PREDS) -
            np.array(LAB).reshape(len(valid_loader.dataset), 1))
        # print(np.array(PREDS).shape, np.array(LAB).shape)
        zippedList = list(
            zip(IMG_IDS, LAB, np.squeeze(PREDS), np.squeeze(score_diff)))
        submission = pd.DataFrame(
            zippedList,
            columns=['image_name', 'label', 'prediction', 'difference'])
        submission = submission.sort_values(by=['difference'], ascending=False)
        submission.to_csv('val_report.csv', index=False)
def eval(val_list, transformer):
    running_loss = 0.0
    predictions = []
    actual_labels = []
    val_dataset = DRDataset('../data/new_data/train.csv', val_list, dim,
                            transformer)
    val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                  batch_size=batch_size // 2,
                                                  shuffle=True,
                                                  num_workers=num_worker)
    model.eval()
    for data, labels in T(val_data_loader):
        labels = labels.view(-1, 1)
        labels = labels.to(device, dtype=torch.float)
        data = data.to(device, dtype=torch.float)
        preds = model(data)
        loss = criterion(preds, labels).data.cpu().numpy()
        running_loss += loss.item() * data.size(0)
        predictions.extend(preds.data.cpu().numpy())
        actual_labels.extend(labels.data.cpu())
    epoch_loss = running_loss / len(val_data_loader.dataset)
    print('Validation Loss: {:.4f}'.format(epoch_loss))
    qk = quadratic_kappa(torch.Tensor(predictions),
                         torch.Tensor(actual_labels))
    print('Quadratic Kappa: {:.4f}'.format(qk))
    return qk, epoch_loss
def load_prediction_to_top_3(preds, fnames, order_df):
    col = ['img_name']
    test_df = DataFrame(fnames, columns=col)
    test_df['label'] = ''
    predictions = array(preds).reshape(len(preds), 251)
    for i, pred in T(enumerate(predictions), total=len(predictions)):
        test_df.loc[i, 'label'] = ' '.join(
            str(int(i)) for i in argsort(pred)[::-1][:3])

    test_df = merge(order_df['img_name'], test_df)
    return test_df
Beispiel #7
0
def Resize(df, size=128):
    resized = {}
    char_data = {'id': list(df.iloc[:, 0]), 'data': np.array(df.iloc[:, 1:])}
    for i in T(range(len(char_data['id']))):
        # image = cv2.resize(df.loc[df.index[i]].values.reshape(137,236),(size,size))
        # image0 = 255 - df.loc[df.index[i]].values.reshape(137,236).astype(np.uint8)
        image0 = 255 - np.copy(char_data['data'][i]).reshape(HEIGHT, WIDTH)
        name = char_data['id'][i]
        #normalize each image by its max val
        # print(image0.max())
        img = (image0 * (255.0 / image0.max())).astype(np.uint8)
        # print('Before crop: ', np.max(img))
        image = crop_resize(img)
        np.save(os.path.join(img_dir, name + '.npy'), image)
def Resize(df, size=128):
    resized = {}
    df = df.set_index('image_id')
    for i in T(range(df.shape[0])):
        # image = cv2.resize(df.loc[df.index[i]].values.reshape(137,236),(size,size))
        image0 = 255 - df.loc[df.index[i]].values.reshape(137, 236).astype(
            np.uint8)
        #normalize each image by its max val
        img = (image0 * (255.0 / image0.max())).astype(np.uint8)
        # print(img.max())
        image = crop_resize(img)
        # print(image.max())
        resized[df.index[i]] = image.reshape(-1)
    resized = pd.DataFrame(resized).T.reset_index()
    resized.columns = resized.columns.astype(str)
    resized.rename(columns={'index': 'image_id'}, inplace=True)
    return resized
Beispiel #9
0
        name = char_data['id'][i]
        np.save(os.path.join(img_dir, name + '.npy'), image)


def col_to_numpy(idx):
    img = np.array(df.iloc[idx][1:]).reshape(128, 128)
    np.save(os.path.join(img_dir, df.iloc[idx][0] + '.npy'),
            img,
            allow_pickle=True)


# prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(0))
# df0 = Resize(prqt)
# prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(1))
# df1 = Resize(prqt)
# prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(2))
# df2 = Resize(prqt)
# prqt = pd.read_parquet('data/train_image_data_{}.parquet'.format(3))
# df3 = Resize(prqt)
# df = pd.concat([df0, df1, df2, df3], ignore_index=True)

# df.to_parquet('train_images_data.parquet', index=False)

# p_map(col_to_numpy, list(range(l)))
# p_map(Resize, ['data/train_image_data_{}.parquet'.format(i) for i in range(4)])
for i in T(range(4)):
    df = pd.read_parquet('data/train_image_data_{}.parquet'.format(i))
    # Resize(df)
    save_as_npy(df)
# x = np.load('data/128_numpy/Train_10.npy')
# print(df.head()
def train(transformer, epoch, num_fold):
    try:
        os.mkdir('models/' + model_dir)
    except:
        pass

    kf = KFold(n_splits=num_fold, shuffle=True, random_state=42)
    df = pd.read_csv('../data/new_data/train.csv')
    for cv_num, (train_list, val_list) in enumerate(kf.split(df)):
        best_qk = 0
        best_loss = np.inf
        for e in T(range(epoch)):
            train_dataset = DRDataset('../data/new_data/train.csv', train_list,
                                      dim, transformer)
            train_data_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=batch_size,
                shuffle=True,
                num_workers=num_worker)
            model.train()
            running_loss = 0.0
            with T(total=len(train_data_loader), file=sys.stdout) as pbar:
                for count, (data, labels) in T(enumerate(train_data_loader),
                                               total=len(train_data_loader)):
                    labels = labels.view(-1, 1)
                    data = data.to(device, dtype=torch.float)
                    labels = labels.to(device, dtype=torch.float)
                    optimizer.zero_grad()
                    it_num = cv_num * epochs * len(
                        train_data_loader) + e * len(
                            train_data_loader) + count + 1

                    # Verify this formula
                    lr = triangular_lr(it_num,
                                       4 * len(train_data_loader) * num_fold,
                                       3e-5, 3e-4, 0.15)
                    set_lr(optimizer, lr)

                    with torch.set_grad_enabled(True):
                        outputs = model(data)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()
                    running_loss += loss.item() * data.size(0)
                    epoch_loss = running_loss / len(train_data_loader.dataset)
                    pbar.set_description('Loss: {:.4f}'.format(
                        running_loss / ((count + 1) * batch_size)))
                    pbar.update(1)
                print('\nTraining Loss: {:.4f}'.format(epoch_loss))
                val_qk, val_loss = eval(val_list, transformer)
                logger(cv_num + 1, e + 1, get_lr(optimizer), epoch_loss,
                       val_loss,
                       val_qk.data.cpu().numpy(),
                       'resnet101_dim_256_logger.csv')
                if val_qk > best_qk and val_loss < best_loss:
                    print(' -----------------------------')
                    print('|       New best model!       |')
                    print(' -----------------------------')
                    best_qk = val_qk
                    best_loss = val_loss
                    torch.save(
                        {
                            'epoch': e,
                            'cv_num': cv_num,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': running_loss,
                            'kappa': best_qk
                        }, 'models/' + model_dir + '/' + model_dir + '_fold_' +
                        str(cv_num) + '.pth')
def evaluate(results, pred_thr):
    model.eval()
    total = 0.0
    running_loss = 0.0
    running_acc = 0.0
    grapheme_root_out = 0.0
    vowel_diacritic_out = 0.0
    consonant_diacritic_out = 0.0
    running_acc = 0.0
    pred1 = []
    pred2 = []
    pred3 = []
    pred1_val = []
    pred2_val = []
    pred3_val = []
    lab1 = []
    lab2 = []
    lab3 = []
    names = []
    with torch.no_grad():
        for idx, (name, inputs, labels1, labels2,
                  labels3) in T(enumerate(valid_loader),
                                total=len(valid_loader)):
            inputs = inputs.to(device)
            labels1 = labels1.to(device)
            labels2 = labels2.to(device)
            labels3 = labels3.to(device)
            total += len(inputs)
            outputs1, outputs2, outputs3 = model(inputs.float())
            names.extend(name)
            pred1.extend(torch.argmax(outputs1, dim=1).cpu().numpy())
            pred2.extend(torch.argmax(outputs2, dim=1).cpu().numpy())
            pred3.extend(torch.argmax(outputs3, dim=1).cpu().numpy())

            pred1_val.extend(
                np.max(nn.functional.softmax(outputs1).cpu().numpy(), axis=1))
            pred2_val.extend(
                np.max(nn.functional.softmax(outputs2).cpu().numpy(), axis=1))
            pred3_val.extend(
                np.max(nn.functional.softmax(outputs3).cpu().numpy(), axis=1))

            lab1.extend(labels1.cpu().numpy())
            lab2.extend(labels2.cpu().numpy())
            lab3.extend(labels3.cpu().numpy())

            loss1 = 0.7 * criterion(outputs1, labels1)
            loss2 = 0.20 * criterion(outputs2, labels2)
            loss3 = 0.10 * criterion(outputs3, labels3)
            running_loss += loss1.item() + loss2.item() + loss3.item()

            grapheme_root_out += (outputs1.argmax(1) == labels1).float().mean()
            vowel_diacritic_out += (
                outputs2.argmax(1) == labels2).float().mean()
            consonant_diacritic_out += (
                outputs3.argmax(1) == labels3).float().mean()

    recall_graph = sklearn.metrics.recall_score(pred1, lab1, average='macro')
    recall_vowel = sklearn.metrics.recall_score(pred2, lab2, average='macro')
    recall_consonant = sklearn.metrics.recall_score(pred3,
                                                    lab3,
                                                    average='macro')
    scores = [recall_graph, recall_vowel, recall_consonant]
    total_recall = np.average(scores, weights=[2, 1, 1])
    msg = 'Loss: {:.4f} \n Acc:     \t Root {:.4f}     \t Vowel {:.4f} \t Consonant {:.4f} \nRecall:  \t Root {:.4f}     \t Vowel {:.4f} \t Consonant {:.4f} Total {:.4f}\n'.format(
        running_loss / (len(valid_loader)),
        grapheme_root_out / (len(valid_loader)),
        vowel_diacritic_out / (len(valid_loader)),
        consonant_diacritic_out / (len(valid_loader)), recall_graph,
        recall_vowel, recall_consonant, total_recall)
    print(msg)
    l = 0
    for idx, (i, j) in enumerate(zip(lab1, pred1)):
        if i != j:
            results.loc[l, 'ID'] = names[idx]
            results.loc[l, 'Graph_Actual'] = idx_to_class(i)
            results.loc[l, 'Graph_Pred'] = idx_to_class(j)
            l += 1
    pred_thr['ID'] = names
    pred_thr['Graph_confidence'] = pred1_val
    pred_thr['Vowel_confidence'] = pred2_val
    pred_thr['Consonant_confidence'] = pred3_val

    pred_thr['Graph_pred'] = pred1
    pred_thr['Vowel_pred'] = pred2
    pred_thr['Consonant_pred'] = pred3

    results.to_csv('results.csv', index=False)
    df = results.groupby('Graph_Actual').size().reset_index(
        name='Count').rename(columns={'Graph_Actual': 'Graph_Actual_value'})
    df.to_csv('results_count.csv', index=False)

    return total_recall, pred_thr
tfrec_dir = f'{data_dir}/tfrecords'
train_dir = f"{data_dir}/malignant"
test_dir = f"{data_dir}/test_768"
filelist = os.listdir(tfrec_dir)
filelist = [f for f in filelist if '.tfrec' in f]
attributes = ['patient_id', 'target', 'sex', 'width', 'height', 'age_approx', 'anatom_site_general_challenge', 'diagnosis', 'image_name', 'image']
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
train = pd.DataFrame()
test = pd.DataFrame()

train_row = 0
test_row = 0

# Creating index files
for i in T(filelist):
    tfrec_file = os.path.join(tfrec_dir, i)
    index_file = os.path.join(tfrec_dir, i).replace('.tfrec', '.index')
    if not os.path.exists(index_file):
        os.system(f"python3 -m tfrecord.tools.tfrecord2idx {tfrec_file} {index_file}")

def tfrec_extract(filename):
    global train_row
    global test_row
    global train
    global test
    tfrecord_path = os.path.join(tfrec_dir, filename)
    index_path = tfrecord_path.replace('.tfrec', '.index')
    
    if 'train' in filename:
        savedir = train_dir
def evaluate(epoch,history):
   model.eval()
   total = 0.0
   running_loss = 0.0
   running_acc = 0.0
   grapheme_root_out=0.0
   vowel_diacritic_out=0.0
   consonant_diacritic_out=0.0
   running_acc = 0.0
   pred1= []
   pred2= []
   pred3 = []
   lab1 = []
   lab2 = []
   lab3 = []
   with torch.no_grad():
     for idx, (inputs,labels1,labels2,labels3) in T(enumerate(valid_loader),total=len(valid_loader)):
        inputs = inputs.to(device)
        labels1 = labels1.to(device)
        labels2 = labels2.to(device)
        labels3 = labels3.to(device)
        total += len(inputs)
        outputs1,outputs2,outputs3 = model(inputs.float())
        pred1.extend(torch.argmax(outputs1, dim=1).cpu().numpy())
        pred2.extend(torch.argmax(outputs2, dim=1).cpu().numpy())
        pred3.extend(torch.argmax(outputs3, dim=1).cpu().numpy())
        
        lab1.extend(labels1.cpu().numpy())
        lab2.extend(labels2.cpu().numpy())
        lab3.extend(labels3.cpu().numpy())

        
        loss1 = 0.70*criterion(outputs1,labels1)
        loss2 = 0.20*criterion(outputs2,labels2)
        loss3 = 0.10*criterion(outputs3,labels3)
        running_loss += loss1.item()+loss2.item()+loss3.item()
        
        grapheme_root_out       += (outputs1.argmax(1)==labels1).float().mean()
        vowel_diacritic_out     += (outputs2.argmax(1)==labels2).float().mean()
        consonant_diacritic_out += (outputs3.argmax(1)==labels3).float().mean()

   recall_graph = sklearn.metrics.recall_score(pred1, lab1, average='macro')
   recall_vowel = sklearn.metrics.recall_score(pred2, lab2, average='macro')
   recall_consonant = sklearn.metrics.recall_score(pred3, lab3, average='macro')
   scores = [recall_graph, recall_vowel, recall_consonant]
   total_recall = np.average(scores, weights=[2, 1, 1])
   writer.add_scalar('Loss/val', running_loss/(len(valid_loader)), epoch)
   writer.add_scalar('Val Accuracy/Root', grapheme_root_out/(len(valid_loader)), epoch)
   writer.add_scalar('Val Accuracy/Vowel', vowel_diacritic_out/(len(valid_loader)), epoch)
   writer.add_scalar('Val Accuracy/Consonant', consonant_diacritic_out/(len(valid_loader)), epoch)

   writer.add_scalar('Val Recall/Root', recall_graph, epoch)
   writer.add_scalar('Val Recall/Vowel', recall_vowel, epoch)
   writer.add_scalar('Val Recall/Consonant', recall_consonant, epoch)
   writer.add_scalar('Val Recall/Total', total_recall, epoch)

   msg = 'Loss: {:.4f} \n Acc:     \t Root {:.4f}     \t Vowel {:.4f} \t Consonant {:.4f} \nRecall:  \t Root {:.4f}     \t Vowel {:.4f} \t Consonant {:.4f} Total {:.4f}\n'.format(running_loss/(len(valid_loader)), grapheme_root_out/(len(valid_loader)), vowel_diacritic_out/(len(valid_loader)), consonant_diacritic_out/(len(valid_loader)), recall_graph, recall_vowel, recall_consonant, total_recall)
   print(msg)
   lr_reduce_scheduler.step(running_loss)
   history.loc[epoch, 'valid_loss'] = running_loss/(len(valid_loader))
   history.loc[epoch, 'valid_grapheme_recall'] = recall_graph
   history.loc[epoch, 'valid_vowel_recall'] =  recall_vowel
   history.loc[epoch, 'valid_conso_recall'] = recall_consonant
   history.loc[epoch, 'valid_recall'] = total_recall
   history.to_csv(os.path.join(history_dir, 'history_{}.csv'.format(model_name)), index=False)
   return  running_loss/(len(valid_loader)), total_recall
for i, (_, test_index) in enumerate(mskf.split(X, y)):
    train_df.iloc[test_index, -1] = i
    
train_df['fold'] = train_df['fold'].astype('int')
idxs = [i for i in range(len(train_df))]
train_idx = []
val_idx = []
model = seresnext(nunique, pretrained_model).to(device)
# model = Dnet(nunique).to(device)
# model = EfficientNetWrapper(pretrained_model).to(device)
# print(summary(model, (3, 128, 128)))
writer.add_graph(model, torch.FloatTensor(np.random.randn(1, 1, 137, 236)).cuda())
# writer.close()

# For stratified split
for i in T(range(len(train_df))):
    if train_df.iloc[i]['fold'] == fold: val_idx.append(i)
    else: train_idx.append(i)

# train_idx = idxs[:int((n_fold-1)*len(idxs)/(n_fold))]
# train_idx = np.load('train_pseudo_idxs.npy')
# val_idx = idxs[int((n_fold-1)*len(idxs)/(n_fold)):]

train_ds = BanglaDataset(train_df, 'data/numpy_format', train_idx, aug=train_aug)
train_loader = DataLoader(train_ds,batch_size=batch_size, shuffle=True)

valid_ds = BanglaDataset(train_df, 'data/numpy_format', val_idx, aug=None)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

writer = SummaryWriter(tb_dir)
## This function for train is copied from @hanjoonchoe