Exemple #1
0
 def sequentialSampler(self, batch_size):
     return DataLoader(self,
                       sampler=SequentialSampler(self),
                       batch_size=batch_size,
                       num_workers=8)
Exemple #2
0
 def __iter__(self):
     self.sampler = SequentialSampler(range(self.size))
     self.iter_sampler = iter(self.sampler)
     return self
Exemple #3
0
def main(args):
    #with torch.cuda.device(args.gpu):
    layers_map = {
        'relu4_2': '22',
        'relu2_2': '8',
        'relu3_2': '13',
        'relu1_2': '4'
    }

    vis = visdom.Visdom(port=args.display_port)

    loss_graph = {
        "g": [],
        "gd": [],
        "gf": [],
        "gpl": [],
        "gpab": [],
        "gs": [],
        "d": [],
        "gdl": [],
        "dl": [],
    }

    # for rgb the change is to feed 3 channels to D instead of just 1. and feed 3 channels to vgg.
    # can leave pixel separate between r and gb for now. assume user use the same weights
    transforms = get_transforms(args)

    if args.color_space == 'rgb':
        args.pixel_weight_ab = args.pixel_weight_rgb
        args.pixel_weight_l = args.pixel_weight_rgb

    rgbify = custom_transforms.toRGB()

    train_dataset = ImageFolder('train', args.data_path, transforms)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)

    val_dataset = ImageFolder('val', args.data_path, transforms)
    indices = torch.randperm(len(val_dataset))
    val_display_size = args.batch_size
    val_display_sampler = SequentialSampler(indices[:val_display_size])
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=val_display_size,
                            sampler=val_display_sampler)
    # renormalize = transforms.Normalize(mean=[+0.5+0.485, +0.5+0.456, +0.5+0.406], std=[0.229, 0.224, 0.225])

    feat_model = models.vgg19(pretrained=True)
    netG, netD, netD_local = get_models(args)

    criterion_gan, criterion_pixel_l, criterion_pixel_ab, criterion_style, criterion_feat, criterion_texturegan = get_criterions(
        args)

    real_label = 1
    fake_label = 0

    optimizerD = optim.Adam(netD.parameters(),
                            lr=args.learning_rate_D,
                            betas=(0.5, 0.999))
    optimizerG = optim.Adam(netG.parameters(),
                            lr=args.learning_rate,
                            betas=(0.5, 0.999))
    optimizerD_local = optim.Adam(netD_local.parameters(),
                                  lr=args.learning_rate_D_local,
                                  betas=(0.5, 0.999))

    with torch.cuda.device(args.gpu):
        netG.cuda()
        netD.cuda()
        netD_local.cuda()
        feat_model.cuda()
        criterion_gan.cuda()
        criterion_pixel_l.cuda()
        criterion_pixel_ab.cuda()
        criterion_feat.cuda()
        criterion_texturegan.cuda()

        input_stack = torch.FloatTensor().cuda()
        target_img = torch.FloatTensor().cuda()
        target_texture = torch.FloatTensor().cuda()
        segment = torch.FloatTensor().cuda()
        label = torch.FloatTensor(args.batch_size).cuda()
        label_local = torch.FloatTensor(args.batch_size).cuda()
        extract_content = FeatureExtractor(feat_model.features,
                                           [layers_map[args.content_layers]])
        extract_style = FeatureExtractor(
            feat_model.features,
            [layers_map[x.strip()] for x in args.style_layers.split(',')])

        model = {
            "netG": netG,
            "netD": netD,
            "netD_local": netD_local,
            "criterion_gan": criterion_gan,
            "criterion_pixel_l": criterion_pixel_l,
            "criterion_pixel_ab": criterion_pixel_ab,
            "criterion_feat": criterion_feat,
            "criterion_style": criterion_style,
            "criterion_texturegan": criterion_texturegan,
            "real_label": real_label,
            "fake_label": fake_label,
            "optimizerD": optimizerD,
            "optimizerD_local": optimizerD_local,
            "optimizerG": optimizerG
        }

        for epoch in range(args.load_epoch, args.num_epoch):
            train(model, train_loader, val_loader, input_stack, target_img,
                  target_texture, segment, label, label_local, extract_content,
                  extract_style, loss_graph, vis, epoch, args)
Exemple #4
0
def get_loader(dataset,
               dataset_root,
               split,
               transform,
               batch_size,
               shuffle,
               num_workers,
               include_eos,
               drop_last=False,
               shuffle_labels=False,
               seed=1234,
               checkpoint=None):

    # reads the file with ids to use for this split
    perm_file = os.path.join('../data/splits/', dataset, split + '.txt')
    with open(perm_file, 'r') as f:
        perm = np.array([int(line.rstrip('\n')) for line in f])

    if dataset == 'coco':
        if split == 'train' or split == 'val':
            annFile = os.path.join(dataset_root, 'annotations',
                                   'instances_train2014.json')
            impath = os.path.join(dataset_root, 'train2014')
        else:
            annFile = os.path.join(dataset_root, 'annotations',
                                   'instances_val2014.json')
            impath = os.path.join(dataset_root, 'val2014')

        dataset = COCO(root=impath,
                       annFile=annFile,
                       transform=transform,
                       shuffle=shuffle_labels,
                       perm=perm,
                       include_eos=include_eos)

    elif dataset == 'voc':
        dataset = VOC(root=dataset_root,
                      year='2007',
                      image_set=split,
                      download=False,
                      transform=transform,
                      shuffle=shuffle_labels,
                      perm=perm,
                      include_eos=include_eos)

    elif dataset == 'nuswide':
        dataset = NUSWIDE(dataset_root,
                          split,
                          transform=transform,
                          shuffle=shuffle_labels,
                          perm=perm,
                          include_eos=include_eos)

    elif dataset == 'ade20k':
        dataset = ADE20K(dataset_root,
                         split,
                         transform=transform,
                         shuffle=shuffle_labels,
                         perm=perm,
                         include_eos=include_eos)

    elif dataset == 'recipe1m':
        dataset = Recipe1M(dataset_root,
                           split,
                           maxnumims=5,
                           shuffle=shuffle_labels,
                           transform=transform,
                           use_lmdb=False,
                           suff='final_',
                           perm=perm,
                           include_eos=include_eos)

    def worker_init_fn(worker_id):
        np.random.seed(seed)

    if shuffle:
        # for training
        sampler = RandomSamplerWithState(dataset, batch_size, seed)
        if checkpoint is not None:
            sampler.set_state(checkpoint['args'].current_epoch,
                              checkpoint['current_step'])
    else:
        # for validation and test
        sampler = SequentialSampler(dataset)

    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              drop_last=drop_last,
                                              pin_memory=True,
                                              collate_fn=collate_fn,
                                              worker_init_fn=worker_init_fn,
                                              sampler=sampler)

    return data_loader, dataset
)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=RandomSampler(train_dataset),
    pin_memory=False,
    drop_last=True,
    num_workers=6,
    collate_fn=collate_fn,
)
val_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=BATCH_SIZE,
    num_workers=6,
    shuffle=False,
    sampler=SequentialSampler(validation_dataset),
    pin_memory=False,
    collate_fn=collate_fn,
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Cuda is available: {}'.format(torch.cuda.is_available()))
cpu_device = torch.device("cpu")
num_classes = 2


model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, pretrained_backbone=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)
    def get_reactions(self, sents, products=None):
        """
        """
        if products is None:
            tokenized_sents, products = self.get_products(sents)

        assert len(products) == len(tokenized_sents)

        # create dataset
        # for each sent, create #{prod} instances
        examples = []
        num_rxns_per_sent = []
        for guid, (sent,
                   prod_labels) in enumerate(zip(tokenized_sents, products)):
            assert len(sent) == len(prod_labels)
            prods = get_entities(prod_labels)
            num_rxns_per_sent.append(len(prods))
            for i, (etype, ss, se) in enumerate(prods):
                assert etype == "Prod"
                labels = ["O"] * len(sent)
                labels[ss] = "B-Prod"
                labels[ss + 1:se + 1] = ["I-Prod"] * (se - ss)
                examples.append(
                    InputExample(guid=guid, words=sent, labels=labels))

        features = cre.data.role.convert_examples_to_features(
            examples,
            self.role_labels,
            self.role_max_seq_len,
            self.role_tokenizer,
            pad_token=self.role_tokenizer.pad_token_id,
            pad_token_label_id=self.pad_token_label_id)

        dataset = RxnDataset(features)
        data_loader = DataLoader(dataset,
                                 sampler=SequentialSampler(dataset),
                                 batch_size=self.batch_size,
                                 collate_fn=default_data_collator)

        all_preds = []
        for batch in data_loader:
            with torch.no_grad():
                for k, v in batch.items():
                    if isinstance(v, torch.Tensor):
                        batch[k] = v.to(self.device)
                outputs = self.role_extractor(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    prod_start_mask=batch['prod_start_mask'],
                    prod_end_mask=batch['prod_end_mask'],
                    prod_mask=batch['prod_mask'],
                    token_type_ids=batch['token_type_ids'])
                logits = outputs[0]

            preds = self.role_extractor.decode(
                logits, batch['decoder_mask'].bool().to(self.device))
            preds = [[self.role_labels[x] for x in seq] for seq in preds]
            all_preds += preds

        # align predictions with inputs
        example_id = 0
        results = []
        for guid, sent in enumerate(tokenized_sents):
            rxns = {"tokens": sent, "reactions": []}
            for k in range(num_rxns_per_sent[guid]):
                # merge preds with prod labels
                rxn_labels = []
                ex = examples[example_id]
                for j, label in enumerate(ex.labels):
                    if label in ["B-Prod", "I-Prod"]:
                        rxn_labels.append(label)
                    else:
                        rxn_labels.append(all_preds[example_id].pop(0))
                rxn = {}
                for role, ss, se in get_entities(rxn_labels):
                    if role == "Prod":
                        rxn["Product"] = (" ".join(sent[ss:se + 1]), ss, se)
                    else:
                        if role not in rxn:
                            rxn[role] = []  # e.g., multiple reactants
                        rxn[role].append((" ".join(sent[ss:se + 1]), ss, se))
                rxns["reactions"].append(rxn)
                example_id += 1

            results.append(rxns)

        return results
Exemple #7
0
    def fit(self,
            train_dataset,
            validation_dataset,
            epochs,
            train_batch_size,
            validation_batch_size,
            results_base_dir_path,
            epoch_handler=None,
            validation_split=None,
            shuffle_dataset=True):
        dataset_size = None
        train_dataset_size = None
        validation_dataset_size = None
        if validation_split is not None:
            dataset_size = len(train_dataset)
            indices = list(range(dataset_size))
            split = int(numpy.floor(validation_split * dataset_size))
            train_indices, validation_indices = indices[
                split:], indices[:split]
            actual_train_dataset = train_dataset
            actual_validation_dataset = train_dataset
        else:
            train_dataset_size = len(train_dataset)
            validation_dataset_size = len(validation_dataset)
            train_indices = list(range(train_dataset_size))
            validation_indices = list(range(validation_dataset_size))
            actual_train_dataset = train_dataset
            actual_validation_dataset = validation_dataset

        if shuffle_dataset is True:
            train_sampler = SubsetRandomSampler(train_indices)
            validation_sampler = SubsetRandomSampler(validation_indices)
        else:
            train_sampler = SequentialSampler(train_indices)
            validation_sampler = SequentialSampler(validation_indices)

        train_data_loader = DataLoader(actual_train_dataset,
                                       batch_size=train_batch_size,
                                       sampler=train_sampler,
                                       drop_last=False,
                                       num_workers=0)
        validation_data_loader = DataLoader(actual_validation_dataset,
                                            batch_size=validation_batch_size,
                                            sampler=validation_sampler,
                                            drop_last=False,
                                            num_workers=0)

        epochs_text = epochs if epochs is not None else 'infinite'

        ModelTrainer._print_training_configuration('Epochs', epochs_text)
        ModelTrainer._print_training_configuration('Train Batch size',
                                                   train_batch_size)
        ModelTrainer._print_training_configuration('Validation Batch size',
                                                   validation_batch_size)
        ModelTrainer._print_training_configuration('Training dataset length',
                                                   len(train_indices))
        ModelTrainer._print_training_configuration(
            'Training batches per epoch',
            int(numpy.ceil(len(train_indices) / train_batch_size)))
        ModelTrainer._print_training_configuration('Validation dataset length',
                                                   len(validation_indices))
        ModelTrainer._print_training_configuration(
            'Validation batches per epoch',
            int(numpy.ceil(len(validation_indices) / validation_batch_size)))

        results_dir_path = os.path.normpath(
            os.path.join(results_base_dir_path,
                         datetime.now().strftime('%Y-%m-%d-%H-%M-%S')))
        model_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'model.pt'))
        results_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'results.npy'))
        model_architecture_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'model_arch.txt'))
        loss_functions_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'loss_functions.txt'))
        optimizer_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'optimizer.txt'))
        trainer_data_file_path = os.path.normpath(
            os.path.join(results_dir_path, 'trainer_data.txt'))
        Path(results_dir_path).mkdir(parents=True, exist_ok=True)

        with open(model_architecture_file_path, "w") as text_file:
            text_file.write(str(self._model))

        with open(loss_functions_file_path, "w") as text_file:
            for loss_function in self._loss_functions:
                text_file.write(str(loss_function))
                print('\n')

        with open(optimizer_file_path, "w") as text_file:
            text_file.write(str(self._optimizer))

        with open(trainer_data_file_path, "w") as text_file:
            text_file.write(f'train_batch_size: {train_batch_size}\n')
            text_file.write(
                f'validation_batch_size: {validation_batch_size}\n')
            text_file.write(f'epochs: {epochs_text}\n')
            text_file.write(f'results_dir_path: {results_dir_path}\n')
            if validation_split is not None:
                text_file.write(f'validation_split: {validation_split}\n')
                text_file.write(f'dataset_size: {dataset_size}\n')
            else:
                text_file.write(f'train_dataset_size: {train_dataset_size}\n')
                text_file.write(
                    f'validation_dataset_size: {validation_dataset_size}\n')

        print(f' - Start Training:')
        results = None
        best_validation_average_loss = None
        train_loss_array = numpy.array([])
        validation_loss_array = numpy.array([])
        for epoch_index in itertools.count():
            print(f'    - Training Epoch #{epoch_index+1}:')
            train_loss = self._train_epoch(epoch_index=epoch_index,
                                           data_loader=train_data_loader)
            train_loss_array = numpy.append(train_loss_array,
                                            [numpy.mean(train_loss)])
            print(f'    - Validation Epoch #{epoch_index+1}:')
            validation_loss = self._validation_epoch(
                epoch_index=epoch_index, data_loader=validation_data_loader)
            validation_loss_array = numpy.append(validation_loss_array,
                                                 [numpy.mean(validation_loss)])

            if best_validation_average_loss is None:
                torch.save(self._model.state_dict(), model_file_path)
                best_validation_average_loss = numpy.mean(validation_loss)
            else:
                validation_average_loss = numpy.mean(validation_loss)
                if validation_average_loss < best_validation_average_loss:
                    torch.save(self._model.state_dict(), model_file_path)
                    best_validation_average_loss = validation_average_loss

            lastest_model_path = os.path.normpath(
                os.path.join(results_dir_path, f'model_{epoch_index}.pt'))
            torch.save(self._model.state_dict(), lastest_model_path)

            if epoch_handler is not None:
                epoch_handler(epoch_index)

            results = {
                'train_loss_array': train_loss_array,
                'validation_loss_array': validation_loss_array,
                'epochs': epochs_text,
                'train_batch_size': train_batch_size,
                'validation_batch_size': validation_batch_size,
                'model_file_path': model_file_path,
                'results_file_path': results_file_path
            }

            numpy.save(file=results_file_path, arr=results, allow_pickle=True)

            if (epochs is not None) and (epoch_index + 1 == epochs):
                break

        return results
Exemple #8
0
    transform=transform,
)

num_train = len(train_dataset)
indices = list(range(num_train))

for i in range(100):
    np.random.shuffle(indices)

split1 = int(np.floor(0.14 * num_train))  #10% of train data is val data at 0.1
split2 = int(np.floor(0.23 * num_train))  #10% of train data is val data at 0.1

train_indices, valid_indices, test_indices = indices[split2:], indices[
    split1:split2], indices[:split1]
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SequentialSampler(valid_indices)
test_sampler = SequentialSampler(test_indices)

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           sampler=train_sampler,
                                           batch_size=args.batch_size,
                                           shuffle=False,
                                           pin_memory=True)
test_loader = torch.utils.data.DataLoader(train_dataset,
                                          sampler=val_sampler,
                                          batch_size=args.test_batch_size,
                                          shuffle=False,
                                          pin_memory=True)

test_loader2 = torch.utils.data.DataLoader(train_dataset,
Exemple #9
0
def test_auto_dataloader_warning(distributed_context_single_node_gloo):
    with pytest.warns(UserWarning, match=r"Found batch_sampler in provided kwargs"):
        auto_dataloader(
            DummyDS(), batch_sampler=BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)
        )
Exemple #10
0
def update_stage1_oof_preds(df, cv_df):
    
    res_file_name = STAGE1_CFGS_TAG+"-train.csv"    
    
    new_feats = get_stage1_columns()
    for f in new_feats:
        df[f] = 0
    
    if os.path.isfile(res_file_name):
        df = pd.read_csv(res_file_name)
        print('img acc:', ((df[new_feats[0]]>0)==df[CFG['image_target_cols'][0]]).mean())
        return df
    
    
    for fold, (train_fold, valid_fold) in enumerate(zip(CFG['train_folds'], CFG['valid_folds'])):
        if fold < 0:
            continue
            
        valid_patients = cv_df.loc[cv_df.fold.isin(valid_fold), 'StudyInstanceUID'].unique()
        filt = df.StudyInstanceUID.isin(valid_patients)
        valid_ = df.loc[filt,:].reset_index(drop=True)

        image_preds_all_list = []
        for cfg in STAGE1_CFGS:
            valid_ds = cfg['dataset_constructor'](valid_, 0.0, CFG['train_img_path'],  image_subsampling=False, transforms=get_valid_transforms(), output_label=True)

            val_loader = torch.utils.data.DataLoader(
                valid_ds, 
                batch_size=256,
                num_workers=CFG['num_workers'],
                shuffle=False,
                pin_memory=False,
                sampler=SequentialSampler(valid_ds)
            )

            device = torch.device(CFG['device'])
            model = cfg['model_constructor']().to(device)
            model.load_state_dict(torch.load('{}/model_fold_{}_{}'.format(CFG['model_path'], fold, cfg['tag'])))
            model.eval()

            image_preds_all = []
            correct_count = 0
            count = 0
            for step, (imgs, target) in enumerate(val_loader):
                imgs = imgs.to(device).float()
                target = target.to(device).float()

                image_preds = model(imgs)   #output = model(input)
                #print(image_preds[:,0], image_preds[:,0].shape)
                #print(target, target.shape)
                
                if len(image_preds.shape) == 1:
                    image_preds = image_preds.view(-1, 1)
                
                correct_count += ((image_preds[:,0]>0) == target[:,0]).sum().detach().item()
                count += imgs.shape[0]
                image_preds_all += [image_preds.cpu().detach().numpy()]
                print('acc: {:.4f}, {}, {}, {}/{}'.format(correct_count/count, correct_count, count, step+1, len(val_loader)), end='\r')
            print()
            
            image_preds_all = np.concatenate(image_preds_all, axis=0)
            image_preds_all_list += [image_preds_all]
        
            del model, val_loader
            torch.cuda.empty_cache()
        
        image_preds_all_list = np.concatenate(image_preds_all_list, axis=1)
        df.loc[filt, new_feats] = image_preds_all_list
        
    df.to_csv(res_file_name, index=False)
    return df
Exemple #11
0
def main():
    args = parser.parse_args()

    log_out_dir = opj(RESULT_DIR, 'logs', args.out_dir, f'fold{args.fold}')
    if not ope(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(opj(log_out_dir, 'log.submit.txt'), mode='a')

    if args.ema:
        network_path = opj(RESULT_DIR, 'models', args.out_dir,
                           f'fold{args.fold}', f'{args.predict_epoch}_ema.pth')
    else:
        network_path = opj(RESULT_DIR, 'models', args.out_dir,
                           f'fold{args.fold}', f'{args.predict_epoch}.pth')

    submit_out_dir = opj(RESULT_DIR, 'submissions', args.out_dir,
                         f'fold{args.fold}', f'epoch_{args.predict_epoch}')
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        submit_out_dir))
    if not ope(submit_out_dir):
        os.makedirs(submit_out_dir)

    # setting up the visible GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id

    args.augment = args.augment.split(',')
    for augment in args.augment:
        if augment not in augment_list:
            raise ValueError(
                'Unsupported or unknown test augmentation: {}!'.format(
                    augment))

    model_params = {}
    model_params['architecture'] = args.arch
    model = init_network(model_params)

    log.write(">> Loading network:\n>>>> '{}'\n".format(network_path))
    checkpoint = torch.load(network_path)
    model.load_state_dict(checkpoint['state_dict'])
    log.write(">>>> loaded network:\n>>>> epoch {}\n".format(
        checkpoint['epoch']))

    # moving network to gpu and eval mode
    model = DataParallel(model)
    model.cuda()
    model.eval()

    # Data loading code
    dataset = args.dataset
    if dataset == 'test':
        steel_test_df = pd.read_csv(opj('..', 'input',
                                        'sample_submission.csv'))
    elif dataset == 'val':
        steel_test_df = pd.read_csv(
            opj(DATA_DIR, args.split_type, args.split_name,
                f'random_valid_cv{args.fold}.csv'))
    else:
        raise ValueError('Unsupported or unknown dataset: {}!'.format(dataset))

    steel_test_df['ImageId'], steel_test_df['ClassId'] = zip(
        *steel_test_df['ImageId_ClassId'].apply(lambda x: x.split('_')))
    imageId = pd.DataFrame(steel_test_df['ImageId'].unique(),
                           columns=['ImageId'])

    test_dataset = SteelDataset(
        imageId,
        img_size=args.img_size,
        mask_size=args.img_size,
        transform=None,
        return_label=False,
        dataset=args.dataset,
    )
    test_loader = DataLoader(
        test_dataset,
        sampler=SequentialSampler(test_dataset),
        batch_size=args.batch_size,
        drop_last=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    for augment in args.augment:
        test_loader.dataset.transform = eval('augment_%s' % augment)
        unaugment_func = eval('unaugment_%s' % augment)
        sub_submit_out_dir = opj(submit_out_dir, augment)
        if not ope(sub_submit_out_dir):
            os.makedirs(sub_submit_out_dir)
        with torch.no_grad():
            predict(test_loader,
                    model,
                    sub_submit_out_dir,
                    dataset,
                    args,
                    unaugment_func=unaugment_func)
Exemple #12
0
    def __init__(self,
                 data,
                 mode,
                 batch_size,
                 vocabs,
                 topology,
                 bucket_by,
                 max_len=None,
                 bucket_order=None,
                 **kwargs):
        self.datasets = {}
        self.mode = mode
        self.vocabs = vocabs
        self.batch_size = batch_size
        self.topology = topology
        self.bucket_by = bucket_by

        # Disable filtering if not training
        self.max_len = max_len if self.mode == 'train' else None
        self.bucket_order = bucket_order if self.mode == 'train' else None

        # For old models to work, set it to the first source
        if self.bucket_by is None:
            if len(self.topology.get_src_langs()) > 0:
                self.bucket_by = self.topology.get_src_langs()[0]
            elif self.mode != 'beam' and len(
                    self.topology.get_trg_langs()) > 0:
                self.bucket_by = self.topology.get_trg_langs()[0]

        for key, ds in self.topology.all.items():
            if self.mode == 'beam' and ds.trg:
                # Skip target streams
                continue

            if key == self.bucket_by:
                self.bucket_by = ds

            if ds._type == "Text":
                # Prepend <bos> if datasource is on target side
                self.datasets[ds] = TextDataset(data[key],
                                                vocabs[key],
                                                bos=ds.trg)
            elif ds._type == "OneHot":
                self.datasets[ds] = OneHotDataset(data[key], vocabs[key])
            elif ds._type == "ImageFolder":
                self.datasets[ds] = ImageFolderDataset(data[key], **kwargs)
            elif ds._type == "Numpy":
                self.datasets[ds] = NumpyDataset(data[key])
            elif ds._type == "Shelve":
                self.datasets[ds] = ShelveDataset(data[key], **kwargs)
            elif ds._type == "Kaldi":
                self.datasets[ds] = KaldiDataset(data[key])
            elif ds._type == "NumpySequence":
                self.datasets[ds] = NumpySequenceDataset(data[key], **kwargs)
            else:
                raise ValueError("Unknown dataset type: {}.".format(ds))

        # Detect dataset sizes
        sizes = set()
        for dataset in self.datasets.values():
            sizes.add(len(dataset))
        assert len(sizes) == 1, "Non-parallel datasets are not supported."

        # Set dataset size
        self.size = list(sizes)[0]

        # Set list of available datasets
        self.keys = list(self.datasets.keys())

        self.n_sources = len([k for k in self.keys if k.src])
        self.n_targets = len([k for k in self.keys if k.trg])

        self.collate_fn = get_collate(self.keys)
        if self.bucket_by is not None:
            self.sort_lens = self.datasets[self.bucket_by].lengths
            self.sampler = BucketBatchSampler(
                batch_size=self.batch_size,
                sort_lens=self.sort_lens,
                max_len=self.max_len,
                store_indices=self.mode == 'beam',
                order=self.bucket_order)
        else:
            # No modality to sort batches, return sequential data
            # Used for beam-search in image->text tasks
            self.sampler = BatchSampler(SequentialSampler(self),
                                        batch_size=self.batch_size,
                                        drop_last=False)
Exemple #13
0
def run_train(args):
    
    out_dir = args.out_dir + '/' + args.model_name
    use_gridmask = args.use_gridmask
    initial_checkpoint = args.initial_checkpoint
    
    if args.scheduler_name == 'null':
        schduler = NullScheduler(lr=0.001)
    else:
        schduler = CyclicScheduler0(min_lr=0.00001, max_lr=0.00005, period=750, ratio=1 )
    
    iter_accum = 1
    batch_size = args.batch_size

    # set-up directories
    for f in ['checkpoint'] : os.makedirs(out_dir +'/'+f, exist_ok=True)

    log = Logger()
    log.open(out_dir+'/log.train.txt',mode='a')
    log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
    log.write('\t%s\n' % COMMON_STRING)
    log.write('\n')

    log.write('\tSEED         = %u\n' % SEED)
    log.write('\t__file__     = %s\n' % __file__)
    log.write('\tout_dir      = %s\n' % out_dir)
    log.write('\n')


    ## dataset ----------------------------------------
    log.write('** dataset setting **\n')
    files_train = [f'train_image_data_{fid}.feather' for fid in range(4)]
    data = read_data(args.data_dir, files_train)
    
    df = pd.read_csv(args.df_path)
    train_split = np.load(args.data_dir + '/train_b_fold1_184855.npy').tolist()
    valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist()

    train_df = df[df['image_id'].isin(train_split)]
    valid_df = df[df['image_id'].isin(valid_split)]

    train_dataset = KaggleDataset(
        df       = df,
        data     = data,
        idx      = train_df.index.values, 
        augment  = train_augment if use_gridmask else valid_augment,
    )

    train_loader  = DataLoader(
        train_dataset,
        sampler     = RandomSampler(train_dataset),
        batch_size  = batch_size,
        drop_last   = True,
        num_workers = 4,
        pin_memory  = True,
        collate_fn  = null_collate
    )

    valid_dataset = KaggleDataset(
        df       = df,
        data     = data,
        idx      = valid_df.index.values, 
        augment  = valid_augment,
    )

    valid_loader = DataLoader(
        valid_dataset,
        sampler     = SequentialSampler(valid_dataset),
        batch_size  = batch_size,
        drop_last   = False,
        num_workers = 4,
        pin_memory  = True,
        collate_fn  = null_collate
    )

    assert(len(train_dataset)>=batch_size)
    log.write('batch_size = %d\n'%(batch_size))
    log.write('\n')

    ## net ----------------------------------------
    log.write('** net setting **\n')
    
    if args.model_name == 'serex50':
        net = Serex50_Net().cuda()
    elif args.model_name == 'effnetb3':
        net = EfficientNet_3().cuda()
    else:
        raise NotImplemented
    
    log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint)

    if initial_checkpoint is not None:
        state_dict = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)
        net.load_state_dict(state_dict,strict=True) 
    else:
        if args.model_name == 'serex50':
            net.load_pretrain(is_print=False)
        else:
            pass

    log.write('net=%s\n'%(type(net)))
    log.write('\n')

    if args.optimizer_name == 'AdamW':
        optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=schduler(0), weight_decay=1e-4)
    else:
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=schduler(0), momentum=0.0, weight_decay = 1e-4)
    
    num_iters   = 3000*1000
    iter_smooth = 50
    iter_log    = 250
    iter_valid  = 500
    iter_save   = [0, num_iters-1]\
                   + list(range(0, num_iters, 1000))#1*1000

    start_iter = 0
    start_epoch= 0
    rate       = 0

    if initial_checkpoint is not None:
        initial_optimizer = initial_checkpoint.replace('_model.pth','_optimizer.pth')
        if os.path.exists(initial_optimizer):
            checkpoint  = torch.load(initial_optimizer)
            start_iter  = checkpoint['iter' ]
            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])
        pass

    log.write('optimizer\n  %s\n'%(optimizer))
    log.write('schduler\n  %s\n'%(schduler))
    log.write('\n')

    ## start training here! ##############################################
    log.write('** start training here! **\n')
    log.write('   batch_size=%d,  iter_accum=%d\n'%(batch_size,iter_accum))
    log.write('   experiment  = %s\n' % str(__file__.split('/')[-2:]))
    log.write('                    |----------------------- VALID------------------------------------|------- TRAIN/BATCH -----------\n')
    log.write('rate    iter  epoch | kaggle                    | loss               acc              | loss             | time       \n')
    log.write('----------------------------------------------------------------------------------------------------------------------\n')

    def message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'):
        if mode==('print'):
            asterisk = ' '
            loss = batch_loss
        if mode==('log'):
            asterisk = '*' if iter in iter_save else ' '
            loss = train_loss

        text = \
            '%0.5f %5.1f%s %4.1f | '%(rate, iter/1000, asterisk, epoch,) +\
            '%0.4f : %0.4f %0.4f %0.4f | '%(kaggle[1],*kaggle[0]) +\
            '%4.4f, %4.4f, %4.4f : %4.4f, %4.4f, %4.4f | '%(*valid_loss,) +\
            '%4.4f, %4.4f, %4.4f |'%(*loss,) +\
            '%s' % (time_to_str((timer() - start_timer),'min'))

        return text

    kaggle = (0,0,0,0)
    valid_loss = np.zeros(6,np.float32)
    train_loss = np.zeros(3,np.float32)
    batch_loss = np.zeros_like(train_loss)
    iter = 0
    i    = 0

    start_timer = timer()
    while  iter<num_iters:
        sum_train_loss = np.zeros_like(train_loss)
        sum_train = np.zeros_like(train_loss)

        optimizer.zero_grad()
        for t, (input, truth, infor) in enumerate(train_loader):

            input, truth, shuffled_truth, lam = cutmix(input, truth,alpha=0.3)

            batch_size = len(infor)
            iter  = i + start_iter
            epoch = (iter-start_iter)*batch_size/len(train_dataset) + start_epoch

            if (iter % iter_valid==0):
                valid_loss, kaggle = do_valid(net, valid_loader, out_dir) #
                pass

            if (iter % iter_log==0):
                print('\r',end='',flush=True)
                log.write(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='log'))
                log.write('\n')

            if iter in iter_save:
                torch.save({
                    'optimizer': optimizer.state_dict(),
                    'iter'     : iter,
                    'epoch'    : epoch,
                }, out_dir +'/checkpoint/%08d_optimizer.pth'%(iter))
                if iter!=start_iter:
                    torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(iter))
                    pass

            # learning rate schduler -------------
            lr = schduler(iter)
            if lr<0 : break
            adjust_learning_rate(optimizer, lr)
            rate = get_learning_rate(optimizer)

            net.train()
            
            input = input.cuda()
            truth = [t.cuda() for t in truth]
            shuffled_truth = [t.cuda() for t in shuffled_truth]

            logit = net(input) 
            probability = logit_to_probability(logit)

            loss = cutmix_criterion(logit, truth, shuffled_truth, lam)
        
            ((loss[0]+loss[1]+loss[2] )/iter_accum).backward()
        
            if (iter % iter_accum)==0:
                optimizer.step()
                optimizer.zero_grad()

            loss = [l.item() for l in loss]
            l = np.array([ *loss, ])*batch_size
            n = np.array([ 1, 1, 1 ])*batch_size
            batch_loss      = l/(n+1e-8)
            sum_train_loss += l
            sum_train      += n
            if iter%iter_smooth == 0:
                train_loss = sum_train_loss/(sum_train+1e-12)
                sum_train_loss[...] = 0
                sum_train[...]      = 0

            print('\r',end='',flush=True)
            print(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'), end='',flush=True)
            i=i+1

        pass  #-- end of one data loader --
    pass #-- end of all iterations --

    log.write('\n')
def train(args):
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        'sst': SstProcessor,
        'aspect': AspectProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForMultiLabelClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in model.named_parameters() if n in no_decay],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        min_loss = 100000000
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, (input_ids, input_mask, segment_ids,
                       label_ids) in enumerate(
                           tqdm(train_dataloader, desc="Iteration")):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

            if args.do_eval:
                eval_examples = processor.get_dev_examples(args.data_dir)
                eval_features = convert_examples_to_features(
                    eval_examples, label_list, args.max_seq_length, tokenizer)

                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                all_input_ids = torch.tensor(
                    [f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)
                if args.local_rank == -1:
                    eval_sampler = SequentialSampler(eval_data)
                else:
                    eval_sampler = DistributedSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                logit_list = []
                labels_eval_list = []
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)
                    with torch.no_grad():
                        tmp_eval_loss, logits = model(input_ids, segment_ids,
                                                      input_mask, label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    logit_list.extend(logits.tolist())
                    labels_eval_list.extend(label_ids.tolist())
                    tmp_eval_accuracy = accuracy(logits, label_ids)
                    # _ = accuracy2(logits, label_ids)
                    # _ = accuracy3(logits, label_ids)
                    # _ = accuracy4(logits, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1
                print(epoch)
                _ = accuracy2(logit_list, labels_eval_list)
                _ = accuracy3(logit_list, labels_eval_list)
                _ = accuracy3_2(logit_list, labels_eval_list)
                _ = accuracy4(logit_list, labels_eval_list)
                _ = accuracy5(logit_list, labels_eval_list)
                _ = accuracy7(logit_list, labels_eval_list)

                eval_loss = eval_loss / nb_eval_steps  # len(eval_dataloader)
                eval_accuracy = eval_accuracy / nb_eval_examples  # len(eval_dataloader)
                print("eval_loss", eval_loss)
                result = {
                    'eval_loss': eval_loss,
                    'eval_accuracy': eval_accuracy,
                    'global_step': global_step,
                    'loss': tr_loss / nb_tr_steps
                }  # 'loss': loss.item()}

                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    logger.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                if eval_loss < min_loss:
                    np.save(os.path.join(args.data_dir, 'oof_train'),
                            np.asarray(logit_list))
                    np.save(os.path.join(args.data_dir, 'oof_train_y'),
                            np.asarray(labels_eval_list))

                    eval_examples = processor.get_test_examples(args.data_dir)
                    eval_features = convert_examples_to_features(
                        eval_examples, label_list, args.max_seq_length,
                        tokenizer)

                    logger.info("***** Running test *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    all_input_ids = torch.tensor(
                        [f.input_ids for f in eval_features], dtype=torch.long)
                    all_input_mask = torch.tensor(
                        [f.input_mask for f in eval_features],
                        dtype=torch.long)
                    all_segment_ids = torch.tensor(
                        [f.segment_ids for f in eval_features],
                        dtype=torch.long)
                    all_label_ids = torch.tensor(
                        [f.label_id for f in eval_features], dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label_ids)
                    if args.local_rank == -1:
                        eval_sampler = SequentialSampler(eval_data)
                    else:
                        eval_sampler = DistributedSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    logit_test = []
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)
                        with torch.no_grad():
                            _, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids)

                        logits = logits.detach().cpu().numpy()
                        # label_ids = label_ids.to('cpu').numpy()
                        logit_test.extend(logits.tolist())
                        # labels_eval_list.extend(label_ids.tolist())

                        np.save(os.path.join(args.data_dir, 'oof_test'),
                                np.asarray(logit_test))

                    min_loss = eval_loss
Exemple #15
0
    if isinstance(data, np.ndarray):
        data = np.reshape(data, (n_data_total, -1, data.shape[-1]))
    elif isinstance(data, list):
        data = [np.reshape(data_slice, (-1, data_slice.shape[-1])) for data_slice in data]
    else:
        raise ValueError("Invalid type of data given")

    # take out only a fraction of the test data
    data = data[idx_start:idx_end]
    labels = labels[idx_start:idx_end]
    n_data_total = len(data)

    input_size = data[0].shape[0]
    output_size = meta['output_size']
    dataset = nu.TimeSeriesDataset(data, labels, transform=nu.ToTensor())
    dataloader = DataLoader(dataset, sampler=SequentialSampler(range(n_data_total)),
        batch_size=batch_size, collate_fn=nu.collate_fn, num_workers=0)

    if device == 'cpu':
        rnn = nu.RNN(input_size=meta['input_size'], hidden_size=meta['hidden_size'],
            output_size=meta['output_size'], n_layers=meta['n_layers'], bidirectional=meta['bidirectional'])
    else:
        rnn = nu.RNN(input_size=meta['input_size'], hidden_size=meta['hidden_size'],
            output_size=meta['output_size'], n_layers=meta['n_layers'], bidirectional=meta['bidirectional']).cuda()
    rnn.load_state_dict(meta['model'][idx_min_loss_epoch])
    del meta
    #criterion = nn.CrossEntropyLoss(reduction='sum') if classifier else nn.MSELoss(reduction='sum')
    #metric = 'cross_entropy_mean' if classifier else 'rmse'
    loss_sum = {}
    loss_metric = {}
    loss_sum = 0.
Exemple #16
0
def main():
    parser = MyArgumentParser((InferenceArguments, ))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        (args, ) = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        (args, ) = parser.parse_args_into_dataclasses()

    params = dict(
        pretrained_model_name_or_path=args.model_name_or_path,
        cache_dir=args.cache_dir,
    )

    config = AutoConfig.from_pretrained(**params)
    tokenizer = AutoTokenizer.from_pretrained(**params)
    model = AutoModelForSeq2SeqLM.from_pretrained(config=config, **params)

    if args.model_parameters:
        print("====== MODEL PARAMETER LOADING... ======\n"
              f"   {args.model_parameters}")
        model.load_state_dict(torch.load(args.model_parameters))

    max_length = args.test_max_target_length

    # set num_beams for evaluation
    num_beams = args.num_beams if args.num_beams else model.config.num_beams

    test_dataset = Seq2SeqDataset(
        tokenizer=tokenizer,
        type_path='test',
        data_dir=args.data_dir,
        max_target_length=args.test_max_target_length,
        max_source_length=args.max_source_length,
    )

    test_sampler = SequentialSampler(test_dataset)

    data_collator = Seq2SeqDataCollator(tokenizer, args)

    test_dataloader = DataLoader(
        test_dataset,
        sampler=test_sampler,
        batch_size=args.per_device_test_batch_size,
        collate_fn=data_collator,
        drop_last=False,
    )

    # prediction_loop
    description = "Prediction"

    batch_size = test_dataloader.batch_size
    num_examples = len(test_dataloader.dataset)

    print(f"***** Running {description} *****")
    print(f"  Num examples = {num_examples}")
    print(f"  Batch size = {batch_size}")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    res = []
    for step, inputs in enumerate(test_dataloader):
        # prediction_step, generative based
        has_labels = "labels" in inputs  # False
        # _prepare_inputs
        #  1. device로 보내기
        #  2. memory에 _past 올리기
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(device)
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
        generated_tokens = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            **gen_kwargs,
        )
        # in case the batch is shorter than max length, the output should be padded
        if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
            # If PAD token is not defined at least EOS token has to be defined
            padded_tensor = tokenizer.pad_token_id * torch.ones(
                (generated_tokens.shape[0], gen_kwargs["max_length"]),
                dtype=generated_tokens.dtype,
                device=generated_tokens.device,
            )
            padded_tensor[:, :generated_tokens.shape[-1]] = generated_tokens
            generated_tokens = padded_tensor
        loss = None
        labels = None
        res.extend(list(generated_tokens))
    submit(args, tokenizer, res)
    print("Finished!")
def training(model_name, model_type, optimizer_name, lr_scheduler_name, lr,
             batch_size, valid_batch_size, num_epoch, start_epoch,
             accumulation_steps, train_data_folder, checkpoint_folder,
             train_split, val_split, fold, load_pretrain):

    COMMON_STRING = '@%s:  \n' % os.path.basename(__file__)
    COMMON_STRING += '\tset random seed\n'
    COMMON_STRING += '\t\tSEED = %d\n' % SEED

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.deterministic = True

    COMMON_STRING += '\tset cuda environment\n'
    COMMON_STRING += '\t\ttorch.__version__              = %s\n' % torch.__version__
    COMMON_STRING += '\t\ttorch.version.cuda             = %s\n' % torch.version.cuda
    COMMON_STRING += '\t\ttorch.backends.cudnn.version() = %s\n' % torch.backends.cudnn.version(
    )
    try:
        COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\']     = %s\n' % os.environ[
            'CUDA_VISIBLE_DEVICES']
        NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
    except Exception:
        COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\']     = None\n'
        NUM_CUDA_DEVICES = 1

    COMMON_STRING += '\t\ttorch.cuda.device_count()      = %d\n' % torch.cuda.device_count(
    )
    COMMON_STRING += '\n'

    os.makedirs(checkpoint_folder + '/' + model_type + '/' + model_name,
                exist_ok=True)

    log = Logger()
    log.open(checkpoint_folder + '/' + model_type + '/' + model_name + '/' +
             model_name + '_fold_' + str(fold) + '_log_train.txt',
             mode='a+')
    log.write('\t%s\n' % COMMON_STRING)
    log.write('\n')

    log.write('\tSEED         = %u\n' % SEED)
    log.write('\tPROJECT_PATH = %s\n' % train_data_folder)
    log.write('\t__file__     = %s\n' % __file__)
    log.write('\tout_dir      = %s\n' % checkpoint_folder)
    log.write('\n')

    ## dataset ----------------------------------------
    log.write('** dataset setting **\n')

    train_dataset = URESDataset(
        data_dir=train_data_folder,
        mode='train',
        csv=[
            'train.csv',
        ],
        split=train_split,
        augment=transform_train,
        size=(1024, 1024),
    )
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=batch_size,
                                  drop_last=True,
                                  num_workers=4,
                                  pin_memory=True,
                                  collate_fn=null_collate)

    valid_dataset = URESDataset(
        data_dir=train_data_folder,
        mode='train',
        csv=[
            'train.csv',
        ],
        split=val_split,
        augment=transform_valid,
        size=(1024, 1024),
    )
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=SequentialSampler(valid_dataset),
                                  batch_size=valid_batch_size,
                                  drop_last=False,
                                  num_workers=4,
                                  pin_memory=True,
                                  collate_fn=null_collate)

    log.write('train_dataset : \n%s\n' % (train_dataset))
    log.write('valid_dataset : \n%s\n' % (valid_dataset))
    log.write('\n')

    ############################################################################## define unet model with backbone
    def load(model, pretrain_file, skip=[]):
        pretrain_state_dict = torch.load(pretrain_file)
        state_dict = model.state_dict()
        keys = list(state_dict.keys())
        for key in keys:
            if any(s in key for s in skip): continue
            try:
                state_dict[key] = pretrain_state_dict[key]
            except:
                print(key)
        model.load_state_dict(state_dict)

        return model

    def get_deeplab_model(model_name="deep_se101", in_channel=3, num_classes=1, criterion=SoftDiceLoss_binary(), \
            load_pretrain=False, checkpoint_filepath=None):

        if model_name == 'deep_se50':
            model = DeepSRNX50V3PlusD_m1(in_channel=in_channel,
                                         num_classes=num_classes,
                                         criterion=criterion)
        elif model_name == 'deep_se101':
            model = DeepSRNX101V3PlusD_m1(in_channel=in_channel,
                                          num_classes=num_classes,
                                          criterion=criterion)
        elif model_name == 'WideResnet38':
            model = DeepWR38V3PlusD_m1(in_channel=in_channel,
                                       num_classes=num_classes,
                                       criterion=criterion)
        elif model_name == 'unet_ef3':
            model = EfficientNet_3_unet()
        elif model_name == 'unet_ef5':
            model = EfficientNet_5_unet()
        else:
            print('No model name in it')
            model = None

        if (load_pretrain):
            model = load(model, checkpoint_filepath)

        return model

    def get_unet_model(model_name="efficientnet-b3", IN_CHANNEL=3, NUM_CLASSES=1, \
            WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=False, checkpoint_filepath=None):

        model = model_iMet(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT)

        if (load_pretrain):
            model.load_pretrain(checkpoint_filepath)

        return model

    def get_aspp_model(model_name="efficientnet-b3",
                       NUM_CLASSES=1,
                       load_pretrain=False,
                       checkpoint_filepath=None):

        model = Net(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT)
        if (load_pretrain):
            state_dict = torch.load(checkpoint_filepath,
                                    map_location=lambda storage, loc: storage)
        model.load_state_dict(state_dict, strict=True)

        return model

    ############################################################################### training parameters
    checkpoint_filename = model_type + '/' + model_name + '/' + model_name + "_" + model_type + '_fold_' + str(
        fold) + "_checkpoint.pth"
    checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename)

    ############################################################################### model and optimizer
    if model_type == 'unet':
        model = get_unet_model(model_name=model_name, IN_CHANNEL=3, NUM_CLASSES=NUM_CLASS, \
            WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)
    elif model_type == 'deeplab':
        model = get_deeplab_model(model_name=model_name, in_channel=3, num_classes=NUM_CLASS, \
            criterion=BCEDiceLoss(), load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)
    elif model_type == 'aspp':
        model = get_aspp_model(model_name=model_name, NUM_CLASSES=NUM_CLASS, \
            load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)

    model = model.cuda()

    if optimizer_name == "Adam":
        if model_type != 'deeplab':
            optimizer = torch.optim.Adam([{
                'params': model.decoder.parameters(),
                'lr': lr,
                'weight_decay': 0.01
            }, {
                'params': model.encoder.parameters(),
                'lr': lr * 0.05
            }])
        else:
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    elif optimizer_name == "adamonecycle":
        flatten_model = lambda m: sum(map(flatten_model, m.children()), []
                                      ) if num_children(m) else [m]
        get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))]

        optimizer_func = partial(optim.Adam, betas=(0.9, 0.99))
        optimizer = OptimWrapper.create(optimizer_func,
                                        3e-3,
                                        get_layer_groups(model),
                                        wd=1e-4,
                                        true_wd=True,
                                        bn_wd=True)
    elif optimizer_name == "Ranger":
        if model_type != 'deeplab':
            optimizer = Ranger([{
                'params': model.decoder.parameters(),
                'lr': lr,
                'weight_decay': 0.01
            }, {
                'params': model.encoder.parameters(),
                'lr': lr * 0.05
            }])
        else:
            optimizer = Ranger(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr,
                               weight_decay=1e-5)
    else:
        raise NotImplementedError

    if lr_scheduler_name == "adamonecycle":
        scheduler = lsf.OneCycle(optimizer,
                                 len(train_dataset) * num_epoch, lr,
                                 [0.95, 0.85], 10.0, 0.4)
        lr_scheduler_each_iter = True
    elif lr_scheduler_name == "CosineAnealing":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               num_epoch,
                                                               eta_min=0,
                                                               last_epoch=-1)
        lr_scheduler_each_iter = False
    elif lr_scheduler_name == "WarmRestart":
        scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6)
        lr_scheduler_each_iter = False
    else:
        raise NotImplementedError

    log.write('net\n  %s\n' % (model_name))
    log.write('optimizer\n  %s\n' % (optimizer_name))
    log.write('schduler\n  %s\n' % (lr_scheduler_name))
    log.write('\n')

    # mix precision
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    ############################################################################### training
    log.write('** start training here! **\n')
    log.write('   batch_size=%d,  accumulation_steps=%d\n' %
              (batch_size, accumulation_steps))
    log.write('   experiment  = %s\n' % str(__file__.split('/')[-2:]))

    valid_loss = np.zeros(3, np.float32)
    train_loss = np.zeros(3, np.float32)
    valid_metric_optimal = np.inf
    eval_step = len(train_dataloader)  # or len(train_dataloader)
    log_step = 10
    eval_count = 0

    # define tensorboard writer and timer
    writer = SummaryWriter()
    start_timer = timer()

    # define criterion
    criterion = BCEDiceLoss()
    metric = FscoreMetric(activation=None)

    for epoch in range(1, num_epoch + 1):

        torch.cuda.empty_cache()

        # update lr and start from start_epoch
        # if (not lr_scheduler_each_iter):
        #     if epoch < 600:
        #         if epoch != 0:
        #             scheduler.step()
        #             scheduler = warm_restart(scheduler, T_mult=2)
        #     elif epoch > 600 and epoch < 800:
        #         optimizer.param_groups[0]['lr'] = 1e-5
        #     else:
        #         optimizer.param_groups[0]['lr'] = 5e-6

        affect_rate = CosineAnnealingWarmUpRestarts(
            epoch,
            T_0=num_epoch,
            T_warmup=15,
            gamma=0.8,
        )
        optimizer.param_groups[0]['lr'] = affect_rate * lr

        if epoch < 100:
            optimizer.param_groups[0]['lr'] = affect_rate * lr
        elif epoch < 150:
            lr = 4e-4
            optimizer.param_groups[0]['lr'] = affect_rate * lr
        else:
            lr = 1e-4

        # optimizer.param_groups[0]['lr'] = rate * lr
        # optimizer.param_groups[1]['lr'] = rate * lr * 0.01

        if (epoch < start_epoch):
            continue

        log.write("Epoch%s\n" % epoch)
        log.write('\n')

        for param_group in optimizer.param_groups:
            rate = param_group['lr']

        sum_train_loss = np.zeros_like(train_loss)
        sum_train = np.zeros_like(train_loss)

        seed_everything(SEED + epoch)
        torch.cuda.empty_cache()
        optimizer.zero_grad()

        for tr_batch_i, (X, truth_mask) in enumerate(train_dataloader):

            if (lr_scheduler_each_iter):
                scheduler.step(tr_batch_i)

            model.train()

            X = X.cuda().float()
            truth_mask = truth_mask.cuda()
            prediction = model(X)  # [N, C, H, W]
            # loss = criterion_mask(prediction, truth_mask, weight=None)
            loss = criterion(prediction, truth_mask)

            with amp.scale_loss(loss / accumulation_steps,
                                optimizer) as scaled_loss:
                scaled_loss.backward()

            #loss.backward()

            if ((tr_batch_i + 1) % accumulation_steps == 0):
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_norm=5.0,
                                               norm_type=2)
                optimizer.step()
                optimizer.zero_grad()

                writer.add_scalar(
                    'train_loss_' + str(fold), loss.item(),
                    (epoch - 1) * len(train_dataloader) * batch_size +
                    tr_batch_i * batch_size)

            # print statistics  --------

            # probability_mask  = prediction
            probability_mask = torch.sigmoid(prediction)
            mask_positive = torch.where(truth_mask > 0.5,
                                        torch.ones_like(truth_mask),
                                        truth_mask)
            mask_negative = 1 - mask_positive
            fscore_positive = metric(probability_mask, mask_positive)
            fscore_negative = metric(1 - probability_mask, mask_negative)

            # probability_mask  = torch.sigmoid(prediction)
            # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0)
            # mask_negative = 1 - mask_positive
            # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0)
            # mask_pred_negative = 1 - mask_pred_positive
            # fscore_positive = f1_score(mask_positive, mask_pred_positive)
            # fscore_negative = f1_score(mask_negative, mask_pred_negative)

            l = np.array(
                [loss.item() * batch_size, fscore_positive, fscore_negative])
            n = np.array([batch_size])
            sum_train_loss = sum_train_loss + l
            sum_train = sum_train + n

            # log for training
            if (tr_batch_i + 1) % log_step == 0:
                train_loss = sum_train_loss / (sum_train + 1e-12)
                sum_train_loss[...] = 0
                sum_train[...] = 0
                log.write('lr: %f train loss: %f fscore_positive: %f fscore_negative: %f\n' % \
                    (rate, train_loss[0], train_loss[1], train_loss[2]))

            if (tr_batch_i + 1) % eval_step == 0:

                eval_count += 1

                valid_loss = np.zeros(3, np.float32)
                valid_num = np.zeros_like(valid_loss)
                valid_metric = []

                with torch.no_grad():

                    torch.cuda.empty_cache()

                    for val_batch_i, (
                            X, truth_mask) in enumerate(valid_dataloader):

                        model.eval()

                        X = X.cuda().float()
                        truth_mask = truth_mask.cuda()
                        prediction = model(X)  # [N, C, H, W]

                        # loss = criterion_mask(prediction, truth_mask, weight=None)
                        loss = criterion(prediction, truth_mask)

                        writer.add_scalar(
                            'val_loss_' + str(fold), loss.item(),
                            (eval_count - 1) * len(valid_dataloader) *
                            valid_batch_size + val_batch_i * valid_batch_size)

                        # print statistics  --------

                        # probability_mask  = prediction
                        probability_mask = torch.sigmoid(prediction)
                        mask_positive = torch.where(
                            truth_mask > 0.5, torch.ones_like(truth_mask),
                            truth_mask)
                        mask_negative = 1 - mask_positive
                        fscore_positive = metric(probability_mask,
                                                 mask_positive)
                        fscore_negative = metric(1 - probability_mask,
                                                 mask_negative)

                        # if (epoch == 1) and (val_batch_i == 0):
                        #     predict = probability_mask[0, :, :].detach().squeeze().cpu().numpy()
                        #     predict = predict > 0.5 # Threshould
                        #     predict = (1 - predict)*255
                        #     cv2.imwrite('result/0_0.tiff', predict.astype(np.uint8))

                        # probability_mask  = torch.sigmoid(prediction)
                        # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0)
                        # mask_negative = 1 - mask_positive
                        # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0)
                        # mask_pred_negative = 1 - mask_pred_positive
                        # fscore_positive = f1_score(mask_positive, mask_pred_positive)
                        # fscore_negative = f1_score(mask_negative, mask_pred_negative)

                        #---
                        l = np.array([
                            loss.item() * valid_batch_size, fscore_positive,
                            fscore_negative
                        ])
                        n = np.array([valid_batch_size])
                        valid_loss = valid_loss + l
                        valid_num = valid_num + n

                    valid_loss = valid_loss / valid_num

                    log.write('validation loss: %f fscore_positive: %f fscore_negative: %f\n' % \
                    (valid_loss[0], \
                    valid_loss[1], \
                    valid_loss[2]))

        val_metric_epoch = valid_loss[0]

        if (val_metric_epoch <= valid_metric_optimal):

            log.write('Validation metric improved ({:.6f} --> {:.6f}).  Saving model ...'.format(\
                    valid_metric_optimal, val_metric_epoch))

            valid_metric_optimal = val_metric_epoch
            torch.save(model.state_dict(), checkpoint_filepath)
def get_dataloaders(checkpoint_dir,
                    rsyncing,
                    selective_sampling=False,
                    warmup_trainer=None,
                    batch_size=16,
                    num_workers=os.cpu_count() - 1,
                    data_aug_vec=[0.5, 0.25, 0.5, 0.5],
                    toy=False,
                    notebook=False,
                    cat=False):
    """

    :param checkpoint_dir:
    :param rsyncing:
    :param selective_sampling:
    :param warmup_trainer:
    :param batch_size:
    :param num_workers:
    :param seed:
    :param data_aug_vec: probabilities for rnd flip, rnd gamma, rnd translation and rnd scale
    :param toy:
    :param notebook:
    :return:
    """
    #     if torch.cuda.is_available():
    #         mp.set_start_method('spawn')
    multiprocessing = False
    num_workers = 0
    sampler_size = 3000

    if rsyncing:
        print('Rsynced data! (prepare feat)', flush=True)
    else:
        print('Using symbolic links! (prepare feat)', flush=True)
    print('Getting path ready..', flush=True)
    anno_path_train, anno_path_val, png_path = get_paths(
        rsyncing, toy, notebook)

    # TODO
    # png_path = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', 'png')
    # anno_path_train = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset',
    #                                'annotations/mscoco_train_full.json')
    # anno_path_val = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset',
    #                                'annotations/mscoco_train_full.json')

    print('Creating Coco Datasets..', flush=True)
    # t.ToTensor()
    if not cat:
        trans_img = torchvision.transforms.Compose([
            t.Normalize(),
            t.BboxCrop(targetsize=224),
            t.RandomFlipImg(prob=data_aug_vec[0]),
            t.RandomGammaImg(prob=data_aug_vec[1],
                             use_normal_distribution=True)
        ])
        trans_bb = torchvision.transforms.Compose([
            t.GetFiveBBs(),
            t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10),
            t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1)
        ])
    else:
        trans_img = torchvision.transforms.Compose([
            t.Normalize(),
            t.BboxCropMult(targetsize=224),
            t.RandomFlipImg(prob=data_aug_vec[0]),
            t.RandomGammaImg(prob=data_aug_vec[1],
                             use_normal_distribution=True)
        ])
        trans_bb = torchvision.transforms.Compose([
            t.GetBBsMult(),
            t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10,
                                cat=True),
            t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1, cat=True)
        ])

    trainset = u.dataset_coco(png_path,
                              anno_path_train,
                              transform=trans_img,
                              bbox_transform=trans_bb,
                              for_feature=True,
                              cat=cat)
    print('Training set has', len(trainset), 'images', flush=True)

    if not cat:
        valset = u.dataset_coco(
            png_path,
            anno_path_val,
            transform=torchvision.transforms.Compose(
                [t.Normalize(), t.BboxCrop(targetsize=224)]),
            bbox_transform=torchvision.transforms.Compose([t.GetFiveBBs()]),
            for_feature=True,
            cat=cat)
    else:
        valset = u.dataset_coco(
            png_path,
            anno_path_val,
            transform=torchvision.transforms.Compose(
                [t.Normalize(), t.BboxCropMult(targetsize=224)]),
            bbox_transform=torchvision.transforms.Compose([t.GetBBsMult()]),
            for_feature=True,
            cat=cat)
    print('Validation set has', len(valset), 'images', flush=True)

    if selective_sampling:
        if not warmup_trainer:
            print(
                'Cannot calculate weights for selective sampling: no model given. Using normal sampling instead',
                flush=True)
            trainloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=batch_size,
                sampler=RandomSampler(trainset),
                num_workers=num_workers,
                collate_fn=u.mammo_collate,
                pin_memory=multiprocessing)
        else:
            print('Getting weights for sampling..', flush=True)
            trainloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=batch_size,
                sampler=SequentialSampler(trainset),
                num_workers=num_workers,
                collate_fn=u.mammo_collate,
                pin_memory=multiprocessing)
            weights = warmup_trainer.predict_dataset(trainloader)
            pkl.dump(
                weights,
                open(
                    os.path.join(checkpoint_dir,
                                 'weights_selective_train.pkl'), 'wb'))
            trainloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=batch_size,
                sampler=WeightedRandomSampler(weights.double(),
                                              sampler_size,
                                              replacement=False),
                num_workers=num_workers,
                collate_fn=u.mammo_collate,
                pin_memory=multiprocessing)

    else:
        trainloader = torch.utils.data.DataLoader(
            trainset,
            batch_size=batch_size,
            sampler=RandomSampler(trainset),
            num_workers=num_workers,
            collate_fn=u.mammo_collate,
            pin_memory=multiprocessing)

    valloader = torch.utils.data.DataLoader(valset,
                                            batch_size=batch_size,
                                            sampler=SequentialSampler(valset),
                                            num_workers=num_workers,
                                            collate_fn=u.mammo_collate,
                                            pin_memory=multiprocessing)

    print('Training loader has', len(trainloader), 'batches', flush=True)
    print('Validation loader has', len(valloader), 'batches', flush=True)
    return trainloader, valloader
Exemple #19
0
        type=str,
        default=
        '../k_logs/2020-09-24T15-18-33-duration_extractor/2020-09-24_checkpoint_step15000.pth',
        help="Path to checkpoint of convolutional_cacotron model")
    parser.add_argument(
        "--data_folder",
        type=str,
        default='../code/datasets/data/kss',
        help="Where the data live and where to save durations.")
    parser.add_argument("--durations_filename",
                        default='durations.txt',
                        type=str,
                        help="Name of the final durations file.")
    parser.add_argument("--batch_size",
                        default=256,
                        type=int,
                        help="Batch size")
    args = parser.parse_args()

    # Load pretrained checkpoint and extract alignments to data_folder
    m = DurationExtractor().load(args.checkpoint)
    dataset = K_AudioDataset(root=args.data_folder, durations=False)
    dataloader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            collate_fn=Collate(m.device),
                            shuffle=False,
                            sampler=SequentialSampler(dataset))

    save_alignments_as_fertilities(m, dataloader, args.data_folder,
                                   args.durations_filename)
def get_sequential_trainloader(toy,
                               rsyncing,
                               batch_size=16,
                               num_workers=os.cpu_count() - 1,
                               data_aug_vec=[0.5, 0.25, 0.5, 0.5],
                               notebook=False):
    """

    :param toy:
    :param rsyncing:
    :param batch_size:
    :param num_workers:
    :param data_aug_vec:
    :param notebook:
    :return:
    """
    num_workers = 0
    if rsyncing:
        print('Rsynced data! (prepare feat)', flush=True)
    else:
        print('Using symbolic links! (prepare feat)', flush=True)
    print('Getting path ready..', flush=True)
    anno_path_train, _, png_path = get_paths(rsyncing, toy, notebook)

    # TODO
    # png_path = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', 'png')
    # anno_path_train = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset',
    #                                'annotations/mscoco_train_full.json')

    trans_img = torchvision.transforms.Compose([
        t.Normalize(),
        t.BboxCrop(targetsize=224),
        t.RandomFlipImg(prob=data_aug_vec[0]),
        t.RandomGammaImg(prob=data_aug_vec[1], use_normal_distribution=True)
    ])
    trans_bb = torchvision.transforms.Compose([
        t.GetFiveBBs(),
        t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10),
        t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1)
    ])

    start_time = time.time()
    print('Creating Coco Dataset..', flush=True)

    trainset = u.dataset_coco(png_path,
                              anno_path_train,
                              transform=trans_img,
                              bbox_transform=trans_bb,
                              for_feature=True)
    print('Training set has', len(trainset), 'images', flush=True)

    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=batch_size,
        sampler=SequentialSampler(trainset),
        num_workers=num_workers,
        collate_fn=u.mammo_collate)
    print('Training loader has', len(trainloader), 'batches', flush=True)

    total_time = time.time() - start_time
    print('Creating Datasets took {:.0f} seconds.'.format(total_time),
          flush=True)

    return trainloader
Exemple #21
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    if args.do_finetune:
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        for name, param in model.named_parameters():
            if name.startswith("distilbert.embeddings."):
                param.requires_grad = False
            for i in range(args.freeze_layer):
                if name.startswith("distilbert.transformer.layer.%s." % i):
                    param.requires_grad = False
        return
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Exemple #22
0
def main():
    # parse command line options
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "experiment",
        nargs='?',
        default="",
        help=
        "Experiment name (sub-folder for this particular run). Default: test")
    parser.add_argument("-data-dir",
                        default='data/maze/',
                        help="Directory where maze data is located")
    parser.add_argument(
        "-output-dir",
        default='data/mapnet',
        help=
        "Output directory where results will be stored (point OverBoard to this location)"
    )
    parser.add_argument("-device",
                        default="cuda:0",
                        help="Device, cpu or cuda")
    parser.add_argument(
        "-data-loaders",
        default=8,
        type=int,
        help="Number of asynchronous worker threads for data loading")
    parser.add_argument("-epochs",
                        default=40,
                        type=int,
                        help="Number of training epochs")
    parser.add_argument("-bs", default=100, type=int, help="Batch size")
    parser.add_argument("-lr", default=1e-3, type=float, help="Learning rate")
    parser.add_argument("--no-bn",
                        dest="bn",
                        action="store_false",
                        help="Disable batch normalization")
    parser.add_argument(
        "-seq-length",
        default=5,
        type=int,
        help=
        "Sequence length for unrolled RNN (longer creates more long-term maps)"
    )
    parser.add_argument("-map-size",
                        default=15,
                        type=int,
                        help="Spatial size of map memory (always square)")
    parser.add_argument(
        "-embedding",
        default=16,
        type=int,
        help="Size of map embedding (vector stored in each map cell)")
    parser.add_argument(
        "--no-improved-padding",
        dest="improved_padding",
        action="store_false",
        help=
        "Disable improved padding, which ensures softmax is only over valid locations and not edges"
    )
    parser.add_argument("-lstm-forget-bias",
                        default=1.0,
                        type=float,
                        help="Initial value for LSTM forget gate")
    parser.add_argument(
        "-max-speed",
        default=0,
        type=int,
        help=
        "If non-zero, only samples trajectories with this maximum spatial distance between steps"
    )
    parser.add_argument(
        "--spawn",
        action="store_true",
        help=
        "Use spawn multiprocessing method, to work around problem with some debuggers (e.g. VSCode)"
    )

    parser.set_defaults(bn=True, improved_padding=True)
    args = parser.parse_args()

    if not t.cuda.is_available(): args.device = 'cpu'

    if args.spawn:  # workaround for vscode debugging
        import torch.multiprocessing as multiprocessing
        multiprocessing.set_start_method('spawn', True)

    if not args.experiment: args.experiment = 'test'

    # complete directory with experiment name
    args.output_dir = (args.output_dir + '/' + args.experiment)

    if os.path.isdir(args.output_dir):
        input(
            'Directory already exists. Press Enter to overwrite or Ctrl+C to cancel.'
        )

    # repeatable random sequences hopefully
    random.seed(0)
    t.manual_seed(0)

    # initialize dataset
    env_size = (21, 21)
    full_set = Mazes(args.data_dir + '/mazes-10-10-100000.txt',
                     env_size,
                     seq_length=args.seq_length,
                     max_speed=args.max_speed)

    (train_set,
     val_set) = t.utils.data.random_split(full_set,
                                          (len(full_set) - 5000, 5000))

    val_loader = DataLoader(val_set,
                            batch_size=10 * args.bs,
                            shuffle=False,
                            num_workers=args.data_loaders)

    # create base CNN and MapNet
    cnn = get_two_layers_cnn(args)
    mapnet = MapNet(cnn=cnn,
                    embedding_size=args.embedding,
                    map_size=args.map_size,
                    lstm_forget_bias=args.lstm_forget_bias,
                    improved_padding=args.improved_padding,
                    orientations=4)

    # use GPU if needed
    device = t.device(args.device)
    mapnet.to(device)

    # create optimizer
    optimizer = t.optim.Adam(mapnet.parameters(), lr=args.lr)

    with Logger(args.output_dir, meta=args) as logger:
        for epoch in range(args.epochs):
            # refresh subset of mazes every epoch
            train_sampler = BatchSampler(RandomSampler(SequentialSampler(
                range(95000)),
                                                       num_samples=10000,
                                                       replacement=True),
                                         batch_size=args.bs,
                                         drop_last=True)
            train_loader = DataLoader(train_set,
                                      batch_sampler=train_sampler,
                                      num_workers=args.data_loaders)

            # training phase
            mapnet.train()
            for inputs in train_loader:
                #with t.autograd.detect_anomaly():

                optimizer.zero_grad()
                loss = batch_forward(inputs, mapnet, 'train', device, args,
                                     logger)

                loss.backward()
                optimizer.step()

                logger.print(prefix='train', line_prefix=f"ep {epoch+1} ")

            # validation phase
            mapnet.eval()
            with t.no_grad():
                for inputs in val_loader:
                    loss = batch_forward(inputs, mapnet, 'val', device, args,
                                         logger)
                    logger.print(prefix='val', line_prefix=f"ep {epoch+1} ")

            logger.append()

            # save state
            state = {
                'epoch': epoch,
                'state_dict': mapnet.state_dict(),
                'optimizer': optimizer.state_dict()
            }
            try:
                os.replace(args.output_dir + "/state.pt",
                           args.output_dir + "/prev_state.pt")
            except:
                pass
            t.save(state, args.output_dir + "/state.pt")
Exemple #23
0
def load_data(args):
    """Load data from here and return.
    Note:
        Compose Composes several transforms together and if augmentation is chosen you compose an additional
        bunch of transforms to be applied to the train data and you send this to the DataTransformer class
        which returns the data set that is used in the data loader. The data loader then takes in this dataset with a
        batch size and sampler. Sampler is defines the strategy to draw samples from the dataset. Here for training
        data random sampling is used and for validation sequential is used. You can also write a custom sampler class
        if you want.
    :param args:
        main_dir (string)       : path to the main directory from the args.
        image_size (int)        : size of the image to be resized.
        transform_prob (float)  : probability to apply transformations on the data.
        batch_size (int)        : batch size to be used in the data loader.
    :return:
        the train loader and validation loader to be used for training and validating.
    """
    # get data set file path
    data_path = os.path.join(args.main_dir, 'data', 'train-volume.tif')
    labels_path = os.path.join(args.main_dir, 'data', 'train-labels.tif')

    # compose the transforms for the train set
    train_data = Compose([Resize(args.image_size), ToTensor()])

    # choose between augmentations for train data
    if args.augment:
        train_augment = augmentations(args)
        train_transform = DataTransformer(data_path,
                                          labels_path,
                                          image_transform=train_data,
                                          image_augmentation=train_augment)

    else:
        # transforming the train data and returning a 4D tensor
        train_transform = DataTransformer(data_path,
                                          labels_path,
                                          image_transform=train_data,
                                          image_augmentation=None)

    # transform for validation data
    val_data = Compose([Resize(args.image_size), ToTensor()])
    val_transform = DataTransformer(data_path,
                                    labels_path,
                                    image_transform=val_data,
                                    image_augmentation=None)

    # split the train and validation indices
    train_indices, validation_indices = train_test_split(range(
        len(train_transform)),
                                                         test_size=0.15)

    # call the sampler for the train and validation data
    train_samples = RandomSampler(train_indices)
    validation_samples = SequentialSampler(validation_indices)

    # load train and validation data
    train_loader = DataLoader(train_transform,
                              batch_size=args.batch_size,
                              sampler=train_samples)
    val_loader = DataLoader(val_transform,
                            batch_size=args.batch_size,
                            sampler=validation_samples)

    return train_loader, val_loader
Exemple #24
0
def mnist_classifier_crossentropyloss():
    # paths
    path = dict()
    path['project'] = os.path.dirname(os.path.abspath(__file__))
    path['state'] = os.path.join(path['project'], 'epoch')
    path['dataset'] = os.path.join(path['project'], 'dataset')
    path['graph'] = os.path.join(path['project'], 'graph')
    path['array'] = os.path.join(path['project'], 'array')
    for key, value in path.items():
        if not os.path.exists(path[key]):
            os.mkdir(path[key])

    # parameters
    batch_size = 1000
    number_of_epochs = 20
    learning_rate = 1e-3
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    mean = 0.1307
    std = 0.3081
    loss = nn.CrossEntropyLoss()
    info_per_batch = 6
    validation_ratio = 0.1

    # transform
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=(mean, ), std=(std, ))
    ])

    # dataset
    train_dataset = torchvision.datasets.MNIST(root=path['dataset'],
                                               train=True,
                                               transform=transform,
                                               download=True)
    test_dataset = torchvision.datasets.MNIST(root=path['dataset'],
                                              train=False,
                                              transform=transform,
                                              download=True)

    # validation dataset
    validation_limit = int((1 - validation_ratio) * len(train_dataset))
    index_list = list(range(len(train_dataset)))
    train_indexes, validation_indexes = index_list[:
                                                   validation_limit], index_list[
                                                       validation_limit:]
    train_sampler = SubsetRandomSampler(train_indexes)
    validation_sampler = SequentialSampler(validation_indexes)

    # dataset loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                    batch_size=batch_size,
                                                    sampler=validation_sampler)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size)

    # model
    model = MnistClassifierCrossEntropyLoss().to(device)

    # optimizer
    optimizer = optim.SGD(params=model.parameters(), lr=learning_rate)

    epochs = np.arange(start=1, stop=(number_of_epochs + 1), step=1, dtype=int)

    print('Mnist Classifier CrossEntropyLoss')
    train_losses = []
    train_accuracies = []
    validation_losses = []
    validation_accuracies = []
    test_losses = []
    test_accuracies = []
    for epoch in epochs:
        info = 'Epoch {epoch_index}/{number_of_epochs}'
        print(info.format(epoch_index=epoch,
                          number_of_epochs=number_of_epochs))

        # train
        train_loss, train_accuracy = train(model=model,
                                           device=device,
                                           loader=train_loader,
                                           optimizer=optimizer,
                                           loss=loss,
                                           info_per_batch=info_per_batch)
        info = 'Train: Average Loss: {train_loss:.5f}, Accuracy: % {train_accuracy:.2f}'
        print(
            info.format(train_loss=train_loss,
                        train_accuracy=(100 * train_accuracy)))
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        # validation
        validation_loss, validation_accuracy = test(
            model=model,
            loader=validation_loader,
            device=device,
            loss=loss,
            info_per_batch=info_per_batch,
            info_name='Validation')
        info = 'Validation: Average Loss: {validation_loss:.5f}, Accuracy: % {validation_accuracy:.2f}'
        print(
            info.format(validation_loss=validation_loss,
                        validation_accuracy=(100 * validation_accuracy)))
        validation_losses.append(validation_loss)
        validation_accuracies.append(validation_accuracy)

        # test
        test_loss, test_accuracy = test(model=model,
                                        loader=test_loader,
                                        device=device,
                                        loss=loss,
                                        info_per_batch=info_per_batch,
                                        info_name='Test')
        info = 'Test: Average Loss: {test_loss:.5f}, Accuracy: % {test_accuracy:.2f}'
        print(
            info.format(test_loss=test_loss,
                        test_accuracy=(100 * test_accuracy)))
        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)

        # epoch state
        state_file_name = 'mnist_classifier_crossentropyloss_epoch_{epoch_index}.pkl'.format(
            epoch_index=epoch)
        save_state(model=model,
                   directory=path['state'],
                   file_name=state_file_name)

    # train loss
    save_data(array=train_losses,
              directory=path['array'],
              file_name='mnist_classifier_crossentropyloss_train_loss.npy')
    draw_line_graph(
        x=epochs,
        y=train_losses,
        x_label='Epoch',
        y_label='Loss',
        title='Mnist Classifier CrossEntropyLoss Train Loss',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_train_loss.png')

    # train accuracy
    save_data(array=train_accuracies,
              directory=path['array'],
              file_name='mnist_classifier_crossentropyloss_train_accuracy.npy')
    draw_line_graph(
        x=epochs,
        y=train_accuracies,
        x_label='Epoch',
        y_label='Accuracy',
        title='Mnist Classifier CrossEntropyLoss Train Accuracy',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_train_accuracy.png')

    # validation loss
    save_data(
        array=validation_losses,
        directory=path['array'],
        file_name='mnist_classifier_crossentropyloss_validation_loss.npy')
    draw_line_graph(
        x=epochs,
        y=validation_losses,
        x_label='Epoch',
        y_label='Loss',
        title='Mnist Classifier CrossEntropyLoss Validation Loss',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_validation_loss.png')

    # validation accuracy
    save_data(
        array=validation_accuracies,
        directory=path['array'],
        file_name='mnist_classifier_crossentropyloss_validation_accuracy.npy')
    draw_line_graph(
        x=epochs,
        y=validation_accuracies,
        x_label='Epoch',
        y_label='Accuracy',
        title='Mnist Classifier CrossEntropyLoss Validation Accuracy',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_validation_accuracy.png')

    # test loss
    save_data(array=test_losses,
              directory=path['array'],
              file_name='mnist_classifier_crossentropyloss_test_loss.npy')
    draw_line_graph(
        x=epochs,
        y=test_losses,
        x_label='Epoch',
        y_label='Loss',
        title='Mnist Classifier CrossEntropyLoss Test Loss',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_test_loss.png')

    # test accuracy
    save_data(array=test_accuracies,
              directory=path['array'],
              file_name='mnist_classifier_crossentropyloss_test_accuracy.npy')
    draw_line_graph(
        x=epochs,
        y=test_accuracies,
        x_label='Epoch',
        y_label='Accuracy',
        title='Mnist Classifier CrossEntropyLoss Test Accuracy',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_test_accuracy.png')

    # loss
    draw_multi_lines_graph(
        lines=[
            dict(label='Train', data=dict(x=epochs, y=train_losses)),
            dict(label='Validation', data=dict(x=epochs, y=validation_losses)),
            dict(label='Test', data=dict(x=epochs, y=test_losses))
        ],
        x_label='Epoch',
        y_label='Loss',
        title='Mnist Classifier CrossEntropyLoss Loss',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_loss.png')

    # accuracy
    draw_multi_lines_graph(
        lines=[
            dict(label='Train', data=dict(x=epochs, y=train_accuracies)),
            dict(label='Validation',
                 data=dict(x=epochs, y=validation_accuracies)),
            dict(label='Test', data=dict(x=epochs, y=test_accuracies))
        ],
        x_label='Epoch',
        y_label='Accuracy',
        title='Mnist Classifier CrossEntropyLoss Accuracy',
        directory=path['graph'],
        file_name='mnist_classifier_crossentropyloss_accuracy.png')
Exemple #25
0
def main():
    args = parser.parse_args()

    log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir,
                               'fold%d' % args.fold)
    if not os.path.exists(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a')

    model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir,
                                 'fold%d' % args.fold)
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        model_out_dir))
    if not os.path.exists(model_out_dir):
        os.makedirs(model_out_dir)

    # set cuda visible device
    if not args.all_gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True
    # cudnn.enabled = False

    # set random seeds
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)

    model_params = {}
    model_params['architecture'] = args.arch
    model_params['num_classes'] = 1
    model_params['in_channels'] = args.in_channels
    if 'efficientnet' in args.arch:
        model_params['image_size'] = args.img_size
        model_params['encoder'] = args.effnet_encoder

    model = init_network(model_params)

    if args.load_state_dict_path is not None:
        if args.load_state_dict_path == 'use-img-level-densenet-ckpt':
            model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6'
            pretrained_ckpt_path = os.path.join(f'{model_dir}',
                                                f'fold{args.fold}',
                                                'final.pth')
        else:
            pretrained_ckpt_path = args.load_state_dict_path
        init_pretrained = torch.load(pretrained_ckpt_path)
        if args.load_as_is:
            model.load_state_dict(init_pretrained['state_dict'])
        else:
            model.load_state_dict({
                key: (val if key not in {'logit.weight', 'logit.bias'} else
                      torch.rand([1, 1024] if key == 'logit.weight' else [1]))
                for key, val in init_pretrained['state_dict'].items()
            })
            torch.nn.init.xavier_uniform(model.logit.weight)

    if args.all_gpus:
        model = DataParallel(model)
    model.cuda()

    # define loss function (criterion)
    try:
        criterion = eval(args.loss)().cuda()
    except:
        raise (RuntimeError("Loss {} not available!".format(args.loss)))

    start_epoch = 0
    best_loss = 1e5
    best_epoch = 0
    best_val_pr_auc_score = 0

    # define scheduler
    try:
        scheduler = eval(args.scheduler)(
            scheduler_lr_multiplier=args.scheduler_lr_multiplier,
            scheduler_epoch_offset=args.scheduler_epoch_offset)
    except:
        raise (RuntimeError("Scheduler {} not available!".format(
            args.scheduler)))
    optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0]

    # Data loading code
    train_transform = train_multi_augment2

    with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f:
        folds = pickle.load(f)
    fold = args.fold
    trn_img_paths, val_img_paths = folds[fold]

    train_df = get_train_df_ohe(clean_from_duplicates=True)
    basepath_2_ohe_vector = {
        img: vec
        for img, vec in zip(train_df['img_base_path'],
                            train_df.iloc[:, 2:].values)
    }

    public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True)
    public_basepath_2_ohe_vector = {
        img_path: vec
        for img_path, vec in zip(public_hpa_df_17['img_base_path'],
                                 public_hpa_df_17.iloc[:, 2:].values)
    }
    basepath_2_ohe_vector.update(public_basepath_2_ohe_vector)

    available_paths = set(
        np.concatenate((train_df['img_base_path'].values,
                        public_hpa_df_17['img_base_path'].values)))
    trn_img_paths = [path for path in trn_img_paths if path in available_paths]
    val_img_paths = [path for path in val_img_paths if path in available_paths]
    labels_df = pd.read_hdf(args.cell_level_labels_path)

    # modifying minor class labels
    cherrypicked_mitotic_spindle = pd.read_csv(
        '../input/mitotic_cells_selection.csv')

    cherrypicked_mitotic_spindle_img_cell = set(
        cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple,
                                                             axis=1).values)

    cherrypicked_mitotic_spindle_img_cell = {
        (img, cell_i - 1)
        for img, cell_i in cherrypicked_mitotic_spindle_img_cell
    }

    class_names = get_class_names()
    mitotic_spindle_class_i = class_names.index('Mitotic spindle')

    cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv(
        '../input/mitotic_pos_nn_added.csv')
    cherrypicked_mitotic_spindle_img_cell.update(
        set(cherrypicked_mitotic_spindle_based_on_nn[['ID', 'cell_i'
                                                      ]].apply(tuple,
                                                               axis=1).values))
    mitotic_bool_idx = labels_df.index.isin(
        cherrypicked_mitotic_spindle_img_cell)

    negative_img_ids_cell = labels_df.index[np.logical_not(
        mitotic_bool_idx)].values

    dfs = []
    for fold in range(5):
        dfs.append(pd.read_csv(f'../output/mitotic_pred_fold_{fold}.csv'))
    pred_df = pd.concat(dfs)
    pred_df.set_index(['ID', 'cell_i'], inplace=True)
    positive_img_ids_cell = pred_df.index[pred_df['pred'] < 0.6].values

    if args.ignore_negative:
        raise NotImplementedError

    train_dataset = ProteinMitoticDatasetCellSeparateLoading(
        trn_img_paths,
        positive_img_ids_cell,
        negative_img_ids_cell,
        in_channels=args.in_channels,
        transform=train_transform,
        target_raw_img_size=args.target_raw_img_size)
    train_loader = DataLoader(
        train_dataset,
        sampler=MitoticBalancingSubSampler(train_dataset.img_ids_cell,
                                           train_dataset.id_cell_2_y),
        batch_size=args.batch_size,
        drop_last=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    valid_dataset = ProteinMitoticDatasetCellSeparateLoading(
        val_img_paths,
        positive_img_ids_cell,
        sample(list(negative_img_ids_cell), 10000),
        img_size=args.img_size,
        in_channels=args.in_channels,
        target_raw_img_size=args.target_raw_img_size)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=args.batch_size,
                              drop_last=False,
                              num_workers=args.workers,
                              pin_memory=True)

    log.write('** start training here! **\n')
    log.write('\n')
    log.write(
        'epoch    iter      rate     |  train_loss/acc  |    valid_loss/acc/pr_auc/---     |best_epoch/best_pr_auc|  min \n'
    )
    log.write(
        '-----------------------------------------------------------------------------------------------------------------\n'
    )
    start_epoch += 1

    if args.eval_at_start:
        with torch.no_grad():
            valid_loss, valid_acc, val_pr_auc_score = validate(
                valid_loader, model, criterion, -1, log)
        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.1f  |    %6.4f  %6.4f   | %3.1f min \n' % \
            (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_pr_auc_score, -1,
                   best_epoch, -1, -1))

    for epoch in range(start_epoch, args.epochs + 1):
        end = time.time()

        # set manual seeds per epoch
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        # adjust learning rate for each epoch
        lr_list = scheduler.step(model, epoch, args.epochs)
        lr = lr_list[0]

        # train for one epoch on train set
        iter, train_loss, train_acc = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            clipnorm=args.clipnorm,
            lr=lr,
            agg_steps=args.gradient_accumulation_steps)

        with torch.no_grad():
            valid_loss, valid_acc, val_pr_auc_score = validate(
                valid_loader, model, criterion, epoch, log)

        # remember best loss and save checkpoint
        is_best = val_pr_auc_score > best_val_pr_auc_score
        best_loss = min(valid_loss, best_loss)
        best_epoch = epoch if is_best else best_epoch
        best_val_pr_auc_score = val_pr_auc_score if is_best else best_val_pr_auc_score

        print('\r', end='', flush=True)
        log.write('%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f  %6.1f |  %6.4f  %6.4f | %3.1f min \n' % \
                  (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_pr_auc_score, -1,
                   best_epoch, best_val_pr_auc_score, (time.time() - end) / 60))

        save_model(model,
                   is_best,
                   model_out_dir,
                   optimizer=optimizer,
                   epoch=epoch,
                   best_epoch=best_epoch,
                   best_map=best_val_pr_auc_score)
Exemple #26
0
    def learn_step(self, updater, network, next_obs, next_internals):
        r = self.exp_cache.read()
        device = r.rewards[0].device
        rollout_len = self.exp_cache.rollout_len

        # estimate value of next state
        with torch.no_grad():
            pred, _, _ = network(next_obs, next_internals)
            last_values = pred['critic'].squeeze(-1).data

        # calc nsteps
        gae = 0.
        next_values = last_values
        gae_returns = []
        for i in reversed(range(rollout_len)):
            rewards = r.rewards[i]
            terminal_mask = 1. - r.terminals[i].float()
            current_values = r.values[i].squeeze(-1)
            # generalized advantage estimation
            delta_t = rewards + self.discount * next_values.data * terminal_mask - current_values
            gae = gae * self.discount * self.gae_discount * terminal_mask + delta_t
            gae_returns.append(gae + current_values)
            next_values = current_values.data
        gae_returns = torch.stack(list(reversed(gae_returns))).data

        # Convert to torch tensors of [seq, num_env]
        old_values = torch.stack(r.values).squeeze(-1)
        adv_targets_batch = (gae_returns - old_values).data
        old_log_probs_batch = torch.stack(r.log_probs).data
        # keep a copy of terminals on the cpu it's faster
        rollout_terminals = torch.stack(r.terminals).cpu().numpy()

        # Normalize advantage
        if self.normalize_advantage:
            adv_targets_batch = (adv_targets_batch - adv_targets_batch.mean()) / \
                                (adv_targets_batch.std() + 1e-5)

        for e in range(self.nb_rollout_epoch):
            # setup minibatch iterator
            minibatch_inds = list(
                BatchSampler(SequentialSampler(range(rollout_len)),
                             self.rollout_minibatch_len,
                             drop_last=False))
            # randomize sequences to sample NOTE: in-place operation
            np.random.shuffle(minibatch_inds)
            for i in minibatch_inds:
                # TODO: detach internals, no_grad in compute_action_exp takes care of this
                starting_internals = {
                    k: ts[i[0]].unbind(0)
                    for k, ts in r.internals.items()
                }
                gae_return = gae_returns[i]
                old_log_probs = old_log_probs_batch[i]
                sampled_actions = [r.actions[x] for x in i]
                batch_obs = [r.observations[x] for x in i]
                # needs to be seq, batch, broadcast dim
                adv_targets = adv_targets_batch[i].unsqueeze(-1)
                terminals_batch = rollout_terminals[i]

                # forward pass
                cur_log_probs, cur_values, entropies = self.act_batch(
                    network, batch_obs, terminals_batch, sampled_actions,
                    starting_internals, device)
                value_loss = 0.5 * torch.mean((cur_values - gae_return).pow(2))

                # calculate surrogate loss
                surrogate_ratio = torch.exp(cur_log_probs - old_log_probs)
                surrogate_loss = surrogate_ratio * adv_targets
                surrogate_loss_clipped = torch.clamp(
                    surrogate_ratio,
                    min=1 - self.policy_clipping,
                    max=1 + self.policy_clipping) * adv_targets
                policy_loss = torch.mean(
                    -torch.min(surrogate_loss, surrogate_loss_clipped))
                entropy_loss = torch.mean(-self.entropy_weight * entropies)

                losses = {
                    'value_loss': value_loss,
                    'policy_loss': policy_loss,
                    'entropy_loss': entropy_loss
                }
                total_loss = torch.sum(
                    torch.stack(tuple(loss for loss in losses.values())))

                updater.step(total_loss)

        # TODO: metrics: average loss, policy % change, loss over epochs?, value change
        metrics = {'advantage': torch.mean(adv_targets_batch)}
        return losses, metrics
Exemple #27
0
def main():
    args = parser.parse_args()

    log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir,
                               'fold%d' % args.fold)
    if not os.path.exists(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a')

    model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir,
                                 'fold%d' % args.fold)
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        model_out_dir))
    if not os.path.exists(model_out_dir):
        os.makedirs(model_out_dir)

    # set cuda visible device
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True
    # cudnn.enabled = False

    # set random seeds
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)

    model_params = {}
    model_params['architecture'] = args.arch
    model_params['num_classes'] = args.num_classes
    model_params['in_channels'] = args.in_channels
    if 'efficientnet' in args.arch:
        model_params['image_size'] = args.img_size
        model_params['encoder'] = args.effnet_encoder
    model = init_network(model_params)

    if args.load_state_dict_path is not None:
        init_pretrained = torch.load(args.load_state_dict_path)
        model.load_state_dict(init_pretrained['state_dict'])
    # state_dict = model.state_dict()
    # torch.save({
    #     'state_dict': state_dict
    # }, '../output/densenet121_bestfitting_converted_classes.h5')
    # sys.exit(0)
    # move network to gpu
    # model = DataParallel(model)

    if args.clip_and_replace_grad_explosures:

        def clip_and_replace_explosures(grad):
            grad[torch.logical_or(
                torch.isnan(grad),
                torch.isinf(grad))] = torch.tensor(0.0).cuda()
            grad = torch.clamp(grad, -0.5, 0.5)
            return grad

        for param in model.parameters():
            if param.requires_grad:
                param.register_hook(clip_and_replace_explosures)
    model.cuda()

    # define loss function (criterion)
    try:
        criterion = eval(args.loss)().cuda()
    except:
        raise (RuntimeError("Loss {} not available!".format(args.loss)))

    start_epoch = 0
    best_loss = 1e5
    best_epoch = 0
    best_map = 0

    # define scheduler
    try:
        scheduler = eval(args.scheduler)(
            scheduler_lr_multiplier=args.scheduler_lr_multiplier,
            scheduler_epoch_offset=args.scheduler_epoch_offset)
    except:
        raise (RuntimeError("Scheduler {} not available!".format(
            args.scheduler)))
    optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0]

    # optionally resume from a checkpoint
    if args.resume:
        # args.resume = os.path.join(model_out_dir, args.resume)
        if os.path.isfile(args.resume):
            # load checkpoint weights and update model and optimizer
            log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume))

            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            best_epoch = checkpoint['best_epoch']
            best_map = checkpoint['best_score']
            model.module.load_state_dict(checkpoint['state_dict'])

            optimizer_fpath = args.resume.replace('.pth', '_optim.pth')
            if os.path.exists(optimizer_fpath):
                log.write(">> Loading checkpoint:\n>> '{}'\n".format(
                    optimizer_fpath))
                optimizer.load_state_dict(
                    torch.load(optimizer_fpath)['optimizer'])
            log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format(
                args.resume, checkpoint['epoch']))
        else:
            log.write(">> No checkpoint found at '{}'\n".format(args.resume))

    # Data loading code
    train_transform = train_multi_augment2

    with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f:
        folds = pickle.load(f)
    fold = args.fold
    trn_img_paths, val_img_paths = folds[fold]

    train_df = get_train_df_ohe(clean_from_duplicates=args.clean_duplicates,
                                clean_mitotic=args.clean_mitotic_samples,
                                clean_aggresome=args.clean_aggresome)
    if args.ignore_negs:
        train_df['Negative'] = 0

    train_paths_set = set(train_df['img_base_path'])

    basepath_2_ohe_vector = {
        img: vec
        for img, vec in zip(train_df['img_base_path'],
                            train_df.iloc[:, 2:].values)
    }

    train_paths_set = set(train_df['img_base_path'])

    if not args.without_public_data:
        public_hpa_df_17 = get_public_df_ohe(
            clean_from_duplicates=args.clean_duplicates,
            clean_mitotic=args.clean_mitotic_samples,
            clean_aggresome=args.clean_aggresome)
        if args.ignore_negs:
            public_hpa_df_17['Negative'] = 0
        public_basepath_2_ohe_vector = {
            img_path: vec
            for img_path, vec in zip(public_hpa_df_17['img_base_path'],
                                     public_hpa_df_17.iloc[:, 2:].values)
        }
        basepath_2_ohe_vector.update(public_basepath_2_ohe_vector)
    else:
        trn_img_paths = [
            path for path in trn_img_paths if path in train_paths_set
        ]

    if not args.without_public_data:
        available_paths = set(
            np.concatenate((train_df['img_base_path'].values,
                            public_hpa_df_17['img_base_path'].values)))
    else:
        available_paths = set(train_df['img_base_path'].values)
    trn_img_paths = [path for path in trn_img_paths if path in available_paths]
    val_img_paths = [path for path in val_img_paths if path in available_paths]

    if args.copy_paste_augment_mitotic_aggresome:
        train_ids = {os.path.basename(x) for x in trn_img_paths}
        id_2_ohe_vector = {
            os.path.basename(path): ohe
            for path, ohe in basepath_2_ohe_vector.items()
        }

        cherrypicked_mitotic_spindle = pd.read_csv(
            '../input/mitotic_cells_selection.csv')
        cherrypicked_mitotic_spindle = cherrypicked_mitotic_spindle[
            cherrypicked_mitotic_spindle['ID'].isin(train_ids)]

        cherrypicked_aggresome = pd.read_csv(
            '../input/aggressome_cells_selection.csv')
        cherrypicked_aggresome = cherrypicked_aggresome[
            cherrypicked_aggresome['ID'].isin(train_ids)]

        cherrypicked_mitotic_spindle['ohe'] = cherrypicked_mitotic_spindle[
            'ID'].map(id_2_ohe_vector)
        cherrypicked_aggresome['ohe'] = cherrypicked_aggresome['ID'].map(
            id_2_ohe_vector)

        mitotic_idx = [
            idx for idx, colname in enumerate(train_df.columns)
            if colname == 'Mitotic spindle'
        ][0]
        aggresome_idx = [
            idx for idx, colname in enumerate(train_df.columns)
            if colname == 'Aggresome'
        ][0]
        mitotic_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0])
        mitotic_ohe[mitotic_idx] = 1

        aggresome_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0])
        aggresome_ohe[aggresome_idx] = 1

        cherrypicked_mitotic_spindle.loc[
            cherrypicked_mitotic_spindle['is_pure'] == 1, 'ohe'] = pd.Series(
                [
                    mitotic_ohe for _ in range(
                        sum(cherrypicked_mitotic_spindle['is_pure'] == 1))
                ],
                index=cherrypicked_mitotic_spindle.index[
                    cherrypicked_mitotic_spindle['is_pure'] == 1])

        cherrypicked_aggresome.loc[
            cherrypicked_aggresome['is_pure'] == 1,
            'ohe'] = pd.Series([
                mitotic_ohe
                for _ in range(sum(cherrypicked_aggresome['is_pure'] == 1))
            ],
                               index=cherrypicked_aggresome.index[
                                   cherrypicked_aggresome['is_pure'] == 1])

        class_purity_2_weight = {1: 4, 0: 1}
        cherrypicked_mitotic_spindle[
            'sampling_weight'] = cherrypicked_mitotic_spindle['is_pure'].map(
                class_purity_2_weight)
        cherrypicked_aggresome['sampling_weight'] = cherrypicked_aggresome[
            'is_pure'].map(class_purity_2_weight)
    else:
        cherrypicked_mitotic_spindle = None
        cherrypicked_aggresome = None

    train_dataset = ProteinDatasetImageLevel(
        trn_img_paths,
        basepath_2_ohe=basepath_2_ohe_vector,
        img_size=args.img_size,
        is_trainset=True,
        return_label=True,
        in_channels=args.in_channels,
        transform=train_transform,
        cherrypicked_mitotic_spindle_df=cherrypicked_mitotic_spindle,
        cherrypicked_aggresome_df=cherrypicked_aggresome)

    class_names = get_class_names()
    if args.balance_classes:
        sampler = BalancingSubSampler(trn_img_paths,
                                      basepath_2_ohe_vector,
                                      class_names,
                                      required_class_count=1500)
    else:
        sampler = RandomSampler(train_dataset)

    train_loader = DataLoader(
        train_dataset,
        sampler=sampler,
        batch_size=args.batch_size,
        drop_last=True,
        num_workers=args.workers,
        pin_memory=True,
    )

    # val_img_paths = [path for path in val_img_paths if path in train_paths_set]

    valid_dataset = ProteinDatasetImageLevel(
        val_img_paths,
        basepath_2_ohe=basepath_2_ohe_vector,
        img_size=args.img_size,
        is_trainset=True,
        return_label=True,
        in_channels=args.in_channels,
        transform=train_transform)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=args.batch_size,
                              drop_last=False,
                              num_workers=args.workers,
                              pin_memory=True)

    focal_loss = FocalLoss().cuda()
    log.write('** start training here! **\n')
    log.write('\n')
    log.write(
        'epoch    iter      rate     |  train_loss/acc  |    valid_loss/acc/focal/map     |best_epoch/best_map|  min \n'
    )
    log.write(
        '-----------------------------------------------------------------------------------------------------------------\n'
    )
    start_epoch += 1

    if args.eval_at_start:
        with torch.no_grad():
            valid_loss, valid_acc, valid_focal_loss, valid_map = validate(
                valid_loader, model, criterion, -1, focal_loss, log)
        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.4f    |  %6.1f    %6.4f   | %3.1f min \n' % \
            (-1, -1, -1, -1, -1, valid_loss, valid_acc, valid_focal_loss, valid_map,
             best_epoch, best_map, -1))

    for epoch in range(start_epoch, args.epochs + 1):
        end = time.time()

        # set manual seeds per epoch
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        # adjust learning rate for each epoch
        lr_list = scheduler.step(model, epoch, args.epochs)
        lr = lr_list[0]

        # train for one epoch on train set
        iter, train_loss, train_acc = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            clipnorm=args.clipnorm,
            lr=lr,
            agg_steps=args.gradient_accumulation_steps)
        if np.isnan(train_loss):
            print('@@@@@NAN!')
        else:
            print('norm')

        with torch.no_grad():
            valid_loss, valid_acc, valid_focal_loss, valid_map = validate(
                valid_loader, model, criterion, epoch, focal_loss, log)

        # remember best loss and save checkpoint
        is_best = valid_map > best_map
        best_loss = min(valid_focal_loss, best_loss)
        best_epoch = epoch if is_best else best_epoch
        best_map = valid_map if is_best else best_map

        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.4f    |  %6.1f    %6.4f   | %3.1f min \n' % \
            (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, valid_focal_loss, valid_map,
             best_epoch, best_map, (time.time() - end) / 60))

        save_model(model,
                   is_best,
                   model_out_dir,
                   optimizer=optimizer,
                   epoch=epoch,
                   best_epoch=best_epoch,
                   best_map=best_map)
Exemple #28
0
def ablation(args, subset, model, checkpoint, dataset):
    logger.info(
        'Beginning ablation study for subset {} on model checkpoint {}'.format(
            subset, checkpoint))

    # for speeding up computation load in mapping dict for good edges, keys are senders nodes; values are all receiver nodes
    good_edge_connections = model.good_edge_connections
    sender_good = {}
    for kg in good_edge_connections.keys():
        id1, id2 = kg
        if id1 not in sender_good:
            sender_good[id1] = [id2]
        else:
            sender_good[id1].append(id2)

    all_edge_connections = model.all_edge_connections

    # create the ablation files in their own directory
    ablation_dir = os.path.join(args.output_dir, 'ablation_{}'.format(subset))

    if not os.path.exists(ablation_dir):
        os.makedirs(ablation_dir)

    ablation_filename = os.path.join(ablation_dir,
                                     'checkpoint_{}.txt'.format(checkpoint))

    # load in tokenizer for the myind_to_word dict to help translate the ids back to words
    my_tokenizer = MyTokenizer.load_tokenizer(args, evaluating=True)
    myind_to_word = my_tokenizer.myind_to_word

    # load in lines one at a time
    train_sampler = SequentialSampler(dataset)
    train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=1)

    with open(ablation_filename, 'w') as af:

        # for each question in the subset
        for batch_ind, batch in enumerate(train_dataloader):
            model.eval()

            # get batch
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'input_mask': batch[1],
                'labels': batch[2],
            }

            _, softmaxed_scores = model(training=False, **inputs)

            inputs = {k: v.squeeze() for k, v in inputs.items()}

            # calculate prediction
            prediction = torch.argmax(softmaxed_scores, dim=1)

            input_ids = inputs['input_ids']
            input_masks = inputs['input_mask']

            # print out the question with the prediction, prediction score, and the correct label
            # get up until the index they are all the same
            change_index_list = [
                input_ids[1, i] == input_ids[2, i] == input_ids[3, i] ==
                input_ids[0, i] for i in range(input_ids.shape[1])
            ]
            if False not in change_index_list:
                continue

            change_index = change_index_list.index(False)

            # get first token which is padding to separate the answers
            pad_indices = [
                input_masks[i, :].tolist().index(0)
                for i in range(input_masks.shape[0])
            ]

            if not all([pi > change_index for pi in pad_indices]):
                continue

            question_ids = input_ids[1, :change_index]
            answers_ids = [
                input_ids[i, change_index:pad_ind]
                for i, pad_ind in zip(range(input_ids.shape[0]), pad_indices)
            ]

            question_text = ' '.join(
                [myind_to_word[qi.item()] for qi in question_ids])

            # get all answer features to display
            answers_text = [
                ' '.join([myind_to_word[ai.item()] for ai in answer_id])
                for answer_id in answers_ids
            ]
            answer_choice_text = ['A.', 'B.', 'C.', 'D.']
            correct_label = [
                ' ' if lab == 0 else '*' for lab in inputs['labels']
            ]
            predicted_label = [' '] * 4
            predicted_label[prediction.item()] = '#'
            softmaxed_scores = [
                round(ss, 3) for ss in softmaxed_scores.squeeze().tolist()
            ]

            assert len(correct_label) == len(predicted_label) == len(
                softmaxed_scores) == len(answer_choice_text) == len(
                    answers_text)

            answer_features = list(
                map(
                    tuple,
                    zip(correct_label, predicted_label, softmaxed_scores,
                        answer_choice_text, answers_text)))

            # print out the batch_ind then the question text then newline
            af.write('{}. {}\n'.format(batch_ind + 1, question_text))
            # print out a * for correct answer, # for prediction, rounded softmaxed score, and then answer text for each of four options
            for (cl, pl, ss, act, at) in answer_features:
                af.write('{} {} {} {}{}\n'.format(cl, pl, ss, act, at))

            af.write('\n')

            ## print out the best connections for the subsetted graph
            # get all unique ids
            unique_ids = torch.unique(input_ids)

            # get all first neighbor connections within good and all
            relevant_best_connections = {}
            for ui in unique_ids:
                ui = ui.item()
                if ui in sender_good and sender_good[ui]:
                    for id2 in sender_good[ui]:
                        if id2 == ui:
                            continue
                        assert (ui, id2) in good_edge_connections
                        assert (ui, id2) in all_edge_connections

                        relevant_best_connections[(
                            ui,
                            id2)] = good_edge_connections[(ui, id2)] / float(
                                all_edge_connections[(ui, id2)])

            # print out top n=10 in percentage
            best_connections = [(k, v)
                                for k, v in relevant_best_connections.items()]
            best_connections.sort(key=lambda t: t[1], reverse=True)

            num_to_print = min(10, len(best_connections))
            for i in range(num_to_print):
                id1, id2 = best_connections[i][0]
                val = best_connections[i][1]
                word1 = myind_to_word[id1]
                word2 = myind_to_word[id2]

                af.write(
                    'The connection between "{}" and "{}" has value {}.\n'.
                    format(word1, word2, round(val, 3)))

            af.write('\n\n')

    return -1
def predict(model_args, predict_args):
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    # logger = create_logger(name="predict_prod", save_dir=train_args.output_dir)
    logger.info("Predict parameters %s", predict_args)

    # Prepare prod-ext task
    labels = get_labels(predict_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    if model_args.use_crf:
        model = BertCRFForTagging.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            cache_dir=model_args.cache_dir,
            tagging_schema="BIO"
        )
    else:
        model = BertForTagging.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            cache_dir=model_args.cache_dir
        )

    device = torch.device(
                "cuda"
                if (not predict_args.no_cuda and torch.cuda.is_available())
                else "cpu"
            )
    model = model.to(device)

    # load test dataset
    test_dataset = ProdDataset(
        data_file=predict_args.input_file,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=predict_args.max_seq_length,
        overwrite_cache=predict_args.overwrite_cache,
    )

    sampler = SequentialSampler(test_dataset)
    data_loader = DataLoader(
        test_dataset,
        sampler=sampler,
        batch_size=predict_args.batch_size,
        collate_fn=default_data_collator
    )

    logger.info("***** Running Prediction *****")
    logger.info("  Num examples = {}".format(len(data_loader.dataset)))
    logger.info("  Batch size = {}".format(predict_args.batch_size))

    model.eval()

    with open(predict_args.input_file, "r") as f:
        all_preds = []
        for inputs in tqdm(data_loader, desc="Predicting"):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            with torch.no_grad():
                outputs = model(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    token_type_ids=inputs['token_type_ids']
                )
                logits = outputs[0]

            preds = model.decode(logits, inputs['decoder_mask'].bool())
            preds_list = [[label_map[x] for x in seq] for seq in preds]

            all_preds += preds_list

    write_predictions(
        predict_args.input_file,
        predict_args.output_file,
        all_preds
    )
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)

    #### Change Made By Xuran Wang: Comment out original lines #######

    # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    #### Change End #######

    #### Change Made By Xuran Wang: Add custom lines #######

    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')
    finetuned_model_path = 'save/baseline-01/'

    #### Change End #######

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")

        #### Change Made By Xuran Wang: Add custom lines #######

        checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)

        #### Change End #######
        '''###'''
        # if args.reinit_pooler:
        #     encoder_temp = getattr(model, "distilbert")  # Equivalent to model.distilbert
        #     encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range)
        #     encoder_temp.pooler.dense.bias.data.zero_()  # The change of encoder_temp would affect the model
        #     for p in encoder_temp.pooler.parameters():
        #         p.requires_grad = True

        if args.reinit_layers > 0:
            import torch.nn as nn
            from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN
            # model_distilbert = getattr(model, "distilbert")  # model.distilbert; change of model_distilbert affects model!
            # Reinitialization for the last few layers
            for layer in model.distilbert.transformer.layer[-args.
                                                            reinit_layers:]:
                for module in layer.modules():
                    # print(module)
                    model.distilbert._init_weights(
                        module)  # It's the line equivalent to below approach
                    # if isinstance(module, nn.modules.linear.Linear):  # Original form for nn.Linear
                    #     # model.config.initializer_range == model.distilbert.config.initializer_range => True
                    #     module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #     if module.bias is not None:
                    #         module.bias.data.zero_()
                    # elif isinstance(module, nn.modules.normalization.LayerNorm):
                    #     module.weight.data.fill_(1.0)
                    #     module.bias.data.zero_()
                    # elif isinstance(module, FFN):
                    #     for param in [module.lin1, module.lin2]:
                    #         param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #         if param.bias is not None:
                    #             param.bias.data.zero_()
                    # elif isinstance(module, MultiHeadSelfAttention):
                    #     for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]:
                    #         param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range)
                    #         if param.bias is not None:
                    #             param.bias.data.zero_()

        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(args.device)

        trainer = Trainer(args, log)

        #### Change Made By Xuran Wang: Add custom lines, comment out original line #######

        # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train')

        train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets,
                                                   args.train_dir, tokenizer,
                                                   'train', train_fraction)

        #### Change End #######

        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])