コード例 #1
0
batch = config.get('main', 'BATCH_SIZE_TRAIN')
lr_str = config.get('main', 'LEARNING_RATE')
attention_mask_is = config.get('CNNs', 'ATTENTION_MASK_IS')
TARGET_only_LARGE_NEST_flag = strtobool(config.get('main', 'NEST'))

writer_log_dir = '../../data/TensorboardGraph/span_NER_RE/correcteval_answerchanged2-4_TARGET_only_LARGE_NEST_flag_is_{5}/batch_size_{0}/learning_rate_{1}/network_{2}_{3}/0_logit_weight_{4}'.format(
    batch, lr_str, network_structure, attention_mask_is, weight,
    str(TARGET_only_LARGE_NEST_flag))
brat_log_dir = '../../brat/brat-v1.3_Crunchy_Frog/data/model_preds/span_NER_RE/correcteval_answerchanged2-4_TARGET_only_LARGE_NEST_flag_is_{5}/batch_size_{0}/learning_rate_{1}/network_{2}_{3}/0_logit_weight_{4}'.format(
    batch, lr_str, network_structure, attention_mask_is, weight,
    str(TARGET_only_LARGE_NEST_flag))

hoge_dir = '../../data/TensorboardGraph/span_NER_RE/LARGE_NEST_{}_debug'.format(
    str(TARGET_only_LARGE_NEST_flag))

writer = tb.SummaryWriter(logdir=writer_log_dir)

np.random.seed(1)
torch.manual_seed(1)
# pdb.set_trace()

print('\nCreate Environment...\n')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print('\nCreate data...')
database = shelve.open(config.get('path', 'SHELVE_PATH'))
# pdb.set_trace()
vocab, REL_DIC, corpus, filename_lst = database[dataname]
database.close()

# (doc[0], indx_tokens, output_film_size1, output_film_size2, output_film_size3, output_film_size4, attention_mask, spmed, (n,doc,Entdic, Reldic))
コード例 #2
0
                              shape=tuple(test_shape))

train_davis_dataset = SimaseDavis(train_davis_json, train_davis_memmap)
train_davis_dataloader = torch.utils.data.DataLoader(
    train_davis_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    drop_last=True)

test_davis_dataset = SimaseDavis(test_davis_json, test_davis_memmap)
test_davis_dataloader = torch.utils.data.DataLoader(test_davis_dataset,
                                                    batch_size=args.batch_size,
                                                    shuffle=False,
                                                    drop_last=True)

tensorboard_writer = tensorboardX.SummaryWriter()

model_save_path = './encoder_dense_1.pth'
'''
#showing pairs
itf = next(iter(davis_dataloader))
img1, img2, target, class_1, class_2 = itf
img1 = img1.squeeze(dim=1)
img2 = img2.squeeze(dim=1)

for k in range(4):
  i = img1[k].numpy().transpose(2,1,0)
  cv2_imshow(i)
  j = img2[k].numpy().transpose(2,1,0)
  cv2_imshow(j)
'''
コード例 #3
0
ファイル: densetrain3m.py プロジェクト: cici-ai-club/3M
def train(opt):
    # Deal with feature things before anything
    opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)
    #opt.ss_prob=0.0
    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
            with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f:
                histories = utils.pickle_load(f)
    else:
        infos['iter'] = 0
        infos['epoch'] = 0
        infos['iterators'] = loader.iterators
        infos['split_ix'] = loader.split_ix
        infos['vocab'] = loader.get_vocab()
        infos['pix_perss']=loader.get_personality()
    infos['opt'] = opt
    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    print("current epoch: ",epoch)
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    opt.vocab = loader.get_vocab()
    opt.xpersonality=loader.get_personality()
    if opt.use_joint==0:
        #torch.cuda.set_device(0)
        model = models.setup(opt).cuda()
    elif opt.use_joint==1:
        model = models.JointModel(opt)
        model.cuda()
    #model=models.setup(opt)
    del opt.vocab
    if opt.start_from is not None:
        opt.model=os.path.join(opt.start_from, 'model'+'.pth')
        model.load_state_dict(torch.load(opt.model))
    dp_model = torch.nn.DataParallel(model)
    lw_model = LossWrapper(model, opt)
    dp_lw_model = torch.nn.DataParallel(lw_model)
    #dp_lw_model=LossWrapper(model, opt)  # this is for no cuda
    epoch_done = True
    # Assure in training mode
    #dp_lw_model=lw_model
    dp_lw_model.train()
    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer([p for p in model.parameters() if p.requires_grad], opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer([p for p in model.parameters() if p.requires_grad], opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))
    else:
        print('Optimizer param group number not matched? There must be new parameters. Reinit the optimizer.')


    def save_checkpoint(model, infos, optimizer, histories=None, append=''):
        if len(append) > 0:
            append = '-' + append
        # if checkpoint_path doesn't exist
        if not os.path.isdir(opt.checkpoint_path):
            os.makedirs(opt.checkpoint_path)
        checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' %(append))
        torch.save(model.state_dict(), checkpoint_path)
        print("model saved to {}".format(checkpoint_path))
        optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' %(append))
        torch.save(optimizer.state_dict(), optimizer_path)
        with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'%s.pkl' %(append)), 'wb') as f:
            utils.pickle_dump(infos, f)
        if histories:
            with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'%s.pkl' %(append)), 'wb') as f:
                utils.pickle_dump(histories, f)

    try:
        while True:
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate  ** frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False
                # Assign retrieval loss weight
                if epoch > opt.retrieval_reward_weight_decay_start and opt.retrieval_reward_weight_decay_start >= 0:
                    frac = (epoch - opt.retrieval_reward_weight_decay_start) // opt.retrieval_reward_weight_decay_every
                    model.retrieval_reward_weight = opt.retrieval_reward_weight * (opt.retrieval_reward_weight_decay_rate  ** frac)
                epoch_done = False
                    
            start = time.time()
            # Load data from train split (0)
            data = loader.get_batch('train')
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()
            with torch.autograd.set_detect_anomaly(True):
                tmp = [data['fc_feats'], data['att_feats'],data['densecap'], data['labels'], data['masks'], data['att_masks'], data['personality']]
                tmp = [_ if _ is None else _.cuda() for _ in tmp]
                fc_feats, att_feats,densecap, labels, masks, att_masks,personality = tmp
                optimizer.zero_grad()
                model_out = dp_lw_model(fc_feats, att_feats,densecap, labels, masks, att_masks,personality, data['gts'], torch.arange(0, len(data['gts'])), sc_flag)

                loss = model_out['loss'].mean()
                
                loss.backward()
                utils.clip_gradient(optimizer, opt.grad_clip)
                optimizer.step()
                train_loss = loss.item()
                torch.cuda.synchronize()
                end = time.time()
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f},train_loss = {:.3f}" \
                    .format(iteration, epoch, model_out['reward'].mean(), end - start,train_loss))

            if opt.use_joint==1:
                for k, v in model.loss().items():
                    prt_str += "{} = {:.3f} ".format(k, v)
                print(prt_str)

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1
                epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration)
                add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
                if sc_flag:
                    add_summary_value(tb_summary_writer, 'avg_reward', model_out['reward'].mean(), iteration)

                loss_history[iteration] = train_loss if not sc_flag else model_out['reward'].mean()
                lr_history[iteration] = opt.current_lr
                ss_prob_history[iteration] = model.ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix
            
            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                eval_kwargs = {'split': 'val',
                                'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    dp_model, lw_model.crit, loader, eval_kwargs)

                if opt.reduce_on_plateau:                    
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration)
                if lang_stats is not None:
                    for k,v in lang_stats.items():
                        add_summary_value(tb_summary_writer, k, v, iteration)
                val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

                # Save model if is improving on validation result
                    
                if opt.language_eval == 1:
                    if opt.use_joint==1:
                        current_score = lang_stats['SPICE']*100
                    elif opt.use_joint==0:
                        current_score = lang_stats['CIDEr'] # could use SPICE
                else:
                    if opt.use_joint==0:
                        current_score = - val_loss
                    elif opt.use_joint==1:
                        current_score= - val_loss['loss_cap']
                if opt.use_joint==1:
                    current_score_vse = val_loss.get(opt.vse_eval_criterion, 0)*100

                best_flag = False
                best_flag_vse= False
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                if opt.use_joint==1:
                    if best_val_score_vse is None or current_score_vse > best_val_score_vse:
                        best_val_score_vse = current_score_vse
                        best_flag_vse = True
                    infos['best_val_score_vse'] = best_val_score_vse
                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score
                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history

                save_checkpoint(model, infos, optimizer, histories)
                if opt.save_history_ckpt:
                    save_checkpoint(model, infos, optimizer, append=str(iteration))

                if best_flag:
                    save_checkpoint(model, infos, optimizer, append='best')
                if best_flag_vse:
                    save_checkpoint(model, infos, optimizer, append='vse-best')

            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        save_checkpoint(model, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
コード例 #4
0
dataset_test,  word_dict = tokenize(os.path.join(args.data_dir, 'valid.txt'), \
        train=False, word_dict=word_dict, char_level=args.character_level)

# fetch one minibatch of data
train_batch = next(minibatch_generator(dataset_train, args, shuffle=False))
test_batch  = next(minibatch_generator(dataset_test,  args, shuffle=False))

# load model that will be evaluated
gen, loaded_epoch = load_model_from_file(args.model_path, epoch=args.model_epoch)
gen.args.alpha_test = args.alpha_test
gen.eval()
print('switching the temperature to {}'.format(gen.args.alpha_test))

# args.model_path = 'our_GAN_stats_for_real_this_time'
# Logging
writer = tensorboardX.SummaryWriter(log_dir=os.path.join(args.model_path, \
        'A_TB_alpha{}'.format(gen.args.alpha_test)))
writes = 0

if args.cuda:
    gen  = gen.cuda()

def save_samples_for_bleu(gen, input, word_dict, epoch, sample_size=10000):
    maybe_create_dir(os.path.join(args.model_path, 'samples'))
    file_name = os.path.join(args.model_path, 'samples/gen_for_bleu_{}_{:.4f}.txt'.format(epoch, gen.args.alpha_test))
    print('saving in {}'.format(file_name))
    with torch.no_grad():
        with open(file_name, 'w') as f:
            tot_sent=0
            while tot_sent < sample_size:
                _, fake_sentences = gen(input[:, [0]])
                sentences = id_to_words(fake_sentences.cpu().data.numpy(), word_dict)
コード例 #5
0
    sys.exit("Only support MUNIT|UNIT")

trainer.cuda()
train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(
    config)
train_display_images_a = torch.stack(
    [train_loader_a.dataset[i]['data'] for i in range(display_size)]).cuda()
train_display_images_b = torch.stack(
    [train_loader_b.dataset[i]['data'] for i in range(display_size)]).cuda()
test_display_images_a = torch.stack(
    [test_loader_a.dataset[i]['data'] for i in range(display_size)]).cuda()
test_display_images_b = torch.stack(
    [test_loader_b.dataset[i]['data'] for i in range(display_size)]).cuda()

model_name = os.path.splitext(os.path.basename(opts.config))[0]
train_writer = tensorboardX.SummaryWriter(
    os.path.join(opts.output_path + "/logs", model_name))
output_directory = os.path.join(opts.output_path + "/outputs", model_name)
checkpoint_directory, image_directory = prepare_sub_folder(output_directory)
shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml'))

# training
iterations = trainer.resume(checkpoint_directory,
                            hyperparameters=config) if opts.resume else 0
while True:
    for it, (images_a,
             images_b) in enumerate(zip(train_loader_a, train_loader_b)):
        trainer.update_learning_rate()
        images_a, images_b = images_a['data'].cuda().detach(
        ), images_b['data'].cuda().detach()

        with Timer("Elapsed time in update: %f"):
コード例 #6
0
    def train(self):
        transform = transforms.Compose([
            transforms.Resize(self.config.data.image_size),
            transforms.ToTensor()
        ])

        if self.config.data.dataset == 'CIFAR10':
            dataset = CIFAR10(os.path.join(self.args.run, 'datasets',
                                           'cifar10'),
                              train=True,
                              download=True,
                              transform=transform)
            test_dataset = CIFAR10(os.path.join(self.args.run, 'datasets',
                                                'cifar10'),
                                   train=False,
                                   download=True,
                                   transform=transform)
            num_items = len(dataset)
            indices = list(range(num_items))
            random_state = np.random.get_state()
            np.random.seed(2020)
            np.random.shuffle(indices)
            np.random.set_state(random_state)
            train_indices, val_indices = indices[:int(
                num_items * 0.9)], indices[int(num_items * 0.9):]
            val_dataset = Subset(dataset, val_indices)
            dataset = Subset(dataset, train_indices)
        elif self.config.data.dataset == 'MNIST':
            dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist'),
                            train=True,
                            download=True,
                            transform=transform)
            num_items = len(dataset)
            indices = list(range(num_items))
            random_state = np.random.get_state()
            np.random.seed(2020)
            np.random.shuffle(indices)
            np.random.set_state(random_state)
            train_indices, val_indices = indices[:int(
                num_items * 0.9)], indices[int(num_items * 0.9):]
            val_dataset = Subset(dataset, val_indices)
            dataset = Subset(dataset, train_indices)
            test_dataset = MNIST(os.path.join(self.args.run, 'datasets',
                                              'mnist'),
                                 train=False,
                                 download=True,
                                 transform=transform)

        dataloader = DataLoader(dataset,
                                batch_size=self.config.training.batch_size,
                                shuffle=True,
                                num_workers=2)
        val_loader = DataLoader(val_dataset,
                                batch_size=self.config.training.batch_size,
                                shuffle=True,
                                num_workers=2)
        test_loader = DataLoader(test_dataset,
                                 batch_size=self.config.training.batch_size,
                                 shuffle=True,
                                 num_workers=2)

        val_iter = iter(val_loader)
        self.config.input_dim = self.config.data.image_size**2 * self.config.data.channels

        tb_path = os.path.join(self.args.run, 'tensorboard', self.args.doc)
        if os.path.exists(tb_path):
            shutil.rmtree(tb_path)
        model_path = os.path.join(self.args.run, 'results', self.args.doc)
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
        os.makedirs(model_path)

        ## save txt files
        txtfiles = os.path.join('txtresults', self.args.doc)
        if not os.path.exists(txtfiles):
            os.makedirs(txtfiles)

        tb_logger = tensorboardX.SummaryWriter(log_dir=tb_path)

        flow = NICE(self.config.input_dim, self.config.model.hidden_size,
                    self.config.model.num_layers).to(self.config.device)

        optimizer = self.get_optimizer(flow.parameters())

        # Set up test data
        noise_sigma = self.config.data.noise_sigma
        step = 1

        def energy_net(inputs):
            energy, _ = flow(inputs, inv=False)
            return -energy

        def grad_net_kingma(inputs):
            energy, _ = flow(inputs, inv=False)
            grad1, grad2 = flow.grads_backward(inv=False)
            return -grad1, -grad2

        def grad_net_UT(inputs):
            energy, _ = flow(inputs, inv=False)
            grad1, T, U = flow.grads_backward_TU(inv=False)
            grad2 = T * U / 2.
            return -grad1, -grad2

        def grad_net_S(inputs):
            energy, _ = flow(inputs, inv=False)
            grad1, S_r, S_i = flow.grads_backward_S(inv=False)
            grad2 = (S_r**2 - S_i**2)
            return -grad1, -grad2

        def sample_net(z):
            samples, _ = flow(z, inv=True)
            samples, _ = Logit()(samples, mode='inverse')
            return samples

        # Use this to select the sigma for DSM losses
        # if self.config.training.algo == 'dsm':
        #     sigma = self.args.dsm_sigma
        # if noise_sigma is None:
        #     sigma = select_sigma(iter(dataloader), iter(val_loader))
        # else:
        #     sigma = select_sigma(iter(dataloader), iter(val_loader), noise_sigma=noise_sigma)

        if self.args.load_path != "":
            flow.load_state_dict(torch.load(self.args.load_path))

        best_model = {"val": None, "ll": None, "esm": None}
        best_val_loss = {"val": 1e+10, "ll": -1e+10, "esm": 1e+10}
        best_val_iter = {"val": 0, "ll": 0, "esm": 0}

        time_record = []
        time_culm_record = []
        val_logp_record = []
        val_sm_record = []
        for _ in range(self.config.training.n_epochs):
            for _, (X, y) in enumerate(dataloader):
                noises = torch.zeros_like(X)
                X = X + (noises.uniform_(0, 1) - 0.5) / 256.
                flattened_X = X.type(torch.float32).to(
                    self.config.device).view(X.shape[0], -1)
                flattened_X.clamp_(1e-3, 1 - 1e-3)
                flattened_X, _ = Logit()(flattened_X, mode='direct')

                if noise_sigma is not None:
                    flattened_X += torch.randn_like(flattened_X) * noise_sigma

                flattened_X.requires_grad_(True)

                logp = -energy_net(flattened_X)
                logp = logp.mean()

                if self.config.training.algo == 'kingma':
                    t = time.time()
                    loss = approx_backprop_score_matching(
                        grad_net_kingma, flattened_X)
                if self.config.training.algo == 'UT':
                    t = time.time()
                    loss = approx_backprop_score_matching(
                        grad_net_UT, flattened_X)
                if self.config.training.algo == 'S':
                    t = time.time()
                    loss = approx_backprop_score_matching(
                        grad_net_S, flattened_X)
                elif self.config.training.algo == 'mle':
                    t = time.time()
                    loss = energy_net(flattened_X)
                    loss = loss.mean()
                elif self.config.training.algo == 'ssm':
                    t = time.time()
                    loss, *_ = single_sliced_score_matching(
                        energy_net,
                        flattened_X,
                        noise_type=self.config.training.noise_type)
                elif self.config.training.algo == 'ssm_vr':
                    t = time.time()
                    loss, *_ = sliced_VR_score_matching(
                        energy_net,
                        flattened_X,
                        noise_type=self.config.training.noise_type)
                elif self.config.training.algo == 'dsm':
                    t = time.time()
                    loss = dsm(energy_net,
                               flattened_X,
                               sigma=self.args.dsm_sigma)
                elif self.config.training.algo == 'dsm_tracetrick':
                    t = time.time()
                    loss = dsm_tracetrick(energy_net,
                                          flattened_X,
                                          sigma=self.args.dsm_sigma)
                elif self.config.training.algo == 'dsm_tracetrick_FD':
                    t = time.time()
                    loss = dsm_tracetrick_FD(energy_net,
                                             flattened_X,
                                             sigma=self.args.dsm_sigma)
                elif self.config.training.algo == "exact":
                    t = time.time()
                    loss = exact_score_matching(energy_net,
                                                flattened_X,
                                                train=True).mean()
                elif self.config.training.algo == 'efficient_sm':
                    t = time.time()
                    loss = single_efficient_score_matching(
                        energy_net,
                        flattened_X,
                        eps=self.args.ESM_eps,
                        noise_type=self.config.training.noise_type)
                elif self.config.training.algo == 'efficient_sm_conjugate':
                    t = time.time()
                    loss = efficient_score_matching_conjugate(
                        energy_net,
                        flattened_X,
                        eps=self.args.ESM_eps,
                        noise_type=self.config.training.noise_type)
                elif self.config.training.algo == 'MLE_efficient_sm_conjugate':
                    t = time.time()
                    loss = MLE_efficient_score_matching_conjugate(
                        energy_net,
                        flattened_X,
                        eps=self.args.ESM_eps,
                        mle_ratio=self.args.MLE_ratio,
                        noise_type=self.config.training.noise_type)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                t = time.time() - t
                time_record.append(t)

                if step % 100 == 0:
                    try:
                        val_X, _ = next(val_iter)
                    except:
                        val_iter = iter(val_loader)
                        val_X, _ = next(val_iter)
                    noises = torch.zeros_like(val_X)
                    val_X = val_X + (noises.uniform_(0, 1) - 0.5) / 256.
                    val_X = val_X.type(torch.float32).to(self.config.device)
                    val_X.clamp_(1e-3, 1 - 1e-3)
                    val_X, _ = Logit()(val_X, mode='direct')
                    val_X = val_X.view(val_X.shape[0], -1)
                    if noise_sigma is not None:
                        val_X += torch.randn_like(val_X) * noise_sigma

                    val_logp = -energy_net(val_X)
                    val_logp = val_logp.mean()
                    if self.config.training.algo == 'kingma':
                        val_loss = approx_backprop_score_matching(
                            grad_net_kingma, val_X)
                    if self.config.training.algo == 'UT':
                        val_loss = approx_backprop_score_matching(
                            grad_net_UT, val_X)
                    if self.config.training.algo == 'S':
                        val_loss = approx_backprop_score_matching(
                            grad_net_S, val_X)
                    elif self.config.training.algo == 'ssm':
                        val_loss, *_ = single_sliced_score_matching(
                            energy_net,
                            val_X,
                            noise_type=self.config.training.noise_type)
                    elif self.config.training.algo == 'ssm_vr':
                        val_loss, *_ = sliced_VR_score_matching(
                            energy_net,
                            val_X,
                            noise_type=self.config.training.noise_type)
                    elif self.config.training.algo == 'dsm':
                        val_loss = dsm(energy_net,
                                       val_X,
                                       sigma=self.args.dsm_sigma)
                    elif self.config.training.algo == 'dsm_tracetrick':
                        val_loss = dsm_tracetrick(energy_net,
                                                  val_X,
                                                  sigma=self.args.dsm_sigma)
                    elif self.config.training.algo == 'dsm_tracetrick_FD':
                        val_loss = dsm_tracetrick_FD(energy_net,
                                                     val_X,
                                                     sigma=self.args.dsm_sigma)
                    elif self.config.training.algo == 'mle':
                        val_loss = -val_logp
                    elif self.config.training.algo == "exact":
                        val_loss = exact_score_matching(energy_net,
                                                        val_X,
                                                        train=False).mean()
                    elif self.config.training.algo == 'efficient_sm':
                        val_loss = single_efficient_score_matching(
                            energy_net, val_X, eps=self.args.ESM_eps)
                    elif self.config.training.algo == 'efficient_sm_conjugate':
                        val_loss = efficient_score_matching_conjugate(
                            energy_net, val_X, eps=self.args.ESM_eps)
                    elif self.config.training.algo == 'MLE_efficient_sm_conjugate':
                        val_loss = MLE_efficient_score_matching_conjugate(
                            energy_net,
                            val_X,
                            eps=self.args.ESM_eps,
                            mle_ratio=self.args.MLE_ratio)

                    logging.info(
                        "logp: {:.3f}, val_logp: {:.3f}, loss: {:.3f}, val_loss: {:.3f}, time per step: {:.3f} +- {:.3f} ms"
                        .format(logp.item(), val_logp.item(), loss.item(),
                                val_loss.item(),
                                np.mean(time_record) * 1e3,
                                np.std(time_record) * 1e3))
                    tb_logger.add_scalar('logp', logp, global_step=step)
                    tb_logger.add_scalar('loss', loss, global_step=step)
                    tb_logger.add_scalar('val_logp',
                                         val_logp,
                                         global_step=step)
                    tb_logger.add_scalar('val_loss',
                                         val_loss,
                                         global_step=step)

                    # save records in txt
                    val_logp_record.append(val_logp.item())
                    time_culm = sum(time_record)
                    time_culm_record.append(time_culm)
                    np.savetxt(txtfiles + '/val_logp_record.txt',
                               np.array(val_logp_record))
                    np.savetxt(txtfiles + '/time_culm_record.txt',
                               np.array(time_culm_record))

                    if val_loss < best_val_loss['val']:
                        best_val_loss['val'] = val_loss
                        best_val_iter['val'] = step
                        best_model['val'] = copy.deepcopy(flow.state_dict())
                    if val_logp > best_val_loss['ll']:
                        best_val_loss['ll'] = val_logp
                        best_val_iter['ll'] = step
                        best_model['ll'] = copy.deepcopy(flow.state_dict())

                if step % 100 == 0:
                    with torch.no_grad():
                        z = torch.normal(
                            torch.zeros(100,
                                        flattened_X.shape[1],
                                        device=self.config.device))
                        samples = sample_net(z)
                        samples = samples.view(100, self.config.data.channels,
                                               self.config.data.image_size,
                                               self.config.data.image_size)
                        samples = torch.clamp(samples, 0.0, 1.0)
                        image_grid = make_grid(samples, 10)
                        tb_logger.add_image('samples',
                                            image_grid,
                                            global_step=step)
                        data = X
                        data_grid = make_grid(data[:100], 10)
                        tb_logger.add_image('data',
                                            data_grid,
                                            global_step=step)

                    logging.info("Computing exact score matching....")
                    try:
                        val_X, _ = next(val_iter)
                    except:
                        val_iter = iter(val_loader)
                        val_X, _ = next(val_iter)

                    noises = torch.zeros_like(val_X)
                    val_X = val_X + (noises.uniform_(0, 1) - 0.5) / 256.
                    val_X = val_X.type(torch.float32).to(self.config.device)
                    val_X.clamp_(1e-3, 1 - 1e-3)
                    val_X, _ = Logit()(val_X, mode='direct')
                    val_X = val_X.view(val_X.shape[0], -1)
                    if noise_sigma is not None:
                        val_X += torch.randn_like(val_X) * noise_sigma

                    sm_loss = exact_score_matching(energy_net,
                                                   val_X,
                                                   train=False).mean()
                    if sm_loss < best_val_loss['esm']:
                        best_val_loss['esm'] = sm_loss
                        best_val_iter['esm'] = step
                        best_model['esm'] = copy.deepcopy(flow.state_dict())

                    logging.info(
                        'step: {}, exact score matching loss: {}'.format(
                            step, sm_loss.item()))
                    tb_logger.add_scalar('exact_score_matching_loss',
                                         sm_loss,
                                         global_step=step)

                    # save records in txt
                    val_sm_record.append(sm_loss.item())
                    np.savetxt(txtfiles + '/val_smloss_record.txt',
                               np.array(val_sm_record))

                if step % 500 == 0:
                    torch.save(flow.state_dict(),
                               os.path.join(model_path, 'nice.pth'))

                step += 1

        self.results = {}
        self.evaluate_model(flow.state_dict(), "final", val_loader,
                            test_loader, model_path)
        self.evaluate_model(best_model['val'], "best_on_val", val_loader,
                            test_loader, model_path)
        self.evaluate_model(best_model['ll'], "best_on_ll", val_loader,
                            test_loader, model_path)
        self.evaluate_model(best_model['esm'], "best_on_esm", val_loader,
                            test_loader, model_path)
        self.results['final']['num_iters'] = step
        self.results['best_on_val']['num_iters'] = best_val_iter['val']
        self.results['best_on_ll']['num_iters'] = best_val_iter['ll']
        self.results['best_on_esm']['num_iters'] = best_val_iter['esm']

        pickle_out = open(model_path + "/results.pkl", "wb")
        pickle.dump(self.results, pickle_out)
        pickle_out.close()
コード例 #7
0
def _main():
    print_gpu_details()
    device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
    train_root = args.train_path

    image_size = 256
    cropped_image_size = 256
    print("set image folder")
    train_set = dset.ImageFolder(root=train_root,
                                 transform=transforms.Compose([
                                     transforms.Resize(image_size),
                                     transforms.CenterCrop(cropped_image_size),
                                     transforms.ToTensor()
                                 ]))

    normalizer_clf = transforms.Compose([
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    print('set data loader')
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

    classifier = torch.load(args.classifier_path)
    classifier.eval()
    classifier.to(device)
    features1_to_image_gen = Features1ToImage()
    features1_to_image_gen.load_state_dict(torch.load(os.path.join(args.features_gens_dir_path, 'features1_to_image')))
    features1_to_image_gen.eval()
    features1_to_image_gen.to(device)
    features_generators = [LevelUpFeaturesGenerator(input_level_features=i) for i in range(2, 5)]
    for i, features_gen in enumerate(features_generators):
        input_level_features = i + 2
        features_gen.to(device)
        # weights init
        if input_level_features < args.train_block_input:
            features_gen.load_state_dict(torch.load(os.path.join(args.features_gens_dir_path, 'features{}_to_features{}'.format(input_level_features, input_level_features - 1))))
            features_gen.eval()
        else:
            features_gen.init_weights()

    discriminator = FeaturesDiscriminator(args.discriminator_norm, dis_type=args.gen_type, dis_level=args.train_block_input - 1)
    discriminator.to(device)
    discriminator.init_weights()

    # losses + optimizers
    criterion_discriminator, criterion_generator = get_wgan_losses_fn()
    next_level_features_criterion = nn.L1Loss()
    criterion_features = nn.L1Loss()
    gen_optimizer = optim.Adam(features_generators[args.train_block_input - 2].parameters(), lr=args.lr, betas=(0.5, 0.999))
    discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=args.lr, betas=(0.5, 0.999))

    num_of_epochs = args.epochs

    starting_time = time.time()
    iterations = 0
    outputs_dir = os.path.join('features_creation_models', args.model_name)
    if not os.path.isdir(outputs_dir):
        os.makedirs(outputs_dir, exist_ok=True)
    temp_results_dir = os.path.join(outputs_dir, 'temp_results')
    if not os.path.isdir(temp_results_dir):
        os.mkdir(temp_results_dir)
    models_dir = os.path.join(outputs_dir, 'models_checkpoint')
    if not os.path.isdir(models_dir):
        os.mkdir(models_dir)
    writer = tensorboardX.SummaryWriter(os.path.join(outputs_dir, 'summaries'))
    fixed_features = 0
    first_iter = True
    print("Starting Training Loop...")
    features_to_train = args.train_block_input
    for epoch in range(num_of_epochs):
        for data in train_loader:
            iterations += 1
            if iterations % 30 == 1:
                print('epoch:', epoch, ', iter', iterations, 'start, time =', time.time() - starting_time, 'seconds')
                starting_time = time.time()
            images, _ = data
            images = images.to(device)  # change to gpu tensor
            images_clf = normalizer_clf(images)
            _, features = classifier(images_clf)
            features = list(features)
            if first_iter:
                first_iter = False
                fixed_features = [torch.clone(features[x]) for x in range(len(features))]
                grid = vutils.make_grid(images, padding=2, normalize=False, nrow=8)
                vutils.save_image(grid, os.path.join(temp_results_dir, 'original_images.jpg'))

            if iterations % (args.discriminator_steps + 1) != 1:
                discriminator_loss_dict = train_discriminator(features_generators[features_to_train - 2], discriminator, criterion_discriminator, discriminator_optimizer, features, features_to_train)
                for k, v in discriminator_loss_dict.items():
                    writer.add_scalar('D/%s' % k, v.data.cpu().numpy(), global_step=iterations)
                    if iterations % 30 == 1:
                        print('{}: {:.6f}'.format(k, v))
            else:
                generator_loss_dict = train_generator(features_generators[features_to_train - 2], discriminator, classifier, gen_optimizer, features,
                                                      features_to_train, criterion_generator, criterion_features, next_level_features_criterion)
                for k, v in generator_loss_dict.items():
                    writer.add_scalar('G/f' + str(features_to_train) + '_%s' % k, v.data.cpu().numpy(), global_step=iterations//1 + 1)
                    if iterations % 30 == 1:
                        print('{}: {:.6f}'.format(k, v))

            if iterations < 10000 and iterations % 2000 == 1 or iterations % 4000 == 1:
                for i, features_gen in enumerate(features_generators):
                    features_level = i + 2
                    if features_level == features_to_train: # print only the given layer output (have option to generate from all layer by modifying the if)
                        torch.save(features_gen.state_dict(),  models_dir + '/' + args.model_name + '_f{}_to_f{}'.format(features_level, features_level - 1))
                        # regular sampling (#batch_size different images)
                        fake_images = sample(features1_to_image_gen, features_generators, fixed_features, features_level)
                        grid = vutils.make_grid(fake_images, padding=2, normalize=True, nrow=8)
                        vutils.save_image(grid, os.path.join(temp_results_dir, 'res_iter_{}_origin_f{}.jpg'.format(iterations // 2000, features_level)))

            if iterations % 20000 == 1:
                for i, features_gen in enumerate(features_generators):
                    features_level = i + 2
                    torch.save(features_gen.state_dict(), models_dir + '/' + args.model_name + '_f{}_to_f{}_'.format(features_level, features_level - 1) + str(iterations // 20000))
コード例 #8
0
# model_name = args.model or default_model_name
# model_dir = utils.get_model_dir(model_name)
results_dir = os.path.join(args.results_dir, args.env)
model_dir = os.path.join(results_dir, 'models', 'seed' + str(args.seed))
if not os.path.exists(model_dir):
    try:
        os.makedirs(model_dir)
    except:
        pass
output_performance_filename = os.path.join(results_dir, 'seed' + str(args.seed) + '.npz')

# Load loggers and Tensorboard writer

txt_logger = utils.get_txt_logger(model_dir)
csv_file, csv_logger = utils.get_csv_logger(model_dir)
tb_writer = tensorboardX.SummaryWriter(model_dir)

# Log command and all script arguments

txt_logger.info("{}\n".format(" ".join(sys.argv)))
txt_logger.info("{}\n".format(args))

# Set seed for all randomness sources

utils.seed(args.seed)

# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
txt_logger.info(f"Device: {device}\n")
コード例 #9
0
ファイル: trainer.py プロジェクト: jsherrah/SSD
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer,
             device, arguments, args):
    logger = logging.getLogger("SSD.trainer")
    logger.info("Start training ...")
    meters = MetricLogger()

    model.train()
    save_to_disk = dist_util.get_rank() == 0
    if args.use_tensorboard and save_to_disk:
        import tensorboardX

        summary_writer = tensorboardX.SummaryWriter(
            log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs'))
    else:
        summary_writer = None

    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        iteration = iteration + 1
        arguments["iteration"] = iteration
        scheduler.step()

        images = images.to(device)
        targets = targets.to(device)
        loss_dict = model(images, targets=targets)
        loss = sum(loss for loss in loss_dict.values())

        #print('loss dict = {}'.format(loss_dict))

        assert torch.all(torch.isfinite(images.flatten()))

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(total_loss=losses_reduced, **loss_dict_reduced)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time)
        if iteration % args.log_step == 0:
            eta_seconds = meters.time.global_avg * (max_iter - iteration)
            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
            logger.info(
                meters.delimiter.join([
                    "iter: {iter:06d}",
                    "lr: {lr:.5f}",
                    '{meters}',
                    "eta: {eta}",
                    'mem: {mem}M',
                ]).format(
                    iter=iteration,
                    lr=optimizer.param_groups[0]['lr'],
                    meters=str(meters),
                    eta=eta_string,
                    mem=round(torch.cuda.max_memory_allocated() / 1024.0 /
                              1024.0),
                ))
            if summary_writer:
                global_step = iteration
                summary_writer.add_scalar('losses/total_loss',
                                          losses_reduced,
                                          global_step=global_step)
                for loss_name, loss_item in loss_dict_reduced.items():
                    summary_writer.add_scalar('losses/{}'.format(loss_name),
                                              loss_item,
                                              global_step=global_step)
                summary_writer.add_scalar('lr',
                                          optimizer.param_groups[0]['lr'],
                                          global_step=global_step)

        if iteration % args.save_step == 0:
            checkpointer.save("model_{:06d}".format(iteration), **arguments)

        if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter:
            if dist_util.get_rank() == 0 and summary_writer:
                eval_results = do_evaluation(cfg,
                                             model,
                                             distributed=args.distributed,
                                             iteration=iteration)
                print('Logging evaluation results...')
                for eval_result, dataset in zip(eval_results,
                                                cfg.DATASETS.TEST):
                    write_metric(eval_result['metrics'], 'metrics/' + dataset,
                                 summary_writer, iteration)
                    # For debugging
                    if 0:
                        print(
                            'writing backup accuracy to logger, eval_result = {}'
                            .format(eval_result))
                        summary_writer.add_scalar(
                            'accuracyBackup',
                            eval_result['metrics']['mAP'],
                            global_step=iteration)

                # important!!
                summary_writer.flush()

            model.train()  # *IMPORTANT*: change to train mode after eval.

    checkpointer.save("model_final", **arguments)
    # compute training time
    total_training_time = int(time.time() - start_training_time)
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
    return model
コード例 #10
0
                          shuffle=True,
                          batch_size=64,
                          transform=train_transform)
    testloader = CIFAR10(train=False,
                         shuffle=False,
                         batch_size=100,
                         transform=test_transform)

    model = get_model(args.model)

    optimizer = nn.SGD(parameters=model.parameters(),
                       lr=args.learning_rate,
                       momentum=0.9,
                       weight_decay=5e-4)

    summary_writer = tensorboardX.SummaryWriter(logdir=args.logdir)

    decay_lr_at = [int(args.epoch_num * i) for i in [0.25, 0.5, 0.75]]

    max_acc = 0.
    for epoch in range(args.epoch_num):
        if epoch in decay_lr_at:
            optimizer.lr *= 0.1
        train_loss, train_acc = train(epoch, model, trainloader, optimizer)
        test_loss, test_acc = test(epoch, model, testloader)

        summary_writer.add_scalar('Train Loss', train_loss, epoch)
        summary_writer.add_scalar('Train Acc', train_acc, epoch)
        summary_writer.add_scalar('Test Loss', test_loss, epoch)
        summary_writer.add_scalar('Test Acc', test_acc, epoch)
コード例 #11
0
ファイル: tee.py プロジェクト: yzhen-li/xnmt
 def set_out_file(self, out_file_name: str, exp_name: str) -> None:
   self.out_file_name = out_file_name
   self.exp_name = exp_name
   self.writer = tensorboardX.SummaryWriter(log_dir=f"{out_file_name}")
コード例 #12
0
def main():
    configs = prepare()
    if configs.evaluate is not None:
        configs.evaluate.fn(configs)
        return

    import numpy as np
    import tensorboardX
    import torch
    import torch.backends.cudnn as cudnn
    from torch.utils.data import DataLoader
    from tqdm import tqdm

    ################################
    # Train / Eval Kernel Function #
    ################################

    def adjust_learning_rate(optimizer, epoch, args_lr):
        """Sets the learning rate to the initial LR decayed by half by every 5 or 10 epochs"""
        if epoch > 0:
            if epoch <= 30:
                lr = args_lr * (0.5**(epoch // 5))
            else:
                lr = args_lr * (0.5**(epoch // 10))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            writer.add_scalar('lr_dis', lr, epoch)

    # train kernel
    def train(model, source_loader, target_loader, criterion, optimizer_g,
              optimizer_cls, scheduler_g, scheduler_cls, current_step, writer,
              cons):

        model.train()
        loss_total = 0
        loss_adv_total = 0
        data_total = 0

        batch_iterator = zip(loop_iterable(source_loader),
                             loop_iterable(target_loader))

        for _ in trange(len(source_loader)):
            (inputs, targets), (inputs_t, _) = next(batch_iterator)

            if isinstance(inputs, dict):
                for k, v in inputs.items():
                    batch_size = v.size(0)
                    inputs[k] = v.to(configs.device, non_blocking=True)
            else:
                batch_size = inputs.size(0)
                inputs = inputs.to(configs.device, non_blocking=True)

            if isinstance(inputs_t, dict):
                for k, v in inputs_t.items():
                    batch_size = v.size(0)
                    inputs_t[k] = v.to(configs.device, non_blocking=True)
            else:
                batch_size = inputs_t.size(0)
                inputs_t = inputs_t.to(configs.device, non_blocking=True)

            if isinstance(targets, dict):
                for k, v in targets.items():
                    targets[k] = v.to(configs.device, non_blocking=True)
            else:
                targets = targets.to(configs.device, non_blocking=True)

            outputs = model(inputs)

            pred_t1, pred_t2 = model.module.inst_seg_net(
                {
                    'features': inputs_t['features'],
                    'one_hot_vectors': inputs_t['one_hot_vectors']
                },
                constant=cons,
                adaptation=True)

            loss_s = criterion(outputs, targets)

            # Adversarial loss
            loss_adv = -1 * discrepancy_loss(pred_t1, pred_t2)

            loss = loss_s + loss_adv
            loss.backward()
            optimizer_g.step()
            optimizer_cls.step()
            optimizer_g.zero_grad()
            optimizer_cls.zero_grad()

            loss_adv_total += loss_adv.item() * batch_size

            # Gen Training
            for _ in range(configs.train.gen_num_train):
                pred_t1, pred_t2 = model.module.inst_seg_net(
                    {
                        'features': inputs_t['features'],
                        'one_hot_vectors': inputs_t['one_hot_vectors']
                    },
                    constant=cons,
                    adaptation=True)
                loss_adv = -1 * discrepancy_loss(pred_t1, pred_t2)
                loss_adv.backward()
                loss_adv_total += loss_adv.item() * batch_size
                optimizer_g.step()
                optimizer_g.zero_grad()

            loss_total += loss_s.item() * batch_size
            data_total += batch_size

            writer.add_scalar('loss_s/train', loss_total / data_total,
                              current_step)
            writer.add_scalar('loss_adv/train', loss_adv_total / data_total,
                              current_step)
            current_step += batch_size

        if scheduler_g is not None:
            scheduler_g.step()

        if scheduler_cls is not None:
            scheduler_cls.step()

    # evaluate kernel
    def evaluate(model, loader, split='test'):
        meters = {}
        for k, meter in configs.train.meters.items():
            meters[k.format(split)] = meter()
        model.eval()
        with torch.no_grad():
            for inputs, targets in tqdm(loader, desc=split, ncols=0):
                if isinstance(inputs, dict):
                    for k, v in inputs.items():
                        inputs[k] = v.to(configs.device, non_blocking=True)
                else:
                    inputs = inputs.to(configs.device, non_blocking=True)
                if isinstance(targets, dict):
                    for k, v in targets.items():
                        targets[k] = v.to(configs.device, non_blocking=True)
                else:
                    targets = targets.to(configs.device, non_blocking=True)
                outputs = model(inputs)
                for meter in meters.values():
                    meter.update(outputs, targets)
        for k, meter in meters.items():
            meters[k] = meter.compute()
        return meters

    ###########
    # Prepare #
    ###########

    if configs.device == 'cuda':
        cudnn.benchmark = True
        if configs.get('deterministic', False):
            cudnn.deterministic = True
            cudnn.benchmark = False
    if ('seed' not in configs) or (configs.seed is None):
        configs.seed = torch.initial_seed() % (2**32 - 1)
    seed = configs.seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    print(configs)

    #####################################################################
    # Initialize DataLoaders, Model, Criterion, LRScheduler & Optimizer #
    #####################################################################

    print(f'\n==> loading source dataset "{configs.source_dataset}"')
    source_dataset = configs.source_dataset()
    source_loaders = {
        "train":
        DataLoader(
            source_dataset["train"],
            shuffle=True,
            batch_size=configs.train.batch_size,
            drop_last=True,
            num_workers=configs.data.num_workers,
            pin_memory=True,
            worker_init_fn=lambda worker_id: np.random.seed(seed + worker_id))
    }

    print(f'\n==> loading target dataset "{configs.target_dataset}"')
    target_dataset = configs.target_dataset()
    target_loaders = {}
    for split in target_dataset:
        target_loaders[split] = DataLoader(
            target_dataset[split],
            shuffle=(split == 'train'),
            batch_size=configs.train.batch_size,
            drop_last=True,
            num_workers=configs.data.num_workers,
            pin_memory=True,
            worker_init_fn=lambda worker_id: np.random.seed(seed + worker_id))

    print(f'\n==> creating model "{configs.model}"')
    model = configs.model()
    if configs.device == 'cuda':
        model = torch.nn.DataParallel(model)
    model = model.to(configs.device)
    criterion = configs.train.criterion().to(configs.device)
    #params
    gen_params = [{
        'params': v
    } for k, v in model.module.inst_seg_net.g.named_parameters()
                  if 'pred_offset' not in k]

    cls_params = [{
        'params': model.module.inst_seg_net.c1.parameters()
    }, {
        'params': model.module.inst_seg_net.c2.parameters()
    }, {
        'params': model.module.center_reg_net.parameters()
    }, {
        'params': model.module.box_est_net.parameters()
    }]

    optimizer_g = configs.train.optimizer_g(gen_params)
    optimizer_cls = configs.train.optimizer_cls(cls_params)
    # optimizer_dis = configs.train.optimizer_dis(dis_params)

    last_epoch, best_metrics = -1, {m: None for m in configs.train.metrics}

    if os.path.exists(configs.train.checkpoint_path):

        print(f'==> loading checkpoint "{configs.train.checkpoint_path}"')
        checkpoint = torch.load(configs.train.checkpoint_path)

        print(' => loading model')
        model.load_state_dict(checkpoint.pop('model'))

        if 'optimizer_g' in checkpoint and checkpoint[
                'optimizer_g'] is not None:
            print(' => loading optimizer_g')
            optimizer_g.load_state_dict(checkpoint.pop('optimizer_g'))

        if 'optimizer_cls' in checkpoint and checkpoint[
                'optimizer_cls'] is not None:
            print(' => loading optimizer_cls')
            optimizer_cls.load_state_dict(checkpoint.pop('optimizer_cls'))

        last_epoch = checkpoint.get('epoch', last_epoch)
        meters = checkpoint.get('meters', {})

        for m in configs.train.metrics:
            best_metrics[m] = meters.get(m + '_best', best_metrics[m])

        del checkpoint

    if 'scheduler_g' in configs.train and configs.train.scheduler_g is not None:
        configs.train.scheduler_g.last_epoch = last_epoch
        print(f'==> creating scheduler "{configs.train.scheduler_g}"')
        scheduler_g = configs.train.scheduler_g(optimizer_g)
    else:
        scheduler_g = None

    if 'scheduler_c' in configs.train and configs.train.scheduler_c is not None:
        configs.train.scheduler_c.last_epoch = last_epoch
        print(f'==> creating scheduler "{configs.train.scheduler_c}"')
        scheduler_c = configs.train.scheduler_c(optimizer_cls)
    else:
        scheduler_c = None

    ############
    # Training #
    ############

    if last_epoch >= configs.train.num_epochs:
        meters = dict()
        for split, loader in target_loaders.items():
            if split != 'train':
                meters.update(evaluate(model, loader=loader, split=split))
        for k, meter in meters.items():
            print(f'[{k}] = {meter:2f}')
        return

    with tensorboardX.SummaryWriter(configs.train.save_path) as writer:
        step_size = min(len(source_dataset['train']),
                        len(target_dataset['train']))

        for current_epoch in range(last_epoch + 1, configs.train.num_epochs):
            current_step = current_epoch * step_size
            cons = math.sin(
                (current_epoch + 1) / configs.train.num_epochs * math.pi / 2)

            writer.add_scalar('lr_g', scheduler_g.get_lr()[0], current_epoch)
            writer.add_scalar('lr_c', scheduler_c.get_lr()[0], current_epoch)

            # train
            print(
                f'\n==> training epoch {current_epoch}/{configs.train.num_epochs}'
            )
            train(model,
                  source_loader=source_loaders['train'],
                  target_loader=target_loaders['train'],
                  criterion=criterion,
                  optimizer_g=optimizer_g,
                  optimizer_cls=optimizer_cls,
                  scheduler_g=scheduler_g,
                  scheduler_cls=scheduler_c,
                  current_step=current_step,
                  writer=writer,
                  cons=cons)
            current_step += step_size

            # evaluate
            meters = dict()
            for split, loader in source_loaders.items():
                if split != 'train':
                    meters.update(evaluate(model, loader=loader, split=split))
            for k, meter in meters.items():
                print(f'Source [{k}] = {meter:2f}')

            meters = dict()
            for split, loader in target_loaders.items():
                if split != 'train':
                    meters.update(evaluate(model, loader=loader, split=split))

            # check whether it is the best
            best = {m: False for m in configs.train.metrics}
            for m in configs.train.metrics:
                if best_metrics[m] is None or best_metrics[m] < meters[m]:
                    best_metrics[m], best[m] = meters[m], True
                meters[m + '_best'] = best_metrics[m]
            # log in tensorboard
            for k, meter in meters.items():
                print(f'Target [{k}] = {meter:2f}')
                writer.add_scalar(k, meter, current_step)

            # save checkpoint
            torch.save(
                {
                    'epoch': current_epoch,
                    'model': model.state_dict(),
                    'optimizer_g': optimizer_g.state_dict(),
                    'optimizer_cls': optimizer_cls.state_dict(),
                    'meters': meters,
                    'configs': configs,
                }, configs.train.checkpoint_path)
            shutil.copyfile(
                configs.train.checkpoint_path,
                configs.train.checkpoints_path.format(current_epoch))
            for m in configs.train.metrics:
                if best[m]:
                    shutil.copyfile(configs.train.checkpoint_path,
                                    configs.train.best_checkpoint_paths[m])
            if best.get(configs.train.metric, False):
                shutil.copyfile(configs.train.checkpoint_path,
                                configs.train.best_checkpoint_path)
            print(f'[save_path] = {configs.train.save_path}')
コード例 #13
0
def main():

    opt = TrainOptions()
    args = opt.initialize()

    _t = {'iter time': Timer()}

    model_name = args.source + '_to_' + args.target
    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)
        os.makedirs(os.path.join(args.snapshot_dir, 'logs'))
    opt.print_options(args)

    sourceloader, targetloader = CreateSrcDataLoader(
        args), CreateTrgDataLoader(args)
    targetloader_iter, sourceloader_iter = iter(targetloader), iter(
        sourceloader)

    model, optimizer = CreateModel(args)
    model_D, optimizer_D = CreateDiscriminator(args)

    start_iter = 0
    if args.restore_from is not None:
        start_iter = int(args.restore_from.rsplit('/', 1)[1].rsplit('_')[1])

    train_writer = tensorboardX.SummaryWriter(
        os.path.join(args.snapshot_dir, "logs", model_name))

    bce_loss = torch.nn.BCEWithLogitsLoss()

    cudnn.enabled = True
    cudnn.benchmark = True
    model.train()
    model.cuda()
    model_D.train()
    model_D.cuda()
    loss = [
        'loss_seg_src', 'loss_seg_trg', 'loss_D_trg_fake', 'loss_D_src_real',
        'loss_D_trg_real'
    ]
    _t['iter time'].tic()
    for i in range(start_iter, args.num_steps):

        model.adjust_learning_rate(args, optimizer, i)
        model_D.adjust_learning_rate(args, optimizer_D, i)

        optimizer.zero_grad()
        optimizer_D.zero_grad()
        for param in model_D.parameters():
            param.requires_grad = False

        src_img, src_lbl, _, _ = sourceloader_iter.next()
        src_img, src_lbl = Variable(src_img).cuda(), Variable(
            src_lbl.long()).cuda()
        src_seg_score = model(src_img, lbl=src_lbl)
        loss_seg_src = model.loss
        loss_seg_src.backward()

        if args.data_label_folder_target is not None:
            trg_img, trg_lbl, _, _ = targetloader_iter.next()
            trg_img, trg_lbl = Variable(trg_img).cuda(), Variable(
                trg_lbl.long()).cuda()
            trg_seg_score = model(trg_img, lbl=trg_lbl)
            loss_seg_trg = model.loss
        else:
            trg_img, _, name = targetloader_iter.next()
            trg_img = Variable(trg_img).cuda()
            trg_seg_score = model(trg_img)
            loss_seg_trg = 0

        outD_trg = model_D(F.softmax(trg_seg_score), 0)
        loss_D_trg_fake = model_D.loss

        loss_trg = args.lambda_adv_target * loss_D_trg_fake + loss_seg_trg
        loss_trg.backward()

        for param in model_D.parameters():
            param.requires_grad = True

        src_seg_score, trg_seg_score = src_seg_score.detach(
        ), trg_seg_score.detach()

        outD_src = model_D(F.softmax(src_seg_score), 0)
        loss_D_src_real = model_D.loss / 2
        loss_D_src_real.backward()

        outD_trg = model_D(F.softmax(trg_seg_score), 1)
        loss_D_trg_real = model_D.loss / 2
        loss_D_trg_real.backward()

        optimizer.step()
        optimizer_D.step()

        for m in loss:
            train_writer.add_scalar(m, eval(m), i + 1)

        if (i + 1) % args.save_pred_every == 0:
            print('taking snapshot ...')
            torch.save(
                model.state_dict(),
                os.path.join(args.snapshot_dir,
                             '%s_' % (args.source) + str(i + 1) + '.pth'))

        if (i + 1) % args.print_freq == 0:
            _t['iter time'].toc(average=False)
            print('[it %d][src seg loss %.4f][lr %.4f][%.2fs]' % \
                    (i + 1, loss_seg_src.data, optimizer.param_groups[0]['lr']*10000, _t['iter time'].diff))
            if i + 1 > args.num_steps_stop:
                print('finish training')
                break
            _t['iter time'].tic()
コード例 #14
0
                                torch.nn.Linear(128, 64), torch.nn.CELU(0.1),
                                torch.nn.Linear(64, 1))
    return model


nn = torchani.ANIModel([atomic() for _ in range(4)])
print(nn)

if os.path.isfile(model_checkpoint):
    nn.load_state_dict(torch.load(model_checkpoint))
else:
    torch.save(nn.state_dict(), model_checkpoint)

model = torch.nn.Sequential(aev_computer, nn).to(device)

writer = tensorboardX.SummaryWriter(log_dir=log)

training = torchani.data.BatchedANIDataset(
    training_path,
    consts.species_to_tensor,
    batch_size,
    device=device,
    transform=[energy_shifter.subtract_from_dataset])

print(training)

validation = torchani.data.BatchedANIDataset(
    validation_path,
    consts.species_to_tensor,
    batch_size,
    device=device,
コード例 #15
0
def main(_):

    local_job_device = '/job:{}/task:{}'.format(FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'
    is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'

    cluster = tf.train.ClusterSpec({
        'actor':
        ['localhost:{}'.format(8001 + i) for i in range(FLAGS.num_actors)],
        'learner': ['localhost:8000']
    })

    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task)

    filters = [shared_job_device, local_job_device]

    input_shape = [84, 84, 4]
    output_size = 4
    env_name = 'BreakoutDeterministic-v4'

    with tf.device(shared_job_device):
        queue = buffer_queue.FIFOQueue(FLAGS.trajectory, input_shape,
                                       output_size, FLAGS.queue_size,
                                       FLAGS.batch_size, FLAGS.num_actors)
        learner = model.IMPALA(trajectory=FLAGS.trajectory,
                               input_shape=input_shape,
                               num_action=output_size,
                               discount_factor=FLAGS.discount_factor,
                               start_learning_rate=FLAGS.start_learning_rate,
                               end_learning_rate=FLAGS.end_learning_rate,
                               learning_frame=FLAGS.learning_frame,
                               baseline_loss_coef=FLAGS.baseline_loss_coef,
                               entropy_coef=FLAGS.entropy_coef,
                               gradient_clip_norm=FLAGS.gradient_clip_norm)

    sess = tf.Session(server.target)
    queue.set_session(sess)
    learner.set_session(sess)

    if is_learner:

        writer = tensorboardX.SummaryWriter('runs/learner')
        train_step = 0

        while True:
            size = queue.get_size()
            if size > 3 * FLAGS.batch_size:
                train_step += 1
                batch = queue.sample_batch()
                s = time.time()
                pi_loss, baseline_loss, entropy, learning_rate = learner.train(
                    state=np.stack(batch.state),
                    reward=np.stack(batch.reward),
                    action=np.stack(batch.action),
                    done=np.stack(batch.done),
                    behavior_policy=np.stack(batch.behavior_policy))
                writer.add_scalar('data/pi_loss', pi_loss, train_step)
                writer.add_scalar('data/baseline_loss', baseline_loss,
                                  train_step)
                writer.add_scalar('data/entropy', entropy, train_step)
                writer.add_scalar('data/learning_rate', learning_rate,
                                  train_step)
                writer.add_scalar('data/time', time.time() - s, train_step)
    else:

        trajectory_data = collections.namedtuple('trajectory_data', [
            'state', 'next_state', 'reward', 'done', 'action',
            'behavior_policy'
        ])

        env = wrappers.make_uint8_env(env_name)
        if FLAGS.task == 0:
            env = gym.wrappers.Monitor(
                env,
                'save-mov',
                video_callable=lambda episode_id: episode_id % 10 == 0)
        state = env.reset()

        episode = 0
        score = 0
        episode_step = 0
        total_max_prob = 0
        lives = 5

        writer = tensorboardX.SummaryWriter('runs/actor_{}'.format(FLAGS.task))

        while True:

            unroll_data = trajectory_data([], [], [], [], [], [])

            for _ in range(FLAGS.trajectory):

                action, behavior_policy, max_prob = learner.get_policy_and_action(
                    state)

                episode_step += 1
                total_max_prob += max_prob

                next_state, reward, done, info = env.step(action)

                score += reward

                if lives != info['ale.lives']:
                    r = -1
                    d = True
                else:
                    r = reward
                    d = False

                unroll_data.state.append(state)
                unroll_data.next_state.append(next_state)
                unroll_data.reward.append(r)
                unroll_data.done.append(d)
                unroll_data.action.append(action)
                unroll_data.behavior_policy.append(behavior_policy)

                state = next_state
                lives = info['ale.lives']

                if done:

                    print(episode, score)
                    writer.add_scalar('data/prob',
                                      total_max_prob / episode_step, episode)
                    writer.add_scalar('data/score', score, episode)
                    writer.add_scalar('data/episode_step', episode_step,
                                      episode)
                    episode += 1
                    score = 0
                    episode_step = 0
                    total_max_prob = 0
                    lives = 5
                    state = env.reset()

            queue.append_to_queue(
                task=FLAGS.task,
                unrolled_state=unroll_data.state,
                unrolled_next_state=unroll_data.next_state,
                unrolled_reward=unroll_data.reward,
                unrolled_done=unroll_data.done,
                unrolled_action=unroll_data.action,
                unrolled_behavior_policy=unroll_data.behavior_policy)
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument("--multi-node", action="store_true", help="multi node")
    parser.add_argument("--out", help="output directory")
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--gpu", type=int, default=0, help="gpu id")
    parser.add_argument("--seed", type=int, default=0, help="random seed")
    parser.add_argument(
        "--lr",
        type=float,
        default=0.0001,
        help="learning rate",
    )
    parser.add_argument(
        "--max-epoch",
        type=int,
        default=30,
        help="max epoch",
    )
    parser.add_argument(
        "--call-evaluation-before-training",
        action="store_true",
        help="call evaluation before training",
    )

    def argparse_type_class_ids(string):
        if string == "all":
            n_class = len(morefusion.datasets.ycb_video.class_names)
            class_ids = np.arange(n_class)[1:].tolist()
        elif string == "asymmetric":
            class_ids = (
                morefusion.datasets.ycb_video.class_ids_asymmetric.tolist())
        elif string == "symmetric":
            class_ids = (
                morefusion.datasets.ycb_video.class_ids_symmetric.tolist())
        else:
            class_ids = [int(x) for x in string.split(",")]
        return class_ids

    parser.add_argument(
        "--class-ids",
        type=argparse_type_class_ids,
        default="all",
        help="class id (e.g., 'all', 'asymmetric', 'symmetric', '1,6,9')",
    )
    parser.add_argument(
        "--pretrained-model",
        help="pretrained model",
    )
    parser.add_argument(
        "--note",
        help="note",
    )
    parser.add_argument(
        "--pretrained-resnet18",
        action="store_true",
        help="pretrained resnet18",
    )
    parser.add_argument(
        "--centerize-pcd",
        action="store_true",
        help="centerize pcd",
    )
    parser.add_argument(
        "--resume",
        help="resume",
    )
    parser.add_argument(
        "--loss",
        choices=["add/add_s", "add->add/add_s|1"],
        default="add->add/add_s|1",
        help="loss",
    )
    args = parser.parse_args()

    chainer.global_config.debug = args.debug

    # -------------------------------------------------------------------------

    # device initialization
    if args.multi_node:
        import chainermn

        comm = chainermn.create_communicator("pure_nccl")
        device = comm.intra_rank
        n_gpu = comm.size
    else:
        device = args.gpu
        n_gpu = 1

    if not args.multi_node or comm.rank == 0:
        now = datetime.datetime.now(datetime.timezone.utc)
        args.timestamp = now.isoformat()
        args.hostname = socket.gethostname()
        args.githash = morefusion.utils.githash(__file__)

        termcolor.cprint("==> Started training", attrs={"bold": True})

    if args.out is None:
        if not args.multi_node or comm.rank == 0:
            args.out = osp.join(here, "logs", now.strftime("%Y%m%d_%H%M%S.%f"))
        else:
            args.out = None
        if args.multi_node:
            args.out = comm.bcast_obj(args.out)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()

    # seed initialization
    random.seed(args.seed)
    np.random.seed(args.seed)
    if device >= 0:
        chainer.cuda.cupy.random.seed(args.seed)

    # dataset initialization
    data_train = None
    data_valid = None
    if not args.multi_node or comm.rank == 0:
        termcolor.cprint("==> Dataset size", attrs={"bold": True})

        data_ycb_trainreal = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "trainreal", class_ids=args.class_ids, augmentation=True)
        data_ycb_syn = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "syn", class_ids=args.class_ids, augmentation=True)
        data_ycb_syn = morefusion.datasets.RandomSamplingDataset(
            data_ycb_syn, len(data_ycb_trainreal))
        data_my_train = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed(  # NOQA
            "train", class_ids=args.class_ids, augmentation=True)
        data_train = chainer.datasets.ConcatenatedDataset(
            data_ycb_trainreal, data_ycb_syn, data_my_train)
        print(f"ycb_trainreal={len(data_ycb_trainreal)}, "
              f"ycb_syn={len(data_ycb_syn)}, my_train={len(data_my_train)}")
        del data_ycb_trainreal, data_ycb_syn, data_my_train

        data_ycb_val = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "val", class_ids=args.class_ids)
        data_my_val = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed(  # NOQA
            "val", class_ids=args.class_ids)
        data_valid = chainer.datasets.ConcatenatedDataset(
            data_ycb_val,
            data_my_val,
        )
        print(f"ycb_val={len(data_ycb_val)}, my_val={len(data_my_val)}")
        del data_ycb_val, data_my_val

        data_train = chainer.datasets.TransformDataset(data_train, transform)
        data_valid = chainer.datasets.TransformDataset(data_valid, transform)

    if args.multi_node:
        data_train = chainermn.scatter_dataset(data_train,
                                               comm,
                                               shuffle=True,
                                               seed=args.seed)
        data_valid = chainermn.scatter_dataset(data_valid,
                                               comm,
                                               shuffle=False,
                                               seed=args.seed)

    args.class_names = morefusion.datasets.ycb_video.class_names.tolist()

    loss = args.loss
    if loss == "add->add/add_s|1":
        loss = "add"

    # model initialization
    model = contrib.models.Model(
        n_fg_class=len(args.class_names) - 1,
        centerize_pcd=args.centerize_pcd,
        pretrained_resnet18=args.pretrained_resnet18,
        loss=loss,
    )
    if args.pretrained_model is not None:
        chainer.serializers.load_npz(args.pretrained_model, model)
    if device >= 0:
        model.to_gpu()

    # optimizer initialization
    optimizer = chainer.optimizers.Adam(alpha=args.lr)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)

    if args.pretrained_resnet18:
        model.resnet_extractor.init_block.disable_update()
        model.resnet_extractor.res2.disable_update()
        for link in model.links():
            if isinstance(link, chainer.links.BatchNormalization):
                link.disable_update()

    if not args.multi_node or comm.rank == 0:
        termcolor.cprint("==> Link update rules", attrs={"bold": True})
        for name, link in model.namedlinks():
            print(name, link.update_enabled)

    # iterator initialization
    iter_train = chainer.iterators.MultiprocessIterator(
        data_train,
        batch_size=16 // n_gpu,
        repeat=True,
        shuffle=True,
    )
    iter_valid = chainer.iterators.MultiprocessIterator(
        data_valid,
        batch_size=16,
        repeat=False,
        shuffle=False,
    )

    updater = chainer.training.StandardUpdater(
        iterator=iter_train,
        optimizer=optimizer,
        device=device,
    )
    if not args.multi_node or comm.rank == 0:
        writer = tensorboardX.SummaryWriter(log_dir=args.out)
        writer_with_updater = morefusion.training.SummaryWriterWithUpdater(
            writer)
        writer_with_updater.setup(updater)

    # -------------------------------------------------------------------------

    trainer = chainer.training.Trainer(updater, (args.max_epoch, "epoch"),
                                       out=args.out)
    trainer.extend(E.FailOnNonNumber())

    @chainer.training.make_extension(trigger=(1, "iteration"))
    def update_loss(trainer):
        updater = trainer.updater
        optimizer = updater.get_optimizer("main")
        target = optimizer.target
        assert trainer.stop_trigger.unit == "epoch"

        if args.loss == "add->add/add_s|1":
            if updater.epoch_detail < 1:
                assert target._loss == "add"
            else:
                target._loss = "add/add_s"
        else:
            assert args.loss in ["add/add_s"]
            return

    trainer.extend(update_loss)

    log_interval = 10, "iteration"
    eval_interval = 0.25, "epoch"

    # evaluate
    evaluator = morefusion.training.extensions.PoseEstimationEvaluator(
        iterator=iter_valid,
        target=model,
        device=device,
        progress_bar=True,
    )
    if args.multi_node:
        evaluator.comm = comm
    trainer.extend(
        evaluator,
        trigger=eval_interval,
        call_before_training=args.call_evaluation_before_training,
    )

    if not args.multi_node or comm.rank == 0:
        # print arguments
        msg = pprint.pformat(args.__dict__)
        msg = textwrap.indent(msg, prefix=" " * 2)
        termcolor.cprint("==> Arguments", attrs={"bold": True})
        print(f"\n{msg}\n")

        trainer.extend(
            morefusion.training.extensions.ArgsReport(args),
            call_before_training=True,
        )

        # snapshot
        trigger_best_add = chainer.training.triggers.MinValueTrigger(
            key="validation/main/add_or_add_s",
            trigger=eval_interval,
        )
        trigger_best_auc = chainer.training.triggers.MaxValueTrigger(
            key="validation/main/auc/add_or_add_s",
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot(filename="snapshot_trainer_latest.npz"),
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_latest.npz"),
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_best_add.npz"),
            trigger=trigger_best_add,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_best_auc.npz"),
            trigger=trigger_best_auc,
        )

        # log
        trainer.extend(
            morefusion.training.extensions.LogTensorboardReport(
                writer=writer,
                trigger=log_interval,
            ),
            call_before_training=True,
        )
        trainer.extend(
            E.PrintReport(
                [
                    "epoch",
                    "iteration",
                    "elapsed_time",
                    "main/loss",
                    "main/add_or_add_s",
                    "validation/main/auc/add_or_add_s",
                ],
                log_report="LogTensorboardReport",
            ),
            trigger=log_interval,
            call_before_training=True,
        )
        trainer.extend(E.ProgressBar(update_interval=1))

    # -------------------------------------------------------------------------

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
コード例 #17
0
    def train(self):
        if 'CIFAR' in self.config.data.dataset:
            if self.config.data.augmentation:
                transform_train = transforms.Compose([
                    transforms.RandomCrop(32, padding=4),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    transforms.Normalize((0.4914, 0.4822, 0.4465),
                                         (0.2023, 0.1994, 0.2010)),
                ])

            else:
                transform_train = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.4914, 0.4822, 0.4465),
                                         (0.2023, 0.1994, 0.2010)),
                ])

            transform_test = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465),
                                     (0.2023, 0.1994, 0.2010)),
            ])
            if self.config.data.dataset == 'CIFAR10':
                dataset = CIFAR10(os.path.join(self.args.run, 'datasets',
                                               'cifar10'),
                                  train=True,
                                  download=True,
                                  transform=transform_train)
                test_dataset = CIFAR10(os.path.join(self.args.run, 'datasets',
                                                    'cifar10'),
                                       train=False,
                                       download=True,
                                       transform=transform_test)
            elif self.config.data.dataset == 'CIFAR100':
                dataset = CIFAR100(os.path.join(self.args.run, 'datasets',
                                                'cifar100'),
                                   train=True,
                                   download=True,
                                   transform=transform_train)
                test_dataset = CIFAR100(os.path.join(self.args.run, 'datasets',
                                                     'cifar100'),
                                        train=False,
                                        download=True,
                                        transform=transform_test)

        elif self.config.data.dataset == 'MNIST':
            if self.config.data.augmentation:
                transform = transforms.Compose([
                    transforms.RandomCrop(28, padding=2),
                    transforms.ToTensor(),
                    transforms.Normalize((0.5, ), (0.5, ))
                ])
            else:
                transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.5, ), (0.5, ))
                ])

            dataset = MNIST(os.path.join(self.args.run, 'datasets', 'mnist'),
                            train=True,
                            download=True,
                            transform=transform)
            test_dataset = MNIST(os.path.join(self.args.run, 'datasets',
                                              'mnist_test'),
                                 train=False,
                                 download=True,
                                 transform=transform)

        elif self.config.data.dataset == 'CELEBA':
            dataset = ImageFolder(
                root=os.path.join(self.args.run, 'datasets', 'celeba'),
                transform=transforms.Compose([
                    transforms.CenterCrop(140),
                    transforms.Resize(self.config.data.image_size),
                    transforms.ToTensor(),
                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                ]))
            num_items = len(dataset)
            indices = list(range(num_items))
            random_state = np.random.get_state()
            np.random.seed(2019)
            np.random.shuffle(indices)
            np.random.set_state(random_state)
            train_indices, test_indices = indices[:int(
                num_items * 0.7)], indices[int(num_items * 0.7):int(num_items *
                                                                    0.8)]
            test_dataset = Subset(dataset, test_indices)
            dataset = Subset(dataset, train_indices)

        dataloader = DataLoader(dataset,
                                batch_size=self.config.training.batch_size,
                                shuffle=True,
                                num_workers=4,
                                drop_last=True)
        test_loader = DataLoader(test_dataset,
                                 batch_size=self.config.training.batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 drop_last=True)
        test_iter = iter(test_loader)

        net = Net(self.config).to(self.config.device)
        #net = ResNet(self.config).to(self.config.device)
        net = torch.nn.DataParallel(net)
        optimizer = self.get_optimizer(net.parameters())

        tb_path = os.path.join(self.args.run, 'tensorboard', self.args.doc)
        if os.path.exists(tb_path):
            shutil.rmtree(tb_path)

        tb_logger = tensorboardX.SummaryWriter(log_dir=tb_path)

        if self.args.resume_training:
            states = torch.load(os.path.join(self.args.run, 'logs',
                                             self.args.doc, 'checkpoint.pth'),
                                map_location=self.config.device)
            net.load_state_dict(states[0])
            optimizer.load_state_dict(states[1])
            begin_epoch = states[2]
            step = states[3]
        else:
            step = 0
            begin_epoch = 0

        # Train the model
        # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150], gamma=0.3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, self.config.training.n_epochs, eta_min=0.)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-08)

        for epoch in range(begin_epoch, self.config.training.n_epochs):
            scheduler.step()
            # manually adjust learning rate
            # self.adjust_learning_rate(optimizer, epoch)
            # total_loss = 0 #for plateau scheduler only
            for batch_idx, (data, target) in enumerate(dataloader):
                net.train()
                data = data.to(device=self.config.device)
                target = target.to(device=self.config.device)
                output = net(data)
                loss = F.nll_loss(output, target)

                pred = torch.argmax(output, dim=1, keepdim=True)
                train_accuracy = float(
                    pred.eq(target.data.view_as(pred)).sum()) / float(
                        target.shape[0])

                # total_loss += loss.data #for plateau scheduler
                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # validation
                net.eval()

                with torch.no_grad():
                    try:
                        test_data, test_target = next(test_iter)
                    except:
                        test_iter = iter(test_loader)
                        test_data, test_target = next(test_iter)

                    test_data = test_data.to(device=self.config.device)
                    test_target = test_target.to(device=self.config.device)
                    test_output = net(test_data)
                    test_loss = F.nll_loss(test_output, test_target)
                    test_pred = torch.argmax(test_output, dim=1, keepdim=True)
                    test_accuracy = float(
                        test_pred.eq(test_target.data.view_as(
                            test_pred)).sum()) / test_data.shape[0]

                tb_logger.add_scalar('training_loss', loss, global_step=step)
                tb_logger.add_scalar('training_accuracy',
                                     train_accuracy,
                                     global_step=step)
                tb_logger.add_scalar('test_loss', test_loss, global_step=step)
                tb_logger.add_scalar('test_accuracy',
                                     test_accuracy,
                                     global_step=step)

                if step % self.config.training.log_interval == 0:
                    logging.info(
                        "epoch: {}, batch: {}, training_loss: {}, train_accuracy: {}, test_loss: {}, test_accuracy: {}"
                        .format(epoch, batch_idx, loss.item(), train_accuracy,
                                test_loss.item(), test_accuracy))
                step += 1

            # scheduler.step(total_loss) #for palteau scheduler only
            if (epoch + 1) % self.config.training.snapshot_interval == 0:
                print(self.config.training.snapshot_interval)
                states = [
                    net.state_dict(),
                    optimizer.state_dict(), epoch + 1, step
                ]
                torch.save(
                    states,
                    os.path.join(self.args.run, 'logs', self.args.doc,
                                 'checkpoint_epoch_{}.pth'.format(epoch + 1)))
                torch.save(
                    states,
                    os.path.join(self.args.run, 'logs', self.args.doc,
                                 'checkpoint.pth'))
コード例 #18
0
    actor_delay=1,
    save_interval=100_000,
    name="awac_run",
    render=False,
    save_to_disk=True,
    log_to_disk=True,
    verbosity=0,
    infinite_bootstrap=True,
    **kwargs,
):

    if save_to_disk or log_to_disk:
        save_dir = utils.make_process_dirs(name)
    if log_to_disk:
        # create tb writer, save hparams
        writer = tensorboardX.SummaryWriter(save_dir)
        writer.add_hparams(locals(), {})

    ###########
    ## SETUP ##
    ###########
    agent.to(device)
    agent.train()
    # initialize target networks
    target_agent = copy.deepcopy(agent)
    target_agent.to(device)
    utils.hard_update(target_agent.critic1, agent.critic1)
    utils.hard_update(target_agent.critic2, agent.critic2)
    target_agent.train()
    # set up optimizers
    critic_optimizer = torch.optim.Adam(
コード例 #19
0
def run_train(model, cfg):
    train_loader = DataLoader(cfg['train'], batch_size=cfg['batch'],
                              shuffle=True, num_workers=cfg['nworker'],
                              collate_fn=cfg['collate'])
    model_pth = os.path.join(cfg['model_dir'], "model.pth")
    writer = tensorboardX.SummaryWriter(cfg['model_dir'])
    cfg['writer'] = writer

    criterion = cfg['criterion']
    optimizer = torch.optim.Adam(
        model.parameters(), lr=cfg['lr'], weight_decay=cfg['decay'])
    if cfg['scheduler']:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'max',
            factor=cfg['factor'], patience=cfg['patience'])

    step = 0
    for e in range(cfg['epochs']):
        print("----run train---", cfg['model'], e)
        model.train()
        st = time.time()

        cfg['step'] = e
        for i_batch, sample_batched in enumerate(train_loader):
            sgene, img, label = sample_batched
            inputs = torch.from_numpy(img).type(torch.cuda.FloatTensor)
            gt = torch.from_numpy(label).type(torch.cuda.FloatTensor)
            model.zero_grad()
            predict = model(inputs)
            loss = criterion(predict, gt)
            loss.backward()
            optimizer.step()

            writer.add_scalar("loss", loss, step)
            step += 1

        et = time.time()
        writer.add_scalar("train time", et - st, e)

        val_loss, lab_f1_macro = run_val(model, cfg)
        print("val loss:", val_loss, "\tf1:", lab_f1_macro)
        if cfg['scheduler']:
            scheduler.step(lab_f1_macro)
            for g in optimizer.param_groups:
                writer.add_scalar("lr", g['lr'], e)

        if e == 0:
            start_loss = val_loss
            min_loss = start_loss
            max_f1 = 0.0

        # if val_loss > 2 * min_loss:
        #     print("early stopping at %d" % e)
        #     break
        # run_test(model, cfg)

        if min_loss > val_loss or lab_f1_macro > max_f1:
            if min_loss > val_loss:
                min_loss = val_loss
                print("----save best epoch:%d, loss:%f---" % (e, val_loss))
            if lab_f1_macro > max_f1:
                max_f1 = lab_f1_macro
                print("----save best epoch:%d, f1:%f---" % (e, max_f1))
            torch.save(model.state_dict(), model_pth)
            run_test(model, cfg)
コード例 #20
0
 def __init__(self, path):
     self.global_step = 0
     self.logger = tensorboardX.SummaryWriter(os.path.join(path, "log"))
コード例 #21
0
ファイル: train_face.py プロジェクト: sarrbranka/pix2pixSC
    print('Resuming from epoch %d at iteration %d' % (start_epoch, epoch_iter))
else:
    start_epoch, epoch_iter = 1, 0

if opt.debug:
    opt.display_freq = 1
    opt.print_freq = 1
    opt.niter = 1
    opt.niter_decay = 0
    opt.max_dataset_size = 10

data_loader = CreateFaceConDataLoader(opt)
dataset = data_loader.load_data()
dataset_size = len(data_loader)
print('#training images = %d' % dataset_size)
train_writer = tensorboardX.SummaryWriter(os.path.join('./logs', opt.name))

model = create_model(opt)
visualizer = Visualizer(opt)

total_steps = (start_epoch - 1) * dataset_size + epoch_iter

display_delta = total_steps % opt.display_freq
print_delta = total_steps % opt.print_freq
save_delta = total_steps % opt.save_latest_freq

for epoch in range(start_epoch, opt.niter + opt.niter_decay + 1):
    epoch_start_time = time.time()
    if epoch != start_epoch:
        epoch_iter = epoch_iter % dataset_size
    for i, data in enumerate(dataset, start=epoch_iter):
コード例 #22
0
ファイル: experiment.py プロジェクト: Rishav1/skeltorch
 def _initialize_tensorboard(self):
     self.tbx = tensorboardX.SummaryWriter(
         self.paths['experiment_tensorboard'], flush_secs=9999)
コード例 #23
0
else:
    device = torch.device('cpu')

print('Using PyTorch version:', torch.__version__, ' Device:', device)
assert (LV(torch.__version__) >= LV("1.0.0"))

# TensorBoard is a tool for visualizing progress during training.  Although TensorBoard was created for TensorFlow, it can also be used with PyTorch.  It is easiest to use it with the tensorboardX module.

try:
    import tensorboardX
    logdir = os.path.join(
        os.getcwd(), "logs",
        "gtsrb-" + datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    print('TensorBoard log directory:', logdir)
    os.makedirs(logdir)
    log = tensorboardX.SummaryWriter(logdir)
except ImportError as e:
    log = None

# ## Data
#
# The training dataset consists of 5535 images of traffic signs of varying size. There are 43 different types of traffic signs.
#
# The validation and test sets consist of 999 and 12630 images, respectively.
#
# ### Downloading the data

datapath = os.getenv('DATADIR', '/scratch/project_2005299/data')
datapath = os.path.join(datapath, 'gtsrb/train-5535')

(nimages_train, nimages_validation, nimages_test) = (5535, 999, 12630)
コード例 #24
0
    exp = utils.ExperienceDataset()

    if loaded_from is not None:
        utils.load_checkpoint(loaded_from, dyn, pol, exp)

    # initialize dynamics optimizer
    opt1 = torch.optim.Adam(dyn.parameters(), args.dyn_lr)

    # initialize policy optimizer
    opt2 = torch.optim.Adam(pol.parameters(), args.pol_lr)

    if args.use_cuda and torch.cuda.is_available():
        dyn = dyn.cuda()
        pol = pol.cuda()

    writer = tensorboardX.SummaryWriter(
        logdir=os.path.join(results_folder, "logs"))

    # callbacks
    def on_close():
        writer.close()

    atexit.register(on_close)

    # initial experience data collection
    env.seed(args.seed)
    rnd = lambda x, t: env.action_space.sample()  # noqa: E731
    while exp.n_samples() < initial_experience:
        ret = utils.apply_controller(
            env,
            rnd,
            min(args.control_H, initial_experience - exp.n_samples() + 1),
コード例 #25
0
def run():
    args = parse_args()

    # Vis window
    if args.vis:
        cv2.namedWindow('Display', cv2.WINDOW_NORMAL)

    # Set-up output directories
    dt = datetime.datetime.now().strftime('%y%m%d_%H%M')
    net_desc = '{}_{}'.format(dt, '_'.join(args.description.split()))

    save_folder = os.path.join(args.outdir, net_desc)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    tb = tensorboardX.SummaryWriter(os.path.join(args.logdir, net_desc))

    # Load Dataset
    logging.info('Loading {} Dataset...'.format(args.dataset.title()))
    Dataset = get_dataset(args.dataset)

    train_dataset = Dataset(args.dataset_path, start=0.0, end=args.split, ds_rotate=args.ds_rotate,
                            random_rotate=True, random_zoom=True,
                            include_depth=args.use_depth, include_rgb=args.use_rgb)
    train_data = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers
    )
    val_dataset = Dataset(args.dataset_path, start=args.split, end=1.0, ds_rotate=args.ds_rotate,
                          random_rotate=True, random_zoom=True,
                          include_depth=args.use_depth, include_rgb=args.use_rgb)
    val_data = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=args.num_workers
    )
    logging.info('Done')

    # Load the network
    logging.info('Loading Network...')
    input_channels = 1*args.use_depth + 3*args.use_rgb
    ggcnn = get_network(args.network)

    net = ggcnn(input_channels=input_channels)
    device = torch.device("cuda:0")
    net = net.to(device)
    optimizer = optim.Adam(net.parameters())
    logging.info('Done')

    # Print model architecture.
    summary(net, (input_channels, 300, 300))
    f = open(os.path.join(save_folder, 'arch.txt'), 'w')
    sys.stdout = f
    summary(net, (input_channels, 300, 300))
    sys.stdout = sys.__stdout__
    f.close()

    best_iou = 0.0
    for epoch in range(args.epochs):
        logging.info('Beginning Epoch {:02d}'.format(epoch))
        train_results = train(epoch, net, device, train_data, optimizer, args.batches_per_epoch, vis=args.vis)

        # Log training losses to tensorboard
        tb.add_scalar('loss/train_loss', train_results['loss'], epoch)
        for n, l in train_results['losses'].items():
            tb.add_scalar('train_loss/' + n, l, epoch)

        # Run Validation
        logging.info('Validating...')
        test_results = validate(net, device, val_data, args.val_batches)
        logging.info('%d/%d = %f' % (test_results['correct'], test_results['correct'] + test_results['failed'],
                                     test_results['correct']/(test_results['correct']+test_results['failed'])))

        # Log validation results to tensorbaord
        tb.add_scalar('loss/IOU', test_results['correct'] / (test_results['correct'] + test_results['failed']), epoch)
        tb.add_scalar('loss/val_loss', test_results['loss'], epoch)
        for n, l in test_results['losses'].items():
            tb.add_scalar('val_loss/' + n, l, epoch)

        # Save best performing network
        iou = test_results['correct'] / (test_results['correct'] + test_results['failed'])
        if iou > best_iou or epoch == 0 or (epoch % 10) == 0:
            torch.save(net, os.path.join(save_folder, 'epoch_%02d_iou_%0.2f' % (epoch, iou)))
            torch.save(net.state_dict(), os.path.join(save_folder, 'epoch_%02d_iou_%0.2f_statedict.pt' % (epoch, iou)))
            best_iou = iou
コード例 #26
0
def train(opt):
    # Deal with feature things before anything
    opt.use_att = utils.if_use_att(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length
    opt.pos_size = loader.pos_size

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    dp_model = torch.nn.DataParallel(model)

    update_lr_flag = True
    # Assure in training mode
    dp_model.train()
    crit = utils.CRFModelCriterion()
    rl_crit = utils.RewardCriterion()

    optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            utils.set_lr(optimizer, opt.current_lr)
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['pos'],
            data['masks'], data['att_masks']
        ]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, pos, masks, att_masks = tmp

        optimizer.zero_grad()
        if not sc_flag:
            outputs, crfloss = dp_model(fc_feats, att_feats, labels,
                                        pos[:, 1:], masks[:, 1:], att_masks)
            loss = crit(crfloss, outputs, labels[:, 1:], masks[:, 1:])
        else:
            gen_result, sample_logprobs = dp_model(fc_feats,
                                                   att_feats,
                                                   att_masks,
                                                   opt={'sample_max': 0},
                                                   mode='sample')
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats,
                                              att_masks, data, gen_result, opt)
            loss = rl_crit(sample_logprobs, gen_result.data,
                           torch.from_numpy(reward).float().cuda())

        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if not sc_flag:
            print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, train_loss, end - start))
        else:
            print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                dp_model, crit, loader, eval_kwargs)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
コード例 #27
0
def main():
    # Device configuration
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.gpu_enable) else 'cpu')

    # seed
    torch.manual_seed(args.seed)
    if device == torch.device('cuda'):
        torch.cuda.manual_seed(args.seed)

    # define model
    model = GQN(gpu_enable=args.gpu_enable).to(device)
    model.load_state_dict(torch.load(args.snapshot_path)['state_dict'])
    model.eval()

    # define screen
    screen_size = model.image_size
    camera = gqn.three.PerspectiveCamera(eye=(3, 1, 0),
                                         center=(0, 0, 0),
                                         up=(0, 1, 0),
                                         fov_rad=math.pi / 2.0,
                                         aspect_ratio=screen_size[0] /
                                         screen_size[1],
                                         z_near=0.1,
                                         z_far=10)

    # prepare images
    raw_observed_images = np.zeros(screen_size + (3, ), dtype="uint32")
    observed_image = torch.from_numpy(
        np.zeros((1, 3) + screen_size, dtype="float32")).to(device)
    observed_viewpoint = torch.from_numpy(np.zeros((1, 7),
                                                   dtype="float32")).to(device)
    renderer = gqn.three.Renderer(screen_size[0], screen_size[1])

    features = []
    label_imgs = []
    label_meta = []
    with torch.no_grad():
        for scenenum in range(10):
            scene, _ = gqn.environment.shepard_metzler.build_scene(
                num_blocks=random.choice([x for x in range(7, 8)]))
            renderer.set_scene(scene)

            for viewnum in range(5):
                # prepare renderer
                rad = random.uniform(0, math.pi * 2)
                rad2 = random.uniform(0, math.pi * 2)
                eye = (3.0 * math.cos(rad), 3.0 * math.sin(rad2),
                       3.0 * math.sin(rad))
                center = (0, 0, 0)
                yaw = gqn.math.yaw(eye, center)
                pitch = gqn.math.pitch(eye, center)
                camera.look_at(
                    eye=eye,
                    center=center,
                    up=(0.0, 1.0, 0.0),
                )
                renderer.render(camera, raw_observed_images)

                # [0, 255] -> [-1, 1]
                observed_image[0] = torch.from_numpy(
                    (raw_observed_images.transpose(
                        (2, 0, 1)) / 255 - 0.5) * 2.0).to(device)
                observed_viewpoint[0] = torch.from_numpy(
                    np.array((eye[0], eye[1], eye[2], math.cos(yaw),
                              math.sin(yaw), math.cos(pitch), math.sin(pitch)),
                             dtype="float32")).to(device)

                # representation network
                tmp_r = model.compute_observation_representation(
                    torch.unsqueeze(observed_image, 0),
                    torch.unsqueeze(observed_viewpoint, 0))

                features.append(tmp_r.view(-1))
                label_imgs.append(observed_image[0].clone())
                label_meta.append(str(scenenum))

    features = torch.stack(features)
    label_imgs = (torch.stack(label_imgs) + 1.0) / 2.0

    # tensorboard
    writer = tbx.SummaryWriter()
    writer.add_embedding(features, metadata=label_meta, label_img=label_imgs)
    writer.close()
コード例 #28
0
for try_epoch in range(args.epochs, 0, -1):
    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch),
                                  root_rank=0,
                                  name='resume_from_epoch').item()

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Horovod: write TensorBoard logs on first worker.
log_writer = tensorboardX.SummaryWriter(
    args.log_dir) if hvd.rank() == 0 else None

kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.ImageFolder(args.train_dir,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(224),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),
                             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                         ]))
# Horovod: use DistributedSampler to partition data among workers. Manually specify
# `num_replicas=hvd.size()` and `rank=hvd.rank()`.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
コード例 #29
0
 def __init__(self,config:BasicConfig):
     self.writer = tb.SummaryWriter(logdir=config.log_root+"_tbx",comment=config.log_file)
コード例 #30
0
ファイル: test.py プロジェクト: meipuru344/torch
import keyword
import torch
import tensorboardX as tbx
import numpy as np
import torchvision
import PIL

writer = tbx.SummaryWriter('runs/exp-1')
#writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img)
writer.add_scalar('loss', torch.tensor([0.3]).item(), 1)
writer.add_scalar('loss', torch.tensor([0.9]).item(), 2)
#writer.add_embedding(torch.randn(100, 5), metadata=meta)
writer.close()