Ejemplo n.º 1
0
def main():
    n_samples = 3                           # 1, 3 or 5
    n_hidden = 30

    # target = [1, 2, 6, 7, 8]              # case1
    # target = [3, 4, 5, 9, 10, 11, 12]     # case2
    target = list(range(1, 22))           # total

    train_data, train_labels = gen_seq_data(target, n_samples, is_train=True)
    test_data, test_labels = gen_seq_data(target, n_samples, is_train=False)

    scaler = preprocessing.StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)

    train_dataset = tchdata.TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
    test_dataset = tchdata.TensorDataset(torch.from_numpy(test_data), torch.from_numpy(test_labels))

    train_loader = tchdata.DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = tchdata.DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = TEMlp(52 * n_samples, n_hidden, len(target))
    model.cuda()
    torch.backends.cudnn.benchmark = True
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.005)

    for i in range(60):
        train_acc = train(model, optimizer, train_loader)
        test_acc = validate(model, test_loader)
        print('{}\tepoch = {}\ttrain accuracy: {:0.3f}\ttest accuracy: {:0.3f}' \
            .format(datetime.now(), i, train_acc, test_acc))
Ejemplo n.º 2
0
def main():
    ### Create the torch datasets and get the size of the 'on-the-fly' created vocabulary and the length of the longest caption
    trainDataset = loadData.FlickrTrainDataset(images_folder, captions_folder,
                                               trans, 'TRAIN')
    valDataset = loadData.FlickrValDataset(images_folder, captions_folder,
                                           trans, 'VAL')
    voc_size = trainDataset.getVocabSize()
    max_capt = trainDataset.getMaxCaptionsLength()

    ### Create the models
    Encoder = model.Encoder()
    Decoder = model.Decoder(encoder_dim=2048,
                            decoder_dim=512,
                            attention_dim=256,
                            vocab_size=voc_size)
    Embedding = model.Embedding(vocab_size=voc_size, embedding_dim=128)

    ### Set the optimizer for the decoder(the only component that is actually trained) and the device for the model tensors
    decoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, Decoder.parameters()),
                                         lr=e - 3)
    Encoder.to(device)
    Decoder.to(device)
    Embedding.to(device)

    ### Create the data loaders for training and evaluation
    loader_train = DataLoader(trainDataset,
                              32,
                              sampler=sampler.SubsetRandomSampler(
                                  range(30000)))
    val_loader = DataLoader(valDataset,
                            32,
                            sampler=sampler.SubsetRandomSampler(range(30000)))

    best_bleu = 0  #The best blue score by now
    for i in range(epochs):
        ## One epoch's training
        train.train(data_loader=loader_train,
                    encoder=Encoder,
                    decoder=Decoder,
                    embedding=Embedding,
                    max_caption_length=max_capt,
                    optim=decoder_optimizer)
        ## One epoch's validation
        new_bleu = train.validate(data_loader=val_loader,
                                  encoder=Encoder,
                                  decoder=Decoder,
                                  embedding=Embedding,
                                  max_capt)

        if new_bleu > best_bleu:
            best_bleu = new_bleu
        else:
            ## We had no improvement since last time,so se don't train more
            break

    ## Save the model for deploying
    torch.save(Encoder, 'Encoder')
    torch.save(Decoder, 'Decoder')
    torch.save(Embedding, 'Embedding')
Ejemplo n.º 3
0
def main():
    args = get_args()

    if args.opts:
        cfg.merge_from_list(args.opts)

    cfg.freeze()

    # create model
    print("=> creating model '{}'".format(cfg.MODEL.ARCH))
    model = get_model(model_name=cfg.MODEL.ARCH, pretrained=None)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # load checkpoint
    resume_path = args.resume

    if Path(resume_path).is_file():
        print("=> loading checkpoint '{}'".format(resume_path))
        checkpoint = torch.load(resume_path, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}'".format(resume_path))
    else:
        raise ValueError("=> no checkpoint found at '{}'".format(resume_path))

    if device == "cuda":
        cudnn.benchmark = True

    test_dataset = FaceDataset(args.data_dir, "test", img_size=cfg.MODEL.IMG_SIZE, augment=False)
    test_loader = DataLoader(test_dataset, batch_size=cfg.TEST.BATCH_SIZE, shuffle=False,
                             num_workers=cfg.TRAIN.WORKERS, drop_last=False)

    print("=> start testing")
    _, _, test_mae = validate(test_loader, model, None, 0, device)
    print(f"test mae: {test_mae:.3f}")
Ejemplo n.º 4
0
def main():
    args = get_args()

    if args.opts:
        cfg.merge_from_list(args.opts)

    cfg.freeze()

    # creat model
    print("=> creating model ")
    model = get_model()
    device = "cuda:5" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # load checkpoint
    resume_path = args.resume

    if Path(resume_path).is_file():
        print("=> loading checkpoint '{}".format(resume_path))
        checkpoint = torch.load(resume_path, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}'".format(resume_path))
    else:
        raise ValueError("=> no checkpoint found at '{}'".format(resume_path))

    if 'cuda' in device:
        cudnn.benchmark = True
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model, device_ids=[3, 4, 5])

    age_test_dataset = dataset.AgeDataset(args.age_dir,
                                          'test',
                                          cfg.MODEL.IMG_SIZE,
                                          augment=False)
    age_test_loader = DataLoader(dataset=age_test_dataset,
                                 batch_size=cfg.TEST.BATCH_SIZE,
                                 shuffle=True)

    # gender_test_dataset = dataset.GenderDataset(args.gender_dir,'test',cfg.MODEL.IMG_SIZE,augment=False)
    # gender_test_loader = DataLoader(dataset=gender_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True,
    #                                 num_workers=cfg.TEST.WORKERS)
    #
    # hat_test_dataset = dataset.HatDataset(args.hat_dir,'test',cfg.MODEL.IMG_SIZE,augment=False)
    # hat_test_loader = DataLoader(dataset=hat_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True,
    #                              num_workers=cfg.TEST.WORKERS)
    #
    # glasses_test_dataset = dataset.GlassDataset(args.glasses_dir,'test',cfg.MODEL.IMG_SIZE,augment=False)
    # glasses_test_loader = DataLoader(dataset=glasses_test_dataset,batch_size=cfg.TEST.BATCH_SIZE,shuffle=True,
    #                                  num_workers=cfg.TEST.WORKERS)

    print("=> start testing")

    _, _, test_mae = validate(age_test_loader, model, None, 0, device, 'age')
    #gender_loss,gender_acc = validate(gender_test_loader,model,None,0,device,'gender')
    #hat_loss,hat_acc = validate(hat_test_loader,model,None,0,device,'hat')
    #glasses_loss,glasses_acc = validate(glasses_test_loader,model,None,0,device,'glasses')

    print(f"age test mae: {test_mae:.3f}")
Ejemplo n.º 5
0
def test(test_dataloader, args, writer, log):
    model = train.MyNERModel(args).cuda()
    model_param = torch.load(
        os.path.join(args.output_dir, 'global_best_model.bin'))
    model.load_state_dict(model_param)
    model.eval()
    f1 = train.validate(model, test_dataloader, args, log, is_test=True)
    print('-----------Test set, F1-Score: %.4f-----------' % f1)
    log.write('-----------Test set, F1-Score: %.4f----------- \n' % f1)
Ejemplo n.º 6
0
def run_ctc():
    """
    Main function for running the program
    """
    args = parse_args()
    # for the feature transforming
    if args.feature_transform:
        timit_dir = args.timit_dir
        feature_dir = args.feature_dir
        if not os.path.exists(timit_dir):
            print("TIMIT directory doesnot exist!")
            sys.exit(0)
        if not os.path.exists(feature_dir):
            os.makedirs(feature_dir)
        featpickle_path = os.path.join(feature_dir, "features.pickle")
        featext.compute_and_store_features(timit_dir,
                                           featpickle_path)
    # for training
    elif args.train_unirnn:
        outputdir = args.train_output_dir
        featdir = args.feature_dir
        if not os.path.exists(outputdir):
            os.makedirs(outputdir)
        featpickle_file = os.path.join(featdir, "features.pickle")
        if not os.path.exists(featpickle_file):
            print("features not exist. Please run feature transform first")
            sys.exit(0)
        model_path = os.path.join(outputdir, "inference_model")
        train.train(model_path, featpickle_file)
    # for validation
    else:
        model_dir = args.model_input_dir
        model_path = os.path.join(model_dir, "inference_model.meta")
        audio_path = args.validation_audio_path
        phoneme_path = args.validation_phoneme_path
        if not os.path.isfile(audio_path):
            print("audio file not exist!")
            sys.exit(0)
        if not os.path.isfile(phoneme_path):
            print("phoneme file not exist!")
        if not os.path.isfile(model_path):
            print("model not exist! please train first")
        train.validate(audio_path, phoneme_path, model_path)
def main():
    args = get_args()

    if args.opts:
        cfg.merge_from_list(args.opts)

    cfg.freeze()

    # create model
    print("=> creating model '{}'".format(cfg.MODEL.ARCH))
    model = get_model(model_name=cfg.MODEL.ARCH, pretrained=None)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # TODO: delete
    if torch.cuda.device_count() > 1:
        print("Let's use [1,2,4,5] GPUs!")
        # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        model = nn.DataParallel(model, device_ids=[1, 2, 4, 5])
    model.to(device)

    # load checkpoint
    resume_path = args.resume

    if Path(resume_path).is_file():
        print("=> loading checkpoint '{}'".format(resume_path))
        checkpoint = torch.load(resume_path, map_location="cpu")
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}'".format(resume_path))
    else:
        raise ValueError("=> no checkpoint found at '{}'".format(resume_path))

    if device == "cuda":
        cudnn.benchmark = True

    test_dataset = FaceDataset(args.data_dir,
                               "test",
                               img_size=cfg.MODEL.IMG_SIZE,
                               augment=False)
    test_loader = DataLoader(test_dataset,
                             batch_size=cfg.TEST.BATCH_SIZE,
                             shuffle=False,
                             num_workers=cfg.TRAIN.WORKERS,
                             drop_last=False)
    criterion = nn.CrossEntropyLoss().to(device)

    print("=> start testing")
    _, _, test_mae, gen_acc = validate(test_loader, model, criterion, 0,
                                       device)
    print(f"Test age mae: {test_mae:.3f}")
    print(f"Test gender accuracy: {gen_acc:.2f}")
Ejemplo n.º 8
0
def main():
    args = parse_args()

    transform = transforms.Compose([
        transforms.Resize(args.imsize_pre),
        transforms.CenterCrop(args.imsize),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    if args.dataset == "coco":
        val_dset = CocoDataset(
            root=args.root_path,
            split="val",
            transform=transform,
        )
    val_loader = DataLoader(
        val_dset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.n_cpu,
        collate_fn=collater,
    )

    vocab = Vocabulary(max_len=args.max_len)
    vocab.load_vocab(args.vocab_path)

    model = SPVSE(
        len(vocab),
        args.emb_size,
        args.out_size,
        args.max_len,
        args.cnn_type,
        args.rnn_type,
        pad_idx=vocab.padidx,
        bos_idx=vocab.bosidx,
    )

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    model = model.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        args.checkpoint),
          flush=True)
    ckpt = torch.load(args.checkpoint, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    _ = validate(1000, val_loader, model, vocab, args)
Ejemplo n.º 9
0
def main():
    train_set = SinaDataset(path.join(args.source, 'train.json'), input_dim)
    test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim)
    train_loader = DataLoader(train_set,
                              batch_size=args.bs,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(test_set,
                             batch_size=args.bs,
                             shuffle=True,
                             drop_last=True)

    model = TextCNN(input_dim, 200)
    # model = MyLSTM(input_dim, hidden_dim=8)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.wd)

    epoch = 0
    train_loss = []
    train_accu = []
    valid_loss = []
    valid_accu = []
    while True:
        epoch += 1
        epoch_loss, epoch_accu = train_one_epoch(epoch, model, optimizer,
                                                 train_loader, device, args.bs)
        val_loss, val_accu = validate(model, test_loader, device, args.bs)
        train_loss += epoch_loss
        train_accu += epoch_accu
        valid_loss += val_loss
        valid_accu += val_accu

        print('saving...')
        torch.save(model.state_dict(),
                   './saved_models/epoch' + str(epoch) + '.pkl')
        print()

        if args.max_epoch and epoch >= args.max_epoch:
            train_result = {
                'batch-size': args.bs,
                'train-loss': train_loss,
                'train-accu': train_accu,
                'valid-loss': valid_loss,
                'valid-accu': valid_accu
            }
            with open('train-result.json', 'w', encoding='utf-8') as f:
                json.dump(train_result, f)

            break
Ejemplo n.º 10
0
def main():
    global args
    args = parser.parse_args()

    print('dataset:', args.root_path)
    print('end2end?:', args.end2end)

    # load image
    train_loader = load_image(
        args.train_list,
        transforms.Compose([
            transforms.CenterCrop(128),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
        ]), True, True)

    val_loader = load_image(
        args.val_list,
        transforms.Compose([
            transforms.CenterCrop(128),
            transforms.ToTensor(),
        ]), False, True)

    # prepare model
    model = create_model(args.end2end)
    params = create_model_parameters(args, model)

    criterion = nn.CrossEntropyLoss().cuda()  # loss function
    optimizer = torch.optim.SGD(params,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    print(validate(val_loader, model, criterion))
Ejemplo n.º 11
0
    def check_ans(self, word, lvl, label, *args):
        correct = False
        result = tk.StringVar()

        if train.validate(word, *args):
            correct = True
            result.set(train.correct(word, lvl))
            label.config(fg='green')
        else:
            result.set(train.incorrect(word, lvl))
            label.config(fg='red')

        label.config(textvariable=result)
        self.elements.append(label)

        if not correct:
            for el in self.elements:
                if el.winfo_class() == 'Button':
                    el.destroy()
                    self.elements.remove(el)

            bt_review = tk.Button(text="Done",
                                  width=10,
                                  height=2,
                                  bg="black",
                                  font=(subtext_font, subtext_size),
                                  fg="white",
                                  anchor="center",
                                  highlightthickness=0,
                                  command=partial(self.homepage))
            self.elements.append(bt_review)
            bt_review.pack(pady=(subtext_size, 0))
            label.pack()
            return

        label.pack()
        self.root.update_idletasks()
        time.sleep(2)
        self.homepage()
Ejemplo n.º 12
0
def train(args, trainer, task, epoch_itr, epoch_aux_itr):
    """Train the model for one epoch."""

    # Update parameters every N batches
    if epoch_itr.epoch <= len(args.update_freq):
        update_freq = args.update_freq[epoch_itr.epoch - 1]
    else:
        update_freq = args.update_freq[-1]

    # Initialize data iterator
    itr = epoch_itr.next_epoch_itr(
        fix_batches_to_gpus=args.fix_batches_to_gpus)
    itr = iterators.GroupedIterator(itr, update_freq)
    progress = progress_bar.build_progress_bar(
        args,
        itr,
        epoch_itr.epoch,
        no_progress_bar='simple',
    )

    # Auxiliary iterator
    aux_itr = epoch_aux_itr.next_epoch_itr(
        fix_batches_to_gpus=args.fix_batches_to_gpus)
    aux_itr = iterators.GroupedIterator(aux_itr,
                                        update_freq,
                                        restart_when_done=True)

    extra_meters = collections.defaultdict(lambda: AverageMeter())
    first_valid = args.valid_subset.split(',')[0]
    max_update = args.max_update or math.inf
    for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch):
        # Record gradients from auxiliary data
        aux_samples = next(aux_itr)
        trainer.train_step(aux_samples, update_params=False)
        # if hasattr(trainer.optimizer, "save_constraints"):
        trainer.optimizer.save_constraints()

        log_output = trainer.train_step(samples)
        if log_output is None:
            continue

        # log mid-epoch stats
        stats = get_training_stats(trainer)
        for k, v in log_output.items():
            if k in [
                    'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size'
            ]:
                continue  # these are already logged above
            if 'loss' in k:
                extra_meters[k].update(v, log_output['sample_size'])
            else:
                extra_meters[k].update(v)
            stats[k] = extra_meters[k].avg
        progress.log(stats)

        # ignore the first mini-batch in words-per-second calculation
        if i == 0:
            trainer.get_meter('wps').reset()

        num_updates = trainer.get_num_updates()
        if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0 and num_updates > 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    [first_valid])
            save_checkpoint(args, trainer, epoch_itr, valid_losses[0])

        if num_updates >= max_update:
            break

    # log end-of-epoch stats
    stats = get_training_stats(trainer)
    for k, meter in extra_meters.items():
        stats[k] = meter.avg
    progress.print(stats)

    # reset training meters
    for k in [
            'train_loss',
            'train_nll_loss',
            'wps',
            'ups',
            'wpb',
            'bsz',
            'gnorm',
            'clip',
    ]:
        meter = trainer.get_meter(k)
        if meter is not None:
            meter.reset()
Ejemplo n.º 13
0
def evaluate_val_acc(model, data):
    crit = nn.CrossEntropyLoss()
    loss, acc = validate(model, data, crit)
    return acc
Ejemplo n.º 14
0
def run(opts):

    rank = opts.local_rank if torch.cuda.device_count() > 1 else 0

    # Set the random seed
    torch.manual_seed(opts.seed + rank)
    random.seed(opts.seed + rank)
    np.random.seed(opts.seed + rank)

    if not os.path.exists(opts.save_dir) and rank == 0:
        os.makedirs(opts.save_dir)

    # Optionally configure wandb
    if not opts.no_wandb and rank == 0:
        wandb.login('never', '31ce01e4120061694da54a54ab0dafbee1262420')
        wandb.init(dir=opts.save_dir,
                   config=opts,
                   project='large_scale_tsp',
                   name=opts.run_name,
                   sync_tensorboard=True,
                   save_code=True)

    # Set the device
    if opts.use_cuda:
        torch.cuda.set_device(rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        opts.device = torch.device("cuda", rank)

    else:
        opts.device = torch.device("cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        if rank == 0:
            print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model: torch.nn.Module = model_class(
        opts.embedding_dim,
        opts.hidden_dim,
        problem,
        attention_type=opts.attention_type,
        n_encode_layers=opts.n_encode_layers,
        n_heads=opts.n_heads,
        feed_forward_dim=opts.feed_forward_dim,
        encoding_knn_size=opts.encoding_knn_size,
        decoding_knn_size=opts.decoding_knn_size,
        mask_inner=True,
        mask_logits=True,
        normalization=opts.normalization,
        tanh_clipping=opts.tanh_clipping,
        checkpoint_encoder=opts.checkpoint_encoder,
        shrink_size=opts.shrink_size).to(opts.device)

    if opts.init_normalization_parameters:
        for m in model.modules():
            if isinstance(m, Normalization):
                m.init_parameters()

    if opts.use_cuda:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(
            opts.device)
        model = DDP(model, device_ids=[rank])

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    scaler = torch.cuda.amp.GradScaler() if opts.precision == 16 else None

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        if rank == 0:
            print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, scaler, baseline, lr_scheduler,
                        epoch, val_dataset, problem, opts)
Ejemplo n.º 15
0
def run(opts):
    # start time
    start_time = time()
    train_run = []
    opts.save_hrs.sort()
    run_name = opts.run_name

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    torch.save(model, os.path.join('.', 'empty.pt'))
    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            avg_time = train_epoch(model, optimizer, baseline, lr_scheduler,
                                   epoch, val_dataset, problem, tb_logger,
                                   opts, start_time)
            train_run.append(avg_time)
            for hr in opts.save_hrs:
                if (time() - start_time) > hr * 3600:
                    opts.save_hrs.remove(hr)
                    print('Saving model and state...')
                    hr_time = int(round((time() - start_time) / 3600))
                    with open(
                            '../models/att/hist_{}_{}hr.pickle'.format(
                                run_name, hr_time), 'wb') as handle:
                        pickle.dump(train_run,
                                    handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
                    torch.save(
                        {
                            'model': get_inner_model(model).state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'rng_state': torch.get_rng_state(),
                            'cuda_rng_state': torch.cuda.get_rng_state_all(),
                            'baseline': baseline.state_dict()
                        },
                        os.path.join(
                            '../models/att',
                            '{}_{}hr-model-att-only.pt'.format(
                                run_name, hr_time)))
                    torch.save(
                        model,
                        os.path.join(
                            '../models/att',
                            '{}_{}hr-model.pt'.format(run_name, hr_time)))
Ejemplo n.º 16
0
def main():

    args.eval_iters = [int(val) for val in args.eval_iters.split(',')]
    # args.loss_reset_step = 10
    args.log_step = 10
    args.dataset = args.dataset.lower()
    args.basenet = args.basenet.lower()

    args.bn = abs(args.bn)  # 0 freeze or else use bn
    if args.bn > 0:
        args.bn = 1  # update bn layer set the flag to 1

    args.exp_name = 'FPN{:d}-{:s}sh{:02d}-{:s}-bs{:02d}-{:s}-lr{:05d}-bn{:d}'.format(
        args.input_dim, args.anchor_type, args.shared_heads, args.dataset,
        args.batch_size, args.basenet, int(args.lr * 100000), args.bn)

    args.save_root += args.dataset + '/'
    args.save_root = args.save_root + 'cache/' + args.exp_name + '/'

    if not os.path.isdir(
            args.save_root):  # if save directory doesn't exist create it
        os.makedirs(args.save_root)

    source_dir = args.save_root + '/source/'  # where to save the source
    utils.copy_source(source_dir)

    anchors = 'None'
    with torch.no_grad():
        if args.anchor_type == 'kmeans':
            anchorbox = kanchorBoxes(input_dim=args.input_dim,
                                     dataset=args.dataset)
        else:
            anchorbox = anchorBox(args.anchor_type,
                                  input_dim=args.input_dim,
                                  dataset=args.dataset)
        anchors = anchorbox.forward()
        args.ar = anchorbox.ar

    args.num_anchors = anchors.size(0)
    anchors = anchors.cuda(0, non_blocking=True)
    if args.dataset == 'coco':
        args.train_sets = ['train2017']
        args.val_sets = ['val2017']
    else:
        args.train_sets = ['train2007', 'val2007', 'train2012', 'val2012']
        args.val_sets = ['test2007']

    args.means = [0.485, 0.456, 0.406]
    args.stds = [0.229, 0.224, 0.225]
    val_dataset = Detection(args,
                            train=False,
                            image_sets=args.val_sets,
                            transform=BaseTransform(args.input_dim, args.means,
                                                    args.stds),
                            full_test=False)
    print('Done Loading Dataset Validation Dataset :::>>>\n',
          val_dataset.print_str)
    args.data_dir = val_dataset.root
    args.num_classes = len(val_dataset.classes) + 1
    args.classes = val_dataset.classes
    args.bias_heads = args.bias_heads > 0
    args.head_size = 256
    if args.shared_heads > 0:
        net = build_fpn_shared_heads(args.basenet,
                                     args.model_dir,
                                     ar=args.ar,
                                     head_size=args.head_size,
                                     num_classes=args.num_classes,
                                     bias_heads=args.bias_heads)
    else:
        net = build_fpn_unshared(args.basenet,
                                 args.model_dir,
                                 ar=args.ar,
                                 head_size=args.head_size,
                                 num_classes=args.num_classes,
                                 bias_heads=args.bias_heads)

    net = net.cuda()

    if args.ngpu > 1:
        print('\nLets do dataparallel\n')
        net = torch.nn.DataParallel(net)
    net.eval()

    for iteration in args.eval_iters:
        args.det_itr = iteration
        log_file = open(
            "{:s}/testing-{:d}.log".format(args.save_root, iteration), "w", 1)
        log_file.write(args.exp_name + '\n')

        args.model_path = args.save_root + '/model_' + repr(iteration) + '.pth'
        log_file.write(args.model_path + '\n')

        net.load_state_dict(torch.load(args.model_path))

        print('Finished loading model %d !' % iteration)
        # Load dataset
        val_data_loader = data_utils.DataLoader(val_dataset,
                                                int(args.batch_size / 2),
                                                num_workers=args.num_workers,
                                                shuffle=False,
                                                pin_memory=True,
                                                collate_fn=custum_collate)

        # evaluation
        torch.cuda.synchronize()
        tt0 = time.perf_counter()
        log_file.write('Testing net \n')
        net.eval()  # switch net to evaluation mode
        if args.dataset != 'coco':
            mAP, ap_all, ap_strs, det_boxes = validate(
                args,
                net,
                anchors,
                val_data_loader,
                val_dataset,
                iteration,
                iou_thresh=args.iou_thresh)
        else:
            mAP, ap_all, ap_strs, det_boxes = validate_coco(
                args,
                net,
                anchors,
                val_data_loader,
                val_dataset,
                iteration,
                iou_thresh=args.iou_thresh)

        for ap_str in ap_strs:
            print(ap_str)
            log_file.write(ap_str + '\n')
        ptr_str = '\nMEANAP:::=>' + str(mAP) + '\n'
        print(ptr_str)
        log_file.write(ptr_str)

        torch.cuda.synchronize()
        print('Complete set time {:0.2f}'.format(time.perf_counter() - tt0))
        log_file.close()
Ejemplo n.º 17
0
    np.savetxt(logdir + "test_set.txt",
               sorted([fi for fi in test_set]),
               fmt="%s")

    # Ensure that test set file list is complete
    chs = ("g", "r", "i")
    test_set = [
        (fi, ch)
        for fi, ch in zip(np.repeat(test_set, len(chs)), itertools.cycle(chs))
        if isfile("./data/gals/{}-{}.fits".format(fi, ch))
        and isfile("./data/sbs_gri_noise/{}_{}.txt".format(fi, ch))
    ]

    # Init and load Pix2Prof
    encoder = ResNet18(num_classes=args.encoding_len).to(cuda)
    decoder = GRUNet(input_dim=1,
                     hidden_dim=args.encoding_len,
                     output_dim=1,
                     n_layers=3).to(cuda)
    criterion = nn.MSELoss()
    encoder_op = optim.Adam(encoder.parameters(), lr=0.0002)
    decoder_op = optim.Adam(decoder.parameters(), lr=0.0002)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    decoder_op.load_state_dict(checkpoint["decoder_op"])
    encoder_op.load_state_dict(checkpoint["encoder_op"])

    # Validate
    validate(test_set, encoder, decoder, chk_epoch, criterion, logdir=logdir)
    plot_validation_set("{}/{:04d}".format(logdir, chk_epoch))
def main(args):

    # create labeled, validation, and test data loader
    # unlabeled data loader not needed for baseline training
    train_loader, val_loader, args = get_data_loaders_no_ssl(args)

    # create models
    model = create_model(args,
                         model='efficient',
                         efficient_version=args.efficient_version)

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    criterion = torch.nn.CrossEntropyLoss()

    # stats and logger
    logger = Logger(os.path.join(args.out, 'log.txt'))
    logger.set_names(['Train Loss', 'Valid Loss', \
                      'Valid Acc.', 'Train Acc.'])
    start_epoch, best_acc = 0, 0

    # load from checkpoints
    if args.resume:
        print('==> Resuming from checkpoint.')
        load_checkpoint(args, model, optimizer, ema_model=None)

        if args.transfer_learning and start_epoch > args.unfreeze:
            print('Unfreezing layers of model.')
            model = unfreeze_layer(model)

    # initialize useful stats / logger variables
    writer = SummaryWriter(args.out)
    step = 0
    test_accs = []

    # train and val
    for epoch in range(start_epoch, args.epochs):

        # transfer learning approach for the efficientNet model
        # First run only the last layers while keeping pre-trained frozen
        # after args.unfreeze epochs, fine-tune the whole network
        if args.transfer_learning and epoch == args.unfreeze:
            model = unfreeze_layer(model)

        print(f'\nEpoch: [{epoch+1} | {args.epochs}] LR: {args.lr}')

        train_loss, train_acc = train_no_ssl(model=model,
                                             optimizer=optimizer,
                                             criterion=criterion,
                                             train_loader=train_loader,
                                             args=args)

        # get validation loss and accuracy
        val_loss, val_acc = validate(val_loader,
                                     model=model,
                                     criterion=criterion,
                                     epoch=epoch,
                                     mode='Validating',
                                     device=args.device)

        step = args.batch_size * len(train_loader) * (epoch + 1)

        # loggin stats
        writer.add_scalar('losses/train_loss', train_loss, step)
        writer.add_scalar('losses/valid_loss', val_loss, step)

        writer.add_scalar('accuracy/train_acc', train_acc, step)
        writer.add_scalar('accuracy/val_acc', val_acc, step)

        # append logger file
        logger.append([train_loss, val_loss, val_acc, train_acc])

        # save model
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'acc': val_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint=args.out)
    logger.close()
    writer.close()

    print('Best acc:')
    print(best_acc)
Ejemplo n.º 19
0
def _run_sl(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    assert opts.problem == 'tspsl', "Only TSP is supported for supervised learning"

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {'attention': AttentionModel}.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    encoder_class = {
        'gat': GraphAttentionEncoder,
        'gcn': GCNEncoder,
        'mlp': MLPEncoder
    }.get(opts.encoder, None)
    assert encoder_class is not None, "Unknown encoder: {}".format(
        encoder_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        encoder_class,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size,
                        use_cuda=opts.use_cuda).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Compute number of network parameters
    print(model)
    nb_param = 0
    for param in model.parameters():
        nb_param += np.prod(list(param.data.size()))
    print('Number of parameters: ', nb_param)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }])

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    train_dataset = problem.make_dataset(size=opts.graph_size,
                                         filename=opts.train_dataset)
    opts.epoch_size = train_dataset.size
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       filename=opts.val_dataset)
    opts.val_size = val_dataset.size

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch_sl(model, optimizer, lr_scheduler, epoch,
                           train_dataset, val_dataset, problem, tb_logger,
                           opts)
Ejemplo n.º 20
0
print("Dataset and model ready. Starting training ...")

cur_var_not_best = 0

for epoch in range(start_epoch, epochs):

    plot_data['epoch'] = epoch

    # Train for one epoch
    plot_data = train.train(train_loader, model, criterion, optimizer, epoch,
                            print_freq, plot_data, gpu, margin, train_iters,
                            variance)

    # Evaluate on validation set
    plot_data = train.validate(val_loader, model, criterion, optimizer, epoch,
                               print_freq, plot_data, gpu, margin, val_iters,
                               variance)

    # Remember best model and save checkpoint
    is_best = plot_data['val_loss'][epoch] < best_loss
    if is_best:
        best_model = 1
    else:
        best_model = 0
        cur_var_not_best += 1

    if is_best:
        print("New best model by loss. Val Loss = " +
              str(plot_data['val_loss'][epoch]))
        best_loss = plot_data['val_loss'][epoch]
        filename = dataset + '/models/' + training_id + '_epoch_' + str(
Ejemplo n.º 21
0
def run(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size,
                        steps=opts.awe_steps,
                        graph_size=opts.graph_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'constant':
        baseline = ConstantBaseline()
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    elif opts.baseline == 'critic_lp':
        assert problem.NAME == 'lp'
        dim_vocab = {2: 2, 3: 5, 4: 15, 5: 52, 6: 203, 7: 877, 8: 4140}
        baseline = CriticBaseline(
            (CriticNetworkLP(dim_vocab[opts.awe_steps], opts.embedding_dim,
                             opts.hidden_dim, opts.n_encode_layers,
                             opts.normalization)).to(opts.device))
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution,
                                       size=opts.graph_size,
                                       degree=opts.degree,
                                       steps=opts.awe_steps,
                                       awe_samples=opts.awe_samples)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        extra = {'updates': 0, 'avg_reward': 10**8, "best_epoch": -1}
        start = time.time()
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):

            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, tb_logger, opts, extra)

        finish = time.time()
        with open("experiments.log", "a+") as f:
            f.write("{} {:.4f} {} {:.2f}\n".format(
                '-'.join(opts.train_dataset.split('/')[-2:]),
                extra["avg_reward"], extra["best_epoch"], finish - start))
        print("Took {:.2f} sec for {} epochs".format(finish - start,
                                                     opts.n_epochs))
Ejemplo n.º 22
0
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)
    if init_distributed:
        raise ValueError("Distibuted training not supported by multiobj "
                         "training")

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest
    # checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    if args.restore_file is not None:
        # Load from checkpoint
        print('| loading model from {}'.format(args.restore_file))
        [model], _model_args = checkpoint_utils.load_model_ensemble(
            [args.restore_file],
            arg_overrides=eval(args.model_overrides),
            task=task,
        )
        # Overwrite architecture arguments
        # (this is very hacky but I don't know a better way)
        for k, v in _model_args.__dict__.items():
            is_model_argument = k == "arch"
            is_model_argument |= k.startswith("encoder_")
            is_model_argument |= k.startswith("decoder_")
            is_model_argument |= k.startswith("share_")
            is_model_argument |= k.startswith("adaptive_")
            if hasattr(args, k) and is_model_argument:
                setattr(args, k, v)
    else:
        # Or build model from scratch
        model = task.build_model(args)

    # Training criterion
    criterion = task.build_criterion(args)
    print(model)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # Build trainer
    trainer = Trainer(args, task, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available and restore the
    # corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)

    # Load auxiliary data
    epoch_aux_itr = task.get_batch_iterator(
        dataset=task.dataset(args.train_subset, idx=1),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            trainer.model.max_positions(),
        ),
        ignore_invalid_inputs=True,
        required_batch_size_multiple=args.required_batch_size_multiple,
        seed=args.seed,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
        num_workers=args.num_workers,
        epoch=0,
    )

    # Estimate fisher if needed
    if args.inverse_fisher or args.ewc > 0:
        fisher_itr = task.get_batch_iterator(
            dataset=task.dataset(args.train_subset, idx=1),
            max_tokens=args.max_tokens,
            max_sentences=1,
            max_positions=utils.resolve_max_positions(
                task.max_positions(),
                trainer.model.max_positions(),
            ),
            ignore_invalid_inputs=True,
            required_batch_size_multiple=args.required_batch_size_multiple,
            seed=args.seed,
            num_shards=args.distributed_world_size,
            shard_id=args.distributed_rank,
            num_workers=args.num_workers,
            epoch=0,
        )
        fim = estimate_diagonal_fisher(args,
                                       trainer,
                                       fisher_itr,
                                       args.n_fisher_samples,
                                       precomputed=args.precomputed_fisher)
        trainer.fim = fim
    # EWC
    if args.ewc > 0.0:
        trainer.prepare_ewc(args.ewc)

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_losses = [None]
    valid_subsets = args.valid_subset.split(',')
    while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update:
        # train for one epoch
        train(args, trainer, task, epoch_itr, epoch_aux_itr)

        if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, None)

        if ':' in getattr(args, 'data', ''):
            # sharded data: get train iterator for next epoch
            epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Ejemplo n.º 23
0
def train(args, trainer, task, epoch_itr, epoch_aux_itr, fim=None):
    """Train the model for one epoch."""
    # Update parameters every N batches
    update_freq = args.update_freq[epoch_itr.epoch - 1] \
        if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1]
    print(update_freq)
    # Initialize data iterator
    itr = epoch_itr.next_epoch_itr(
        fix_batches_to_gpus=args.fix_batches_to_gpus,
        shuffle=(epoch_itr.epoch >= args.curriculum),
    )
    itr = iterators.GroupedIterator(itr, update_freq)
    progress = progress_bar.build_progress_bar(
        args,
        itr,
        epoch_itr.epoch,
        no_progress_bar='simple',
    )

    # Auxiliary iterator
    aux_itr = epoch_aux_itr.next_epoch_itr(
        fix_batches_to_gpus=args.fix_batches_to_gpus)
    aux_itr = iterators.GroupedIterator(aux_itr, update_freq, bottomless=True)

    extra_meters = collections.defaultdict(lambda: AverageMeter())
    valid_subsets = args.valid_subset.split(',')
    max_update = args.max_update or math.inf
    for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch):
        # Record gradients from auxiliary data
        aux_samples = next(aux_itr)
        trainer.train_step(aux_samples, update_params=False)
        # Fisher
        if hasattr(trainer.optimizer, "save_auxiliary"):
            trainer.optimizer.save_auxiliary()
        else:
            print("Warning, the optimizer is ignoring the auxiliary gradients")
        # Take a step on the primary task
        log_output = trainer.train_step(samples, apply_ewc=args.ewc > 0)

        if log_output is None:
            continue

        # log mid-epoch stats
        stats = get_training_stats(trainer)
        for k, v in log_output.items():
            if k in [
                    'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size'
            ]:
                continue  # these are already logged above
            if 'loss' in k:
                extra_meters[k].update(v, log_output['sample_size'])
            else:
                extra_meters[k].update(v)
            stats[k] = extra_meters[k].avg
        progress.log(stats, tag='train', step=stats['num_updates'])

        # ignore the first mini-batch in words-per-second calculation
        if i == 0:
            trainer.get_meter('wps').reset()

        num_updates = trainer.get_num_updates()
        if (not args.disable_validation and args.save_interval_updates > 0
                and num_updates % args.save_interval_updates == 0
                and num_updates > 0):
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, None)

        if num_updates >= max_update:
            break

    # log end-of-epoch stats
    stats = get_training_stats(trainer)
    for k, meter in extra_meters.items():
        stats[k] = meter.avg
    progress.print(stats, tag='train', step=stats['num_updates'])

    # reset training meters
    for k in [
            'train_loss',
            'train_nll_loss',
            'wps',
            'ups',
            'wpb',
            'bsz',
            'gnorm',
            'clip',
    ]:
        meter = trainer.get_meter(k)
        if meter is not None:
            meter.reset()
Ejemplo n.º 24
0
                         dropout=0.3,
                         lr=args.learning_rate,
                         activation_fn=nn.LeakyReLU(0.2)).to(device)

    print("========== Encoder ==========\n{}".format(enc))

    print("========== Decoder ==========\n{}".format(dec))

    print("========== Discriminator ==========\n{}".format(disc))

    for epoch in range(1, args.num_epochs + 1):
        print("========== Start epoch {} at {} ==========".format(
            epoch,
            datetime.now().strftime("%H:%M:%S")))

        train(epoch, enc, dec, disc, prior_size, train_dl, TEXT.vocab, device)
        validate(epoch, enc, dec, disc, prior_size, valid_dl, TEXT.vocab,
                 device)

        print_decoded(enc, dec, gen_dl, vocab=TEXT.vocab, device=device)
        print_sample(dec,
                     sample_size=prior_size,
                     max_seq_len=41,
                     vocab=TEXT.vocab,
                     style_vocab=LABEL.vocab,
                     device=device)

    torch.save(enc.state_dict(), 'rcaae.enc.pt')
    torch.save(dec.state_dict(), 'rcaae.dec.pt')
    torch.save(disc.state_dict(), 'rcaae.disc.pt')
Ejemplo n.º 25
0
                        batch_size=32,
                        shuffle=False,
                        num_workers=4)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss().to(device)

if opt.train:
    epoch_num = opt.num_epochs
    best_val_acc = 0
    total_loss_val, total_acc_val = [], []
    for epoch in tqdm(range(1, epoch_num + 1)):
        loss_train, acc_train, total_loss_train, total_acc_train = train(
            train_loader, model, criterion, optimizer, epoch, device)
        loss_val, acc_val = validate(val_loader, model, criterion, optimizer,
                                     epoch, device)
        total_loss_val.append(loss_val)
        total_acc_val.append(acc_val)
        if acc_val > best_val_acc:
            best_val_acc = acc_val
            print('*****************************************************')
            print('best record: [epoch %d], [val loss %.5f], [val acc %.5f]' %
                  (epoch, loss_val, acc_val))
            print('*****************************************************')

    fig = plt.figure(num=2)
    fig1 = fig.add_subplot(2, 1, 1)
    fig2 = fig.add_subplot(2, 1, 2)
    fig1.plot(total_loss_train, label='training loss')
    fig1.plot(total_acc_train, label='training accuracy')
    fig2.plot(total_loss_val, label='validation loss')
Ejemplo n.º 26
0
def main(args):
    if args.max_tokens is None:
        args.max_tokens = 6000
    print(args)

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load dataset splits
    load_dataset_splits(task, ['train', 'valid'])

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print('| model {}, criterion {},'.format(args.arch,
                                             criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.numel() for p in model.parameters())))

    # Make a dummy batch to (i) warm the caching allocator and (ii) as a
    # placeholder DistributedDataParallel when there's an uneven number of
    # batches per worker.
    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        model.max_positions(),
    )
    dummy_batch = task.dataset('train').get_dummy_batch(
        args.max_tokens, max_positions)

    # Build trainer
    trainer = Trainer(args, task, model, criterion, dummy_batch)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))
    print('| Optimizer {}'.format(trainer.optimizer.__class__.__name__))

    # Initialize dataloader
    epoch_itr = task.get_batch_iterator(
        dataset=task.dataset(args.train_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
        ignore_invalid_inputs=True,
        required_batch_size_multiple=8,
        seed=args.seed,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    )

    # Dataloader for the auxiliary task
    epoch_aux_itr = task.get_batch_iterator(
        dataset=task.dataset(args.train_subset, idx=1),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
        ignore_invalid_inputs=True,
        required_batch_size_multiple=8,
        seed=args.seed,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    )

    # Load the latest checkpoint if one is available
    if not load_checkpoint(args, trainer, epoch_itr):
        trainer.dummy_train_step([dummy_batch])

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_losses = [None]
    valid_subsets = args.valid_subset.split(',')
    while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update:
        # train for one epoch
        train(args, trainer, task, epoch_itr, epoch_aux_itr)

        if epoch_itr.epoch % args.validate_interval == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Ejemplo n.º 27
0
if not os.path.exists(tensorboard_dir):
    os.makedirs(tensorboard_dir)
configure(tensorboard_dir)

best_valid_acc, patience_counter = 0, 0

for epoch in range(0, hp['EPOCHS']):
    print('\nEpoch: {}/{} - LR: {:.6f}'.format(epoch + 1, hp['EPOCHS'],
                                               hp['LEARNING_RATE']))

    # train for 1 epoch
    train_loss, train_acc = train_one_epoch(model, optimizer, train_loader,
                                            epoch, hp)

    # evaluate on validation set
    valid_loss, valid_acc = validate(model, valid_loader, epoch, hp)

    # # reduce lr if validation loss plateaus
    # self.scheduler.step(valid_loss)

    is_best = valid_acc > best_valid_acc
    msg1 = "train loss: {:.3f} - train acc: {:.3f} "
    msg2 = "- val loss: {:.3f} - val acc: {:.3f}"
    if is_best:
        patience_counter = 0
        msg2 += " [*]"
    msg = msg1 + msg2
    print(msg.format(train_loss, train_acc, valid_loss, valid_acc))

    # check for improvement
    if not is_best:
Ejemplo n.º 28
0
nodes in its output layer. For example, a 2-4-3 network has 2 input nodes, one
hidden layer with 4 nodes, and 3 output nodes."""

train1 = 'examples/train1.txt'
train2 = 'examples/train2.txt'
train3 = 'examples/train3.txt'
validate1 = 'examples/validation1.txt'
validate2 = 'examples/validation2.txt'
validate3 = 'examples/validation3.txt'
set3Labels = ['upper_left', 'upper_right', 'lower_left', 'lower_right']

p1 = Perceptron(2)
p2 = Perceptron(2)
print("\nPerceptrons with constant learning rates, datasets 1 and 2:")
train(p1, train1, constantLearningRate(1))
validate(p1, validate1)
train(p2, train2, constantLearningRate(1))
validate(p2, validate2)

p1 = Perceptron(2)
p2 = Perceptron(2)
print("\nPerceptrons with inverse time learning rates, datasets 1 and 2:")
train(p1, train1, inverseTimeLearningRate(1))
validate(p1, validate1)
train(p2, train2, inverseTimeLearningRate(1))
validate(p2, validate2)

p1 = Perceptron(2, bias=True)
p2 = Perceptron(2, bias=True)
print("\nPerceptrons with exponential learning rates and bias, "
    + "datasets 1 and 2:")
Ejemplo n.º 29
0
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': float(opts.lr_model)
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': float(opts.lr_critic)
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size)

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, opts)
Ejemplo n.º 30
0
                                             pin_memory=True,
                                             num_workers=8)
    test_loader = torch.utils.data.DataLoader(test_dataset_full,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              pin_memory=True,
                                              num_workers=8)

    if args.PRETRAIN:
        print('pretraining...')
        net = VGG_small().cuda()
        loss_func = torch.nn.CrossEntropyLoss().cuda()

        optimizer = torch.optim.Adam(net.parameters(), lr=5e-2)
        get_accuracy(net, train_loader, loss_func)
        val_accuracy = validate(net, val_loader, loss_func)
        best_acc = val_accuracy[0]
        test(net, test_loader, loss_func)
        save_model_ori(args.model_ori, net, optimizer)

        for epoch in range(100):
            if epoch % 30 == 0:
                optimizer.param_groups[0]['lr'] *= 0.2
            train_fullprecision(net, train_loader, loss_func, optimizer, epoch)
            val_accuracy = validate(net, val_loader, loss_func)
            if val_accuracy[0] > best_acc:
                best_acc = val_accuracy[0]
                test(net, test_loader, loss_func)
                save_model_ori(args.model_ori, net, optimizer)

    if args.ALQ:
Ejemplo n.º 31
0
def _run_rl(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    encoder_class = {
        'gat': GraphAttentionEncoder,
        'gcn': GCNEncoder,
        'mlp': MLPEncoder
    }.get(opts.encoder, None)
    assert encoder_class is not None, "Unknown encoder: {}".format(
        encoder_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        encoder_class,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Compute number of network parameters
    print(model)
    nb_param = 0
    for param in model.parameters():
        nb_param += np.prod(list(param.data.size()))
    print('Number of parameters: ', nb_param)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 encoder_class, 2, opts.embedding_dim, opts.hidden_dim,
                 opts.n_encode_layers, opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset)
    opts.val_size = val_dataset.size

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, tb_logger, opts)
Ejemplo n.º 32
0
def main():
  now = datetime.datetime.now()
  logger = Logger(args.save_path + '/logs_{}'.format(now.isoformat()))

  model = getModel(args)
  cudnn.benchmark = True
  optimizer = torch.optim.SGD(model.parameters(), args.LR,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)

  valSource_dataset = SourceDataset('test', ref.nValViews)
  valTarget_dataset = TargetDataset('test', ref.nValViews)
  
  valSource_loader = torch.utils.data.DataLoader(valSource_dataset, batch_size = 1, 
                        shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_fn_cat)
  valTarget_loader = torch.utils.data.DataLoader(valTarget_dataset, batch_size = 1, 
                        shuffle=False, num_workers=1, pin_memory=True, collate_fn=collate_fn_cat)
  
  if args.test:
    f = {}
    for split in splits:
      f['{}'.format(split)] = open('{}/{}.txt'.format(args.save_path, split), 'w')
    test(args, valSource_loader, model, None, f['valSource'], 'valSource')
    test(args, valTarget_loader, model, None, f['valTarget'], 'valTarget')
    return
  
  train_dataset = Fusion(SourceDataset, TargetDataset, nViews = args.nViews, targetRatio = args.targetRatio, totalTargetIm = args.totalTargetIm)
  trainTarget_dataset = train_dataset.targetDataset
  
  train_loader = torch.utils.data.DataLoader(
      train_dataset, batch_size=args.batchSize, shuffle=not args.test,
      num_workers=args.workers if not args.test else 1, pin_memory=True, collate_fn=collate_fn_cat)
  trainTarget_loader = torch.utils.data.DataLoader(
      trainTarget_dataset, batch_size=args.batchSize, shuffle=False,
      num_workers=args.workers if not args.test else 1, pin_memory=True, collate_fn=collate_fn_cat)

  M = None
  if args.shapeWeight > ref.eps:
    print 'getY...'
    Y = getY(train_dataset.sourceDataset)
    M = initLatent(trainTarget_loader, model, Y, nViews = args.nViews, S = args.sampleSource, AVG = args.AVG)
  
  print 'Start training...'
  for epoch in range(1, args.epochs + 1):
    adjust_learning_rate(optimizer, epoch, args.dropLR)
    train_mpjpe, train_loss, train_unSuploss = train(args, train_loader, model, optimizer, M, epoch)
    valSource_mpjpe, valSource_loss, valSource_unSuploss = validate(args, 'Source', valSource_loader, model, None, epoch)
    valTarget_mpjpe, valTarget_loss, valTarget_unSuploss = validate(args, 'Target', valTarget_loader, model, None, epoch)

    train_loader.dataset.targetDataset.shuffle()
    if args.shapeWeight > ref.eps and epoch % args.intervalUpdateM == 0:
      M = stepLatent(trainTarget_loader, model, M, Y, nViews = args.nViews, lamb = args.lamb, mu = args.mu, S = args.sampleSource)

    logger.write('{} {} {}\n'.format(train_mpjpe, valSource_mpjpe, valTarget_mpjpe))
    
    logger.scalar_summary('train_mpjpe', train_mpjpe, epoch)
    logger.scalar_summary('valSource_mpjpe', valSource_mpjpe, epoch)
    logger.scalar_summary('valTarget_mpjpe', valTarget_mpjpe, epoch)
    
    logger.scalar_summary('train_loss', train_loss, epoch)
    logger.scalar_summary('valSource_loss', valSource_loss, epoch)
    logger.scalar_summary('valTatget_loss', valTarget_loss, epoch)
    
    logger.scalar_summary('train_unSuploss', train_unSuploss, epoch)
    logger.scalar_summary('valSource_unSuploss', valSource_unSuploss, epoch)
    logger.scalar_summary('valTarget_unSuploss', valTarget_unSuploss, epoch)
    
    if epoch % 10 == 0:
      torch.save({
        'epoch': epoch + 1,
        'arch': args.arch,
        'state_dict': model.state_dict(),
        'optimizer' : optimizer.state_dict(),
      }, args.save_path + '/checkpoint_{}.pth.tar'.format(epoch))
  logger.close()