class HyperdashCallback(Callback):
    exp = None
    last = 1

    def on_train_begin(self, logs=None):
        self.exp = Experiment("Deep Weather")

    def on_train_end(self, logs=None):
        self.exp.end()

    def on_epoch_end(self, epoch, logs=None):
        if 'loss' in logs:
            self.exp.metric("progress", min(0.1, self.last - logs["loss"]))
            self.last = logs["loss"]
            self.exp.metric("loss", min(0.5, logs["loss"]))
            self.exp.metric("val_loss", min(0.5, logs["val_loss"]))
Exemple #2
0
def run():
    BATCH_SIZE = args.batch_size
    LEARNING_RATE = args.learning_rate
    DATA_SAMPLING = args.data_sampling
    NUM_EPOCHS = args.epochs
    MODEL = args.model

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    logger = Logger('./logs/{}'.format(time.localtime()))

    print("Created model...")
    model = cdssm.CDSSM()
    model = model.cuda()
    model = model.to(device)
    if torch.cuda.device_count() > 0:
        print("Let's use", torch.cuda.device_count(), "GPU(s)!")
        model = nn.DataParallel(model)
    model.load_state_dict(torch.load(MODEL))

    print("Created dataset...")
    dataset = pytorch_data_loader.WikiDataset(test,
                                              claims_dict,
                                              data_sampling=DATA_SAMPLING,
                                              testFile="shared_task_dev.jsonl")
    dataloader = DataLoader(dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=3,
                            shuffle=True,
                            collate_fn=pytorch_data_loader.PadCollate())

    OUTPUT_FREQ = int((len(dataset) / BATCH_SIZE) * 0.02)
    criterion = torch.nn.BCEWithLogitsLoss()

    parameters = {
        "batch size": BATCH_SIZE,
        "loss": criterion.__class__.__name__,
        "data batch size": DATA_SAMPLING,
        "data": args.data
    }
    exp_params = {}
    exp = Experiment("CLSM V2")
    for key, value in parameters.items():
        exp_params[key] = exp.param(key, value)

    true = []
    pred = []
    print("Evaluating...")
    model.eval()
    test_running_accuracy = 0.0
    test_running_loss = 0.0
    num_batches = 0
    for batch_num, inputs in enumerate(dataloader):
        num_batches += 1
        claims_tensors, claims_text, evidences_tensors, evidences_text, labels = inputs

        y_pred = model(claims, evidences)

        y = (labels).float()
        y_pred = y_pred.squeeze()
        y = y.squeeze()
        y = y.view(-1)
        y_pred = y_pred.view(-1)
        bin_acc = torch.sigmoid(y_pred).round()

        loss = criterion(y_pred, y)

        true.extend(y.tolist())
        pred.extend(bin_acc.tolist())

        accuracy = (y == bin_acc).float().mean()
        test_running_accuracy += accuracy.item()
        test_running_loss += loss.item()

        if batch_num % OUTPUT_FREQ == 0 and batch_num > 0:
            print("[{}]: {}".format(batch_num,
                                    test_running_accuracy / OUTPUT_FREQ))

            # 1. Log scalar values (scalar summary)
            info = {
                'test_loss': test_running_loss / OUTPUT_FREQ,
                'test_accuracy': test_running_accuracy / OUTPUT_FREQ
            }

            for tag, value in info.items():
                exp.metric(tag, value, log=False)
                # logger.scalar_summary(tag, value, batch_num+1)

            # 2. Log values and gradients of the parameters (histogram summary)
            for tag, value in model.named_parameters():
                tag = tag.replace('.', '/')
                # logger.histo_summary(tag, value.data.cpu().numpy(), batch_num+1)

            test_running_loss = 0.0
            test_running_accuracy = 0.0

    print(true[0], pred[0])
    true = np.array(true).astype("int")
    pred = np.array(pred).astype("int")
    final_accuracy = accuracy_score(true, pred)
    print("Final accuracy: {}".format(final_accuracy))
    print(classification_report(true, pred))
    filename = "predicted_labels"
    for key, value in parameters.items():
        filename += "_{}-{}".format(key.replace(" ", "_"), value)

    joblib.dump({"true": true, "pred": pred}, filename)
Exemple #3
0
        if len(reward_avg) == 0:
            reward = reward_raw
        else:
            reward = reward_raw - np.mean(reward_avg)

        # update running mean
        reward_avg.append(reward_raw)
        if len(reward_avg) > REWARD_BUF:
            reward_avg.pop(0)

        rewards.append(reward)
        entropies.append(entropy[0])
        log_probs.append(log_prob[0])

    agent.update_parameters(rewards, log_probs, entropies, GAMMA)

    if i_episode % CHKP_FREQ == 0:
        torch.save(
            agent.model.state_dict(),
            os.path.join(exp_dir, 'reinforce-' + str(i_episode) + '.pkl'))

    # print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
    exp.metric("episode", i_episode)
    exp.metric("rewards", np.mean(reward_raw_log))

    del rewards
    del log_probs
    del entropies
    del state
Exemple #4
0
def train_reconstruction(train_loader, test_loader, encoder, decoder, args):
    exp = Experiment("Reconstruction Training")
    try:
        lr = args.lr
        encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr)
        decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr)

        encoder.train()
        decoder.train()
        steps = 0
        for epoch in range(1, args.epochs+1):
            print("=======Epoch========")
            print(epoch)
            for batch in train_loader:
                feature = Variable(batch)
                if args.use_cuda:
                    encoder.cuda()
                    decoder.cuda()
                    feature = feature.cuda()

                encoder_opt.zero_grad()
                decoder_opt.zero_grad()

                h = encoder(feature)
                prob = decoder(h)
                reconstruction_loss = compute_cross_entropy(prob, feature)
                reconstruction_loss.backward()
                encoder_opt.step()
                decoder_opt.step()

                steps += 1
                print("Epoch: {}".format(epoch))
                print("Steps: {}".format(steps))
                print("Loss: {}".format(reconstruction_loss.data[0] / args.sentence_len))
                exp.metric("Loss", reconstruction_loss.data[0] / args.sentence_len)
                # check reconstructed sentence
                if steps % args.log_interval == 0:
                    print("Test!!")
                    input_data = feature[0]
                    single_data = prob[0]
                    _, predict_index = torch.max(single_data, 1)
                    input_sentence = util.transform_id2word(input_data.data, train_loader.dataset.index2word, lang="en")
                    predict_sentence = util.transform_id2word(predict_index.data, train_loader.dataset.index2word, lang="en")
                    print("Input Sentence:")
                    print(input_sentence)
                    print("Output Sentence:")
                    print(predict_sentence)

            if steps % args.test_interval == 0:
                eval_reconstruction(encoder, decoder, test_loader, args)


            if epoch % args.lr_decay_interval == 0:
                # decrease learning rate
                lr = lr / 5
                encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr)
                decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr)
                encoder.train()
                decoder.train()

            if epoch % args.save_interval == 0:
                util.save_models(encoder, args.save_dir, "encoder", steps)
                util.save_models(decoder, args.save_dir, "decoder", steps)

        # finalization
        # save vocabulary
        with open("word2index", "wb") as w2i, open("index2word", "wb") as i2w:
            pickle.dump(train_loader.dataset.word2index, w2i)
            pickle.dump(train_loader.dataset.index2word, i2w)

        # save models
        util.save_models(encoder, args.save_dir, "encoder", "final")
        util.save_models(decoder, args.save_dir, "decoder", "final")

        print("Finish!!!")
    finally:
        exp.end()
Exemple #5
0
            optimizer.step()

            x_buf = []
            y_buf = []
            epi_x_old = epi_x

            loss_episode = loss.clone().cpu().data.numpy()[0]
            diff_episode = F.mse_loss(x_cat[:, :, :12],
                                      y_cat).clone().cpu().data.numpy()[0]

            loss.detach_()
            net.hidden[0].detach_()
            net.hidden[1].detach_()

            if exp is not None:
                exp.metric("loss episode", loss_episode)
                exp.metric("diff episode", diff_episode)
                exp.metric("epoch", epoch)

            loss_epoch += loss_episode
            diff_epoch += diff_episode

        x_buf.append(x)
        y_buf.append(y)

    printEpochLoss(epoch, epi_x_old, loss_epoch, diff_epoch)
    saveModel(state=net.state_dict(),
              epoch=epoch,
              epoch_len=epi_x_old,
              loss_epoch=loss_epoch,
              diff_epoch=diff_epoch,
def demo(args=None):
    from_file = get_api_key_from_file()
    from_env = get_api_key_from_env()
    api_key = from_env or from_file

    if not api_key:
        print("""
            `hyperdash demo` requires a Hyperdash API key. Try setting your API key in the
            HYPERDASH_API_KEY environment variable, or in a hyperdash.json file in the local
            directory or your user's home directory with the following format:

            {
                "api_key": "<YOUR_API_KEY>"
            }
        """)
        return

    print("""
Running the following program:

    from hyperdash import Experiment
    exp = Experiment("Dogs vs. Cats")

    # Parameters
    estimators = exp.param("Estimators", 500)
    epochs = exp.param("Epochs", 5)
    batch = exp.param("Batch Size", 64)

    for epoch in xrange(1, epochs + 1):
        accuracy = 1. - 1./epoch
        loss = float(epochs - epoch)/epochs
        print("Training model (epoch {})".format(epoch))
        time.sleep(1)

        # Metrics
        exp.metric("Accuracy", accuracy)
        exp.metric("Loss", loss)

    exp.end()
    """)
    from hyperdash import Experiment
    exp = Experiment("Dogs vs. Cats")

    # Parameters
    estimators = exp.param("Estimators", 500)
    epochs = exp.param("Epochs", 5)
    batch = exp.param("Batch Size", 64)

    for epoch in xrange(epochs):
        print("Training model (epoch {})".format(epoch))

        accuracy = 1. - 1. / (epoch + 1)
        loss = float(epochs - epoch) / (epochs + 1)

        # Metrics
        exp.metric("Accuracy", accuracy)
        exp.metric("Loss", loss)

        time.sleep(1)

    exp.end()
Exemple #7
0
}
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)


def match_env(ev1, ev2):
    # set env1 (simulator) to that of env2 (real robot)
    ev1.env.env.set_state(ev2.env.env.model.data.qpos.ravel(),
                          ev2.env.env.model.data.qvel.ravel())


i = 0

exp = Experiment("dataset pusher")

for i in tqdm(range(max_steps)):
    exp.metric("episode", i)
    obs = env.reset()
    obs2 = env2.reset()
    match_env(env, env2)

    for j in range(episode_length):
        # env.render()
        # env2.render()

        if j % action_steps == 0:
            action = env.action_space.sample()
        new_obs, reward, done, info = env.step(action)
        new_obs2, reward2, done2, info2 = env2.step(action)

        # print (j, done, new_obs[0][0])
Exemple #8
0
    #Visualize output
    out = model.session.run(net_out, feed_dict={network: X, labels: Y})
    out_ = []
    for j in range(out.shape[1]):
        out_.append(out[0][j])
    out = out_
    out_log.append(out)

    train_acc, train_loss = model.session.run([acc, cost],
                                              feed_dict={
                                                  network: X,
                                                  labels: Y
                                              })

    if hyperdash:
        exp.metric("Accuracy", train_acc)
        exp.metric("Loss", train_loss)

    train_summary = model.session.run(merged,
                                      feed_dict={
                                          network: X,
                                          labels: Y
                                      })
    writer2.add_summary(train_summary, i)

    if i % 200 == 0:
        # Save model and acc/error curves
        os.chdir('/home/mpcr/Desktop/rodrigo/deepcontrol/saved_models')
        model.save(m_save + modelswitch[model_num].__name__)

        # Save model output throughout training
Exemple #9
0
    def test_experiment_handles_numpy_numbers(self):
        nums_to_test = [
            ("int_", np.int_()),
            ("intc", np.intc()),
            ("intp", np.intp()),
            ("int8", np.int8()),
            ("int16", np.int16()),
            ("int32", np.int32()),
            ("int64", np.int64()),
            ("uint8", np.uint8()),
            ("uint16", np.uint16()),
            ("uint32", np.uint32()),
            ("uint64", np.uint64()),
            ("float16", np.float16()),
            ("float32", np.float32()),
            ("float64", np.float64()),
        ]
        # Make sure the SDK doesn't choke and JSON serialization works
        exp = Experiment("MNIST")
        for name, num in nums_to_test:
            exp.metric("test_metric_{}".format(name), num)
            exp.param("test_param_{}".format(name), num)
        exp.end()

        # Test params match what is expected
        params_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "params" in payload:
                params_messages.append(payload)

        expected_params = []
        for name, num in nums_to_test:
            obj = {
                "params": {},
                "is_internal": False,
            }
            obj["params"]["test_param_{}".format(name)] = num
            obj["is_internal"] = False
            expected_params.append(obj)

        assert len(expected_params) == len(params_messages)
        for i, message in enumerate(params_messages):
            print(message)
            print(expected_params[i])
            assert message == expected_params[i]

        # Test metrics match what is expected
        metrics_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "name" in payload:
                metrics_messages.append(payload)

        expected_metrics = []
        for name, num in nums_to_test:
            expected_metrics.append({
                "name": "test_metric_{}".format(name),
                "value": num,
                "is_internal": False,
            })

        assert len(expected_metrics) == len(metrics_messages)
        for i, message in enumerate(metrics_messages):
            assert message == expected_metrics[i]
Exemple #10
0
    def test_experiment(self):
        # Run a test job via the Experiment API
        # Make sure log file is where is supposed to be
        # look at decorator
        # verify run start/stop is sent
        with patch("sys.stdout", new=StringIO()) as faked_out:
            exp = Experiment("MNIST")
            exp.log("test print")
            exp.param("batch size", 32)
            for i in exp.iter(2):
                time.sleep(1)
                exp.metric("accuracy", i * 0.2)
            time.sleep(0.1)
            exp.end()

        # Test params match what is expected
        params_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "params" in payload:
                params_messages.append(payload)

        expect_params = [
            {
                "params": {
                    "batch size": 32,
                },
                "is_internal": False,
            },
            {
                "params": {
                    "hd_iter_0_epochs": 2,
                },
                "is_internal": True,
            },
        ]
        assert len(expect_params) == len(params_messages)
        for i, message in enumerate(params_messages):
            assert message == expect_params[i]

        # Test metrics match what is expected
        metrics_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "name" in payload:
                metrics_messages.append(payload)

        expect_metrics = [
            {
                "is_internal": True,
                "name": "hd_iter_0",
                "value": 0
            },
            {
                "is_internal": False,
                "name": "accuracy",
                "value": 0
            },
            {
                "is_internal": True,
                "name": "hd_iter_0",
                "value": 1
            },
            {
                "is_internal": False,
                "name": "accuracy",
                "value": 0.2
            },
        ]
        assert len(expect_metrics) == len(metrics_messages)
        for i, message in enumerate(metrics_messages):
            assert message == expect_metrics[i]

        captured_out = faked_out.getvalue()
        assert "error" not in captured_out

        # Make sure correct API name / version headers are sent
        assert server_sdk_headers[0][API_KEY_NAME] == API_NAME_EXPERIMENT
        assert server_sdk_headers[0][
            VERSION_KEY_NAME] == get_hyperdash_version()

        # Make sure logs were persisted
        expect_logs = [
            "{ batch size: 32 }",
            "test print",
            "| Iteration 0 of 1 |",
            "| accuracy:   0.000000 |",
        ]

        log_dir = get_hyperdash_logs_home_path_for_job("MNIST")
        latest_log_file = max([
            os.path.join(log_dir, filename) for filename in os.listdir(log_dir)
        ],
                              key=os.path.getmtime)
        with open(latest_log_file, "r") as log_file:
            data = log_file.read()
            for log in expect_logs:
                assert_in(log, data)
        os.remove(latest_log_file)
Exemple #11
0
def main():

    args = parse_args()

    # set random seed
    #logger.info('> set random seed {}'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)

    # Set up Devices
    #logger.info('> set gpu device {}'.format(args.gpus))
    num_cuda_devices = utils.set_devices(args.gpus)

    # Load model
    #logger.info('> load model {}'.format(args.model_name))
    ext = os.path.splitext(args.model_file)[1]
    model_path = '.'.join(os.path.split(args.model_file)).replace(ext, '')
    model = import_module(model_path)
    model = getattr(model, args.model_name)(args.output_class)
    if num_cuda_devices > 0:
        model = torch.nn.DataParallel(model)
        model.cuda()

    logger.info('> set optimizer')
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.initial_lr,
                          momentum=args.lr_momentum)

    # Create result dir
    result_dir = create_result_dir(args.model_name)

    fh_handler = logging.FileHandler(os.path.join(result_dir, "log"))
    fh_handler.setFormatter(
        logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
    logger.addHandler(fh_handler)

    shutil.copy(args.model_file,
                os.path.join(result_dir, os.path.basename(args.model_file)))
    script_file_list = glob.glob('./*.py') + glob.glob('./*.sh')
    for file_name in script_file_list:
        shutil.copy(file_name,
                    os.path.join(result_dir, os.path.basename(file_name)))
    with open(os.path.join(result_dir, 'args'), 'w') as fp:
        fp.write(json.dumps(vars(args)))
    print(json.dumps(vars(args), sort_keys=True, indent=4))

    # Create Dataset
    logger.info('> Creating DataSet')
    train_transform = partial(transforms.transform_f,
                              random_angle=args.random_angle,
                              expand_ratio=args.expand_ratio,
                              crop_size=args.crop_size,
                              train=True)
    train = getdataset.getCcoreDataset(args.train_json, train_transform,
                                       args.train_mode)

    val_transform = partial(transforms.transform_f,
                            random_angle=args.random_angle,
                            expand_ratio=args.expand_ratio,
                            crop_size=args.crop_size,
                            train=True)
    val = getdataset.getCcoreDataset(args.train_json, val_transform,
                                     args.train_mode)

    # Create DataLoader
    logger.info('> create dataloader')
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=args.batchsize,
                                               shuffle=True,
                                               num_workers=4)
    val_loader = torch.utils.data.DataLoader(val,
                                             batch_size=args.batchsize,
                                             shuffle=False,
                                             num_workers=4)

    # Training
    logger.info('> run training')
    best_prec = 0

    # Create Hyperdash Experiment
    logger.info('> Create Hyperdash Experiment {}'.format(
        args.experiment_name))
    exp = Experiment(args.experiment_name,
                     api_key_getter=utils.get_api_key_from_env)

    for epoch in tqdm(range(args.training_epoch)):

        training_result = training(train_loader, model, criterion, optimizer)
        val_result = validate(val_loader, model, criterion)

        result_str = 'epoch : {} / {}\
        main/loss : {:.3f}\
        main/acc : {:.3f}\
        val/loss : {:.3f}\
        val/acc : {:.3f}'.format(epoch, args.training_epoch,
                                 training_result['loss'],
                                 training_result['acc'], val_result['loss'],
                                 val_result['acc'])
        logger.info(result_str)
        exp.log(result_str)

        prec1 = val_result['acc']

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec
        best_prec = max(prec1, best_prec)
        if is_best:
            save_checkpoint(
                state={
                    'epoch': epoch + 1,
                    #'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec': best_prec,
                    'optimizer': optimizer.state_dict(),
                },
                is_best=is_best,
                result_dir=result_dir)

        exp.metric('main/loss', training_result['loss'])
        exp.metric('val/loss', val_result['loss'])

    logger.info('> end training')
    exp.end()
Exemple #12
0
def main():
    exp = Experiment("duckietown_vae")
    model_name = 'models/duckietown_vae_fc_model.pt'
    # changed configuration to this instead of argparse for easier interaction
    CUDA = True
    SEED = 1
    BATCH_SIZE = 32
    LOG_INTERVAL = 10
    EPOCHS = 25

    # connections through the autoencoder bottleneck
    # in the pytorch VAE example, this is 20
    ZDIMS = 100

    torch.manual_seed(SEED)
    if CUDA:
        torch.cuda.manual_seed(SEED)

    # DataLoader instances will load tensors directly into GPU memory
    kwargs = {'num_workers': 1, 'pin_memory': True} if CUDA else {}

    # Download or load downloaded MNIST dataset
    # shuffle data at every epoch
    #train_loader = torch.utils.data.DataLoader(
    #datasets.MNIST('data', train=True, download=True,
    #transform=transforms.ToTensor()),
    #batch_size=BATCH_SIZE, shuffle=True, **kwargs)

    # Same for test data
    #test_loader = torch.utils.data.DataLoader(
    #datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
    #batch_size=BATCH_SIZE, shuffle=True, **kwargs)

    train_loader = load_dataset('images/train', BATCH_SIZE)
    test_loader = load_dataset('images/test', BATCH_SIZE)

    vae = FC_VAE(input_size=64 * 64 * 3,
                 HIDDEN_1=400,
                 batch_size=BATCH_SIZE,
                 NUM_Z=ZDIMS,
                 learning_rate=1e-3,
                 CUDA=CUDA)

    #vae = Conv_VAE(batchsize=BATCH_SIZE)
    vae.cuda()
    total_test_loss = []
    total_train_loss = []
    for epoch in range(1, EPOCHS + 1):
        train_loss = vae.train_model(epoch, train_loader, LOG_INTERVAL)
        total_train_loss.append(train_loss)
        exp.metric("train_loss", train_loss.item())
        test_loss = vae.test_model(epoch, test_loader)
        total_test_loss.append(test_loss)
        exp.metric("test_loss", test_loss.item())
        torch.save(vae.state_dict(), model_name)

    # 64 sets of random ZDIMS-float vectors, i.e. 64 locations / MNIST
    # digits in latent space

    #sample = Variable(torch.randn(64, ZDIMS))
    #if CUDA:
    #sample = sample.cuda()
    #sample = vae.decode(sample).cpu()

    # save out as an 8x8 matrix of MNIST digits
    # this will give you a visual idea of how well latent space can generate things
    # that look like digits
    #save_image(sample.data.view(64, 1, 28, 28), 'results/sample_' + str(epoch) + '.png')

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    line, = ax.plot(range(EPOCHS),
                    total_train_loss,
                    color='blue',
                    lw=2,
                    label='Train')
    line, = ax.plot(range(EPOCHS),
                    total_test_loss,
                    color='red',
                    lw=2,
                    label='Test')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    ax.legend()
    plt.show()


#main()
Exemple #13
0
            target = Variable(torch.from_numpy(ds.next_real[epi,
                                                            frame])).cuda()

            loss = loss_function(new_obs, target)
            losses += loss
            loss_epi += loss.clone().cpu().data.numpy()[0]
            diff_epi += F.mse_loss(
                tmp_var(ds.current_real[epi, frame]),
                tmp_var(ds.next_real[epi,
                                     frame])).clone().cpu().data.numpy()[0]

        losses.backward()
        optimizer.step()

        losses.detach_()

        del losses
        del loss

        env.net.hidden[0].detach_()
        env.net.hidden[1].detach_()
        env.net.zero_grad()
        env.net.zero_hidden()
        optimizer.zero_grad()

        exp.metric("loss episode", loss_epi)
        exp.metric("diff episode", diff_epi)
        exp.metric("epoch", epoch)

    saveModel(env.net.state_dict())
def train_reconstruction(train_loader, test_loader, encoder, decoder, args):
    exp = Experiment("Reconstruction Training")
    #vis = Visualizations()
    vis = visdom.Visdom(port=8098)
    try:
        lr = args.lr
        encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr)
        decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr)

        encoder.train()
        decoder.train()
        steps = 0
        all_losses = []
        for epoch in range(1, args.epochs + 1):
            epoch_losses = []
            print("=======Epoch========")
            print(epoch)
            for batch in train_loader:
                feature = batch  # Variable
                if args.use_cuda:
                    encoder.cuda()
                    decoder.cuda()
                    feature = feature.cuda()

                encoder_opt.zero_grad()
                decoder_opt.zero_grad()

                h = encoder(feature)
                prob = decoder(h)
                reconstruction_loss = compute_cross_entropy(prob, feature)
                reconstruction_loss.backward()
                encoder_opt.step()
                decoder_opt.step()

                print("Epoch: {}".format(epoch))
                print("Steps: {}".format(steps))
                print("Loss: {}".format(reconstruction_loss.item() /
                                        args.sentence_len))
                exp.metric("Loss",
                           reconstruction_loss.item() / args.sentence_len)

                epoch_losses.append(reconstruction_loss.item())

                # check reconstructed sentence
                if steps % args.log_interval == 0:
                    print("Test!!")
                    input_data = feature[0]
                    single_data = prob[0]
                    _, predict_index = torch.max(single_data, 1)
                    input_sentence = transform_id2word(
                        input_data.data,
                        train_loader.dataset.index2word,
                        lang="en")
                    predict_sentence = transform_id2word(
                        predict_index.data,
                        train_loader.dataset.index2word,
                        lang="en")
                    print("Input Sentence:")
                    print(input_sentence)
                    print("Output Sentence:")
                    print(predict_sentence)

                steps += 1

            # Visualization data

            epoch_loss = sum(epoch_losses) / float(len(epoch_losses))
            all_losses.append(epoch_loss)
            if epoch == 1:
                # vis.plot_loss(np.mean(epoch_losses), steps)
                win = vis.line(X=np.array((epoch, )),
                               Y=np.array((epoch_loss, )),
                               name="train_loss",
                               opts=dict(xlabel='Epoch',
                                         ylabel='Loss',
                                         title='Train and Eval Loss'))
            else:
                vis.line(X=np.array((epoch, )),
                         Y=np.array((epoch_loss, )),
                         name="train_loss",
                         update="append",
                         win=win)
            #epoch_losses.clear()

            if epoch % args.test_interval == 0:
                eval_reconstruction(encoder, decoder, test_loader, args, vis,
                                    win, epoch)

            if epoch % args.lr_decay_interval == 0:
                # decrease learning rate
                lr = lr / 1.05
                encoder_opt = torch.optim.Adam(encoder.parameters(), lr=lr)
                decoder_opt = torch.optim.Adam(decoder.parameters(), lr=lr)
                encoder.train()
                decoder.train()

            if epoch % args.save_interval == 0:
                save_models(encoder, args.save_dir, "encoder", steps)
                save_models(decoder, args.save_dir, "decoder", steps)

            if epoch % 20 == 0:
                # finalization
                # save vocabulary
                #with open("word2index", "wb") as w2i, open("index2word", "wb") as i2w:
                #    pickle.dump(train_loader.dataset.word2index, w2i)
                #    pickle.dump(train_loader.dataset.index2word, i2w)
                torch.save(train_loader.dataset.index2word,
                           "/home/avshalom/ext/ae_cnn_code/index2word.pt")
                torch.save(train_loader.dataset.word2index,
                           "/home/avshalom/ext/ae_cnn_code/word2index.pt")

                # save models
                #save_models(encoder, args.save_dir, "encoder", "final")
                #save_models(decoder, args.save_dir, "decoder", "final")
                torch.save(
                    encoder,
                    "/home/avshalom/ext/ae_cnn_code/encoder_lsize_%s_epoch_%s.pt"
                    % (args.latent_size, epoch))

        print("Finish!!!")
    finally:
        exp.end()
def calc_accuricy():
    error = 0
    for step, (batch_x,
               batch_y) in enumerate(loader_test):  # for each training step
        b_x = Variable(batch_x)
        b_y = Variable(batch_y)
        prediction = net(b_x)
        loss = loss_func(prediction, b_y)  # must be (1. nn output, 2. target)
        error += float(loss)

    exp.metric('accuracy', error)


# start training
for epoch in range(EPOCH):
    exp.metric('epoch', epoch)
    for step, (batch_x,
               batch_y) in enumerate(loader_train):  # for each training step

        b_x = Variable(batch_x)
        b_y = Variable(batch_y)

        prediction = net(b_x)  # input x and predict based on x
        loss = loss_func(prediction, b_y)  # must be (1. nn output, 2. target)

        if float(loss) < 15000:
            exp.metric('loss', float(loss))
            calc_accuricy()
        optimizer.zero_grad()  # clear gradients for next train
        loss.backward()  # backpropagation, compute gradients
        optimizer.step()  # apply gradients
Exemple #16
0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward

            if not load_checkpoint:
                agent.store_transition(observation, action, reward, observation_, int(done))
                agent.learn()

            observation = observation_
            n_steps += 1
        scores.append(score)
        steps_array.append(n_steps)

        avg_score = np.mean(scores[-100:])

        exp.metric("epsilon", agent.epsilon)
        exp.metric("score", score)

        print('episode ', i, 'score: ', score, 'average score %.1f best score %.1f epsilon %.2f' % (avg_score, best_score, agent.epsilon), 'steps ', n_steps)

        if avg_score > best_score:
            if not load_checkpoint:
                agent.save_models()
            best_score = avg_score
        
        eps_history.append(agent.epsilon)

    plot_learning_curve(steps_array, scores, eps_history, figure_file)
    exp.end()
Exemple #17
0
class NeuralNet(object):
    LAYER1_SIZE = 522  # 12
    LAYER2_SIZE = 256  # 6
    LAYER3_SIZE = 128
    LAYER4_SIZE = 64
    LAYER5_SIZE = 32
    OUTPUT_SIZE = 1

    def __init__(self,
                 name='nn-model',
                 cached_model=None,
                 seed=None,
                 lr=1e-4,
                 training=False):
        self.graph = tf.Graph()
        self.training = training
        if self.training:
            from hyperdash import Experiment

            self.exp = Experiment(name)

        with self.graph.as_default():
            if seed is not None:
                tf.set_random_seed(seed)
            self.session = tf.Session()
            self.features = tf.placeholder(dtype=tf.float32,
                                           name="input_features",
                                           shape=(None, PLANET_MAX_NUM,
                                                  PER_PLANET_FEATURES))
            # target_distribution describes what the bot did in a real game.
            # For instance, if it sent 20% of the ships to the first planet and 15% of the ships to the second planet,
            # then expected_distribution = [0.2, 0.15 ...]
            self.target_distribution = tf.placeholder(
                dtype=tf.float32,
                name="target_distribution",
                shape=(None, PLANET_MAX_NUM))
            # Combine all the planets from all the frames together, so it's easier to share
            # the weights and biases between them in the network.
            flattened_frames = tf.reshape(self.features,
                                          [-1, PER_PLANET_FEATURES])

            layer1 = fully_connected(flattened_frames, 512)
            layer2 = fully_connected(layer1, 256)
            layer3 = fully_connected(layer2, 128)
            # Group back into frames
            layer4 = fully_connected(layer3, 64)
            layer5 = fully_connected(layer4, 32)
            layer6 = fully_connected(layer5, 1, activation_fn=None)
            logits = tf.reshape(layer6, [-1, PLANET_MAX_NUM])

            self.prediction_normalized = tf.nn.softmax(logits)
            self.loss_op = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    logits=logits, labels=self.target_distribution))

            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=lr)  # returns Op

            self.train_op = self.optimizer.minimize(self.loss_op)

            # self.acc_op = tf.reduce_mean(tf.reduce_min(tf.cast(self.prediction_normalized, tf.float32), 1))
            # self.acc, self.update_acc_op = tf.metrics.mean_per_class_accuracy(self.target_distribution, self.prediction_normalized, 28)
            # multilabel_accuracy(self.prediction_normalized, self.target_distribution)
            self.saver = tf.train.Saver()
            if self.training:
                self.exp.param("lr", lr)
            if cached_model is None:
                self.session.run([
                    tf.global_variables_initializer(),
                    tf.local_variables_initializer()
                ])
            else:
                self.session.run(tf.local_variables_initializer())
                self.saver.restore(self.session, cached_model)

    def fit(self, input_data, expected_output_data):
        loss, _ = self.session.run(
            [self.loss_op, self.train_op],
            feed_dict={
                self.features: normalize_input(input_data),
                self.target_distribution: expected_output_data
            })

        if self.training:
            self.exp.metric("training_loss", loss)
        return loss

    def predict(self, input_data):
        """
        Given data from 1 frame, predict where the ships should be sent.

        :param input_data: numpy array of shape (PLANET_MAX_NUM, PER_PLANET_FEATURES)
        :return: 1-D numpy array of length (PLANET_MAX_NUM) describing percentage of ships
        that should be sent to each planet
        """
        return self.session.run(
            self.prediction_normalized,
            feed_dict={self.features:
                       normalize_input(np.array([input_data]))})[0]

    def compute_loss(self, input_data, expected_output_data):
        """
        Compute loss on the input data without running any training.

        :param input_data: numpy array of shape (number of frames, PLANET_MAX_NUM, PER_PLANET_FEATURES)
        :param expected_output_data: numpy array of shape (number of frames, PLANET_MAX_NUM)
        :return: training loss on the input data
        """
        loss = self.session.run(self.loss_op,
                                feed_dict={
                                    self.features: normalize_input(input_data),
                                    self.target_distribution:
                                    expected_output_data
                                })
        if self.training:
            self.exp.metric("val_loss", loss)
        return loss

    def save(self, path):
        """
        Serializes this neural net to given path.
        :param path:
        """
        self.saver.save(self.session, path)
Exemple #18
0
    # idn_u_stars = idn_data["idn_u_stars"]

    # idn_u_preds, idn_f_preds = model.idn_predict(idn_t_stars, idn_x_stars)
    # idn_error_us = [
    #     np.linalg.norm(star - pred, 2) / np.linalg.norm(star, 2)
    #     for star, pred in zip(idn_u_stars, idn_u_preds)
    # ]
    # for i, e in enumerate(idn_error_us):
    #     model.logger.info(f"Data{i} Error u: {e:.3e}")

    sol_t_star = sol_data["sol_t_star"]
    sol_x_star = sol_data["sol_x_star"]
    sol_u_star = sol_data["sol_u_star"]

    model.train_solver(args.niter * 2, args.scipyopt)
    u_pred, f_pred = model.solver_predict(sol_t_star, sol_x_star)
    error_u = np.linalg.norm(sol_u_star - u_pred, 2) / np.linalg.norm(
        sol_u_star, 2)
    model.logger.info(f"Error u: {error_u:.3e}")
    exp.metric("Error u", error_u)

    sol_X_star = sol_data["sol_X_star"]
    sol_T = sol_data["sol_T"]
    sol_X = sol_data["sol_X"]
    sol_exact = sol_data["sol_exact"]
    U_pred = griddata(sol_X_star,
                      u_pred.flatten(), (sol_T, sol_X),
                      method="cubic")
    plt_saver(U_pred, sol_exact, sol_lb, sol_ub, figurename)
    exp.end()
Exemple #19
0
            loss += criterion_contrast(internals[0], internals[1 + answers[b]],
                                       0)

            # add contrastive loss for the non-matching answers
            loss += criterion_contrast(internals[0],
                                       internals[1 + other_answers[0]], 1)
            loss += criterion_contrast(internals[0],
                                       internals[1 + other_answers[1]], 1)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        exp.metric("epoch", epoch)
        exp.metric("loss", loss.item())

        if (i + 1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                epoch + 1, EPOCHS, i + 1, total_step, loss.item()))

    # Save the model checkpoint
    torch.save(
        model.state_dict(),
        'model-siam2-s{}-e{}-b{}-lr{}.ckpt'.format(SIZE, EPOCHS, BATCH,
                                                   LEARNING_RATE))

# # Test the model
# model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
# with torch.no_grad():
Exemple #20
0
        baseline = np.mean(rewards_raw[:k] + rewards_raw[k + 1:])
        rewards.append(rewards_raw[k] - baseline)

    # calculate additional VAE loss
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + torch.log(latent_stddev.pow(2)) -
                           latent_mu.pow(2) - latent_stddev.pow(2))

    # update model
    returns = torch.Tensor(rewards).to(device)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    policy_loss = []
    for log_prob, R in zip(log_probs, returns):
        policy_loss.append(-log_prob * R)

    optimizer.zero_grad()
    policy_loss_sum = torch.cat(policy_loss).sum() + KLD
    loss_copy = policy_loss_sum.detach().cpu().numpy().copy()
    policy_loss_sum.backward()
    optimizer.step()

    if i_episode % CHKP_FREQ == 0:
        torch.save(
            policy.state_dict(),
            os.path.join(exp_dir, 'reinforce-' + str(i_episode) + '.pkl'))

    exp.metric("episode", i_episode)
    exp.metric("rewards", np.mean(rewards_raw))
    exp.metric("loss", float(loss_copy))
Exemple #21
0
        opt_dec.zero_grad()


        z, mean = encoder(inputs)
        z, mu, logvar = ca(z)
        recon_batch = decoder(z)

        loss = loss_function(recon_batch, inputs, mu, logvar)
        loss.backward()


        opt_enc.step()
        opt_ca.step()
        opt_dec.step()

        exp.metric('loss', loss.item())

        end_t = time.time()
        if idx % 15 == 0:
          print('''[%d/%d][%d/%d] Loss: %.2f Time: %.2fs''' % (i, args.epochs, idx, len(loader), loss.item(), end_t - start_t))
          e_time = 100 * len(loader) * (end_t - start_t) / 60 / 60
          print(e_time, "h")

    if i % 3 == 0:
        show_plts_mono(inputs[0],recon_batch[0], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_1.png")
        show_plts_mono(inputs[1],recon_batch[1], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_2.png")
        show_plts_mono(inputs[2],recon_batch[2], f"/content/drive/My Drive/RMSprop/vae_16/images/reconstruction_epoch{i}_3.png")
        torch.save(encoder.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/encoder_epoch{i}_{idx}.pth")
        torch.save(ca.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/ca_epoch{i}_{idx}.pth")
        torch.save(decoder.module.state_dict(), f"/content/drive/My Drive/RMSprop/vae_16/models/decoder_epoch{i}_{idx}.pth")
Exemple #22
0
def train(train_list,
          test_list,
          lr,
          epoch,
          batchsize,
          insize,
          outsize,
          save_interval=10,
          weight_decay=5e-4,
          lr_step=10,
          model_name='resnet34',
          loss_name='focal_loss',
          metric_name='arc_margin',
          optim_name='adam',
          num_workers=4,
          print_freq=1e+6,
          debug=False):

    device = torch.device("cuda")

    train_dataset = Dataset(train_list,
                            mode='train',
                            insize=insize,
                            debug=debug)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=batchsize,
                                              shuffle=True,
                                              num_workers=num_workers)
    test_dataset = Dataset(test_list, mode='test', insize=insize, debug=debug)
    testloader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=batchsize,
                                             shuffle=False,
                                             num_workers=num_workers)
    class_num = train_dataset.get_classnum()

    print('{} train iters per epoch:'.format(len(trainloader)))
    print('{} test iters per epoch:'.format(len(testloader)))

    if loss_name == 'focal_loss':
        criterion = FocalLoss(gamma=2)
    else:
        criterion = torch.nn.CrossEntropyLoss()

    if model_name == 'resnet18':
        model = resnet_face18(insize, outsize)
    elif model_name == 'resnet34':
        model = resnet34(insize, outsize)
    elif model_name == 'resnet50':
        model = resnet50(insize, outsize)
    elif model_name == 'resnet101':
        model = resnet101(insize, outsize)
    elif model_name == 'resnet152':
        model = resnet152(insize, outsize)
    elif model_name == 'shuffle':
        model = ShuffleFaceNet(outsize)
    elif model_name == 'simplev1':
        model = CNNv1(insize, outsize, activation='relu', kernel_pattern='v1')
    else:
        raise ValueError('Invalid model name: {}'.format(model_name))

    if metric_name == 'add_margin':
        metric_fc = AddMarginProduct(outsize, class_num, s=30, m=0.35)
    elif metric_name == 'arc_margin':
        metric_fc = ArcMarginProduct(outsize,
                                     class_num,
                                     s=30,
                                     m=0.5,
                                     easy_margin=False)
    elif metric_name == 'sphere':
        metric_fc = SphereProduct(outsize, class_num, m=4)
    else:
        metric_fc = nn.Linear(outsize, class_num)

    # view_model(model, opt.input_shape)
    print(model)
    model.to(device)
    model = DataParallel(model)
    metric_fc.to(device)
    metric_fc = DataParallel(metric_fc)

    assert optim_name in ['sgd', 'adam']
    if optim_name == 'sgd':
        optimizer = torch.optim.SGD([{
            'params': model.parameters()
        }, {
            'params': metric_fc.parameters()
        }],
                                    lr=lr,
                                    weight_decay=weight_decay)
    elif optim_name == 'adam':
        optimizer = torch.optim.Adam([{
            'params': model.parameters()
        }, {
            'params': metric_fc.parameters()
        }],
                                     lr=lr,
                                     weight_decay=weight_decay)
    scheduler = StepLR(optimizer, step_size=lr_step, gamma=0.1)

    start = time.time()
    training_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    hyperdash_exp = Experiment(training_id)
    checkpoints_dir = os.path.join('logs', training_id)
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
    logging_path = os.path.join(checkpoints_dir, 'history.csv')

    config = {}
    config['train_list'] = train_list
    config['test_list'] = test_list
    config['lr'] = lr
    config['epoch'] = epoch
    config['batchsize'] = batchsize
    config['insize'] = insize
    config['outsize'] = outsize
    config['save_interval'] = save_interval
    config['weight_decay'] = weight_decay
    config['lr_step'] = lr_step
    config['model_name'] = model_name
    config['loss_name'] = loss_name
    config['metric_name'] = metric_name
    config['optim_name'] = optim_name
    config['num_workers'] = num_workers
    config['debug'] = debug
    for k, v in config.items():
        hyperdash_exp.param(k, v, log=False)
    with open(os.path.join(checkpoints_dir, 'train_config.json'), 'w') as f:
        json.dump(config, f, indent=4)

    with open(logging_path, 'w') as f:
        f.write('epoch,time_elapsed,train_loss,train_acc,test_loss,test_acc\n')

    prev_time = datetime.datetime.now()
    for i in range(epoch):
        model.train()
        for ii, data in enumerate(tqdm(trainloader, disable=True)):
            data_input, label = data
            data_input = data_input.to(device)
            label = label.to(device).long()
            feature = model(data_input)
            output = metric_fc(feature, label)
            loss = criterion(output, label)
            pred_classes = np.argmax(output.data.cpu().numpy(), axis=1)
            acc = np.mean(
                (pred_classes == label.data.cpu().numpy()).astype(int))
            optimizer.zero_grad()
            loss.backward()

            #import pdb; pdb.set_trace()
            optimizer.step()
            #scheduler.step()

            iters = i * len(trainloader) + ii

            if iters % print_freq == 0 or debug:
                speed = print_freq / (time.time() - start)
                time_str = time.asctime(time.localtime(time.time()))
                print('{} train epoch {} iter {} {} iters/s loss {} acc {}'.
                      format(time_str, i, ii, speed, loss.item(), acc))

                start = time.time()

        model.eval()
        for ii, data in enumerate(tqdm(testloader, disable=True)):
            data_input, label = data
            data_input = data_input.to(device)
            label = label.to(device).long()
            feature = model(data_input)
            output = metric_fc(feature, label)
            test_loss = criterion(output, label)
            output = np.argmax(output.data.cpu().numpy(), axis=1)
            test_acc = np.mean(
                (output == label.data.cpu().numpy()).astype(int))
            #test_acc = np.mean((torch.argmax(output, dim=1) == label).type(torch.int32))

        if i % save_interval == 0 or i == epoch:
            save_model(model.module, checkpoints_dir, model_name, i)
            save_model(metric_fc.module, checkpoints_dir, metric_name, i)

        new_time = datetime.datetime.now()
        with open(logging_path, 'a') as f:
            f.write('{},{},{},{},{},{}\n'.format(
                i, (new_time - prev_time).total_seconds(), loss.item(), acc,
                test_loss.item(), test_acc))
        prev_time = datetime.datetime.now()

        hyperdash_exp.metric('train_loss', loss.item(), log=False)
        hyperdash_exp.metric('train_acc', acc, log=False)
        hyperdash_exp.metric('test_loss', test_loss.item(), log=False)
        hyperdash_exp.metric('test_acc', test_acc, log=False)

    hyperdash_exp.end()
    print('Finished {}'.format(training_id))
# digits.py
from sklearn import svm, datasets
from hyperdash import Experiment

# Preprocess data
digits = datasets.load_digits(100)
test_cases = 50
X_train, y_train = digits.data[:-test_cases], digits.target[:-test_cases]
X_test, y_test = digits.data[-test_cases:], digits.target[-test_cases:]

# Create an experiment with a model name, then autostart
exp = Experiment("Digits Classifier")
# Record the value of hyperparameter gamma for this experiment
gamma = exp.param("gamma", 0.1)
# Param can record any basic type (Number, Boolean, String)

classifer = svm.SVC(gamma=gamma)
classifer.fit(X_train, y_train)

# Record a numerical performance metric
exp.metric("accuracy", classifer.score(X_test, y_test))

# Cleanup and mark that the experiment successfully completed
exp.end()
                loss.detach_()
                net.hidden[0].detach_()
                net.hidden[1].detach_()
                net.zero_grad()
                net.zero_hidden()
                optimizer.zero_grad()
                loss_buffer.zero_()

            del loss_concat

            x_buf = []
            y_buf = []
            epi_x_old = epi_x

            if exp is not None:
                exp.metric("loss episode la", loss_episode_la)
                exp.metric("loss episode lb", loss_episode_lb)
                exp.metric("loss episode cc", loss_episode_cc)
                exp.metric("diff episode", diff_episode)
                exp.metric("epoch", epoch)

        x_buf.append(x.squeeze(0))
        y_buf.append(y.squeeze(0))


    # Validation step
    loss_total_la = []
    loss_total_lb = []
    loss_total_cc = []
    diff_total = []
Exemple #25
0
class GAN(object):
    def __init__(self):
        warnings.filterwarnings('ignore')
        self.start_time = time()

        self.args = get_args()
        if self.args.checkpoint_dir_name:
            dir_name = self.args.checkpoint_dir_name
        else:
            dir_name = datetime.datetime.now().strftime('%y%m%d%H%M%S')
        self.path_to_dir = Path(__file__).resolve().parents[1]
        self.path_to_dir = os.path.join(self.path_to_dir, *['log', dir_name])
        os.makedirs(self.path_to_dir, exist_ok=True)

        # tensorboard
        path_to_tensorboard = os.path.join(self.path_to_dir, 'tensorboard')
        os.makedirs(path_to_tensorboard, exist_ok=True)
        self.writer = SummaryWriter(path_to_tensorboard)

        # model saving
        os.makedirs(os.path.join(self.path_to_dir, 'model'), exist_ok=True)
        path_to_model = os.path.join(self.path_to_dir, *['model', 'model.tar'])

        # csv
        os.makedirs(os.path.join(self.path_to_dir, 'csv'), exist_ok=True)
        self.path_to_results_csv = os.path.join(self.path_to_dir,
                                                *['csv', 'results.csv'])
        path_to_args_csv = os.path.join(self.path_to_dir, *['csv', 'args.csv'])
        if not self.args.checkpoint_dir_name:
            with open(path_to_args_csv, 'a') as f:
                args_dict = vars(self.args)
                param_writer = csv.DictWriter(f, list(args_dict.keys()))
                param_writer.writeheader()
                param_writer.writerow(args_dict)

        # logging by hyperdash
        if not self.args.no_hyperdash:
            from hyperdash import Experiment
            self.exp = Experiment('Generation task on ' + self.args.dataset +
                                  ' dataset with GAN')
            for key in vars(self.args).keys():
                exec("self.args.%s = self.exp.param('%s', self.args.%s)" %
                     (key, key, key))
        else:
            self.exp = None

        self.dataloader = get_dataloader(self.args.dataset,
                                         self.args.image_size,
                                         self.args.batch_size)
        sample_data = self.dataloader.__iter__().__next__()[0]
        image_channels = sample_data.shape[1]

        z = torch.randn(self.args.batch_size, self.args.z_dim)
        self.sample_z = z.view(z.size(0), z.size(1), 1, 1)

        self.Generator = Generator(self.args.z_dim, image_channels,
                                   self.args.image_size)
        self.Generator_optimizer = optim.Adam(self.Generator.parameters(),
                                              lr=self.args.lr_Generator,
                                              betas=(self.args.beta1,
                                                     self.args.beta2))
        self.writer.add_graph(self.Generator, self.sample_z)
        self.Generator.to(self.args.device)

        self.Discriminator = Discriminator(image_channels,
                                           self.args.image_size)
        self.Discriminator_optimizer = optim.Adam(
            self.Discriminator.parameters(),
            lr=self.args.lr_Discriminator,
            betas=(self.args.beta1, self.args.beta2))
        self.writer.add_graph(self.Discriminator, sample_data)
        self.Discriminator.to(self.args.device)

        self.BCELoss = nn.BCELoss()

        self.sample_z = self.sample_z.to(self.args.device)

    def train(self):
        self.train_hist = {}
        self.train_hist['Generator_loss'] = 0.0
        self.train_hist['Discriminator_loss'] = 0.0

        # real ---> y = 1
        # fake ---> y = 0
        self.y_real = torch.ones(self.args.batch_size, 1).to(self.args.device)
        self.y_fake = torch.zeros(self.args.batch_size, 1).to(self.args.device)

        self.Discriminator.train()

        global_step = 0
        #  -----training -----
        for epoch in range(1, self.args.n_epoch + 1):
            self.Generator.train()
            for idx, (x, _) in enumerate(self.dataloader):
                if idx == self.dataloader.dataset.__len__(
                ) // self.args.batch_size:
                    break

                z = torch.randn(self.args.batch_size, self.args.z_dim)
                z = z.view(z.size(0), z.size(1), 1, 1)
                z = z.to(self.args.device)
                x = x.to(self.args.device)

                # ----- update Discriminator -----
                # minimize: -{ log[D(x)] + log[1-D(G(z))] }
                self.Discriminator_optimizer.zero_grad()
                # real
                # ---> log[D(x)]
                Discriminator_real, _ = self.Discriminator(x)
                Discriminator_real_loss = self.BCELoss(Discriminator_real,
                                                       self.y_real)
                # fake
                # ---> log[1-D(G(z))]
                Discriminator_fake, _ = self.Discriminator(self.Generator(z))
                Discriminator_fake_loss = self.BCELoss(Discriminator_fake,
                                                       self.y_fake)

                Discriminator_loss = Discriminator_real_loss + Discriminator_fake_loss
                self.train_hist[
                    'Discriminator_loss'] = Discriminator_loss.item()

                Discriminator_loss.backward()
                self.Discriminator_optimizer.step()

                # ----- update Generator -----
                # As stated in the original paper,
                # we want to train the Generator
                # by minimizing log(1−D(G(z)))
                # in an effort to generate better fakes.
                # As mentioned, this was shown by Goodfellow
                # to not provide sufficient gradients,
                # especially early in the learning process.
                # As a fix, we instead wish to maximize log(D(G(z))).
                # ---> minimize: -log[D(G(z))]

                self.Generator_optimizer.zero_grad()
                Discriminator_fake, _ = self.Discriminator(self.Generator(z))
                Generator_loss = self.BCELoss(Discriminator_fake, self.y_real)
                self.train_hist['Generator_loss'] = Generator_loss.item()
                Generator_loss.backward()
                self.Generator_optimizer.step()

                # ----- logging by tensorboard, csv and hyperdash
                # tensorboard
                self.writer.add_scalar('loss/Generator_loss',
                                       Generator_loss.item(), global_step)
                self.writer.add_scalar('loss/Discriminator_loss',
                                       Discriminator_loss.item(), global_step)
                # csv
                with open(self.path_to_results_csv, 'a') as f:
                    result_writer = csv.DictWriter(
                        f, list(self.train_hist.keys()))
                    if epoch == 1 and idx == 0: result_writer.writeheader()
                    result_writer.writerow(self.train_hist)
                # hyperdash
                if self.exp:
                    self.exp.metric('Generator loss', Generator_loss.item())
                    self.exp.metric('Discriminator loss',
                                    Discriminator_loss.item())

                if (idx % 10) == 0:
                    self._plot_sample(global_step)
                global_step += 1

        elapsed_time = time() - self.start_time
        print('\nTraining Finish, elapsed time ---> %f' % (elapsed_time))

    def _plot_sample(self, global_step):
        with torch.no_grad():
            total_n_sample = min(self.args.n_sample, self.args.batch_size)
            image_frame_dim = int(np.floor(np.sqrt(total_n_sample)))
            samples = self.Generator(self.sample_z)
            samples = samples.cpu().data.numpy().transpose(0, 2, 3, 1)
            samples = (samples + 1) / 2
            fig = plt.figure(figsize=(24, 15))
            for i in range(image_frame_dim * image_frame_dim):
                ax = fig.add_subplot(
                    image_frame_dim,
                    image_frame_dim * 2,
                    (int(i / image_frame_dim) + 1) * image_frame_dim + i + 1,
                    xticks=[],
                    yticks=[])
                if samples[i].shape[2] == 3:
                    ax.imshow(samples[i])
                else:
                    ax.imshow(samples[i][:, :, 0], cmap='gray')
            self.writer.add_figure('sample images generated by GAN', fig,
                                   global_step)
Exemple #26
0
def main():
    start_time = time()
    args = get_args()
    if args.checkpoint_dir_name:
        dir_name = args.checkpoint_dir_name
    else:
        dir_name = datetime.datetime.now().strftime('%y%m%d%H%M%S')
    path_to_dir = Path(__file__).resolve().parents[1]
    path_to_dir = os.path.join(path_to_dir, *['log', dir_name])
    os.makedirs(path_to_dir, exist_ok=True)
    # tensorboard
    path_to_tensorboard = os.path.join(path_to_dir, 'tensorboard')
    os.makedirs(path_to_tensorboard, exist_ok=True)
    writer = SummaryWriter(path_to_tensorboard)
    # model saving
    os.makedirs(os.path.join(path_to_dir, 'model'), exist_ok=True)
    path_to_model = os.path.join(path_to_dir, *['model', 'model.tar'])
    # csv
    os.makedirs(os.path.join(path_to_dir, 'csv'), exist_ok=True)
    path_to_results_csv = os.path.join(path_to_dir, *['csv', 'results.csv'])
    path_to_args_csv = os.path.join(path_to_dir, *['csv', 'args.csv'])
    if not args.checkpoint_dir_name:
        with open(path_to_args_csv, 'a') as f:
            args_dict = vars(args)
            param_writer = csv.DictWriter(f, list(args_dict.keys()))
            param_writer.writeheader()
            param_writer.writerow(args_dict)

    # logging using hyperdash
    if not args.no_hyperdash:
        from hyperdash import Experiment
        exp = Experiment('Classification task on CIFAR10 dataset with CNN')
        for key in vars(args).keys():
            exec("args.%s = exp.param('%s', args.%s)" % (key, key, key))
    else:
        exp = None

    path_to_dataset = os.path.join(
        Path(__file__).resolve().parents[2], 'datasets')
    os.makedirs(path_to_dataset, exist_ok=True)
    train_loader, eval_loader, classes = get_loader(
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        path_to_dataset=path_to_dataset)

    # show some of the training images, for fun.
    dataiter = iter(train_loader)
    images, labels = dataiter.next()
    img_grid = torchvision.utils.make_grid(images)
    matplotlib_imshow(img_grid)
    writer.add_image('four_CIFAR10_images', img_grid)

    # define a network, loss function and optimizer
    model = CNN()
    writer.add_graph(model, images)
    model = torch.nn.DataParallel(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)
    start_epoch = 0
    # resume training
    if args.checkpoint_dir_name:
        print('\nLoading the model...')
        checkpoint = torch.load(path_to_model)
        model.state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
    summary(model, input_size=(3, 32, 32))
    model.to(args.device)

    # train the network
    print('\n--------------------')
    print('Start training and evaluating the CNN')
    for epoch in range(start_epoch, args.n_epoch):
        start_time_per_epoch = time()
        train_loss, train_acc = train(train_loader, model, criterion,
                                      optimizer, args.device, writer, epoch,
                                      classes)
        eval_loss, eval_acc = eval(eval_loader, model, criterion, args.device)
        elapsed_time_per_epoch = time() - start_time_per_epoch
        result_dict = {
            'epoch': epoch,
            'train_loss': train_loss,
            'eval_loss': eval_loss,
            'train_acc': train_acc,
            'eval_acc': eval_acc,
            'elapsed time': elapsed_time_per_epoch
        }
        with open(path_to_results_csv, 'a') as f:
            result_writer = csv.DictWriter(f, list(result_dict.keys()))
            if epoch == 0: result_writer.writeheader()
            result_writer.writerow(result_dict)
        # checkpoint
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }, path_to_model)
        if exp:
            exp.metric('train loss', train_loss)
            exp.metric('eval loss', eval_loss)
            exp.metric('train acc', train_acc)
            exp.metric('eval acc', eval_acc)
        else:
            print(result_dict)

        writer.add_scalar('loss/train_loss', train_loss,
                          epoch * len(train_loader))
        writer.add_scalar('loss/eval_loss', eval_loss,
                          epoch * len(eval_loader))
        writer.add_scalar('acc/train_acc', train_acc,
                          epoch * len(train_loader))
        writer.add_scalar('acc/eval_acc', eval_acc, epoch * len(eval_loader))

    elapsed_time = time() - start_time
    print('\nFinished Training, elapsed time ===> %f' % elapsed_time)
    if exp:
        exp.end()
    writer.close()
Exemple #27
0
class BaseTrainer(_BaseTrainer):
    """ Base trainer to make pytorch training be easier.

    Args:
        data-augmentation (bool): Crop randomly and add random noise for data augmentation.
        epoch (int): Number of epochs to train.
        opt (str): Optimization method.
        gpu (bool): Use GPU.
        seed (str): Random seed to train.
        train (str): Path to training image-pose list file.
        val (str): Path to validation image-pose list file.
        batchsize (int): Learning minibatch size.
        out (str): Output directory.
        resume (str): Initialize the trainer from given file.
            The file name is 'epoch-{epoch number}.iter'.
        resume_model (str): Load model definition file to use for resuming training
            (it\'s necessary when you resume a training).
            The file name is 'epoch-{epoch number}.model'.
        resume_opt (str): Load optimization states from this file
            (it\'s necessary when you resume a training).
            The file name is 'epoch-{epoch number}.state'.
    """

    def __init__(self, **kwargs):
        self.data_augmentation = kwargs['data_augmentation']
        self.epoch = kwargs['epoch']
        self.gpu = (kwargs['gpu'] >= 0)
        self.opt = kwargs['opt']
        self.seed = kwargs['seed']
        self.train = kwargs['train']
        self.val = kwargs['val']
        self.batchsize = kwargs['batchsize']
        self.out = kwargs['out']
        self.resume = kwargs['resume']
        self.resume_model = kwargs['resume_model']
        self.resume_opt = kwargs['resume_opt']
        self.hyperdash = kwargs['hyperdash']
        if self.hyperdash:
            self.experiment = Experiment(self.hyperdash)
            for key, val in kwargs.items():
                self.experiment.param(key, val)
        # validate arguments.
        self._validate_arguments()
        self.lowest_loss = 0
        self.device = torch.device('cuda' if kwargs['gpu'] >= 0 else 'cpu')
        #self.experiment.log_multiple_params(kwargs)
        self.dataloader = torch.utils.data.DataLoader

    def _validate_arguments(self):
        if self.seed is not None and self.data_augmentation:
            raise NotSupportedError('It is not supported to fix random seed for data augmentation.')
        if self.gpu and not torch.cuda.is_available():
            raise GPUNotFoundError('GPU is not found.')
        #for path in (self.train, self.val):
        #    if not os.path.isfile(path):
        #        raise FileNotFoundError('{0} is not found.'.format(path))
        if self.opt not in ('MomentumSGD', 'Adam'):
            raise UnknownOptimizationMethodError(
                '{0} is unknown optimization method.'.format(self.opt))
        if self.resume is not None:
            for path in (self.resume, self.resume_model, self.resume_opt):
                if not os.path.isfile(path):
                    raise FileNotFoundError('{0} is not found.'.format(path))

    # TODO: make it acceptable multiple optimizer, or define out of this trainer.
    def _get_optimizer(self, model, **kwargs):
        if self.opt == 'MomentumSGD':
            optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
        elif self.opt == "Adam":
            optimizer = optim.Adam(model.parameters())
        else:
            try:
                optimizer = getattr(optim, self.opt)(**kwargs)
            except OptimNotSupportedError:
                print("This optim is not available. See https://pytorch.org/docs/stable/optim.html")
        return optimizer

    def forward(self, batch, model, criterion):
        data, target = map(lambda d: d.to(self.device), batch)
        output = model(data)
        loss = criterion(output, target)
        return loss

    def _train(self, model, optimizer, criterion, train_iter, logger, start_time, log_interval=10):
        model.train()
        loss_sum = 0.0
        for iteration, batch in enumerate(tqdm(train_iter, desc='this epoch'), 1):
            optimizer.zero_grad()
            loss = self.forward(batch, model, criterion, isTest=False)
            loss_sum += loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 500)
            optimizer.step()
            if self.hyperdash:
                self.experiment.metric("loss", int(loss.cpu().data.numpy()), log=False)
            if iteration % log_interval == 0:
                log = 'elapsed_time: {0}, loss: {1}'.format(time.time() - start_time, loss.data[0])
                logger.write(log)
        return loss_sum / len(train_iter)

    def _test(self, model, test_iter, criterion, logger, start_time):
        model.eval()
        test_loss = 0
        for batch in test_iter:
            loss = self.forward(batch, model, criterion, isTest=True)
            print('Test loss: {}'.format(loss.data))
            test_loss += loss.item()
        test_loss /= len(test_iter)
        log = 'elapsed_time: {0}, validation/loss: {1}'.format(time.time() - start_time, test_loss)
        if self.hyperdash:
            self.experiment.metric('test_loss', int(test_loss.cpu().data.numpy()))
        logger.write(log)
        return test_loss

    def _checkpoint(self, epoch, model, optimizer, logger):
        filename = os.path.join(self.out, 'epoch-{0}'.format(epoch + 1))
        torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter')
        torch.save(model.state_dict(), filename + '.model')
        torch.save(optimizer.state_dict(), filename + '.state')

    def _best_checkpoint(self, epoch, model, optimizer, logger):
        filename = os.path.join(self.out, 'best_model')
        torch.save({'epoch': epoch + 1, 'logger': logger.state_dict()}, filename + '.iter')
        torch.save(model.state_dict(), filename + '.model')
        torch.save(optimizer.state_dict(), filename + '.state')

    def fit(self, model, train_data, val_data, criterion):
        """ Execute training """
        # set random seed.
        if self.seed is not None:
            random.seed(self.seed)
            torch.manual_seed(self.seed)
            if self.gpu:
                torch.cuda.manual_seed(self.seed)
        # initialize model to train.
        if self.resume_model:
            model.load_state_dict(torch.load(self.resume_model))
        # prepare gpu.
        if self.gpu:
            model.cuda()
        # load the datasets.
        train_iter = self.dataloader(train_data, batch_size=self.batchsize, shuffle=True)
        val_iter = self.dataloader(val_data, batch_size=3, shuffle=False)
        # set up an optimizer.
        optimizer = self._get_optimizer(model)
        if self.resume_opt:
            optimizer.load_state_dict(torch.load(self.resume_opt))
        # set intervals.
        val_interval = 3
        resume_interval = self.epoch / 10
        log_interval = 10
        # set logger and start epoch.
        logger = TrainLogger(self.out)
        start_epoch = 0
        if self.resume:
            resume = torch.load(self.resume)
            start_epoch = resume['epoch']
            logger.load_state_dict(resume['logger'])
        # start training.
        start_time = time.time()
        loss = 0
        for epoch in trange(start_epoch, self.epoch, initial=start_epoch, total=self.epoch, desc='     total'):
            self._train(model, optimizer, criterion, train_iter, log_interval, logger, start_time)
            if (epoch) % val_interval == 0:
                loss = self._test(model, val_iter, criterion, logger, start_time)
                if self.lowest_loss == 0 or self.lowest_loss > loss:
                    logger.write('Best model updated. loss: {} => {}'.format(self.lowest_loss, loss))
                    self._best_checkpoint(epoch, model, optimizer, logger)
                    self.lowest_loss = loss
            if (epoch + 1) % resume_interval == 0:
                self._checkpoint(epoch, model, optimizer, logger)

        if self.hyperdash:
            self.experiment.end()

    @staticmethod
    def get_args():
        # arg definition
        parser = argparse.ArgumentParser(
            description='Training pose net for comparison \
            between chainer and pytorch about implementing DeepPose.')
        parser.add_argument(
            '--data-augmentation', '-a', action='store_true', help='Crop randomly and add random noise for data augmentation.')
        parser.add_argument(
            '--epoch', '-e', type=int, default=100, help='Number of epochs to train.')
        parser.add_argument(
            '--opt', '-o', type=str, default='Adam',
            choices=['MomentumSGD', 'Adam'], help='Optimization method.')
        parser.add_argument(
            '--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU).')
        parser.add_argument(
            '--seed', '-s', type=int, help='Random seed to train.')
        parser.add_argument(
            '--train', type=str, default='data/train', help='Path to training image-pose list file.')
        parser.add_argument(
            '--val', type=str, default='data/test', help='Path to validation image-pose list file.')
        parser.add_argument(
            '--batchsize', type=int, default=32, help='Learning minibatch size.')
        parser.add_argument(
            '--out', default='result', help='Output directory')
        parser.add_argument(
            '--resume', default=None,
            help='Initialize the trainer from given file. \
            The file name is "epoch-{epoch number}.iter".')
        parser.add_argument(
            '--resume-model', type=str, default=None,
            help='Load model definition file to use for resuming training \
            (it\'s necessary when you resume a training). \
            The file name is "epoch-{epoch number}.mode"')
        parser.add_argument(
            '--resume-opt', type=str, default=None,
            help='Load optimization states from this file \
            (it\'s necessary when you resume a training). \
            The file name is "epoch-{epoch number}.state"')
        parser.add_argument(
            '--hyperdash', type=str, default=None,
            help='If you use hyperdash logging, enter here the name of experiment. Before using, you have to login to hyperdash with "hyperdash login --github". The default is None that means no logging with hyperdash')
        args = parser.parse_args()
        return args
# Set seeds
seed(args.seed)

state_dim = env.observation_space.shape
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# Initialize policy
policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")

replay_buffer = utils.ReplayBuffer(args.replay_buffer_max_size)

# Evaluate untrained policy
evaluations = [evaluate_policy(env, policy)]

exp.metric("rewards", evaluations[0])

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
episode_reward = None
env_counter = 0
while total_timesteps < args.max_timesteps:

    if done:

        if total_timesteps != 0:
            print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                  (total_timesteps, episode_num, episode_timesteps,
                   episode_reward))
            next_sim = robot.observe()

            variable = data_to_var(next_sim, current_real, ds.action[epi,
                                                                     frame])
            delta = double_squeeze(net.forward(variable))
            next_real = to_var(next_sim).float() + delta
            robot.set(next_real.data.cpu().numpy())

            target = to_var(ds.next_real[epi, frame], volatile=True)

            loss = loss_function(next_real, target)
            losses += loss

            diffs += F.mse_loss(
                to_var(next_sim).float(),
                to_var(ds.next_real[epi, frame])).clone().cpu().data.numpy()[0]

        exp.metric("loss episode", losses.cpu().data.numpy()[0])
        exp.metric("diff episode", diffs)
        exp.metric("epoch", epoch)

        losses.backward()
        optimizer.step()

        del losses
        del loss

    save_model(net.state_dict())

robot.close()
Exemple #30
0
class condGANTrainer(object):
    def __init__(self, output_dir, data_loader, n_words, ixtoword):
        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(output_dir, 'Model')
            self.image_dir = os.path.join(output_dir, 'Image')
            mkdir_p(self.model_dir)
            mkdir_p(self.image_dir)

        # torch.cuda.set_device(cfg.GPU_ID)
        cudnn.benchmark = True

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.max_epoch = cfg.TRAIN.MAX_EPOCH
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        self.n_words = n_words
        self.ixtoword = ixtoword
        self.data_loader = data_loader
        self.num_batches = len(self.data_loader)

        self.start_epoch = 0
        self.exp = Experiment("t2s", capture_io=False, api_key_getter=get_api_key_from_env)

    def build_models(self):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM)
        img_encoder_path = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder')
        state_dict = torch.load(img_encoder_path, map_location=lambda storage, loc: storage)
        image_encoder.load_state_dict(state_dict)
        for p in image_encoder.parameters():
            p.requires_grad = False
        print('Load image encoder from:', img_encoder_path)
        image_encoder.eval()
        image_encoder = image_encoder.to(device)

        text_encoder = RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM)
        state_dict = torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage)
        text_encoder.load_state_dict(state_dict)
        for p in text_encoder.parameters():
            p.requires_grad = False
        print('Load text encoder from:', cfg.TRAIN.NET_E)

        text_encoder.eval()
        text_encoder = text_encoder.to(device)

        netG = G_NET().to(device)
        netD = D_NET_32().to(device)

        self.netG = torch.nn.DataParallel(netG)
        self.netD = torch.nn.DataParallel(netD)

        self.optimizerG = optim.Adam(self.netG.parameters(),
                                lr=cfg.TRAIN.DISCRIMINATOR_LR,
                                betas=(0.5, 0.999))

        self.optimizerD = optim.Adam(self.netD.parameters(),
                                lr=cfg.TRAIN.GENERATOR_LR,
                                betas=(0.5, 0.999))

        epoch = 0
        if cfg.TRAIN.NET_G != '':
            state_dict = torch.load(cfg.TRAIN.NET_G, map_location=lambda storage, loc: storage)
            self.netG.load_state_dict(state_dict)
            print('Load G from: ', cfg.TRAIN.NET_G)

            istart = cfg.TRAIN.NET_G.rfind('_') + 1
            iend = cfg.TRAIN.NET_G.rfind('.')
            epoch = cfg.TRAIN.NET_G[istart:iend]
            epoch = int(epoch) + 1
            self.start_epoch = epoch

            state_dict = torch.load(cfg.TRAIN.NET_G.replace('G', 'D'), map_location=lambda storage, loc: storage)
            self.netD.load_state_dict(state_dict)
            print('Load D from: ', cfg.TRAIN.NET_G.replace('G', 'D'))


        return [text_encoder, image_encoder]

    def train(self):
        text_encoder, image_encoder = self.build_models()
        netG, netD = self.netG, self.netD
        optimizerG = self.optimizerG
        optimizerD = self.optimizerD
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        start_epoch = 0

        nz = cfg.GAN.Z_DIM
        noise = Variable(torch.FloatTensor(self.batch_size, nz)).to(device)
        fixed_noise = Variable(torch.FloatTensor(self.batch_size, nz).normal_(0, 1)).to(device)

        # gen_iterations = start_epoch * self.num_batches
        for epoch in range(start_epoch, self.max_epoch):
            epoch = epoch + self.start_epoch
            for step, data in enumerate(self.data_loader):
                start_t = time.time()

                shape, cap, cap_len, cls_id, key = data

                sorted_cap_lens, sorted_cap_indices = torch.sort(cap_len, 0, True)

                #sort
                shapes = shape[sorted_cap_indices].squeeze().to(device)
                captions = cap[sorted_cap_indices].squeeze().to(device)
                class_ids = cls_id[sorted_cap_indices].squeeze().numpy()

                hidden = text_encoder.init_hidden(self.batch_size)
                # words_embs: batch_size x nef x seq_len
                # sent_emb: batch_size x nef
                words_embs, sent_emb = text_encoder(captions, sorted_cap_lens, hidden)
                # words_embs, sent_emb = words_embs.detach(), sent_emb.detach()
                mask = (captions == 0)
                num_words = words_embs.size(2)
                if mask.size(1) > num_words:
                    mask = mask[:, :num_words]
                #######################################################
                # (2) Generate fake images
                ######################################################
                noise.data.normal_(0, 1)
                fake_shapes, mu, logvar = netG(noise, sent_emb)

                #######################################################
                # (3) Update D network
                ######################################################

                real_labels = torch.FloatTensor(self.batch_size).fill_(1).to(device)
                fake_labels = torch.FloatTensor(self.batch_size).fill_(0).to(device)

                netD.zero_grad()

                real_features = netD(shapes).to(device)
                cond_real_errD = nn.BCELoss()(real_features, real_labels)
                fake_features = netD(fake_shapes).to(device)
                cond_fake_errD = nn.BCELoss()(fake_features, fake_labels)



                errD_total = cond_real_errD + cond_fake_errD / 2.

                d_real_acu = torch.ge(real_features.squeeze(), 0.5).float()
                d_fake_acu = torch.le(fake_features.squeeze(), 0.5).float()
                d_total_acu = torch.mean(torch.cat((d_real_acu, d_fake_acu),0))

                if d_total_acu < 0.85:
                    errD_total.backward(retain_graph=True)
                    optimizerD.step()

                # #######################################################
                # # (4) Update G network: maximize log(D(G(z)))
                # ######################################################
                # # compute total loss for training G
                # step += 1
                # gen_iterations += 1

                # # do not need to compute gradient for Ds
                # # self.set_requires_grad_value(netsD, False)
                netG.zero_grad()
                # errG_total, G_logs = \
                #     generator_loss(netsD, image_encoder, fake_imgs, real_labels,
                #                    words_embs, sent_emb, match_labels, cap_lens, class_ids)

                labels = Variable(torch.LongTensor(range(self.batch_size)))
                real_labels = torch.FloatTensor(self.batch_size).fill_(1).to(device)

                real_features = netD(fake_shapes)
                cond_real_errG = nn.BCELoss()(real_features, real_labels)

                kl_loss = KL_loss(mu, logvar)
                errG_total = kl_loss + cond_real_errG



                if step % 10 == 0:
                    region_features, cnn_code = image_encoder(fake_shapes)

                    w_loss0, w_loss1, _ = words_loss(region_features, words_embs,
                                                    labels, sorted_cap_lens,
                                                    class_ids, self.batch_size)
                    w_loss = (w_loss0 + w_loss1) * \
                        cfg.TRAIN.SMOOTH.LAMBDA

                    s_loss0, s_loss1 = sent_loss(cnn_code, sent_emb,
                                                labels, class_ids, self.batch_size)
                    s_loss = (s_loss0 + s_loss1) * \
                        cfg.TRAIN.SMOOTH.LAMBDA

                    errG_total += s_loss + w_loss
                    self.exp.metric('s_loss', s_loss.item())
                    self.exp.metric('w_loss', w_loss.item())

                # print('kl: %.2f w s, %.2f %.2f, cond %.2f' % (kl_loss.item(), w_loss.item(), s_loss.item(), cond_real_errG.item()))
                # # backward and update parameters
                errG_total.backward()


                optimizerG.step()

                end_t = time.time()

                self.exp.metric('d_loss', errD_total.item())
                self.exp.metric('g_loss', errG_total.item())
                self.exp.metric('act', d_total_acu.item())

                print('''[%d/%d][%d/%d] Loss_D: %.2f Loss_G: %.2f Time: %.2fs'''
                    % (epoch, self.max_epoch, step, self.num_batches,
                        errD_total.item(), errG_total.item(),
                        end_t - start_t))

                if step % 500 == 0:
                    fullpath = '%s/lean_%d_%d.png' % (self.image_dir,epoch, step)
                    build_images(fake_shapes, captions, self.ixtoword, fullpath)

            torch.save(netG.state_dict(),'%s/netG_epoch_%d.pth' % (self.model_dir, epoch))
            torch.save(netD.state_dict(),'%s/netD_epoch_%d.pth' % (self.model_dir, epoch))
            print('Save G/Ds models.')