Exemple #1
0
def visualize_pc_with_svd():
    num_points = 1024
    model_idx = 5
    gpu_idx = 0

    original_points, _ = provider.load_single_model(model_idx=model_idx,
                                                    test_train='train',
                                                    file_idxs=0,
                                                    num_points=num_points)
    original_points = provider.rotate_point_cloud_by_angle(
        original_points, np.pi / 2)

    # #Simple plane sanity check
    # original_points = np.concatenate([np.random.rand(2, 1024), np.zeros([1, 1024])],axis=0)
    # R = np.array([[0.7071, 0, 0.7071],
    #               [0, 1, 0],
    #               [-0.7071, 0, 0.7071]])
    # original_points = np.transpose(np.dot(R ,original_points))

    original_points = np.expand_dims(original_points, 0)
    pc_util.pyplot_draw_point_cloud(original_points[0, :, :])

    sess = tf_util.get_session(gpu_idx, limit_gpu=True)
    points_pl = tf.placeholder(tf.float32, shape=(1, num_points, 3))
    svd_op = tf_util.pc_svd(points_pl)
    rotated_points = sess.run(svd_op, feed_dict={points_pl: original_points})

    pc_util.pyplot_draw_point_cloud(rotated_points[0, :, :])
    plt.show()
Exemple #2
0
    def test(self, env, num_rollouts, max_steps, render=False):
        returns = []
        observations = []
        for i in range(num_rollouts):
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = tf_util.get_session().run(
                    self.est_act, feed_dict={self.obs: (obs[None, :])})
                observations.append(obs)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if render:
                    env.render()
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('[{0}] mean return: {1:.2f}'.format(self._name,
                                                  np.mean(returns)))
        print('[{0}] std of return: {1:.2f}'.format(self._name,
                                                    np.std(returns)))

        return observations
Exemple #3
0
def visualize_fv_pc_clas():
    num_points = 1024
    n_classes = 40
    clas = 'person'
    #Create new gaussian
    subdev = 5
    variance = 0.04
    export = False
    display = True
    exp_path = '/home/itzikbs/PycharmProjects/fisherpointnet/paper_images/'

    shape_names = provider.getDataFiles( \
        os.path.join(BASE_DIR, 'data/modelnet' + str(n_classes) + '_ply_hdf5_2048/shape_names.txt'))
    shape_dict = {shape_names[i]: i for i in range(len(shape_names))}

    gmm = utils.get_grid_gmm(subdivisions=[subdev, subdev, subdev], variance=variance)
    # compute fv
    w = tf.constant(gmm.weights_, dtype=tf.float32)
    mu = tf.constant(gmm.means_, dtype=tf.float32)
    sigma = tf.constant(gmm.covariances_, dtype=tf.float32)

    for clas in shape_dict:
        points = provider.load_single_model_class(clas=clas, ind=0, test_train='train', file_idxs=0, num_points=1024,
                                                  n_classes=n_classes)
        points = np.expand_dims(points,0)

        points_tensor = tf.constant(points, dtype=tf.float32)  # convert points into a tensor
        fv_tensor = tf_util.get_fv_minmax(points_tensor, w, mu, sigma, flatten=False)

        sess = tf_util.get_session(2)
        with sess:
            fv = fv_tensor.eval()
        #
        # visualize_single_fv_with_pc(fv_train, points, label_title=clas,
        #                      fig_title='fv_pc', type='paper', pos=[750, 800, 0, 0], export=export,
        #                      filename=BASE_DIR + '/paper_images/fv_pc_' + clas)

        visualize_fv(fv, gmm, label_title=[clas], max_n_images=5, normalization=True, export=export, display=display,
                     filename=exp_path + clas+'_fv', n_scales=1, type='none', fig_title='Figure')
        visualize_pc(points, label_title=clas, fig_title='figure', export=export, filename=exp_path +clas+'_pc')
        plt.close('all')
    def call(self, dict_obs, new, istate, agent_idx, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        #feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5})

        feed1.update({self.ph_agent_idx: agent_idx})
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf_util.get_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })

        base_vpred_ext = np.ones_like(vpred_ext)

        return a[:,
                 0], vpred_int[:,
                               0], vpred_ext[:,
                                             0], nlp[:,
                                                     0], newstate, ent[:,
                                                                       0], base_vpred_ext[:,
                                                                                          0]
Exemple #5
0
    def train(self, obs_data, act_data, batch_size=64):

        print("obs_data.shape:", obs_data.shape)
        print("act_data.shape:", act_data.shape)
        n_total = obs_data.shape[0]
        assert n_total == act_data.shape[
            0], "Sizes do not match ({}vs{})".format(n_total,
                                                     act_data.shape[0])
        print("training data size = {}".format(n_total))
        iter_per_epoch = int(
            np.ceil(1.0 * self._train_epoch * n_total / batch_size))
        print("iter_per_epoch = {}".format(iter_per_epoch))
        n_epoch = 0
        while n_epoch < self._train_epoch:
            n_iter = 0
            while n_iter < iter_per_epoch:
                rand_idx = np.random.choice(
                    n_total,
                    size=batch_size,
                    replace=False if batch_size < n_total else True)
                obs_batch = obs_data[rand_idx]
                act_batch = act_data[rand_idx]
                train_loss, summary, _ = tf_util.get_session().run(
                    [self.loss, self.merged, self.train_op],
                    feed_dict={
                        self.obs: obs_batch,
                        self.exp_act: act_batch.squeeze()
                    })

                n_iter += 1
            n_epoch += 1
            print("epoch={0}/{1} loss={2:.4f}".format(n_epoch,
                                                      self._train_epoch,
                                                      train_loss))
            if self.writer is not None:
                self.writer.add_summary(summary, global_step=n_epoch)
Exemple #6
0
def predict(gmm):

    with tf.device('/gpu:' + str(GPU_IDX)):
        points_pl, normal_pl, w_pl, mu_pl, sigma_pl, n_effective_points = MODEL.placeholder_inputs(
            BATCH_SIZE, NUM_POINT, gmm, PATCH_RADIUS)

        is_training_pl = tf.placeholder(tf.bool, shape=())

        # Get model and loss
        experts_prob, n_pred, fv = MODEL.get_model(
            points_pl,
            w_pl,
            mu_pl,
            sigma_pl,
            is_training_pl,
            PATCH_RADIUS,
            original_n_points=n_effective_points,
            n_experts=N_EXPERTS,
            expert_dict=EXPERT_DICT)
        loss, cos_ang = MODEL.get_loss(n_pred,
                                       normal_pl,
                                       experts_prob,
                                       loss_type=LOSS_TYPE,
                                       n_experts=N_EXPERTS,
                                       expert_type=EXPERT_LOSS_TYPE)
        tf.summary.scalar('loss', loss)
        ops = {
            'points_pl': points_pl,
            'normal_pl': normal_pl,
            'n_effective_points': n_effective_points,
            'experts_prob': experts_prob,
            'cos_ang': cos_ang,
            'w_pl': w_pl,
            'mu_pl': mu_pl,
            'sigma_pl': sigma_pl,
            'is_training_pl': is_training_pl,
            'fv': fv,
            'n_pred': n_pred,
            'loss': loss
        }

    saver = tf.train.Saver()
    sess = tf_util.get_session(GPU_IDX, limit_gpu=True)

    flog = open(os.path.join(output_dir, 'log.txt'), 'w')

    # Restore model variables from disk.
    printout(flog, 'Loading model %s' % pretrained_model_path)
    saver.restore(sess, pretrained_model_path)
    printout(flog, 'Model restored.')

    # PCPNet data loaders
    testnset_loader, dataset = provider.get_data_loader(
        dataset_name=TEST_FILES,
        batchSize=BATCH_SIZE,
        indir=PC_PATH,
        patch_radius=PATCH_RADIUS,
        points_per_patch=NUM_POINT,
        outputs=[],
        patch_point_count_std=0,
        seed=3627473,
        identical_epochs=False,
        use_pca=False,
        patch_center='point',
        point_tuple=1,
        cache_capacity=100,
        patch_sample_order='full',
        workers=0,
        dataset_type='test',
        sparse_patches=SPARSE_PATCHES)

    is_training = False

    shape_ind = 0
    shape_patch_offset = 0
    shape_patch_count = dataset.shape_patch_count[shape_ind]
    normal_prop = np.zeros([shape_patch_count, 3])
    expert_prop = np.zeros([
        shape_patch_count,
    ], dtype=np.uint64)
    expert_prob_props = np.zeros([shape_patch_count, N_EXPERTS])
    num_batchs = len(testnset_loader)

    for batch_idx, data in enumerate(testnset_loader, 0):

        current_data = data[0]
        n_effective_points = data[-1]

        if current_data.shape[0] < BATCH_SIZE:
            # compensate for last batch
            pad_size = current_data.shape[0]
            current_data = np.concatenate([
                current_data,
                np.zeros([BATCH_SIZE - pad_size, n_rad * NUM_POINT, 3])
            ],
                                          axis=0)
            n_effective_points = np.concatenate(
                [n_effective_points,
                 np.zeros([BATCH_SIZE - pad_size, n_rad])],
                axis=0)

        feed_dict = {
            ops['points_pl']: current_data,
            ops['n_effective_points']: n_effective_points,
            ops['w_pl']: gmm.weights_,
            ops['mu_pl']: gmm.means_,
            ops['sigma_pl']: np.sqrt(gmm.covariances_),
            ops['is_training_pl']: is_training,
        }
        n_est, experts_prob = sess.run([ops['n_pred'], ops['experts_prob']],
                                       feed_dict=feed_dict)

        expert_to_use = np.argmax(experts_prob, axis=0)
        experts_prob = np.transpose(experts_prob)
        n_est = n_est[expert_to_use, range(len(expert_to_use))]

        # Save estimated normals to file
        batch_offset = 0

        print('Processing batch  [%d/%d]...' % (batch_idx, num_batchs - 1))

        while batch_offset < n_est.shape[0] and shape_ind + 1 <= len(
                dataset.shape_names):
            shape_patches_remaining = shape_patch_count - shape_patch_offset
            batch_patches_remaining = n_est.shape[0] - batch_offset

            # append estimated patch properties batch to properties for the current shape on the CPU
            normal_prop[shape_patch_offset:shape_patch_offset + min(shape_patches_remaining,
                                                                          batch_patches_remaining), :] = \
                n_est[batch_offset:batch_offset + min(shape_patches_remaining, batch_patches_remaining), :]

            expert_prop[shape_patch_offset:shape_patch_offset + min(shape_patches_remaining,
                                                                          batch_patches_remaining)] = \
                expert_to_use[batch_offset:batch_offset + min(shape_patches_remaining, batch_patches_remaining)]

            expert_prob_props[shape_patch_offset:shape_patch_offset + min(shape_patches_remaining,
                                                                          batch_patches_remaining), :] = \
                experts_prob[batch_offset:batch_offset + min(shape_patches_remaining, batch_patches_remaining), :]

            batch_offset = batch_offset + min(shape_patches_remaining,
                                              batch_patches_remaining)
            shape_patch_offset = shape_patch_offset + min(
                shape_patches_remaining, batch_patches_remaining)

            if shape_patches_remaining <= batch_patches_remaining:

                np.savetxt(
                    os.path.join(output_dir,
                                 dataset.shape_names[shape_ind] + '.normals'),
                    normal_prop)
                print('saved normals for ' + dataset.shape_names[shape_ind])
                np.savetxt(os.path.join(
                    output_dir, dataset.shape_names[shape_ind] + '.experts'),
                           expert_prop.astype(int),
                           fmt='%i')
                np.savetxt(
                    os.path.join(
                        output_dir,
                        dataset.shape_names[shape_ind] + '.experts_probs'),
                    expert_prob_props)
                print('saved experts for ' + dataset.shape_names[shape_ind])
                shape_patch_offset = 0
                shape_ind += 1
                if shape_ind < len(dataset.shape_names):
                    shape_patch_count = dataset.shape_patch_count[shape_ind]
                    normal_prop = np.zeros([shape_patch_count, 3])
                    expert_prop = np.zeros([
                        shape_patch_count,
                    ],
                                           dtype=np.uint64)
                    expert_prob_props = np.zeros(
                        [shape_patch_count, N_EXPERTS])
                sys.stdout.flush()
Exemple #7
0
def export_visualizations(gmm, log_dir):
    """
    Visualizes and saves the images of the confusion matrix and fv representations

    :param gmm: instance of sklearn GaussianMixture (GMM) object Gauassian mixture model
    :param log_dir: path to the trained model
    :return None (exports images)
    """

    # load the model
    model_checkpoint = os.path.join(log_dir, "model.ckpt")
    if not (os.path.isfile(model_checkpoint + ".meta")):
        raise ValueError("No log folder availabe with name " + str(log_dir))
    # reBuild Graph
    with tf.Graph().as_default():
        with tf.device('/gpu:' + str(GPU_INDEX)):

            points_pl, labels_pl, w_pl, mu_pl, sigma_pl, = MODEL.placeholder_inputs(
                BATCH_SIZE,
                NUM_POINT,
                gmm,
            )
            is_training_pl = tf.placeholder(tf.bool, shape=())

            # Get model and loss
            pred, fv = MODEL.get_model(points_pl,
                                       w_pl,
                                       mu_pl,
                                       sigma_pl,
                                       is_training_pl,
                                       num_classes=NUM_CLASSES)

            ops = {
                'points_pl': points_pl,
                'labels_pl': labels_pl,
                'w_pl': w_pl,
                'mu_pl': mu_pl,
                'sigma_pl': sigma_pl,
                'is_training_pl': is_training_pl,
                'pred': pred,
                'fv': fv
            }
            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()

            # Create a session
            sess = tf_util.get_session(GPU_INDEX, limit_gpu=LIMIT_GPU)

            # Restore variables from disk.
            saver.restore(sess, model_checkpoint)
            print("Model restored.")

            # Load the test data
            for fn in range(len(TEST_FILES)):
                log_string('----' + str(fn) + '-----')
                current_data, current_label = provider.loadDataFile(
                    TEST_FILES[fn])
                current_data = current_data[:, 0:NUM_POINT, :]
                current_label = np.squeeze(current_label)

                file_size = current_data.shape[0]
                num_batches = file_size / BATCH_SIZE

                for batch_idx in range(num_batches):
                    start_idx = batch_idx * BATCH_SIZE
                    end_idx = (batch_idx + 1) * BATCH_SIZE

                    feed_dict = {
                        ops['points_pl']:
                        current_data[start_idx:end_idx, :, :],
                        ops['labels_pl']: current_label[start_idx:end_idx],
                        ops['w_pl']: gmm.weights_,
                        ops['mu_pl']: gmm.means_,
                        ops['sigma_pl']: np.sqrt(gmm.covariances_),
                        ops['is_training_pl']: False
                    }

                    pred_label, fv_data = sess.run([ops['pred'], ops['fv']],
                                                   feed_dict=feed_dict)
                    pred_label = np.argmax(pred_label, 1)

                    all_fv_data = fv_data if (
                        fn == 0 and batch_idx == 0) else np.concatenate(
                            [all_fv_data, fv_data], axis=0)
                    true_labels = current_label[start_idx:end_idx] if (
                        fn == 0 and batch_idx == 0) else np.concatenate(
                            [true_labels, current_label[start_idx:end_idx]],
                            axis=0)
                    all_pred_labels = pred_label if (
                        fn == 0 and batch_idx == 0) else np.concatenate(
                            [all_pred_labels, pred_label], axis=0)

    # Export Confusion Matrix
    visualization.visualize_confusion_matrix(true_labels,
                                             all_pred_labels,
                                             classes=LABEL_MAP,
                                             normalize=False,
                                             export=True,
                                             display=False,
                                             filename=os.path.join(
                                                 log_dir, 'confusion_mat'),
                                             n_classes=NUM_CLASSES)

    # Export Fishre Vector Visualization
    label_tags = [LABEL_MAP[i] for i in true_labels]
    visualization.visualize_fv(all_fv_data,
                               gmm,
                               label_tags,
                               export=True,
                               display=False,
                               filename=os.path.join(log_dir,
                                                     'fisher_vectors'))
    # plt.show() #uncomment this to see the images in addition to saving them
    print("Confusion matrix and Fisher vectores were saved to /" +
          str(log_dir))
Exemple #8
0
def train(gmm):
    global MAX_ACCURACY, MAX_CLASS_ACCURACY
    # n_fv_features = 7 * len(gmm.weights_)

    # Build Graph, train and classify
    with tf.Graph().as_default():
        with tf.device('/gpu:' + str(GPU_INDEX)):
            points_pl, labels_pl, w_pl, mu_pl, sigma_pl = MODEL.placeholder_inputs(
                BATCH_SIZE, NUM_POINT, gmm)
            is_training_pl = tf.placeholder(tf.bool, shape=())

            # Note the global_step=batch parameter to minimize.
            # That tells the optimizer to helpfully increment the 'batch' parameter for you every time it trains.
            batch = tf.Variable(0)
            bn_decay = get_bn_decay(batch)
            tf.summary.scalar('bn_decay', bn_decay)

            # Get model and loss
            pred, fv = MODEL.get_model(points_pl,
                                       w_pl,
                                       mu_pl,
                                       sigma_pl,
                                       is_training_pl,
                                       bn_decay=bn_decay,
                                       weigth_decay=WEIGHT_DECAY,
                                       add_noise=False,
                                       num_classes=NUM_CLASSES)
            loss = MODEL.get_loss(pred, labels_pl)
            tf.summary.scalar('loss', loss)
            # Get accuracy
            correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels_pl))
            accuracy = tf.reduce_sum(tf.cast(correct,
                                             tf.float32)) / float(BATCH_SIZE)
            tf.summary.scalar('accuracy', accuracy)

            # Get training operator
            learning_rate = get_learning_rate(batch)
            tf.summary.scalar('learning_rate', learning_rate)
            if OPTIMIZER == 'momentum':
                optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                       momentum=MOMENTUM)
            elif OPTIMIZER == 'adam':
                optimizer = tf.train.AdamOptimizer(learning_rate)
            train_op = optimizer.minimize(
                loss, global_step=batch
            )  #, aggregation_method = tf.AggregationMethod.EXPERIMENTAL_TREE) #consider using: tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N

            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()

        # Create a session
        sess = tf_util.get_session(GPU_INDEX, limit_gpu=LIMIT_GPU)

        # Add summary writers
        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'),
                                             sess.graph)
        test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'))

        # Init variables
        init = tf.global_variables_initializer()
        sess.run(init, {is_training_pl: True})

        ops = {
            'points_pl': points_pl,
            'labels_pl': labels_pl,
            'w_pl': w_pl,
            'mu_pl': mu_pl,
            'sigma_pl': sigma_pl,
            'is_training_pl': is_training_pl,
            'fv': fv,
            'pred': pred,
            'loss': loss,
            'train_op': train_op,
            'merged': merged,
            'step': batch
        }

        for epoch in range(MAX_EPOCH):
            log_string('**** EPOCH %03d ****' % (epoch))
            sys.stdout.flush()

            train_one_epoch(sess, ops, gmm, train_writer)
            acc, acc_avg_cls = eval_one_epoch(sess, ops, gmm, test_writer)

            # Save the variables to disk.
            if epoch % 10 == 0:
                save_path = saver.save(sess,
                                       os.path.join(LOG_DIR, "model.ckpt"))
                log_string("Model saved in file: %s" % save_path)

            if acc > MAX_ACCURACY:
                MAX_ACCURACY = acc
                MAX_CLASS_ACCURACY = acc_avg_cls

        log_string("Best test accuracy: %f" % MAX_ACCURACY)
        log_string("Best test class accuracy: %f" % MAX_CLASS_ACCURACY)
Exemple #9
0
def train_net(model, mode, img_dir, dataset, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_epoch = 300, check_every_n = 500, loss_check_n = 10, save_model_freq = 5, batch_size = 512, lr = 0.001):
    img1 = U.get_placeholder_cached(name="img1")
    img2 = U.get_placeholder_cached(name="img2")

    vae_loss = U.mean(model.vaeloss)

    latent_z1_tp = model.latent_z1
    latent_z2_tp = model.latent_z2

    losses = [U.mean(model.vaeloss),
            U.mean(model.siam_loss),
            U.mean(model.kl_loss1),
            U.mean(model.kl_loss2),
            U.mean(model.reconst_error1),
            U.mean(model.reconst_error2),
            ]

    siam_normal = losses[1]/entangled_feat
    siam_max = U.mean(model.max_siam_loss)

    tf.summary.scalar('Total Loss', losses[0])
    tf.summary.scalar('Siam Loss', losses[1])
    tf.summary.scalar('kl1_loss', losses[2])
    tf.summary.scalar('kl2_loss', losses[3])
    tf.summary.scalar('reconst_err1', losses[4])
    tf.summary.scalar('reconst_err2', losses[5])
    tf.summary.scalar('Siam Normal', siam_normal)
    tf.summary.scalar('Siam Max', siam_max)



    compute_losses = U.function([img1, img2], vae_loss)
    optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size)

    all_var_list = model.get_trainable_variables()


    img1_var_list = all_var_list
    optimize_expr1 = optimizer.minimize(vae_loss, var_list=img1_var_list)
    merged = tf.summary.merge_all()
    train = U.function([img1, img2],
                        [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1])
    get_reconst_img = U.function([img1, img2], [model.reconst1, model.reconst2, latent_z1_tp, latent_z2_tp])
    get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp])

    cur_dir = get_cur_dir()
    chk_save_dir = os.path.join(cur_dir, chkfile_name)
    log_save_dir = os.path.join(cur_dir, logfile_name)
    validate_img_saver_dir = os.path.join(cur_dir, validatefile_name)
    if dataset == 'chairs' or dataset == 'celeba':
        test_img_saver_dir = os.path.join(cur_dir, "test_images")
        testing_img_dir = os.path.join(cur_dir, "dataset/{}/test_img".format(dataset))

    train_writer = U.summary_writer(dir = log_save_dir)

    U.initialize()

    saver, chk_file_epoch_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir)
    if dataset == 'chairs' or dataset == 'celeba':
        validate_img_saver = Img_Saver(Img_dir = validate_img_saver_dir)
    elif dataset == 'dsprites':
        validate_img_saver = BW_Img_Saver(Img_dir = validate_img_saver_dir) # Black and White, temporary usage
    else:
        warn("Unknown dataset Error")
        # break

    warn(img_dir)
    if dataset == 'chairs' or dataset == 'celeba':
        training_images_list = read_dataset(img_dir)
        n_total_train_data = len(training_images_list)
        testing_images_list = read_dataset(testing_img_dir)
        n_total_testing_data = len(testing_images_list)
    elif dataset == 'dsprites':
        cur_dir = osp.join(cur_dir, 'dataset')
        cur_dir = osp.join(cur_dir, 'dsprites')
        img_dir = osp.join(cur_dir, 'dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')
        manager = DataManager(img_dir, batch_size)
    else:
        warn("Unknown dataset Error")
        # break

    meta_saved = False

    if mode == 'train':
        for epoch_idx in range(chk_file_epoch_num+1, max_epoch):
            t_epoch_start = time.time()
            num_batch = manager.get_len()

            for batch_idx in range(num_batch):
                if dataset == 'chairs' or dataset == 'celeba':
                    idx = random.sample(range(n_total_train_data), 2*batch_size)
                    batch_files = [training_images_list[i] for i in idx]
                    [images1, images2] = load_image(dir_name = img_dir, img_names = batch_files)
                elif dataset == 'dsprites':
                    [images1, images2] = manager.get_next()
                img1, img2 = images1, images2
                [l1, l2, _, _] = get_reconst_img(img1, img2)

                [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2)

                if batch_idx % 50 == 1:
                    header("******* epoch: {}/{} batch: {}/{} *******".format(epoch_idx, max_epoch, batch_idx, num_batch))
                    warn("Total Loss: {}".format(loss0))
                    warn("Siam loss: {}".format(loss1))
                    warn("kl1_loss: {}".format(loss2))
                    warn("kl2_loss: {}".format(loss3))
                    warn("reconst_err1: {}".format(loss4))
                    warn("reconst_err2: {}".format(loss5))

                if batch_idx % check_every_n == 1:
                    if dataset == 'chairs' or dataset == 'celeba':
                        idx = random.sample(range(len(training_images_list)), 2*5)
                        validate_batch_files = [training_images_list[i] for i in idx]
                        [images1, images2] = load_image(dir_name = img_dir, img_names = validate_batch_files)
                    elif dataset == 'dsprites':
                        [images1, images2] = manager.get_next()

                    [reconst1, reconst2, _, _] = get_reconst_img(images1, images2)

                    if dataset == 'chairs':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}".format(batch_idx)

                            save_img = np.squeeze(images1[img_idx])
                            save_img = Image.fromarray(save_img)
                            img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = Image.fromarray(save_img)
                            img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)
                    elif dataset == 'celeba':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}".format(batch_idx)

                            save_img = np.squeeze(images1[img_idx])
                            save_img = Image.fromarray(save_img, 'RGB')
                            img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = Image.fromarray(save_img, 'RGB')
                            img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)
                    elif dataset == 'dsprites':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}".format(batch_idx)

                            # save_img = images1[img_idx].reshape(64, 64)
                            save_img = np.squeeze(images1[img_idx])
                            save_img = save_img.astype(np.float32)
                            img_file_name = "{}_ori.jpg".format(img_idx)
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            # save_img = reconst1[img_idx].reshape(64, 64)
                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = save_img.astype(np.float32)
                            img_file_name = "{}_rec.jpg".format(img_idx)
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                if batch_idx % loss_check_n == 1:
                    train_writer.add_summary(summary, batch_idx)

            t_epoch_end = time.time()
            t_epoch_run = t_epoch_end - t_epoch_start
            if dataset == 'dsprites':
                t_check = manager.sample_size / t_epoch_run

                warn("==========================================")
                warn("Run {} th epoch in {} sec: {} images / sec".format(epoch_idx+1, t_epoch_run, t_check))
                warn("==========================================")

            # if epoch_idx % save_model_freq == 0:
            if meta_saved == True:
                saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = False)
            else:
                print "Save  meta graph"
                saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = True)
                meta_saved = True

    # Testing
    elif mode == 'test':
        test_file_name = testing_images_list[0]
        test_img = load_single_img(dir_name = testing_img_dir, img_name = test_file_name)
        test_feature = 31
        test_variation = np.arange(-5, 5, 0.1)

        z = test(test_img)
        for idx in range(len(test_variation)):
            z_test = np.copy(z)
            z_test[0, test_feature] = z_test[0, test_feature] + test_variation[idx]
            reconst_test = test_reconst(z_test)
            test_save_img = np.squeeze(reconst_test[0])
            test_save_img = Image.fromarray(test_save_img)
            img_file_name = "test_feat_{}_var_({}).png".format(test_feature, test_variation[idx])
            test_img_saver.save(test_save_img, img_file_name, sub_dir = None)
        reconst_test = test_reconst(z)
        test_save_img = np.squeeze(reconst_test[0])
        test_save_img = Image.fromarray(test_save_img)
        img_file_name = "test_feat_{}_var_original.png".format(test_feature)
        test_img_saver.save(test_save_img, img_file_name, sub_dir = None)
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs):
    """Train a deepq model.
    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    #set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name, Num_action):

        return tf.placeholder(tf.float32, shape=[None, Num_action], name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
def train(gmm):

    # Build Graph, train and classify
    with tf.Graph().as_default():
        with tf.device('/gpu:' + str(GPU_INDEX)):

            points_pl, normal_pl, w_pl, mu_pl, sigma_pl, n_effective_points = \
                MODEL.placeholder_inputs(BATCH_SIZE, NUM_POINT, gmm, PATCH_RADIUS)
            is_training_pl = tf.placeholder(tf.bool, shape=())

            # Note the global_step=batch parameter that tells the optimizer to helpfully increment the 'batch' parameter
            # for you every time it trains.
            batch = tf.Variable(0)
            bn_decay = get_bn_decay(batch)
            tf.summary.scalar('bn_decay', bn_decay)

            # Get model and loss
            experts_prob, n_pred, fv = MODEL.get_model(
                points_pl,
                w_pl,
                mu_pl,
                sigma_pl,
                is_training_pl,
                PATCH_RADIUS,
                original_n_points=n_effective_points,
                bn_decay=bn_decay,
                weight_decay=WEIGHT_DECAY,
                n_experts=N_EXPERTS,
                expert_dict=EXPERT_DICT)
            loss, cos_ang = MODEL.get_loss(n_pred,
                                           normal_pl,
                                           experts_prob,
                                           loss_type=LOSS_TYPE,
                                           n_experts=N_EXPERTS,
                                           expert_type=EXPERT_LOSS_TYPE)
            tf.summary.scalar('loss', loss)

            # Get training operator
            learning_rate = get_learning_rate(batch)
            tf.summary.scalar('learning_rate', learning_rate)
            if OPTIMIZER == 'momentum':
                optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                       momentum=MOMENTUM)
            elif OPTIMIZER == 'adam':
                optimizer = tf.train.AdamOptimizer(learning_rate)
            train_op = optimizer.minimize(
                loss, global_step=batch
            )  #, aggregation_method = tf.AggregationMethod.EXPERIMENTAL_TREE) #consider using: tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N

            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()

        # Create a session
        sess = tf_util.get_session(GPU_INDEX, limit_gpu=LIMIT_GPU)

        # Add summary writers
        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'),
                                             sess.graph)
        test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'))

        # Init variables
        init = tf.global_variables_initializer()
        sess.run(init, {is_training_pl: True})

        ops = {
            'points_pl': points_pl,
            'normal_gt_pl': normal_pl,
            'experts_prob': experts_prob,
            'normal_pred': n_pred,
            'n_effective_points': n_effective_points,
            'w_pl': w_pl,
            'mu_pl': mu_pl,
            'sigma_pl': sigma_pl,
            'is_training_pl': is_training_pl,
            'fv': fv,
            'loss': loss,
            'cos_ang': cos_ang,
            'train_op': train_op,
            'merged': merged,
            'step': batch
        }

        trainset, _ = provider.get_data_loader(
            dataset_name=TRAIN_FILES,
            batchSize=BATCH_SIZE,
            indir=PC_PATH,
            patch_radius=PATCH_RADIUS,
            points_per_patch=NUM_POINT,
            outputs=OUTPUTS,
            patch_point_count_std=0,
            seed=3627473,
            identical_epochs=IDENTICAL_EPOCHS,
            use_pca=False,
            patch_center='point',
            point_tuple=1,
            cache_capacity=100,
            patches_per_shape=PATCHES_PER_SHAPE,
            patch_sample_order='random',
            workers=0,
            dataset_type='training')
        validationset, validation_dataset = provider.get_data_loader(
            dataset_name=VALIDATION_FILES,
            batchSize=BATCH_SIZE,
            indir=PC_PATH,
            patch_radius=PATCH_RADIUS,
            points_per_patch=NUM_POINT,
            outputs=OUTPUTS,
            patch_point_count_std=0,
            seed=3627473,
            identical_epochs=IDENTICAL_EPOCHS,
            use_pca=False,
            patch_center='point',
            point_tuple=1,
            cache_capacity=100,
            patches_per_shape=PATCHES_PER_SHAPE,
            patch_sample_order='random',
            workers=0,
            dataset_type='validation')

        for epoch in range(MAX_EPOCH):
            log_string('**** EPOCH %03d ****' % (epoch))
            sys.stdout.flush()

            train_one_epoch(sess, ops, gmm, train_writer, trainset, epoch)
            eval_one_epoch(sess, ops, gmm, test_writer, validationset,
                           validation_dataset)

            # Save the variables to disk.
            if epoch % 10 == 0:
                save_path = saver.save(sess,
                                       os.path.join(LOG_DIR, "model.ckpt"))
                log_string("Model saved in file: %s" % save_path)
Exemple #12
0
 def clone_behavior(self, obs_data, act_data):
     # initialize all variables
     tf_util.get_session().run(self.init_op)
     # training
     self.train(obs_data, act_data)
Exemple #13
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=20,
                        help='Number of expert roll outs')
    ### added for hw1
    parser.add_argument("--clone_expert", action="store_true")
    parser.add_argument("--train_epoch",
                        type=int,
                        default=10,
                        help="Number of epoches to train")
    parser.add_argument("--init_lr",
                        type=float,
                        default=0.002,
                        help="Initial learning rate")
    parser.add_argument("--reg_coef",
                        type=float,
                        default=0.0,
                        help="Coefficient for L2 regularization")
    parser.add_argument("--logdir", type=str, default="log")
    parser.add_argument("--dagger", action="store_true")
    parser.add_argument("--dagger_iter",
                        type=int,
                        default=20,
                        help="Number of dagger iterations")
    ###
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit
        returns, observations, actions = expert_test(env, args.num_rollouts,
                                                     policy_fn, max_steps,
                                                     args.render)

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }

        print('#samples={}'.format(len(actions)))

        if args.clone_expert:
            logdir = args.logdir
            logdir += "/" + args.envname + "_" + str(args.num_rollouts) + "_" +\
                        str(args.train_epoch) + "_" + str(args.init_lr) + "_" +\
                        str(args.reg_coef)
            bc = HW1_sol(args.expert_policy_file, logdir, args.train_epoch,
                         args.init_lr, args.reg_coef)
            bc.clone_behavior(expert_data["observations"],
                              expert_data["actions"])
            bc.test(env, args.num_rollouts, max_steps, args.render)
            expert_test(env, args.num_rollouts, policy_fn, max_steps,
                        args.render)

        if args.dagger:
            logdir = args.logdir
            logdir += "/DAgger_" + args.envname + "_" + str(args.num_rollouts) + "_" +\
                        str(args.train_epoch) + "_" + str(args.init_lr) + "_" +\
                        str(args.reg_coef)

            bc = HW1_sol(args.expert_policy_file, logdir, args.train_epoch,
                         args.init_lr, args.reg_coef, "BC")
            dagger = HW1_sol(args.expert_policy_file, logdir, args.train_epoch,
                             args.init_lr, args.reg_coef, "DAgger")

            writer = tf.summary.FileWriter(logdir, tf_util.get_session().graph)
            bc.writer = writer
            dagger.writer = writer

            bc_obs = expert_data["observations"]
            bc_act = expert_data["actions"]
            n_total = bc_obs.shape[0]
            n_data_each_iter = int(np.round(1.0 * n_total / args.dagger_iter))
            dagger_obs = bc_obs[:n_data_each_iter]
            dagger_act = bc_act[:n_data_each_iter]

            n_dagger_iter = 0
            while n_dagger_iter < args.dagger_iter:

                # train bc
                bc.clone_behavior(bc_obs, bc_act)
                # train dagger
                if n_dagger_iter == 0:
                    dagger.clone_behavior(dagger_obs, dagger_act)
                else:
                    dagger.train(dagger_obs, dagger_act)

                print("Test at dagger_iter={}".format(n_dagger_iter))

                # test expert
                expert_test(env, args.num_rollouts, policy_fn, max_steps,
                            args.render)
                # test bc
                bc.test(env, args.num_rollouts, max_steps, args.render)
                # test dagger
                bc_obs = dagger.test(env, args.num_rollouts, max_steps,
                                     args.render)
                bc_act = []
                for obs in bc_obs:
                    bc_act.append(policy_fn(obs[None, :]))

                bc_obs = np.array(bc_obs)
                bc_act = np.array(bc_act)
                dagger_obs = np.concatenate(
                    [dagger_obs, bc_obs[:n_data_each_iter]])
                dagger_act = np.concatenate(
                    [dagger_act, bc_act[:n_data_each_iter]])

                n_dagger_iter += 1
Exemple #14
0
    def __init__(self):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            train_model = policy(nbatch_train, nsteps, sess)


        # Create Placeholders
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped loss
        vf_losses1 = tf.square(vpred - R)
        # Clipped loss
        vf_losses2 = tf.square(vpredclipped - R)
        # Average them
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        # Calculate ratio (current policy / old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))\
        
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Calculate total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters (weights)
        params = tf.trainable_variables('ppo2_model')
        # 2. Build an optimizer (trainer)
        self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Compute the gradient using the trainer (gradient and variables to update to minimise loss)
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
    def step(self):
        #Does a rollout.
        t = self.I.step_count % self.nsteps
        epinfos = []

        self.check_goto_next_policy()

        for l in range(self.I.nlump):
            obs, prevrews, ec_rews, news, infos, ram_states, monitor_rews = self.env_get(l)




            for env_pos_in_lump, info in enumerate(infos):
                if 'episode' in info:
                    #Information like rooms visited is added to info on end of episode.
                    epinfos.append(info['episode'])
                    info_with_places = info['episode']
                    try:
                        info_with_places['places'] = info['episode']['visited_rooms']
                    except:
                        import ipdb; ipdb.set_trace()
                    self.I.buf_epinfos[env_pos_in_lump+l*self.I.lump_stride][t] = info_with_places


                    self.check_episode(env_pos_in_lump+l*self.I.lump_stride)

            sli = slice(l * self.I.lump_stride, (l + 1) * self.I.lump_stride)
            memsli = slice(None) if self.I.mem_state is NO_STATES else sli





            dict_obs = self.stochpol.ensure_observation_is_dict(obs)
            with logger.ProfileKV("policy_inference"):
                #Calls the policy and value function on current observation.
                acs, vpreds_int, vpreds_ext, nlps, self.I.mem_state[memsli], ent = self.stochpol.call(dict_obs, news, self.I.mem_state[memsli],
                                                                                                               update_obs_stats=self.update_ob_stats_every_step)
            self.env_step(l, acs)





            #Update buffer with transition.
            for k in self.stochpol.ph_ob_keys:
                self.I.buf_obs[k][sli, t] = dict_obs[k]
            self.I.buf_news[sli, t] = news
            self.I.buf_vpreds_int[sli, t] = vpreds_int
            self.I.buf_vpreds_ext[sli, t] = vpreds_ext
            self.I.buf_nlps[sli, t] = nlps
            self.I.buf_acs[sli, t] = acs
            self.I.buf_ent[sli, t] = ent

            if t > 0:

                prevrews = [self.filter_rew(prevrews[k], infos[k]['unclip_rew'], infos[k]['position'], infos[k]['open_door_type'],k)
                                for k in range(self.I.nenvs)]
    
                prevrews = np.asarray(prevrews)
                #print(prevrews)

                self.I.buf_rews_ext[sli, t-1] = prevrews
                self.I.buf_rews_ec[sli, t-1] = ec_rews

                if self.rnd_type=='oracle':
                    #buf_rews_int = [
                    #    self.I.oracle_visited_count.update_position(infos[k]['position'])
                    #    for k in range(self.I.nenvs)]

                    buf_rews_int = [
                        self.update_rnd(infos[k]['position'], k)
                        for k in range(self.I.nenvs)]

                    #print(buf_rews_int)

                    buf_rews_int = np.array(buf_rews_int)
                    self.I.buf_rews_int[sli, t-1] = buf_rews_int

    


        self.I.step_count += 1
        if t == self.nsteps - 1 and not self.disable_policy_update:
            #We need to take one extra step so every transition has a reward.
            for l in range(self.I.nlump):
                sli = slice(l * self.I.lump_stride, (l + 1) * self.I.lump_stride)
                memsli = slice(None) if self.I.mem_state is NO_STATES else sli
                nextobs, rews, ec_rews, nextnews, infos, ram_states, monitor_rews = self.env_get(l)
                dict_nextobs = self.stochpol.ensure_observation_is_dict(nextobs)
                for k in self.stochpol.ph_ob_keys:
                    self.I.buf_ob_last[k][sli] = dict_nextobs[k]
                self.I.buf_new_last[sli] = nextnews
                with logger.ProfileKV("policy_inference"):
                    _, self.I.buf_vpred_int_last[sli], self.I.buf_vpred_ext_last[sli], _, _, _ = self.stochpol.call(dict_nextobs, nextnews, self.I.mem_state[memsli], update_obs_stats=False)
                

                rews = [self.filter_rew(rews[k], infos[k]['unclip_rew'], infos[k]['position'], infos[k]['open_door_type'],k)
                                for k in range(self.I.nenvs)]
    
                rews = np.asarray(rews)

                self.I.buf_rews_ext[sli, t] = rews
                self.I.buf_rews_ec[sli, t] = ec_rews

                if self.rnd_type=='oracle':
                    #buf_rews_int = [
                    #    self.I.oracle_visited_count.update_position(infos[k]['position'])
                    #    for k in range(self.I.nenvs)]
                    
                    buf_rews_int = [
                        self.update_rnd(infos[k]['position'], k)
                        for k in range(self.I.nenvs)]

                    buf_rews_int = np.array(buf_rews_int)
                    self.I.buf_rews_int[sli, t] = buf_rews_int

            if self.rnd_type =='rnd':
                #compute RND
                fd = {}
                fd[self.stochpol.ph_ob[None]] = np.concatenate([self.I.buf_obs[None], self.I.buf_ob_last[None][:,None]], 1)
                fd.update({self.stochpol.ph_mean: self.stochpol.ob_rms.mean,
                               self.stochpol.ph_std: self.stochpol.ob_rms.var ** 0.5})
                fd[self.stochpol.ph_ac] = self.I.buf_acs
                self.I.buf_rews_int[:] = tf_util.get_session().run(self.stochpol.int_rew, fd) * self.I.buf_rews_ec
            elif self.rnd_type =='oracle':
            #compute oracle count-based reward
                fd = {}
            else:
                raise ValueError('Unknown exploration reward: {}'.format(
                  self._exploration_reward))
            #Calcuate the intrinsic rewards for the rollout (for each step).
            '''
            envsperbatch = self.I.nenvs // self.nminibatches

            #fd = {}

            
            #[nenvs, nstep+1, h,w,stack]
            #fd[self.stochpol.ph_ob[None]] = np.concatenate([self.I.buf_obs[None], self.I.buf_ob_last[None][:,None]], 1)
            start = 0
            while start < self.I.nenvs:
                end = start + envsperbatch
                mbenvinds = slice(start, end, None)
    
                fd = {}
        
                fd[self.stochpol.ph_ob[None]] = np.concatenate([self.I.buf_obs[None][mbenvinds],  self.I.buf_ob_last[None][mbenvinds, None]], 1)
    
    
                fd.update({self.stochpol.ph_mean: self.stochpol.ob_rms.mean,
                           self.stochpol.ph_std: self.stochpol.ob_rms.var ** 0.5})
                fd[self.stochpol.ph_ac] = self.I.buf_acs[mbenvinds]


    


                # if dead,  we set rew_int to zero
                #self.I.buf_rews_int[mbenvinds] = (1 -self.I.buf_news[mbenvinds]) * self.sess.run(self.stochpol.int_rew, fd)
                
                rews_int = tf_util.get_session().run(self.stochpol.int_rew, fd)
                self.I.buf_rews_int[mbenvinds] = rews_int * self.I.buf_rews_ec[mbenvinds]

                start +=envsperbatch

            '''
            if not self.update_ob_stats_every_step:
                #Update observation normalization parameters after the rollout is completed.
                obs_ = self.I.buf_obs[None].astype(np.float32)
                self.stochpol.ob_rms.update(obs_.reshape((-1, *obs_.shape[2:]))[:,:,:,-1:])
                feed = {self.stochpol.ph_mean: self.stochpol.ob_rms.mean, self.stochpol.ph_std: self.stochpol.ob_rms.var ** 0.5\
                        , self.stochpol.ph_count: self.stochpol.ob_rms.count}
                self.sess.run(self.assign_op, feed)

            if not self.testing:


                logger.info(self.I.cur_gen_idx,self.I.rews_found_by_contemporary)
                update_info = self.update()


                self.I.oracle_visited_count.sync()

                self.I.cur_oracle_visited_count.sync()
                self.I.cur_oracle_visited_count_for_next_gen.sync()

            else:
                update_info = {}
            self.I.seg_init_mem_state = copy(self.I.mem_state)
            global_i_stats = dict_gather(self.comm_log, self.I.stats, op='sum')
            global_deque_mean = dict_gather(self.comm_log, { n : np.mean(dvs) for n,dvs in self.I.statlists.items() }, op='mean')
            update_info.update(global_i_stats)
            update_info.update(global_deque_mean)
            self.global_tcount = global_i_stats['tcount']
            for infos_ in self.I.buf_epinfos:
                infos_.clear()
        else:
            update_info = {}

        #Some reporting logic.
        for epinfo in epinfos:
            if self.testing:
                self.I.statlists['eprew_test'].append(epinfo['r'])
                self.I.statlists['eplen_test'].append(epinfo['l'])
            else:
                if "visited_rooms" in epinfo:
                    self.local_rooms += list(epinfo["visited_rooms"])
                    self.local_rooms = sorted(list(set(self.local_rooms)))
                    score_multiple = self.I.venvs[0].score_multiple
                    if score_multiple is None:
                        score_multiple = 1000
                    rounded_score = int(epinfo["r"] / score_multiple) * score_multiple
                    self.scores.append(rounded_score)
                    self.scores = sorted(list(set(self.scores)))
                    self.I.statlists['eprooms'].append(len(epinfo["visited_rooms"]))

                self.I.statlists['eprew'].append(epinfo['r'])
                if self.local_best_ret is None:
                    self.local_best_ret = epinfo["r"]
                elif epinfo["r"] > self.local_best_ret:
                    self.local_best_ret = epinfo["r"]

                self.I.statlists['eplen'].append(epinfo['l'])
                self.I.stats['epcount'] += 1
                self.I.stats['tcount'] += epinfo['l']
                self.I.stats['rewtotal'] += epinfo['r']
                # self.I.stats["best_ext_ret"] = self.best_ret


        return {'update' : update_info}
    def update(self):

        #Some logic gathering best ret, rooms etc using MPI.
        temp = sum(MPI.COMM_WORLD.allgather(self.local_rooms), [])
        temp = sorted(list(set(temp)))
        self.rooms = temp

        temp = sum(MPI.COMM_WORLD.allgather(self.scores), [])
        temp = sorted(list(set(temp)))
        self.scores = temp

        temp = sum(MPI.COMM_WORLD.allgather([self.local_best_ret]), [])
        self.best_ret = max(temp)

        eprews = MPI.COMM_WORLD.allgather(np.mean(list(self.I.statlists["eprew"])))
        local_best_rets = MPI.COMM_WORLD.allgather(self.local_best_ret)
        n_rooms = sum(MPI.COMM_WORLD.allgather([len(self.local_rooms)]), [])

        if MPI.COMM_WORLD.Get_rank() == 0 and self.I.stats["n_updates"] % self.log_interval ==0: 
            logger.info("Rooms visited {}".format(self.rooms))
            logger.info("Best return {}".format(self.best_ret))
            logger.info("Best local return {}".format(sorted(local_best_rets)))
            logger.info("eprews {}".format(sorted(eprews)))
            logger.info("n_rooms {}".format(sorted(n_rooms)))
            logger.info("Extrinsic coefficient {}".format(self.ext_coeff))
            logger.info("Intrinsic coefficient {}".format(self.int_coeff))
            logger.info("Gamma {}".format(self.gamma))
            logger.info("Gamma ext {}".format(self.gamma_ext))
            logger.info("All scores {}".format(sorted(self.scores)))


        '''
        to do:  
        '''
        #Normalize intrinsic rewards.
        rffs_int = np.array([self.I.rff_int.update(rew) for rew in self.I.buf_rews_int.T])
        self.I.rff_rms_int.update(rffs_int.ravel())
        rews_int = self.I.buf_rews_int / np.sqrt(self.I.rff_rms_int.var)
        self.mean_int_rew = np.mean(rews_int)
        self.max_int_rew = np.max(rews_int)

        #Don't normalize extrinsic rewards.
        rews_ext = self.I.buf_rews_ext

        rewmean, rewstd, rewmax = self.I.buf_rews_int.mean(), self.I.buf_rews_int.std(), np.max(self.I.buf_rews_int)

        #Calculate intrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps-1, -1, -1): # nsteps-2 ... 0
            if self.use_news:
                nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last
            else:
                nextnew = 0.0 #No dones for intrinsic reward.
            nextvals = self.I.buf_vpreds_int[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_int_last
            nextnotnew = 1 - nextnew
            delta = rews_int[:, t] + self.gamma * nextvals * nextnotnew - self.I.buf_vpreds_int[:, t]
            self.I.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
        rets_int = self.I.buf_advs_int + self.I.buf_vpreds_int

        #Calculate extrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps-1, -1, -1): # nsteps-2 ... 0
            nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last
            #Use dones for extrinsic reward.
            nextvals = self.I.buf_vpreds_ext[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_ext_last
            nextnotnew = 1 - nextnew
            delta = rews_ext[:, t] + self.gamma_ext * nextvals * nextnotnew - self.I.buf_vpreds_ext[:, t]
            self.I.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam
        rets_ext = self.I.buf_advs_ext + self.I.buf_vpreds_ext

        #Combine the extrinsic and intrinsic advantages.
        self.I.buf_advs = self.int_coeff*self.I.buf_advs_int + self.ext_coeff*self.I.buf_advs_ext

        #Collects info for reporting.
        info = dict(
            advmean = self.I.buf_advs.mean(),
            advstd  = self.I.buf_advs.std(),
            retintmean = rets_int.mean(), # previously retmean
            retintstd  = rets_int.std(), # previously retstd
            retextmean = rets_ext.mean(), # previously not there
            retextstd  = rets_ext.std(), # previously not there

            rewec_mean = self.I.buf_rews_ec.mean(),
            rewec_max = np.max(self.I.buf_rews_ec),

            rewintmean_unnorm = rewmean, # previously rewmean
            rewintmax_unnorm = rewmax, # previously not there
            rewintmean_norm = self.mean_int_rew, # previously rewintmean
            rewintmax_norm = self.max_int_rew, # previously rewintmax
            rewintstd_unnorm  = rewstd, # previously rewstd
            vpredintmean = self.I.buf_vpreds_int.mean(), # previously vpredmean
            vpredintstd  = self.I.buf_vpreds_int.std(), # previously vrpedstd
            vpredextmean = self.I.buf_vpreds_ext.mean(), # previously not there
            vpredextstd  = self.I.buf_vpreds_ext.std(), # previously not there
            ev_int = np.clip(explained_variance(self.I.buf_vpreds_int.ravel(), rets_int.ravel()), -1, None),
            ev_ext = np.clip(explained_variance(self.I.buf_vpreds_ext.ravel(), rets_ext.ravel()), -1, None),
            rooms = SemicolonList(self.rooms),
            n_rooms = len(self.rooms),
            best_ret = self.best_ret,
            reset_counter = self.I.reset_counter
        )

        info['mem_available'] = psutil.virtual_memory().available

        to_record = {'acs': self.I.buf_acs,
                     'rews_int': self.I.buf_rews_int,
                     'rews_int_norm': rews_int,
                     'rews_ext': self.I.buf_rews_ext,
                     'rews_ect': self.I.buf_rews_ec,
                     'vpred_int': self.I.buf_vpreds_int,
                     'vpred_ext': self.I.buf_vpreds_ext,
                     'adv_int': self.I.buf_advs_int,
                     'adv_ext': self.I.buf_advs_ext,
                     'ent': self.I.buf_ent,
                     'ret_int': rets_int,
                     'ret_ext': rets_ext,
                     }
        if self.I.venvs[0].record_obs:
            to_record['obs'] = self.I.buf_obs[None]
        self.recorder.record(bufs=to_record,
                             infos=self.I.buf_epinfos)


        #Create feeddict for optimization.
        envsperbatch = self.I.nenvs // self.nminibatches
        ph_buf = [
            (self.stochpol.ph_ac, self.I.buf_acs),
            (self.ph_ret_int, rets_int),
            (self.ph_ret_ext, rets_ext),
            (self.ph_oldnlp, self.I.buf_nlps),
            (self.ph_adv, self.I.buf_advs),
        ]
        if self.I.mem_state is not NO_STATES:
            ph_buf.extend([
                (self.stochpol.ph_istate, self.I.seg_init_mem_state),
                (self.stochpol.ph_new, self.I.buf_news),
            ])

        verbose = False
        if verbose and self.is_log_leader:
            samples = np.prod(self.I.buf_advs.shape)
            logger.info("buffer shape %s, samples_per_mpi=%i, mini_per_mpi=%i, samples=%i, mini=%i " % (
                    str(self.I.buf_advs.shape),
                    samples, samples//self.nminibatches,
                    samples*self.comm_train_size, samples*self.comm_train_size//self.nminibatches))
            logger.info(" "*6 + fmt_row(13, self.loss_names))


        epoch = 0
        start = 0
        #Optimizes on current data for several epochs.
        while epoch < self.nepochs:
            end = start + envsperbatch
            mbenvinds = slice(start, end, None)

            fd = {ph : buf[mbenvinds] for (ph, buf) in ph_buf}
            fd.update({self.ph_lr : self.lr, self.ph_cliprange : self.cliprange})
            fd[self.stochpol.ph_ob[None]] = np.concatenate([self.I.buf_obs[None][mbenvinds], self.I.buf_ob_last[None][mbenvinds, None]], 1)
            assert list(fd[self.stochpol.ph_ob[None]].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape), \
                [fd[self.stochpol.ph_ob[None]].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape)]
            fd.update({self.stochpol.ph_mean:self.stochpol.ob_rms.mean, self.stochpol.ph_std:self.stochpol.ob_rms.var**0.5})

            ret = tf_util.get_session().run(self._losses+[self._train], feed_dict=fd)[:-1]
            if not self.testing:
                lossdict = dict(zip([n for n in self.loss_names], ret), axis=0)
            else:
                lossdict = {}
            #Synchronize the lossdict across mpi processes, otherwise weights may be rolled back on one process but not another.
            _maxkl = lossdict.pop('maxkl')
            lossdict = dict_gather(self.comm_train, lossdict, op='mean')
            maxmaxkl = dict_gather(self.comm_train, {"maxkl":_maxkl}, op='max')
            lossdict["maxkl"] = maxmaxkl["maxkl"]
            if verbose and self.is_log_leader:
                logger.info("%i:%03i %s" % (epoch, start, fmt_row(13, [lossdict[n] for n in self.loss_names])))
            start += envsperbatch
            if start == self.I.nenvs:
                epoch += 1
                start = 0

        if self.is_train_leader:
            self.I.stats["n_updates"] += 1
            info.update([('opt_'+n, lossdict[n]) for n in self.loss_names])
            tnow = time.time()
            info['tps'] = self.nsteps * self.I.nenvs / (tnow - self.I.t_last_update)
            info['time_elapsed'] = time.time() - self.t0
            self.I.t_last_update = tnow
        self.stochpol.update_normalization( # Necessary for continuous control tasks with odd obs ranges, only implemented in mlp policy,
            ob=self.I.buf_obs               # NOTE: not shared via MPI
            )
        return info
    def __init__(self, *, scope,
                 ob_space, ac_space,
                 stochpol_fn,
                 nsteps, nepochs=4, nminibatches=1,
                 gamma=0.99,
                 gamma_ext=0.99,
                 lam=0.95,
                 ent_coef=0,
                 cliprange=0.2,
                 max_grad_norm=1.0,
                 vf_coef=1.0,
                 lr=30e-5,
                 adam_hps=None,
                 testing=False,
                 comm=None, comm_train=None, use_news=False,
                 update_ob_stats_every_step=True,
                 int_coeff=None,
                 ext_coeff=None,
                 log_interval = 1,
                 only_train_r = True,
                 rnd_type = 'rnd',
                 reset=False, reset_prob=0.2,dynamics_sample=False, save_path=''
                 ):
        self.lr = lr
        self.ext_coeff = ext_coeff
        self.int_coeff = int_coeff
        self.use_news = use_news
        self.update_ob_stats_every_step = update_ob_stats_every_step
        self.abs_scope = (tf.get_variable_scope().name + '/' + scope).lstrip('/')
        
        self.rnd_type = rnd_type

        self.sess = sess = tf_util.get_session()

        self.testing = testing

        self.only_train_r = only_train_r

        self.log_interval = log_interval

        self.reset = reset
        self.reset_prob = reset_prob
        self.dynamics_sample = dynamics_sample

        self.save_path = save_path

        self.random_weight_path = '{}_{}'.format(save_path,str(1))

        self.comm_log = MPI.COMM_SELF
        if comm is not None and comm.Get_size() > 1:
            self.comm_log = comm
            assert not testing or comm.Get_rank() != 0, "Worker number zero can't be testing"
        if comm_train is not None:
            self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size()
        else:
            self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size()
        self.is_log_leader = self.comm_log.Get_rank()==0
        self.is_train_leader = self.comm_train.Get_rank()==0
        with tf.variable_scope(scope):
            self.best_ret = -np.inf
            self.local_best_ret = - np.inf
            self.rooms = []
            self.local_rooms = []
            self.scores = []
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.stochpol = stochpol_fn()
            self.nepochs = nepochs
            self.cliprange = cliprange
            self.nsteps = nsteps
            self.nminibatches = nminibatches
            self.gamma = gamma
            self.gamma_ext = gamma_ext
            self.lam = lam
            self.adam_hps = adam_hps or dict()
            self.ph_adv = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_int = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_ext = tf.placeholder(tf.float32, [None, None])
            self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
            self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
            self.ph_lr = tf.placeholder(tf.float32, [])
            self.ph_lr_pred = tf.placeholder(tf.float32, [])
            self.ph_cliprange = tf.placeholder(tf.float32, [])

            #Define loss.
            neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac)
            entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy())
            vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int))
            vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext))
            vf_loss = vf_loss_int + vf_loss_ext
            ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old
            negadv = - self.ph_adv
            pg_losses1 = negadv * ratio
            pg_losses2 = negadv * tf.clip_by_value(ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))
            ent_loss =  (- ent_coef) * entropy
            approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.ph_oldnlp))
            maxkl    = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp))
            clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_cliprange)))
            loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss

            #Create optimizer.
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.abs_scope)
            logger.info("PPO: using MpiAdamOptimizer connected to %i peers" % self.comm_train_size)
            trainer = MpiAdamOptimizer(self.comm_train, learning_rate=self.ph_lr, **self.adam_hps)
            grads_and_vars = trainer.compute_gradients(loss, params)
            grads, vars = zip(*grads_and_vars)
            if max_grad_norm:
                _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            global_grad_norm = tf.global_norm(grads)
            grads_and_vars = list(zip(grads, vars))
            self._train = trainer.apply_gradients(grads_and_vars)

        #assign ph_mean and ph_var
        self.assign_op=[]
        self.assign_op.append(self.stochpol.var_ph_mean.assign(self.stochpol.ph_mean))
        self.assign_op.append(self.stochpol.var_ph_std.assign(self.stochpol.ph_std))
        self.assign_op.append(self.stochpol.var_ph_count.assign(self.stochpol.ph_count))

        #Quantities for reporting.
        self._losses = [loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl, self.stochpol.aux_loss,
                        self.stochpol.feat_var, self.stochpol.max_feat, global_grad_norm]
        self.loss_names = ['tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl', "auxloss", "featvar",
                           "maxfeat", "gradnorm"]
        self.I = None
        self.disable_policy_update = None
        allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.abs_scope)
        if self.is_log_leader:
            tf_util.display_var_info(allvars)
        self.sess.run(tf.variables_initializer(allvars))
        sync_from_root(self.sess, allvars) #Syncs initialization across mpi workers.
        self.t0 = time.time()
        self.global_tcount = 0


        #save & load
        self.save = functools.partial(tf_util.save_state)
        self.load = functools.partial(tf_util.load_state)
Exemple #18
0
def predict(gmm):

    with tf.device('/gpu:' + str(GPU_IDX)):
        points_pl, noise_gt_pl, n_gt_pl, w_pl, mu_pl, sigma_pl, n_effective_points = MODEL.placeholder_inputs(BATCH_SIZE, NUM_POINT, gmm, PATCH_RADIUS)

        is_training_pl = tf.placeholder(tf.bool, shape=())

        # simple model
        # Get model and loss
        noise_pred, n_pred, fv = MODEL.get_model(points_pl, w_pl, mu_pl, sigma_pl, is_training_pl, PATCH_RADIUS, original_n_points=n_effective_points)
        loss, cos_ang = MODEL.get_loss(noise_pred, noise_gt_pl, n_pred, n_gt_pl)
        tf.summary.scalar('loss', loss)
        ops = {'points_pl': points_pl,
               'n_gt_pl': n_gt_pl,
               'noise_gt_pl': noise_gt_pl,
               'n_effective_points': n_effective_points,
               'cos_ang': cos_ang,
               'w_pl': w_pl,
               'mu_pl': mu_pl,
               'sigma_pl': sigma_pl,
               'is_training_pl': is_training_pl,
               'fv': fv,
               'n_pred': n_pred,
               'loss': loss
               }

    saver = tf.train.Saver()
    sess = tf_util.get_session(GPU_IDX, limit_gpu=True)

    flog = open(os.path.join(output_dir, 'log.txt'), 'w')

    # Restore model variables from disk.
    printout(flog, 'Loading model %s' % pretrained_model_path)
    saver.restore(sess, pretrained_model_path)
    printout(flog, 'Model restored.')

    # PCPNet data loaders
    testnset_loader, dataset = provider.get_data_loader(dataset_name=TEST_FILES, batchSize=BATCH_SIZE, indir=PC_PATH,
                                             patch_radius=PATCH_RADIUS,
                                             points_per_patch=NUM_POINT, outputs=['unoriented_normals', 'noise'],
                                             patch_point_count_std=0,
                                             seed=3627473, identical_epochs=False, use_pca=False, patch_center='point',
                                             point_tuple=1, cache_capacity=100,
                                             patch_sample_order='full',
                                             workers=0, dataset_type='test', sparse_patches=True)

    is_training = False

    shape_ind = 0
    shape_patch_offset = 0
    shape_patch_count = dataset.shape_patch_count[shape_ind]
    normal_prop = np.zeros([shape_patch_count, 3])

    # ang_err = []
    for batch_idx, data in enumerate(testnset_loader, 0):

        current_data = data[0]
        target = tuple(t.data.numpy() for t in data[1:-1])
        current_normals = target[0]
        current_noise = target[1]
        n_effective_points = data[-1]

        if current_data.shape[0] < BATCH_SIZE:
            # compensate for last batch
            pad_size = current_data.shape[0]
            current_data = np.concatenate([current_data,
                                           np.zeros([BATCH_SIZE - pad_size, n_rad*NUM_POINT, 3])], axis=0)
            current_normals = np.concatenate([current_normals,
                                              np.zeros([BATCH_SIZE - pad_size, 3])], axis=0)
            current_noise = np.concatenate([current_noise,
                                              np.zeros([BATCH_SIZE - pad_size])], axis=0)
            n_effective_points = np.concatenate([n_effective_points,
                                           np.zeros([BATCH_SIZE - pad_size, n_rad])], axis=0)

        feed_dict = {ops['points_pl']: current_data,
                     ops['n_gt_pl']: current_normals,
                     ops['noise_gt_pl']: current_noise,
                     ops['n_effective_points']: n_effective_points,
                     ops['w_pl']: gmm.weights_,
                     ops['mu_pl']: gmm.means_,
                     ops['sigma_pl']: np.sqrt(gmm.covariances_),
                     ops['is_training_pl']: is_training, }
        loss_val, n_est, cos_ang = sess.run([ops['loss'], ops['n_pred'], ops['cos_ang']], feed_dict=feed_dict)

        # Save estimated normals to file
        batch_offset = 0

        while batch_offset < n_est.shape[0] and shape_ind + 1 <= len(dataset.shape_names):
            shape_patches_remaining = shape_patch_count - shape_patch_offset
            batch_patches_remaining = n_est.shape[0] - batch_offset

            # append estimated patch properties batch to properties for the current shape on the CPU
            normal_prop[shape_patch_offset:shape_patch_offset + min(shape_patches_remaining,
                                                                          batch_patches_remaining), :] = \
                n_est[batch_offset:batch_offset + min(shape_patches_remaining, batch_patches_remaining), :]

            batch_offset = batch_offset + min(shape_patches_remaining, batch_patches_remaining)
            shape_patch_offset = shape_patch_offset + min(shape_patches_remaining, batch_patches_remaining)


            if shape_patches_remaining <= batch_patches_remaining:

                np.savetxt(os.path.join(output_dir, dataset.shape_names[shape_ind] + '.normals'),
                           normal_prop)
                print('saved normals for ' + dataset.shape_names[shape_ind])
                shape_patch_offset = 0
                shape_ind += 1
                if shape_ind < len(dataset.shape_names):
                    shape_patch_count = dataset.shape_patch_count[shape_ind]
                    normal_prop = np.zeros([shape_patch_count, 3])
Exemple #19
0
def train_net(model, manager, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_iter = 6000001, check_every_n = 1000, loss_check_n = 10, save_model_freq = 5000, batch_size = 32):
	img1 = U.get_placeholder_cached(name="img1")
	img2 = U.get_placeholder_cached(name="img2")


	# Testing
	# img_test = U.get_placeholder_cached(name="img_test")
	# reconst_tp = U.get_placeholder_cached(name="reconst_tp")


	vae_loss = U.mean(model.vaeloss)

	latent_z1_tp = model.latent_z1
	latent_z2_tp = model.latent_z2

	losses = [U.mean(model.vaeloss),
			U.mean(model.siam_loss),
			U.mean(model.kl_loss1), 
			U.mean(model.kl_loss2), 
			U.mean(model.reconst_error1), 
			U.mean(model.reconst_error2), 
			]

	siam_normal = losses[1]/entangled_feat		
	siam_max = U.mean(model.max_siam_loss)

	tf.summary.scalar('Total Loss', losses[0])
	tf.summary.scalar('Siam Loss', losses[1])
	tf.summary.scalar('kl1_loss', losses[2])
	tf.summary.scalar('kl2_loss', losses[3])
	tf.summary.scalar('reconst_err1', losses[4])
	tf.summary.scalar('reconst_err2', losses[5])
	tf.summary.scalar('Siam Normal', siam_normal)
	tf.summary.scalar('Siam Max', siam_max)

	# decoded_img = [model.reconst1, model.reconst2]


	compute_losses = U.function([img1, img2], vae_loss)
	lr = 0.005
	optimizer=tf.train.AdagradOptimizer(learning_rate=lr)

	all_var_list = model.get_trainable_variables()

	# print all_var_list
	img1_var_list = all_var_list
	#[v for v in all_var_list if v.name.split("/")[1].startswith("proj1") or v.name.split("/")[1].startswith("unproj1")]
	optimize_expr1 = optimizer.minimize(vae_loss, var_list=img1_var_list)
	merged = tf.summary.merge_all()
	train = U.function([img1, img2], 
						[losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1])
	get_reconst_img = U.function([img1, img2], [model.reconst1_mean, model.reconst2_mean, latent_z1_tp, latent_z2_tp])
	get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp])


	# testing
	# test = U.function([img_test], model.latent_z_test)
	# test_reconst = U.function([reconst_tp], [model.reconst_test])

	cur_dir = get_cur_dir()
	chk_save_dir = os.path.join(cur_dir, chkfile_name)
	log_save_dir = os.path.join(cur_dir, logfile_name)
	validate_img_saver_dir = os.path.join(cur_dir, validatefile_name)
	# test_img_saver_dir = os.path.join(cur_dir, "test_images")
	# testing_img_dir = os.path.join(cur_dir, "dataset/test_img")
	
	train_writer = U.summary_writer(dir = log_save_dir)


	U.initialize()

	saver, chk_file_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir)
	validate_img_saver = BW_Img_Saver(validate_img_saver_dir)

	# testing
	# test_img_saver = Img_Saver(test_img_saver_dir)

	meta_saved = False

	iter_log = []
	loss1_log = []
	loss2_log = []

	loss3_log = []

	training_images_list = manager.imgs
	# read_dataset(img_dir)
	n_total_train_data = len(training_images_list)

	# testing_images_list = read_dataset(testing_img_dir)
	# n_total_testing_data = len(testing_images_list)

	training = True
	testing = False

	if training == True:
		for num_iter in range(chk_file_num+1, max_iter):
			header("******* {}th iter: *******".format(num_iter))

			idx = random.sample(range(n_total_train_data), 2*batch_size)
			batch_files = idx
			# print batch_files
			[images1, images2] = manager.get_images(indices = idx)
			img1, img2 = images1, images2
			[l1, l2, _, _] = get_reconst_img(img1, img2)

			[loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2)	

			warn("Total Loss: {}".format(loss0))
			warn("Siam loss: {}".format(loss1))
			warn("kl1_loss: {}".format(loss2))
			warn("kl2_loss: {}".format(loss3))
			warn("reconst_err1: {}".format(loss4))
			warn("reconst_err2: {}".format(loss5))

			# warn("num_iter: {} check: {}".format(num_iter, check_every_n))
			# warn("Total Loss: {}".format(loss6))
			if num_iter % check_every_n == 1:
				header("******* {}th iter: *******".format(num_iter))
				idx = random.sample(range(len(training_images_list)), 2*5)
				[images1, images2] = manager.get_images(indices = idx)
				[reconst1, reconst2, _, _] = get_reconst_img(images1, images2)
				# for i in range(len(latent1[0])):
				# 	print "{} th: {:.2f}".format(i, np.mean(np.abs(latent1[:, i] - latent2[:, i])))
				for img_idx in range(len(images1)):
					sub_dir = "iter_{}".format(num_iter)

					save_img = images1[img_idx].reshape(64, 64)
					save_img = save_img.astype(np.float32)
					img_file_name = "{}_ori.jpg".format(img_idx)				
					validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

					save_img = reconst1[img_idx].reshape(64, 64)
					save_img = save_img.astype(np.float32)
					img_file_name = "{}_rec.jpg".format(img_idx)				
					validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

			if num_iter % loss_check_n == 1:
				train_writer.add_summary(summary, num_iter)

			if num_iter > 11 and num_iter % save_model_freq == 1:
				if meta_saved == True:
					saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = False)
				else:
					print "Save  meta graph"
					saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = True)
					meta_saved = True
Exemple #20
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')
    policy_fn = load_policy.load_policy(args.expert_policy_file)
    print('loaded and built')

    with tf.Session():
        tf_util.initialize()

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = policy_fn(obs[None, :])
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

        expert_data = {
            'observations': np.array(observations),
            'actions': np.array(actions)
        }
        obs_shape = expert_data['observations'].shape
        actions_shape = expert_data['actions'].shape
        num_examples = obs_shape[0]
        print('observations shape:{}'.format(obs_shape))
        print('actions shape:{}'.format(actions_shape))

        # define placeholders (set first dim to None to signal we want the network to be able to run any number of training examples at once)
        X = tf.placeholder(shape=(None, expert_data['observations'].shape[1]),
                           dtype=tf.float32)
        Y = tf.placeholder(shape=(None, expert_data['actions'].shape[-1]),
                           dtype=tf.float32)

        # define layers
        l1 = tf.layers.dense(X, 60, activation=tf.nn.relu)
        l1 = tf.nn.dropout(l1, 0.3)
        l2 = tf.layers.dense(l1, 60, activation=tf.nn.relu)
        l2 = tf.nn.dropout(l2, 0.3)
        l3 = tf.layers.dense(l2, 60, activation=tf.nn.relu)
        l3 = tf.nn.dropout(l3, 0.3)
        l4 = tf.layers.dense(l3, 40, activation=tf.nn.relu)
        output = tf.layers.dense(l4, Y.shape[-1], activation=None)

        cost = tf.reduce_mean(
            tf.losses.mean_squared_error(labels=Y, predictions=output))
        optimizer = tf.train.AdamOptimizer(0.01).minimize(cost)

        tf_util.initialize()
        for epoch in range(2000):
            batch_size = args.num_rollouts * 10
            for minibatch in range(int(num_examples / batch_size)):
                minibatch_X = np.reshape(
                    expert_data['observations'][batch_size *
                                                minibatch:batch_size *
                                                (minibatch + 1)],
                    (batch_size, obs_shape[1]))
                minibatch_Y = np.reshape(
                    expert_data['actions'][batch_size * minibatch:batch_size *
                                           (minibatch + 1)],
                    (batch_size, actions_shape[-1]))
                _, val = tf_util.get_session().run([optimizer, cost],
                                                   feed_dict={
                                                       X: minibatch_X,
                                                       Y: minibatch_Y
                                                   })

            if epoch % 100 == 0:
                print("epoch: {}, value: {}".format(epoch, val))

        # Test out our trained network on the same environment and report results
        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action = tf_util.get_session().run(output,
                                                   feed_dict={X: obs[None, :]})
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                if steps >= max_steps:
                    break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
Exemple #21
0
def train(*, env_id, num_env, hps, num_timesteps, seed):

    venv = VecFrameStack(
        make_atari_env(env_id,
                       num_env,
                       seed,
                       wrapper_kwargs=dict(),
                       start_index=num_env * MPI.COMM_WORLD.Get_rank(),
                       max_episode_steps=hps.pop('max_episode_steps')),
        hps.pop('frame_stack'))

    # Size of states when stored in the memory.
    only_train_r = hps.pop('only_train_r')

    online_r_training = hps.pop('online_train_r') or only_train_r

    r_network_trainer = None
    save_path = hps.pop('save_path')
    r_network_weights_path = hps.pop('r_path')
    '''
    ec_type = 'none' # hps.pop('ec_type')

    venv = CuriosityEnvWrapperFrameStack(
        make_atari_env(env_id, num_env, seed, wrapper_kwargs=dict(),
                       start_index=num_env * MPI.COMM_WORLD.Get_rank(),
                       max_episode_steps=hps.pop('max_episode_steps')),
        vec_episodic_memory = None,
        observation_embedding_fn = None,
        exploration_reward = ec_type,
        exploration_reward_min_step = 0,
        nstack = hps.pop('frame_stack'),
        only_train_r = only_train_r
        )
    '''

    # venv.score_multiple = {'Mario': 500,
    #                        'MontezumaRevengeNoFrameskip-v4': 100,
    #                        'GravitarNoFrameskip-v4': 250,
    #                        'PrivateEyeNoFrameskip-v4': 500,
    #                        'SolarisNoFrameskip-v4': None,
    #                        'VentureNoFrameskip-v4': 200,
    #                        'PitfallNoFrameskip-v4': 100,
    #                        }[env_id]
    venv.score_multiple = 1
    venv.record_obs = True if env_id == 'SolarisNoFrameskip-v4' else False
    ob_space = venv.observation_space
    ac_space = venv.action_space
    gamma = hps.pop('gamma')

    log_interval = hps.pop('log_interval')

    nminibatches = hps.pop('nminibatches')

    play = hps.pop('play')

    if play:
        nsteps = 1

    rnd_type = hps.pop('rnd_type')
    div_type = hps.pop('div_type')

    num_agents = hps.pop('num_agents')

    load_ram = hps.pop('load_ram')

    debug = hps.pop('debug')

    rnd_mask_prob = hps.pop('rnd_mask_prob')

    rnd_mask_type = hps.pop('rnd_mask_type')

    indep_rnd = hps.pop('indep_rnd')
    logger.info("indep_rnd:", indep_rnd)
    indep_policy = hps.pop('indep_policy')

    sd_type = hps.pop('sd_type')

    from_scratch = hps.pop('from_scratch')

    use_kl = hps.pop('use_kl')

    save_interval = 100

    policy = {'rnn': CnnGruPolicy, 'cnn': CnnPolicy}[hps.pop('policy')]
    agent = PpoAgent(
        scope='ppo',
        ob_space=ob_space,
        ac_space=ac_space,
        stochpol_fn=functools.partial(
            policy,
            scope='pol',
            ob_space=ob_space,
            ac_space=ac_space,
            update_ob_stats_independently_per_gpu=hps.pop(
                'update_ob_stats_independently_per_gpu'),
            proportion_of_exp_used_for_predictor_update=hps.pop(
                'proportion_of_exp_used_for_predictor_update'),
            dynamics_bonus=hps.pop("dynamics_bonus"),
            num_agents=num_agents,
            rnd_type=rnd_type,
            div_type=div_type,
            indep_rnd=indep_rnd,
            indep_policy=indep_policy,
            sd_type=sd_type,
            rnd_mask_prob=rnd_mask_prob),
        gamma=gamma,
        gamma_ext=hps.pop('gamma_ext'),
        gamma_div=hps.pop('gamma_div'),
        lam=hps.pop('lam'),
        nepochs=hps.pop('nepochs'),
        nminibatches=nminibatches,
        lr=hps.pop('lr'),
        cliprange=0.1,
        nsteps=5 if debug else 128,
        ent_coef=0.001,
        max_grad_norm=hps.pop('max_grad_norm'),
        use_news=hps.pop("use_news"),
        comm=MPI.COMM_WORLD if MPI.COMM_WORLD.Get_size() > 1 else None,
        update_ob_stats_every_step=hps.pop('update_ob_stats_every_step'),
        int_coeff=hps.pop('int_coeff'),
        ext_coeff=hps.pop('ext_coeff'),
        log_interval=log_interval,
        only_train_r=only_train_r,
        rnd_type=rnd_type,
        reset=hps.pop('reset'),
        dynamics_sample=hps.pop('dynamics_sample'),
        save_path=save_path,
        num_agents=num_agents,
        div_type=div_type,
        load_ram=load_ram,
        debug=debug,
        rnd_mask_prob=rnd_mask_prob,
        rnd_mask_type=rnd_mask_type,
        sd_type=sd_type,
        from_scratch=from_scratch,
        use_kl=use_kl,
        indep_rnd=indep_rnd)

    load_path = hps.pop('load_path')
    base_load_path = hps.pop('base_load_path')

    agent.start_interaction([venv])
    if load_path is not None:

        if play:
            agent.load(load_path)
        else:
            #agent.load(load_path)
            #agent.load_help_info(0, load_path)
            #agent.load_help_info(1, load_path)

            #load diversity agent
            #base_agent_idx = 1
            #logger.info("load base  agents weights from {}  agent {}".format(base_load_path, str(base_agent_idx)))
            #agent.load_agent(base_agent_idx, base_load_path)
            #agent.clone_baseline_agent(base_agent_idx)
            #agent.load_help_info(0, dagent_load_path)
            #agent.clone_agent(0)

            #load main agen1
            src_agent_idx = 1

            logger.info("load main agent weights from {} agent {}".format(
                load_path, str(src_agent_idx)))
            agent.load_agent(src_agent_idx, load_path)

            if indep_rnd == False:
                rnd_agent_idx = 1
            else:
                rnd_agent_idx = src_agent_idx
            #rnd_agent_idx = 0
            logger.info("load rnd weights from {} agent {}".format(
                load_path, str(rnd_agent_idx)))
            agent.load_rnd(rnd_agent_idx, load_path)
            agent.clone_agent(rnd_agent_idx,
                              rnd=True,
                              policy=False,
                              help_info=False)

            logger.info("load help info from {} agent {}".format(
                load_path, str(src_agent_idx)))
            agent.load_help_info(src_agent_idx, load_path)

            agent.clone_agent(src_agent_idx,
                              rnd=False,
                              policy=True,
                              help_info=True)

            #logger.info("load main agent weights from {} agent {}".format(load_path, str(2)))

            #load_path = '/data/xupeifrom7700_1000/seed1_log0.5_clip-0.5~0.5_3agent_hasint4_2divrew_-1~1/models'
            #agent.load_agent(1, load_path)

            #agent.clone_baseline_agent()
            #if sd_type =='sd':
            #    agent.load_sd("save_dir/models_sd_trained")

        #agent.initialize_discriminator()

    update_ob_stats_from_random_agent = hps.pop(
        'update_ob_stats_from_random_agent')
    if play == False:

        if load_path is not None:
            pass  #agent.collect_statistics_from_model()
        else:
            if update_ob_stats_from_random_agent and rnd_type == 'rnd':
                agent.collect_random_statistics(num_timesteps=128 *
                                                5 if debug else 128 * 50)
        assert len(hps) == 0, "Unused hyperparameters: %s" % list(hps.keys())

        #agent.collect_rnd_info(128*50)
        '''
        if sd_type=='sd':
            agent.train_sd(max_nepoch=300, max_neps=5)
            path = '{}_sd_trained'.format(save_path)
            logger.log("save model:",path)
            agent.save(path)

            return
            #agent.update_diverse_agent(max_nepoch=1000)
            #path = '{}_divupdated'.format(save_path)
            #logger.log("save model:",path)
            #agent.save(path)
        '''
        counter = 0
        while True:
            info = agent.step()

            n_updates = agent.I.stats["n_updates"]
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
                counter += 1

            if info['update'] and save_path is not None and (
                    n_updates % save_interval == 0 or n_updates == 1):
                path = '{}_{}'.format(save_path, str(n_updates))
                logger.log("save model:", path)
                agent.save(path)
                agent.save_help_info(save_path, n_updates)

            if agent.I.stats['tcount'] > num_timesteps:
                path = '{}_{}'.format(save_path, str(n_updates))
                logger.log("save model:", path)
                agent.save(path)
                agent.save_help_info(save_path, n_updates)
                break
        agent.stop_interaction()
    else:
        '''
        check_point_rews_list_path ='{}_rewslist'.format(load_path)
        check_point_rnd_path ='{}_rnd'.format(load_path)
        oracle_rnd = oracle.OracleExplorationRewardForAllEpisodes()
        oracle_rnd.load(check_point_rnd_path)
        #print(oracle_rnd._collected_positions_writer)
        #print(oracle_rnd._collected_positions_reader)

        rews_list = load_rews_list(check_point_rews_list_path)
        print(rews_list)
        '''

        istate = agent.stochpol.initial_state(1)
        #ph_mean, ph_std = agent.stochpol.get_ph_mean_std()

        last_obs, prevrews, ec_rews, news, infos, ram_states, _ = agent.env_get(
            0)
        agent.I.step_count += 1

        flag = False
        show_cam = True

        last_xr = 0

        restore = None
        '''
        #path = 'ram_state_500_7room'
        #path='ram_state_400_6room'
        #path='ram_state_6700' 
        path='ram_state_7700_10room'
        f = open(path,'rb')
        restore = pickle.load(f)
        f.close()
        last_obs[0] = agent.I.venvs[0].restore_full_state_by_idx(restore,0)
        print(last_obs.shape)

        #path = 'ram_state_400_monitor_rews_6room'
        #path = 'ram_state_500_monitor_rews_7room'
        #path='ram_state_6700_monitor_rews'
        path='ram_state_7700_monitor_rews_10room'
        f = open(path,'rb')
        monitor_rews = pickle.load(f)
        f.close()
        
        agent.I.venvs[0].set_cur_monitor_rewards_by_idx(monitor_rews,0)
        '''

        agent_idx = np.asarray([0])
        sample_agent_prob = np.asarray([0.5])

        ph_mean = agent.stochpol.ob_rms_list[0].mean
        ph_std = agent.stochpol.ob_rms_list[0].var**0.5

        buf_ph_mean = np.zeros(
            ([1, 1] + list(agent.stochpol.ob_space.shape[:2]) + [1]),
            np.float32)
        buf_ph_std = np.zeros(
            ([1, 1] + list(agent.stochpol.ob_space.shape[:2]) + [1]),
            np.float32)

        buf_ph_mean[0, 0] = ph_mean
        buf_ph_std[0, 0] = ph_std

        vpreds_ext_list = []

        ep_rews = np.zeros((1))
        divexp_flag = False
        step_count = 0
        stage_prob = True

        last_rew_ob = np.full_like(last_obs, 128)

        clusters = Clusters(1.0)

        #path = '{}_sd_rms'.format(load_path)
        #agent.I.sd_rms.load(path)

        while True:

            dict_obs = agent.stochpol.ensure_observation_is_dict(last_obs)

            #acs= np.random.randint(low=0, high=15, size=(1))
            acs, vpreds_int, vpreds_ext, nlps, istate, ent = agent.stochpol.call(
                dict_obs, news, istate, agent_idx[:, None])

            step_acs = acs
            t = ''
            #if show_cam==True:
            t = input("input:")
            if t != '':
                t = int(t)
                if t <= 17:
                    step_acs = [t]

            agent.env_step(0, step_acs)

            obs, prevrews, ec_rews, news, infos, ram_states, monitor_rews = agent.env_get(
                0)

            if news[0] and restore is not None:
                obs[0] = agent.I.venvs[0].restore_full_state_by_idx(restore, 0)
                agent.I.venvs[0].set_cur_monitor_rewards_by_idx(
                    monitor_rews, 0)

            ep_rews = ep_rews + prevrews

            print(ep_rews)

            last_rew_ob[prevrews > 0] = obs[prevrews > 0]

            room = infos[0]['position'][2]
            vpreds_ext_list.append([vpreds_ext, room])
            #print(monitor_rews[0])
            #print(len(monitor_rews[0]))
            #print(infos[0]['open_door_type'])

            stack_obs = np.concatenate([last_obs[:, None], obs[:, None]], 1)

            fd = {}

            fd[agent.stochpol.ph_ob[None]] = stack_obs

            fd.update({
                agent.stochpol.sep_ph_mean: buf_ph_mean,
                agent.stochpol.sep_ph_std: buf_ph_std
            })
            fd[agent.stochpol.ph_agent_idx] = agent_idx[:, None]
            fd[agent.stochpol.sample_agent_prob] = sample_agent_prob[:, None]

            fd[agent.stochpol.last_rew_ob] = last_rew_ob[:, None]
            fd[agent.stochpol.game_score] = ep_rews[:, None]

            fd[agent.stochpol.sd_ph_mean] = agent.I.sd_rms.mean
            fd[agent.stochpol.sd_ph_std] = agent.I.sd_rms.var**0.5

            div_prob = 0

            all_div_prob = tf_util.get_session().run(
                [agent.stochpol.all_div_prob], fd)
            '''
            if prevrews[0] > 0:
                clusters.update(rnd_em,room)
    
                num_clusters = len(clusters._cluster_list)
                for i in range(num_clusters):
                    print("{} {}".format(str(i),list(clusters._room_set[i])))
            '''
            print("vpreds_int: ", vpreds_int, "vpreds_ext:", vpreds_ext,
                  "ent:", ent, "all_div_prob:", all_div_prob, "room:", room,
                  "step_count:", step_count)

            #aaaa = np.asarray(vpreds_ext_list)
            #print(aaaa[-100:])
            '''
            if step_acs[0]==0:
                ram_state = ram_states[0]
                path='ram_state_7700_10room'
                f = open(path,'wb')
                pickle.dump(ram_state,f)
                f.close()

                path='ram_state_7700_monitor_rews_10room'
                f = open(path,'wb')
                pickle.dump(monitor_rews[0],f)
                f.close()
            '''
            '''
            if  restore is None:
                restore = ram_states[0]

            
            if np.random.rand() < 0.1:
                print("restore")
                obs = agent.I.venvs[0].restore_full_state_by_idx(restore,0)
                prevrews = None
                ec_rews = None
                news= True
                infos = {}
                ram_states = ram_states[0]

                #restore = ram_states[0]
            '''

            img = agent.I.venvs[0].render()

            last_obs = obs

            step_count = step_count + 1

            time.sleep(0.04)
Exemple #22
0
def main():
    import argparse
    #Parse Terminal Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('run_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts', type=int, default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('Loading and building regular policy')
    
    with open('rollouts/'+args.envname+'-'+str(args.num_rollouts)+'-expert.pkl', 'rb') as f:
        data = pickle.load(f)
        n_in, n_out = data['observations'].shape[1], data['actions'].shape[2]
    
    x, _ = pol.placeholder_inputs(None, n_in, n_out, pol.batch_size)
    policy_fn = pol.inference(x, n_in, n_out, pol.n_h1, pol.n_h2, pol.n_h3)
    saver = tf.train.Saver()
    print('Loaded and Built')

    with tf.Session():
        tf_util.initialize()
        saver.restore(tf_util.get_session(), "trained/"+args.envname)

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0
            steps = 0
            while not done:
                action = np.array(tf_util.get_session().run([policy_fn],feed_dict={x:obs[None,:]}))
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                    if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
                    if steps >= max_steps:
                        break
            returns.append(totalr)

        print('Returns', returns)
        print('Mean return', np.mean(returns))
        print('Std. of return', np.std(returns))

        reg_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
        # save regular policy observations
        with open('rollouts/'+args.envname+'-regular.pkl', 'wb+') as f:
            pickle.dump(reg_data, f)
Exemple #23
0
def mgpu_train_net(models, num_gpus, mode, img_dir, dataset, chkfile_name, logfile_name, validatefile_name, entangled_feat, max_epoch = 300, check_every_n = 500, loss_check_n = 10, save_model_freq = 5, batch_size = 512, lr = 0.001):
    img1 = U.get_placeholder_cached(name="img1")
    img2 = U.get_placeholder_cached(name="img2")

    feat_cls = U.get_placeholder_cached(name="feat_cls")

    # batch size must be multiples of ntowers (# of GPUs)
    ntowers = len(models)
    tf.assert_equal(tf.shape(img1)[0], tf.shape(img2)[0])
    tf.assert_equal(tf.floormod(tf.shape(img1)[0], ntowers), 0)

    img1splits = tf.split(img1, ntowers, 0)
    img2splits = tf.split(img2, ntowers, 0)

    tower_vae_loss = []
    tower_latent_z1_tp = []
    tower_latent_z2_tp = []
    tower_losses = []
    tower_siam_max = []
    tower_reconst1 = []
    tower_reconst2 = []
    tower_cls_loss = []
    for gid, model in enumerate(models):
        with tf.name_scope('gpu%d' % gid) as scope:
            with tf.device('/gpu:%d' % gid):

                vae_loss = U.mean(model.vaeloss)
                latent_z1_tp = model.latent_z1
                latent_z2_tp = model.latent_z2
                losses = [U.mean(model.vaeloss),
                          U.mean(model.siam_loss),
                          U.mean(model.kl_loss1),
                          U.mean(model.kl_loss2),
                          U.mean(model.reconst_error1),
                          U.mean(model.reconst_error2),
                          ]
                siam_max = U.mean(model.max_siam_loss)
                cls_loss = U.mean(model.cls_loss)

                tower_vae_loss.append(vae_loss)
                tower_latent_z1_tp.append(latent_z1_tp)
                tower_latent_z2_tp.append(latent_z2_tp)
                tower_losses.append(losses)
                tower_siam_max.append(siam_max)
                tower_reconst1.append(model.reconst1)
                tower_reconst2.append(model.reconst2)
                tower_cls_loss.append(cls_loss)

                tf.summary.scalar('Total Loss', losses[0])
                tf.summary.scalar('Siam Loss', losses[1])
                tf.summary.scalar('kl1_loss', losses[2])
                tf.summary.scalar('kl2_loss', losses[3])
                tf.summary.scalar('reconst_err1', losses[4])
                tf.summary.scalar('reconst_err2', losses[5])
                tf.summary.scalar('Siam Max', siam_max)

    vae_loss = U.mean(tower_vae_loss)
    siam_max = U.mean(tower_siam_max)
    latent_z1_tp = tf.concat(tower_latent_z1_tp, 0)
    latent_z2_tp = tf.concat(tower_latent_z2_tp, 0)
    model_reconst1 = tf.concat(tower_reconst1, 0)
    model_reconst2 = tf.concat(tower_reconst2, 0)
    cls_loss = U.mean(tower_cls_loss)

    losses = [[] for _ in range(len(losses))]
    for tl in tower_losses:
        for i, l in enumerate(tl):
            losses[i].append(l)

    losses = [U.mean(l) for l in losses]
    siam_normal = losses[1] / entangled_feat

    tf.summary.scalar('total/Total Loss', losses[0])
    tf.summary.scalar('total/Siam Loss', losses[1])
    tf.summary.scalar('total/kl1_loss', losses[2])
    tf.summary.scalar('total/kl2_loss', losses[3])
    tf.summary.scalar('total/reconst_err1', losses[4])
    tf.summary.scalar('total/reconst_err2', losses[5])
    tf.summary.scalar('total/Siam Normal', siam_normal)
    tf.summary.scalar('total/Siam Max', siam_max)

    compute_losses = U.function([img1, img2], vae_loss)

    all_var_list = model.get_trainable_variables()
    vae_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("vae")]
    cls_var_list = [v for v in all_var_list if v.name.split("/")[2].startswith("cls")]

    warn("{}".format(all_var_list))
    warn("==========================")
    warn("{}".format(vae_var_list))
    # warn("==========================")
    # warn("{}".format(cls_var_list))

    # with tf.device('/cpu:0'):
    optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size)
    optimize_expr1 = optimizer.minimize(vae_loss, var_list=vae_var_list)

    feat_cls_optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
    optimize_expr2 = feat_cls_optimizer.minimize(cls_loss, var_list=cls_var_list)


    merged = tf.summary.merge_all()
    train = U.function([img1, img2],
                        [losses[0], losses[1], losses[2], losses[3], losses[4], losses[5], latent_z1_tp, latent_z2_tp, merged], updates = [optimize_expr1])


    get_reconst_img = U.function([img1, img2], [model_reconst1, model_reconst2, latent_z1_tp, latent_z2_tp])
    get_latent_var = U.function([img1, img2], [latent_z1_tp, latent_z2_tp])

    cur_dir = get_cur_dir()
    chk_save_dir = os.path.join(cur_dir, chkfile_name)
    log_save_dir = os.path.join(cur_dir, logfile_name)
    validate_img_saver_dir = os.path.join(cur_dir, validatefile_name)
    if dataset == 'chairs' or dataset == 'celeba':
        test_img_saver_dir = os.path.join(cur_dir, "test_images")
        testing_img_dir = os.path.join(cur_dir, "dataset/{}/test_img".format(dataset))

    train_writer = U.summary_writer(dir = log_save_dir)

    U.initialize()

    saver, chk_file_epoch_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir)
    if dataset == 'chairs' or dataset == 'celeba':
        validate_img_saver = Img_Saver(Img_dir = validate_img_saver_dir)
    elif dataset == 'dsprites':
        validate_img_saver = BW_Img_Saver(Img_dir = validate_img_saver_dir) # Black and White, temporary usage
    else:
        warn("Unknown dataset Error")
        # break

    warn("dataset: {}".format(dataset))
    if dataset == 'chairs' or dataset == 'celeba':
        training_images_list = read_dataset(img_dir)
        n_total_train_data = len(training_images_list)
        testing_images_list = read_dataset(testing_img_dir)
        n_total_testing_data = len(testing_images_list)
    elif dataset == 'dsprites':
        cur_dir = osp.join(cur_dir, 'dataset')
        cur_dir = osp.join(cur_dir, 'dsprites')
        img_dir = osp.join(cur_dir, 'dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')
        manager = DataManager(img_dir, batch_size)
    else:
        warn("Unknown dataset Error")
        # break

    meta_saved = False

    if mode == 'train':
        for epoch_idx in range(chk_file_epoch_num+1, max_epoch):
            t_epoch_start = time.time()
            num_batch = manager.get_len()

            for batch_idx in range(num_batch):
                if dataset == 'chairs' or dataset == 'celeba':
                    idx = random.sample(range(n_total_train_data), 2*batch_size)
                    batch_files = [training_images_list[i] for i in idx]
                    [images1, images2] = load_image(dir_name = img_dir, img_names = batch_files)
                elif dataset == 'dsprites':
                    [images1, images2] = manager.get_next()
                img1, img2 = images1, images2
                [l1, l2, _, _] = get_reconst_img(img1, img2)

                [loss0, loss1, loss2, loss3, loss4, loss5, latent1, latent2, summary] = train(img1, img2)

                if batch_idx % 50 == 1:
                    header("******* epoch: {}/{} batch: {}/{} *******".format(epoch_idx, max_epoch, batch_idx, num_batch))
                    warn("Total Loss: {}".format(loss0))
                    warn("Siam loss: {}".format(loss1))
                    warn("kl1_loss: {}".format(loss2))
                    warn("kl2_loss: {}".format(loss3))
                    warn("reconst_err1: {}".format(loss4))
                    warn("reconst_err2: {}".format(loss5))

                if batch_idx % check_every_n == 1:
                    if dataset == 'chairs' or dataset == 'celeba':
                        idx = random.sample(range(len(training_images_list)), 2*5)
                        validate_batch_files = [training_images_list[i] for i in idx]
                        [images1, images2] = load_image(dir_name = img_dir, img_names = validate_batch_files)
                    elif dataset == 'dsprites':
                        [images1, images2] = manager.get_next()

                    [reconst1, reconst2, _, _] = get_reconst_img(images1, images2)

                    if dataset == 'chairs':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx)

                            save_img = np.squeeze(images1[img_idx])
                            save_img = Image.fromarray(save_img)
                            img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = Image.fromarray(save_img)
                            img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)
                    elif dataset == 'celeba':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx)

                            save_img = np.squeeze(images1[img_idx])
                            save_img = Image.fromarray(save_img, 'RGB')
                            img_file_name = "{}_ori.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = Image.fromarray(save_img, 'RGB')
                            img_file_name = "{}_rec.png".format(validate_batch_files[img_idx].split('.')[0])
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)
                    elif dataset == 'dsprites':
                        for img_idx in range(len(images1)):
                            sub_dir = "iter_{}_{}".format(epoch_idx, batch_idx)

                            # save_img = images1[img_idx].reshape(64, 64)
                            save_img = np.squeeze(images1[img_idx])
                            save_img = save_img.astype(np.float32)
                            img_file_name = "{}_ori.jpg".format(img_idx)
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                            # save_img = reconst1[img_idx].reshape(64, 64)
                            save_img = np.squeeze(reconst1[img_idx])
                            save_img = save_img.astype(np.float32)
                            img_file_name = "{}_rec.jpg".format(img_idx)
                            validate_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

                if batch_idx % loss_check_n == 1:
                    train_writer.add_summary(summary, batch_idx)

            t_epoch_end = time.time()
            t_epoch_run = t_epoch_end - t_epoch_start
            if dataset == 'dsprites':
                t_check = manager.sample_size / t_epoch_run

                warn("==========================================")
                warn("Run {} th epoch in {} sec: {} images / sec".format(epoch_idx+1, t_epoch_run, t_check))
                warn("==========================================")


            if meta_saved == True:
                saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = False)
            else:
                print "Save  meta graph"
                saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = epoch_idx, write_meta_graph = True)
                meta_saved = True
Exemple #24
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('expert_policy_file', type=str)
    parser.add_argument('envname', type=str)
    parser.add_argument('--render', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_rollouts',
                        type=int,
                        default=20,
                        help='Number of expert roll outs')
    args = parser.parse_args()

    print('loading and building expert policy')

    with open('dagger_database/' + args.envname, 'rb') as f:
        data = pickle.load(f)
        tempx = data['observations']
        temp = tempx.shape
        nin = temp[1]
        tempy = data['actions']
        temp = tempy.shape
        nout = temp[2]

    policy_expert = load_policy.load_policy(args.expert_policy_file)
    x, y = imit.placeholder_inputs(None, nin, nout, par.batch_size)
    policy_fn = imit.inference(x, nin, nout, par.n_h1, par.n_h2, par.n_h3)
    saver = tf.train.Saver()
    print('loaded and built')

    #init = tf.global_variables_initializer()
    with tf.Session():
        #tf_util.get_session().run(init)
        tf_util.initialize()
        saver.restore(tf_util.get_session(), "trainedNN/" + args.envname)

        import gym
        env = gym.make(args.envname)
        max_steps = args.max_timesteps or env.spec.timestep_limit

        returns = []
        observations = []
        actions = []
        actions_expert = []
        for i in range(args.num_rollouts):
            print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                action_expert = policy_expert(obs[None, :])
                action = tf_util.get_session().run([policy_fn],
                                                   feed_dict={x: obs[None, :]})
                observations.append(obs)
                actions.append(action)
                actions_expert.append(action_expert)
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                    if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
                    if steps >= max_steps:
                        break
            returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))
        '''
        expert_data = {'observations': np.array(observations),
                       'actions': np.array(actions)}
        '''
        expert_data = {
            'observations': np.concatenate((tempx, np.array(observations))),
            'actions': np.concatenate((tempy, np.array(actions_expert)))
        }
        # save expert policy observations
        with open('dagger_database/' + args.envname, 'wb') as f:
            pickle.dump(expert_data, f)
Exemple #25
0
def train_net(model, img_dir, max_iter = 100000, check_every_n = 20, save_model_freq = 1000, batch_size = 128):
	img1 = U.get_placeholder_cached(name="img1")
	img2 = U.get_placeholder_cached(name="img2")

	mean_loss1 = U.mean(model.match_error)
	mean_loss2 = U.mean(model.reconst_error1)
	mean_loss3 = U.mean(model.reconst_error2)

	decoded_img = [model.reconst1, model.reconst2]

	weight_loss = [1, 1, 1]

	compute_losses = U.function([img1, img2], [mean_loss1, mean_loss2, mean_loss3])
	lr = 0.00001
	optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon = 0.01/batch_size)

	all_var_list = model.get_trainable_variables()

	img1_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("proj1") or v.name.split("/")[1].startswith("unproj1")]
	img2_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("proj2") or v.name.split("/")[1].startswith("unproj2")]


	img1_loss = mean_loss1 + mean_loss2
	img2_loss = mean_loss1 + mean_loss3

	optimize_expr1 = optimizer.minimize(img1_loss, var_list=img1_var_list)
	optimize_expr2 = optimizer.minimize(img2_loss, var_list=img2_var_list)

	img1_train = U.function([img1, img2], [mean_loss1, mean_loss2, mean_loss3], updates = [optimize_expr1])
	img2_train = U.function([img1, img2], [mean_loss1, mean_loss2, mean_loss3], updates = [optimize_expr2])

	get_reconst_img = U.function([img1, img2], decoded_img)

	U.initialize()

	name = "test"
	cur_dir = get_cur_dir()
	chk_save_dir = os.path.join(cur_dir, "chkfiles")
	log_save_dir = os.path.join(cur_dir, "log")
	test_img_saver_dir = os.path.join(cur_dir, "test_images")

	saver, chk_file_num = U.load_checkpoints(load_requested = True, checkpoint_dir = chk_save_dir)
	test_img_saver = Img_Saver(test_img_saver_dir)

	meta_saved = False

	iter_log = []
	loss1_log = []
	loss2_log = []
	loss3_log = []

	training_images_list = read_dataset(img_dir)

	for num_iter in range(chk_file_num+1, max_iter):
		header("******* {}th iter: Img {} side *******".format(num_iter, num_iter%2 + 1))

		idx = random.sample(range(len(training_images_list)), batch_size)
		batch_files = [training_images_list[i] for i in idx]
		[images1, images2] = load_image(dir_name = img_dir, img_names = batch_files)
		img1, img2 = images1, images2
		# args = images1, images2
		if num_iter%2 == 0:
			[loss1, loss2, loss3] = img1_train(img1, img2)
		elif num_iter%2 == 1:
			[loss1, loss2, loss3] = img2_train(img1, img2)		
		warn("match_error: {}".format(loss1))
		warn("reconst_err1: {}".format(loss2))
		warn("reconst_err2: {}".format(loss3))
		warn("num_iter: {} check: {}".format(num_iter, check_every_n))
		if num_iter % check_every_n == 1:
			idx = random.sample(range(len(training_images_list)), 10)
			test_batch_files = [training_images_list[i] for i in idx]
			[images1, images2] = load_image(dir_name = img_dir, img_names = test_batch_files)
			[reconst1, reconst2] = get_reconst_img(images1, images2)
			for img_idx in range(len(images1)):
				sub_dir = "iter_{}".format(num_iter)

				save_img = np.squeeze(images1[img_idx])
				save_img = Image.fromarray(save_img)
				img_file_name = "{}_ori_2d.jpg".format(test_batch_files[img_idx])				
				test_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

				save_img = np.squeeze(images2[img_idx])
				save_img = Image.fromarray(save_img)
				img_file_name = "{}_ori_3d.jpg".format(test_batch_files[img_idx])				
				test_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

				save_img = np.squeeze(reconst1[img_idx])
				save_img = Image.fromarray(save_img)
				img_file_name = "{}_rec_2d.jpg".format(test_batch_files[img_idx])				
				test_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

				save_img = np.squeeze(reconst2[img_idx])
				save_img = Image.fromarray(save_img)
				img_file_name = "{}_rec_3d.jpg".format(test_batch_files[img_idx])				
				test_img_saver.save(save_img, img_file_name, sub_dir = sub_dir)

		if num_iter > 11 and num_iter % save_model_freq == 1:
			if meta_saved == True:
				saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = False)
			else:
				print "Save  meta graph"
				saver.save(U.get_session(), chk_save_dir + '/' + 'checkpoint', global_step = num_iter, write_meta_graph = True)
				meta_saved = True