コード例 #1
0
def restore_cpc(cpc, epoch, test_data, batch_size, n, k=1, folder=''):
    test_loss = 0
    test_loader = prep_data(test_data, batch_size, k, n)
    batch_num = test_loader[0].shape[0]
    batch_num = 1
    idx = 0
    obs, actions, obs_pos = test_loader[0][idx], test_loader[1][
        idx], test_loader[2][idx]
    obs_neg = get_neg_samples(obs, idx * batch_size, (idx + 1) * batch_size,
                              test_loader[0], n, cpc.type)
    obs, actions, obs_pos = np.concatenate(obs), np.concatenate(
        actions), np.concatenate(obs_pos)

    obs, actions, obs_pos, obs_neg = obs[:
                                         real_batch_size], actions[:
                                                                   real_batch_size], obs_pos[:
                                                                                             real_batch_size], obs_neg[:
                                                                                                                       real_batch_size]
    obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:]))

    test_loss = cpc.test(obs, obs_pos, actions, obs_neg)
    logger.logkv("cpc restored loss", test_loss)

    with open(folder + 'data.pickle', 'wb') as pickle_file:
        pickle.dump([obs, obs_pos, actions, obs_neg], pickle_file)
コード例 #2
0
def test_decoder(decoder,
                 encoder,
                 epoch,
                 test_data,
                 batch_size,
                 include_action,
                 n,
                 k=1):
    global real_batch_size
    test_loader = prep_data(test_data, batch_size, k, n, decode=True)
    total_loss = 0
    batch_num = test_loader[0].shape[0]
    for idx in range(batch_num):
        obs = test_loader[0][idx]
        obs = np.concatenate(
            obs)  #real_batch_size x fixed_num_of_contact x contact_dim
        obs = obs[:real_batch_size]
        obs = torch.from_numpy(obs)
        obs = obs.cuda()  # b x 9 * contact_dim
        recon = decoder(encoder(obs))
        object_info = test_loader[3][idx]
        object_info = torch.from_numpy(
            np.concatenate(object_info)[:real_batch_size]).cuda()
        loss = ((object_info - recon)**2).mean()
        total_loss += loss.item()
    logger.logkv("decoder testing loss", total_loss)
    return total_loss / batch_num
コード例 #3
0
 def test(self, input_data, position, log_info = '', print_msg = False):
     feed_dict = {self.input: input_data,
                  self.positions: position}
     position_loss = self.sess.run([self.position_loss], feed_dict=feed_dict)[0]
     logger.logkv(log_info + 'test_position_loss', position_loss)
     if print_msg:
         print(log_info + 'test_position_loss', position_loss)
コード例 #4
0
 def restore_predict(self, inputs, labels):
     prediction, test_loss = self.sess.run([self.output_ph,
                                            self.loss_ph],
                                           feed_dict={self.input_ph: inputs,
                                                      self.label_ph: labels})
     logger.logkv('test_loss', test_loss)
     return prediction, test_loss
コード例 #5
0
def test_cpc(cpc, epoch, test_data, batch_size, n, k=1):
    start = time.time()

    test_loss = 0
    test_loader = prep_data(test_data, batch_size, k, n)
    batch_num = test_loader[0].shape[0]
    batch_num = 20
    for idx in range(batch_num):
        obs, actions, obs_pos = test_loader[0][idx], test_loader[1][
            idx], test_loader[2][idx]
        obs_neg = get_neg_samples(obs, idx * batch_size,
                                  (idx + 1) * batch_size, test_loader[0], n,
                                  cpc.type)  # n x 9 * contact_dim
        obs, actions, obs_pos = np.concatenate(obs), np.concatenate(
            actions), np.concatenate(obs_pos)  # b x 9 * contact_dim

        obs_neg = obs_neg[:
                          real_batch_size]  # n x fixed_num_of_contact * contact_dim
        obs, actions, obs_pos = obs[:
                                    real_batch_size], actions[:
                                                              real_batch_size], obs_pos[:
                                                                                        real_batch_size]
        obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:]))

        loss = cpc.test_encoder(obs, obs_pos, actions, obs_neg)
        test_loss += loss

    test_loss /= batch_num
    logger.logkv("cpc testing loss", test_loss)
    logger.logkv("cpc testing time", time.time() - start)
コード例 #6
0
def train_cpc(cpc, epoch, train_data, batch_size, n, k=1):
    """predict the next k steps. """
    start = time.time()
    train_losses = []
    train_loader = prep_data(train_data, batch_size, k, n)
    batch_num = train_loader[0].shape[0]
    batch_num = 100
    for idx in range(batch_num):
        obs, actions, obs_pos = train_loader[0][idx], train_loader[1][
            idx], train_loader[2][idx]
        obs_neg = get_neg_samples(obs, idx * batch_size,
                                  (idx + 1) * batch_size, train_loader[0], n,
                                  cpc.type)
        obs, actions, obs_pos = np.concatenate(obs), np.concatenate(
            actions), np.concatenate(
                obs_pos)  # b x fixed_num_of_contact * contact_dim
        obs_neg = obs_neg[:
                          real_batch_size]  # b x n x fixed_num_of_contact * contact_dim
        obs, actions, obs_pos = obs[:
                                    real_batch_size], actions[:
                                                              real_batch_size], obs_pos[:
                                                                                        real_batch_size]
        obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:]))
        loss = cpc.train_encoder(obs, obs_pos, actions, obs_neg)
        train_losses.append(loss)

    losses = np.mean(train_losses[-50:])
    logger.logkv("cpc training loss", losses)
    logger.logkv("cpc training time", time.time() - start)
コード例 #7
0
 def log_diagnostics(self, paths, prefix=''):
     """
     Log extra information per iteration based on the collected paths
     """
     log_stds = np.vstack(
         [path["agent_infos"]["log_std"] for path in paths])
     logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
コード例 #8
0
    def train(self, input_data, position):
        feed_dict = {self.input: input_data,
                     self.positions: position}

        for _ in range(20):
            position_loss, _ = self.sess.run([self.position_loss,
                                             self.pos_op], feed_dict=feed_dict)
        # predictions = self.sess.run([self.predicted_pos], feed_dict=feed_dict)[0]
        logger.logkv('train_position_loss', position_loss)
コード例 #9
0
 def test(self, input_data, position, rot_matrix):
     feed_dict = {
         self.input: input_data,
         self.rotations: rot_matrix,  #batch*3*3
         self.positions: position
     }
     position_loss, rotation_loss = self.sess.run(
         [self.position_cls_loss, self.rotation_cls_loss],
         feed_dict=feed_dict)
     logger.logkv('test_position_loss', position_loss)
     logger.logkv('test_rotation_loss', rotation_loss)
コード例 #10
0
def test_cpc(encoder,
             trans,
             epoch,
             test_data,
             batch_size,
             n,
             k=1,
             include_action=True):
    global real_batch_size
    start = time.time()
    encoder.eval()
    trans.eval()

    test_loss = 0
    test_loader = prep_data(test_data, batch_size, k, n)
    batch_num = test_loader[0].shape[0]
    batch_num = 20
    for idx in range(batch_num):
        obs, obs_pos = test_loader[0][idx], test_loader[2][idx]
        obs, obs_pos = np.concatenate(obs), np.concatenate(
            obs_pos)  #real_batch_size x fixed_num_of_contact x contact_dim
        obs, obs_pos = obs[:real_batch_size], obs_pos[:real_batch_size]
        obs_neg = get_neg_samples(
            idx * batch_size, (idx + 1) * batch_size,
            test_loader[0],
            n=n,
            b=real_batch_size)  # b x n x fixed_num_of_contact x contact_dim
        obs, obs_pos, obs_neg = torch.from_numpy(obs), torch.from_numpy(
            obs_pos), torch.from_numpy(obs_neg)
        obs, obs_pos = obs.cuda(), obs_pos.cuda(),  # b x 9 * contact_dim
        obs_neg = obs_neg.cuda()  # (b x n) x 9 * contact_dim
        if include_action:
            actions = test_loader[1][idx]
            actions = np.concatenate(actions)[:real_batch_size]
            actions = torch.from_numpy(actions)
            actions = actions.cuda()
            loss = compute_cpc_loss(obs,
                                    obs_pos,
                                    obs_neg,
                                    encoder,
                                    trans,
                                    actions=actions)
        else:
            loss = compute_cpc_loss(obs,
                                    obs_pos,
                                    obs_neg,
                                    encoder,
                                    trans,
                                    actions=None)
        test_loss += loss.item()
    avg_loss = test_loss / batch_num
    logger.logkv("cpc testing loss", avg_loss)
    logger.logkv("cpc testing time", time.time() - start)
コード例 #11
0
 def train(self, input_data, position, rot_matrix):
     feed_dict = {
         self.input: input_data,
         self.rotations: rot_matrix,  #batch*3*3
         self.positions: position
     }
     position_loss, _, rotation_loss, _ = self.sess.run([
         self.position_cls_loss, self.pos_op, self.rotation_cls_loss,
         self.rot_op
     ],
                                                        feed_dict=feed_dict)
     logger.logkv('train_position_loss', position_loss)
     logger.logkv('train_rotation_loss', rotation_loss)
コード例 #12
0
 def optimize_policy(self, buffer, timestep, grad_steps, log=True):
     sess = tf.get_default_session()
     for i in range(grad_steps):
         value_dict = buffer.random_batch(self.batch_size)
         feed_dict = create_feed_dict(placeholder_dict=self.op_phs_dict,
                                      value_dict=value_dict)
         sess.run(self.training_ops, feed_dict)
         if log:
             diagnostics = sess.run({**self.diagnostics_ops}, feed_dict)
             for k, v in diagnostics.items():
                 logger.logkv(k, v)
         if timestep % self.target_update_interval == 0:
             self._update_target()
コード例 #13
0
 def restore_predict(self, inputs, position, rot_matrix):
     pos, rot, position_loss, rotation_loss = self.sess.run(
         [
             self.predicted_pos_ph, self.predicted_rot_ph,
             self.position_loss_ph, self.rotation_loss_ph
         ],
         feed_dict={
             self.input_ph: inputs,
             self.rotations_ph: rot_matrix,  #batch*3*3
             self.positions_ph: position
         })
     logger.logkv('restore_position_loss', position_loss)
     logger.logkv('restore_rotation_loss', rotation_loss)
コード例 #14
0
def main(**kwargs):
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + str(kwargs['seed'])
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)

    with sess.as_default() as sess:
        folder = './data/policy/' + kwargs['env']
        paths = pickle.load(open(folder + '/paths.pickle', 'rb'))
        niters = paths.get_current_episode_size() // 100
        train_data, test_data = split_data(paths, niters)

        dimo = train_data[0]['o'].shape[-1]

        dims = [dimo]
        env = gym.make(kwargs['env'],
                       obs_type=kwargs['obs_type'],
                       fixed_num_of_contact=kwargs['fixed_num_of_contact'])

        feature_net = FeatureNet(
            dims,
            fixed_num_of_contact=kwargs['fixed_num_of_contact'],
            contact_dim=env.contact_dim,
            sess=sess,
            output=kwargs['prediction'],
            process_type=kwargs['process_type'],
            feature_dim=kwargs['feature_dim'],
            feature_layer=kwargs['feature_layer'])

        sess.run(tf.global_variables_initializer())
        for i in range(niters):
            start = timer.time()
            feature_net.train(train_data[i])
            feature_net.test(test_data[i])
            logger.logkv("iter", i)
            logger.logkv("iter_time", timer.time() - start)
            logger.dumpkvs()
            if i == 0:
                sess.graph.finalize()
コード例 #15
0
def test_decoder(decoder, epoch, test_data, batch_size, n, k=1):
    start = time.time()
    decoder_test_loss = 0
    test_loader = prep_data(test_data, batch_size, k, n, decode = True)
    batch_num = test_loader[0].shape[0]
    batch_num = 50
    for idx in range(batch_num):
        obs = test_loader[0][idx]
        object_info = test_loader[3][idx]
        obs = np.concatenate(obs)[:real_batch_size] # b x 9 * contact_dim
        object_info = np.concatenate(object_info)[:real_batch_size]

        loss = decoder.test(obs, object_info)
        decoder_test_loss += loss
    decoder_test_loss /= batch_num
    logger.logkv("decoder testing loss", decoder_test_loss)
    logger.logkv("decoder testing time", time.time() - start)
コード例 #16
0
def train_decoder(decoder, epoch, train_data, batch_size, n, k=1):
    """predict the next k steps. """
    start = time.time()
    train_decoder_losses = []
    train_loader = prep_data(train_data, batch_size, k, n, decode = True)
    batch_num = train_loader[0].shape[0]
    batch_num = 300
    for idx in range(batch_num):
        obs = train_loader[0][idx]
        object_info = train_loader[3][idx]
        obs = np.concatenate(obs)[:real_batch_size]
        object_info = np.concatenate(object_info)[:real_batch_size]

        decoder_loss = decoder.train(obs, object_info)
        train_decoder_losses.append(decoder_loss)
    avg_loss = np.mean(train_decoder_losses[-50:])
    logger.logkv("decoder training loss", avg_loss)
    logger.logkv("decoder training time", time.time() - start)
コード例 #17
0
 def test(self, data):
     feed_dict = {self.o_tf: data['o'].reshape((-1, self.dimo))}
     accuracy = self.sess.run([self.pred_loss], feed_dict=feed_dict)
     logger.logkv('test_pred_loss', accuracy[0])
コード例 #18
0
 def train(self, data):
     feed_dict = {self.o_tf: data['o'].reshape((-1, self.dimo))}
     loss, _ = self.sess.run([self.total_loss, self.op],
                             feed_dict=feed_dict)
     logger.logkv('train_classify_loss', loss)
コード例 #19
0
    def obtain_samples(self,
                       log=False,
                       log_prefix='',
                       random=False,
                       deterministic=False,
                       eval=False,
                       multiple_trajectory=1,
                       dynamics_model=None):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        multiple_trajectories = []

        for _ in range(multiple_trajectory):
            paths = []
            n_samples = 0
            running_paths = _get_empty_running_paths_dict()

            if log: pbar = ProgBar(self.total_samples)
            policy_time, env_time = 0, 0

            policy = self.policy
            policy.reset(dones=[True])

            # initial reset of meta_envs
            obs = np.asarray(self.env.reset())

            ts = 0

            while n_samples < self.total_samples:

                # execute policy
                t = time.time()
                if eval:
                    H = self.mpc.horizon
                    mean_list = []
                    std_list = []
                    observation = obs
                    for t in range(H + 1):
                        action, agent_info = policy.get_action(observation)
                        action = agent_info['mean']
                        mean_list.append(action)
                        std_list.append(agent_info['log_std'])
                        if self.policy.squashed:
                            action = np.tanh(action)
                        if observation.ndim == 1:
                            observation = observation[None]
                        if action.ndim == 1:
                            action = action[None]
                        observation = dynamics_model.predict(
                            observation, action)
                        observation = observation.reshape((-1))
                    action, _ = self.mpc.get_actions(obs[None], mean_list,
                                                     std_list)
                    if action.ndim == 2:
                        action = action[0]
                else:
                    obs = obs.reshape((-1))
                    if random:
                        action = self.env.action_space.sample()
                        agent_info = {}
                    elif deterministic:
                        action, agent_info = policy.get_action(obs)
                        action = agent_info['mean']
                        if self.policy.squashed:
                            action = np.tanh(action)
                    else:
                        action, agent_info = policy.get_action(obs)
                        if action.ndim == 2:
                            action = action[0]
                    policy_time += time.time() - t

                # step environments
                t = time.time()
                next_obs, reward, done, env_info = self.env.step(action)

                ts += 1

                env_time += time.time() - t

                new_samples = 0

                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths["observations"].append(obs)
                running_paths["actions"].append(action)
                running_paths["rewards"].append(reward)
                running_paths["dones"].append(done)
                running_paths["env_infos"].append(env_info)
                running_paths["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done or ts >= self.max_path_length:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths["observations"]),
                            actions=np.asarray(running_paths["actions"]),
                            rewards=np.asarray(running_paths["rewards"]),
                            dones=np.asarray(running_paths["dones"]),
                            env_infos=[],
                            agent_infos=[],
                            # env_infos=utils.stack_tensor_dict_list(running_paths["env_infos"]),
                            # agent_infos=utils.stack_tensor_dict_list(running_paths["agent_infos"]),
                        ))
                    new_samples += len(running_paths["rewards"])
                    running_paths = _get_empty_running_paths_dict()

                if done or ts >= self.max_path_length:
                    next_obs = self.env.reset()
                    ts = 0

                if log: pbar.update(new_samples)
                n_samples += new_samples
                obs = next_obs
            multiple_trajectories.append(paths)
        if log: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return multiple_trajectories
コード例 #20
0
    def _log_path_stats(self,
                        multiple_trajectories,
                        log=False,
                        log_prefix='',
                        return_avg_return=False,
                        trajectory_num=1):
        # compute log stats
        trajectory_num = len(multiple_trajectories)
        if trajectory_num == 1:
            paths = multiple_trajectories[0]
            average_discounted_return = np.mean(
                [path["returns"][0] for path in paths])
            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            if log == 'reward':
                logger.logkv(log_prefix + 'AverageReturn',
                             np.mean(undiscounted_returns))

            elif log == 'all' or log is True:
                logger.logkv(log_prefix + 'AverageDiscountedReturn',
                             average_discounted_return)
                logger.logkv(log_prefix + 'AverageReturn',
                             np.mean(undiscounted_returns))
                logger.logkv(log_prefix + 'NumTrajs', len(paths))
                logger.logkv(log_prefix + 'StdReturn',
                             np.std(undiscounted_returns))
                logger.logkv(log_prefix + 'MaxReturn',
                             np.max(undiscounted_returns))
                logger.logkv(log_prefix + 'MinReturn',
                             np.min(undiscounted_returns))

            return np.mean(undiscounted_returns)
        else:
            lst = [
                np.mean([path["returns"][0] for path in paths])
                for paths in multiple_trajectories
            ]
            average_discounted_return = sum(lst) / len(lst)
            lst = [[sum(path["rewards"]) for path in paths]
                   for paths in multiple_trajectories]
            maxreturn = [np.max(r) for r in lst]
            minreturn = [np.min(r) for r in lst]
            stdreturn = [np.std(r) for r in lst]
            meanreturn = [np.mean(r) for r in lst]

            if log == 'reward':
                logger.logkv(log_prefix + 'AverageReturn',
                             np.mean(undiscounted_returns))

            elif log == 'all' or log is True:
                logger.logkv(log_prefix + 'AverageDiscountedReturn',
                             average_discounted_return)
                logger.logkv(log_prefix + 'AverageReturn', np.mean(meanreturn))
                logger.logkv(
                    log_prefix + 'NumTrajs',
                    np.mean([len(paths) for paths in multiple_trajectories]))
                logger.logkv(log_prefix + 'StdReturn', np.mean(stdreturn))
                logger.logkv(log_prefix + 'MaxReturn', np.mean(maxreturn))
                logger.logkv(log_prefix + 'MinReturn', np.mean(minreturn))

            return np.mean(meanreturn)
コード例 #21
0
def main(**kwargs):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get(
        'gpu_frac', 0.95)
    sess = tf.Session(config=config)
    exp_dir = os.getcwd() + '/data/feature_net/' + kwargs['input_label'][
        0] + kwargs['output_label'][0] + '/'
    mode = kwargs['mode'][0]

    if mode == 'restore':
        rotation_saver = tf.train.import_meta_graph(exp_dir + '-999.meta')
        rotation_saver.restore(sess, tf.train.latest_checkpoint(exp_dir))
        graph = tf.get_default_graph()

    with sess.as_default() as sess:

        input_label = kwargs['input_label'][0]
        output_label = kwargs['output_label'][0]
        buffer = {}
        name = '1'
        paths, fixed_num_of_contact = pickle.load(
            open(
                '../saved/trained/SoftHandManipulateEgg-v080-' + name +
                '-dict.pickle', 'rb'))
        for key in paths:
            buffer[key] = paths[key]

        for name in [str(i) for i in range(2, 17)]:
            paths, fixed_num_of_contact = pickle.load(
                open(
                    '../saved/trained/SoftHandManipulateEgg-v080-' + name +
                    '-dict.pickle', 'rb'))
            for key in paths:
                buffer[key] = np.concatenate([buffer[key], paths[key]], axis=0)

        env = gym.make(kwargs['env'][0],
                       obs_type=kwargs['obs_type'][0],
                       fixed_num_of_contact=fixed_num_of_contact)
        batch_size = 100
        paths = data_filter(buffer, fixed_num_of_contact, batch_size)
        niters = paths['positions'].shape[0] // batch_size
        print("total iteration: ", niters)
        print("total number of data: ", paths['positions'].shape[0])

        train_data, test_data, _, _ = split_data(paths, niters)
        train_data['object_position'] = train_data['object_position'][:, :, :3]
        test_data['object_position'] = test_data['object_position'][:, :, :3]

        labels_to_dims = {}
        labels_to_dims['positions'] = 3

        rotation_model = RotationModel(
            dims=[labels_to_dims[input_label]],
            sess=sess,
            fixed_num_of_contact=fixed_num_of_contact,
            feature_layers=kwargs['feature_layers'][0],
            output_layers=kwargs['output_layers'][0],
            learning_rate=kwargs['learning_rate'][0])

        if mode == 'train':
            sess.run(tf.global_variables_initializer())
            for i in range(niters):
                input, out = train_data[input_label][i], train_data[
                    output_label][i]
                pred = rotation_model.train(input, out)
                logger.logkv("iter", i)
                logger.dumpkvs()
            rotation_model.save_model(exp_dir, 999)

        if mode == 'restore':
            rotation_model.restore()
            for i in range(1):
                logger.logkv("iter", i)
                _, _ = rotation_model.restore_predict(
                    train_data[input_label][i], train_data[output_label][i])
                logger.dumpkvs()
コード例 #22
0
 def test(self, input_data, labels):
     feed_dict = {self.input: input_data,
                  self.labels: labels}
     accuracy = self.sess.run([self.geodesic_loss], feed_dict=feed_dict)[0]
     logger.logkv('test_pred_loss', accuracy)
コード例 #23
0
def main(**kwargs):
    exp_dir = os.getcwd(
    ) + '/cpc_model/' + kwargs['process_type'][0] + '/n200-8'
    logger.configure(dir=exp_dir,
                     format_strs=['stdout', 'log', 'csv'],
                     snapshot_mode='last')
    json.dump(kwargs,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    obs, acts, fixed_num_of_contact = pickle.load(
        open('../untrained/HandManipulateEgg-v0/5seeds-dict.pickle', 'rb'))

    include_action = kwargs['include_action'][0]

    env = gym.make(kwargs['env'][0],
                   obs_type=kwargs['obs_type'][0],
                   fixed_num_of_contact=[fixed_num_of_contact, True])

    ngeoms = env.sim.model.ngeom
    obs, object_info = expand_data(obs, ngeoms, fixed_num_of_contact)
    next_obs = obs[:, 1:]
    obs = obs[:, :-1]
    N, L, _, contact_point_dim = obs.shape
    N, L, action_dim = acts.shape

    obs_dim = (fixed_num_of_contact, contact_point_dim)

    z_dim = 8
    lr = 1e-3
    epochs = 100
    batch_size = 2
    n = 200
    k = 1

    encoder = Encoder(z_dim, obs_dim[1], fixed_num_of_contact).cuda()
    if include_action:
        trans = Transition(z_dim, action_dim).cuda()
    else:
        trans = Transition(z_dim, 0).cuda()
    decoder = Decoder(z_dim, 3).cuda()

    optim_cpc = optim.Adam(list(encoder.parameters()) +
                           list(trans.parameters()),
                           lr=lr)
    optim_dec = optim.Adam(decoder.parameters(), lr=lr)
    train_data, test_data = split_data([obs, acts, next_obs])

    for epoch in range(epochs):
        train_cpc(encoder, trans, optim_cpc, epoch, train_data, batch_size, n,
                  k, include_action)
        test_cpc(encoder, trans, epoch, test_data, batch_size, n, k,
                 include_action)

        logger.logkv("epoch", epoch)
        logger.dumpkvs()

    train_data, test_data = split_data([obs, acts, next_obs, object_info])
    for epoch in range(100):
        train_decoder(decoder,
                      encoder,
                      optim_dec,
                      epoch,
                      train_data,
                      batch_size,
                      include_action,
                      n,
                      k=1)
        test_decoder(decoder,
                     encoder,
                     epoch,
                     test_data,
                     batch_size,
                     include_action,
                     n,
                     k=1)
        logger.logkv("epoch", epoch)
        logger.dumpkvs()
コード例 #24
0
def main(**kwargs):
    z_dim = kwargs['z_dim']
    trans_mode = kwargs['trans_mode']
    epochs = kwargs['epochs']
    include_action = kwargs['include_action']
    label = kwargs['label']

    dataset = kwargs['data_path']
    feature_dims = kwargs['feature_dims']
    mode = kwargs['mode']
    n = kwargs['n']
    k = kwargs['k']
    encoder_lr = kwargs['encoder_lr']
    decoder_lr = kwargs['decoder_lr']
    decoder_feature_dims = kwargs['decoder_feature_dims']
    process_type = kwargs['process_type']

    if kwargs['data_path'] == '../dataset/sequence/HandManipulateEgg-v0/5seeds-dict.pickle':
        kwargs['dataset'] = 'trained_5seeds'
    elif kwargs['data_path'] == '../dataset/untrained/HandManipulateEgg-v0/5seeds-dict.pickle':
        kwargs['dataset'] = 'untrained_5seeds'
    elif kwargs['data_path'] == '../dataset/HandManipulateEgg-v09-dict.pickle':
        kwargs['dataset'] = 'trained_1seed'
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + str(kwargs['seed'])
    if kwargs['debug']:
        save_dir = '../saved_cpc/' + str(label) + '/' +  str(kwargs['normalize_data']) + '/' + str(process_type)+ '/trained/debug'
        # save_dir = '../saved_cpc/' + str(label) + '/' + str(process_type)+ '/trained/debug'
    else:
        save_dir = '../saved_cpc/' + str(label) + '/' +  str(kwargs['normalize_data']) + '/' + str(process_type)+ '/trained'
        # save_dir = '../saved_cpc/' + str(label) + '/' + str(process_type)+ '/trained'
    logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last')
    json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95)
    sess = tf.Session(config=config)

    obs, acts, fixed_num_of_contact = pickle.load(open(dataset, 'rb'))

    env = gym.make(kwargs['env'],
                   obs_type = kwargs['obs_type'],
                   fixed_num_of_contact = [fixed_num_of_contact, True])

    ngeoms = env.sim.model.ngeom
    obs, object_info = expand_data(obs, ngeoms, fixed_num_of_contact)
    if kwargs['normalize_data']:
        obs = normalize_obs(obs)
    next_obs = obs[:, 1:]
    obs = obs[:, :-1]
    N, L, _, contact_point_dim = obs.shape
    N, L, action_dim = acts.shape

    obs_dim = (fixed_num_of_contact, contact_point_dim)
    train_data, test_data = split_data([obs, acts, next_obs, object_info])

    batch_size = 2

    if mode in ['restore', 'store_weights']:
        saver = tf.train.import_meta_graph(save_dir + '-999.meta')
        pur_save_dir = save_dir[:-8]
        saver.restore(sess, tf.train.latest_checkpoint(pur_save_dir))
        graph = tf.get_default_graph()

    with sess.as_default() as sess:
        encoder = Encoder(z_dim,
                          fixed_num_of_contact,
                          contact_point_dim,
                          feature_dims)
        trans = Transition(z_dim, action_dim, mode = trans_mode)
        cpc = CPC(sess,
                  encoder,
                  trans,
                  encoder_lr,
                  fixed_num_of_contact,
                  contact_point_dim,
                  action_dim,
                  include_action = include_action,
                  type = 1*(label=='cpc1') + 2*(label=='cpc2'),
                  n_neg = n,
                  process_type = process_type,
                  mode = mode)

        cpc_epochs, decoder_epochs = epochs
        if mode == 'train':
            sess.run(tf.global_variables_initializer())
            logger.log("training started")
            for epoch in range(cpc_epochs):
                # train_cpc(cpc, epoch, train_data, batch_size, n, k)
                test_cpc(cpc, epoch, test_data, batch_size, n, k)

                logger.logkv("epoch", epoch)
                logger.dumpkvs()
            cpc.save_model(save_dir, 999)

            """decoder"""
            logger.log("Done with cpc training.")

            decoder = Decoder(cpc,
                              sess,
                              z_dim,
                              decoder_feature_dims,
                              fixed_num_of_contact,
                              contact_point_dim,
                              decoder_lr)
            uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.variables_initializer(uninit_vars))
            for epoch in range(decoder_epochs):
                train_decoder(decoder, epoch, train_data, batch_size, n, k)
                test_decoder(decoder, epoch, test_data, batch_size, n, k)

                logger.logkv("epoch", (epoch + cpc_epochs))
                logger.dumpkvs()
            print("model saved in", save_dir)

        elif mode == 'restore':
            decoder = Decoder(cpc,
                              sess,
                              z_dim,
                              decoder_feature_dims,
                              fixed_num_of_contact,
                              contact_point_dim,
                              decoder_lr)
            uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
            sess.run(tf.variables_initializer(uninit_vars))
            print("initialized")
            for epoch in range(100):
                train_decoder(decoder, epoch, train_data, batch_size, n, k)
                test_decoder(decoder, epoch, test_data, batch_size, n, k)

                logger.logkv("epoch", epoch)
                logger.dumpkvs()
                print("logging to", exp_dir)

        elif mode == 'store_weights':
            old = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='')
            old = sess.run(old)
            save_dir = './saved_model/' +  str(label) + '/' + str(process_type)+ '/trained/'
            with open(save_dir + 'weights.pickle', 'wb') as pickle_file:
                pickle.dump(old, pickle_file)
            print("weights saved to", save_dir)

            save_dir = '/home/vioichigo/try/tactile-baselines/saved_model/cpc2/trained'
            with open(save_dir + 'params.pickle', 'wb') as pickle_file:
                pickle.dump([z_dim, fixed_num_of_contact, contact_point_dim, action_dim, encoder_lr, feature_dims, trans_mode, label, include_action], pickle_file)

        tf.reset_default_graph()
        print("graph reset successfully")
コード例 #25
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from tactile_baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
コード例 #26
0
def train(*, policy, rollout_worker, evaluator,
          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
          save_path, demo_file, exp_dir, **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()


    logger.info("Training...")
    best_success_rate = -1

    # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers
    if policy.pre_train_model == 'supervised':
        # test_input, test_output = pickle.load(open(policy.feature_net_path + 'data.pickle', 'rb'))
        stored_weghts = pickle.load(open(policy.feature_net_path + 'weights.pickle', 'rb'))
        restored_weights = [tf.constant(w) for w in stored_weghts]
        """assign weights for main"""
        new_scope_main = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/main/pi/process/predicted_pos')
        update_weights_main = [tf.assign(new, old) for (new, old) in zip(new_scope_main, restored_weights)]
        policy.sess.run(update_weights_main)
        """assign weights for target"""
        new_scope_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/process/predicted_pos')
        update_weights_target = [tf.assign(new, old) for (new, old) in zip(new_scope_target, restored_weights)]
        policy.sess.run(update_weights_target)
    elif policy.pre_train_model == 'cpc':
        path = '/home/vioichigo/try/tactile-baselines/saved_model/cpc2/max_pool/trained/'
        stored_weights = pickle.load(open(path + 'weights.pickle', 'rb'))
        restored_weights = [tf.constant(w) for w in stored_weights]
        """assign weights for main"""
        new_scope_main = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/main/pi/process/new_cpc')
        update_weights_main = [tf.assign(new, old) for (new, old) in zip(new_scope_main, restored_weights)]
        policy.sess.run(update_weights_main)
        """assign weights for target"""
        new_scope_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/process/new_cpc')
        update_weights_target = [tf.assign(new, old) for (new, old) in zip(new_scope_target, restored_weights)]
        policy.sess.run(update_weights_target)




    for epoch in range(n_epochs): #200

        if policy.pre_train_model != 'none':
            auxiliary_loss = []
        start_time = time.time()
        # train
        rollout_worker.clear_history()
        for n_cycle in range(n_cycles): #50
            episode = rollout_worker.generate_rollouts()
            obs = policy.store_episode(episode)
            start_here = time.time()

            for i in range(n_batches): #40
                if policy.pre_train_model == 'none':
                    policy.train()
                else:
                    _, _, loss = policy.train()
                    auxiliary_loss.append(loss)
            policy.update_target_net()

        # test
        evaluator.clear_history()
        for _ in range(n_test_rollouts):
            evaluator.generate_rollouts()

        # record logs
        logger.record_tabular('epoch', epoch)
        for key, val in evaluator.logs('test'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))

        logger.logkv('itr time', time.time() - start_time)

        if policy.pre_train_model == 'supervised':
            logger.logkv('auxiliary loss', np.array(auxiliary_loss).mean())

        if rank == 0:
            log_dict = dict([])
            for k in logger.Logger.CURRENT.name2val:
                value = logger.Logger.CURRENT.name2val[k]
                log_dict[k] = np.mean([value])
            wandb.log(log_dict)

        if rank == 0:
            logger.dump_tabular()

        # save the policy if it's better than the previous ones
        success_rate = mpi_average(evaluator.current_success_rate())
        # can't pickle SwigPyObject objects
        # if rank == 0 and success_rate >= best_success_rate and save_path and epoch % 10 == 0:
        #     best_success_rate = success_rate
            # batch = policy.sample_batch()
            # policy.sess.run(policy.stage_op, feed_dict=dict(zip(policy.buffer_ph_tf, batch)))
            # critic_loss, actor_loss, Q_grad, pi_grad = policy._grads()
            # print("calculated")
            # if policy.pre_train_model == 'supervised':
            #     policy.sess.run(policy.stage_op, feed_dict=dict(zip(policy.buffer_ph_tf, batch)))
            #     feature_loss, feature_grad = policy.sess.run([policy.feature_loss_tf, policy.feature_grad_tf])
            #     with open(save_path + '/stats.pickle', 'wb') as pickle_file:
            #         pickle.dump([batch, critic_loss, actor_loss, Q_grad, pi_grad, feature_loss, feature_grad], pickle_file)
            # else:
            #     with open(save_path + '/stats.pickle', 'wb') as pickle_file:
            #         pickle.dump([batch, critic_loss, actor_loss, Q_grad, pi_grad], pickle_file)

            # policy.o_stats.save(save_path + '/o-stats' + str(epoch) + '.pickle')
            # if policy.pre_train_model == 'cpc':
            #     policy.feature_stats.save(save_path + '/feature-stats' + str(epoch) + '.pickle')
            # print("model saved")

            # actually includes the two steps above
            # tf_util.save_variables(save_path + '/saved' + str(epoch) + '.pkl', sess=policy.sess)
        if save_path and success_rate >= best_success_rate and epoch % 10 == 0:
            best_success_rate = success_rate
            tf_util.save_variables(save_path + '/saved' + str(epoch) + '-seed' + str(rank) + '.pkl', sess=policy.sess)
            # print("vars saved")




        policy.sess.run(policy.increment_global_step)
        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1,))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)


        if rank != 0:
            assert local_uniform[0] != root_uniform[0]



    return policy
コード例 #27
0
def train_cpc(encoder,
              trans,
              optimizer,
              epoch,
              train_data,
              batch_size,
              n,
              k=1,
              include_action=True):
    """predict the next k steps. """
    global real_batch_size
    start = time.time()
    encoder.train()
    trans.train()

    train_losses = []
    train_loader = prep_data(train_data, batch_size, k, n)
    batch_num = train_loader[0].shape[0]
    batch_num = 100
    for idx in range(batch_num):
        obs, obs_pos = train_loader[0][idx], train_loader[2][idx]
        """add batch here, so that each sample in the batch get different neg samples. """
        obs, obs_pos = np.concatenate(obs), np.concatenate(
            obs_pos)  #real_batch_size x fixed_num_of_contact x contact_dim
        obs, obs_pos = obs[:real_batch_size], obs_pos[:real_batch_size]
        obs_neg = get_neg_samples(
            idx * batch_size, (idx + 1) * batch_size,
            train_loader[0],
            n=n,
            b=real_batch_size)  # b x n x fixed_num_of_contact x contact_dim

        obs, obs_pos, obs_neg = torch.from_numpy(obs), torch.from_numpy(
            obs_pos), torch.from_numpy(obs_neg)
        obs, obs_pos = obs.cuda(), obs_pos.cuda(),  # b x 9 * contact_dim
        obs_neg = obs_neg.cuda()  # (b x n) x 9 * contact_dim
        if include_action:
            actions = train_loader[1][idx]
            actions = np.concatenate(actions)
            actions = actions[:real_batch_size]
            actions = torch.from_numpy(actions)
            actions = actions.cuda()
            loss = compute_cpc_loss(obs,
                                    obs_pos,
                                    obs_neg,
                                    encoder,
                                    trans,
                                    actions=actions)
        else:
            loss = compute_cpc_loss(obs,
                                    obs_pos,
                                    obs_neg,
                                    encoder,
                                    trans,
                                    actions=None)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    avg_loss = np.mean(train_losses[-50:])
    logger.logkv("cpc training loss", avg_loss)
    logger.logkv("cpc training time", time.time() - start)
コード例 #28
0
 def train(self, input_data, labels):
     feed_dict = {self.input: input_data,
                  self.labels: labels}
     loss, _ = self.sess.run([self.geodesic_loss, self.op], feed_dict=feed_dict)
     logger.logkv('train_classify_loss', loss)
コード例 #29
0
    def train(self):
        """
        Trains policy on env using algo
        Pseudocode:
            for itr in n_itr:
                for step in num_inner_grad_steps:
                    sampler.sample()
                    algo.compute_updated_dists()
                algo.optimize_policy()
                sampler.update_goals()
        """

        with self.sess.as_default() as sess:
            # initialize uninitialized vars  (only initialize vars that were not loaded)
            sess.run(tf.global_variables_initializer())
            start_time = time.time()

            if self.start_itr == 0:
                self.algo._update_target(tau=1.0)
                if self.n_initial_exploration_steps > 0:
                    while self.replay_buffer._size < self.n_initial_exploration_steps:
                        paths = self.sampler.obtain_samples(
                            log=True, log_prefix='train-', random=True)
                        samples_data = self.sample_processor.process_samples(
                            paths, log='all', log_prefix='train-')[0]
                        self.replay_buffer.add_samples(
                            samples_data['observations'],
                            samples_data['actions'],
                            samples_data['rewards'],
                            samples_data['dones'],
                            samples_data['next_observations'],
                        )

            for itr in range(self.start_itr, self.n_itr):
                itr_start_time = time.time()
                logger.log(
                    "\n ---------------- Iteration %d ----------------" % itr)
                logger.log(
                    "Sampling set of tasks/goals for this meta-batch...")
                """ -------------------- Sampling --------------------------"""

                logger.log("Obtaining samples...")
                time_env_sampling_start = time.time()
                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='train-')
                sampling_time = time.time() - time_env_sampling_start
                """ ----------------- Processing Samples ---------------------"""
                # check how the samples are processed
                logger.log("Processing samples...")
                time_proc_samples_start = time.time()
                samples_data = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='train-')[0]
                self.replay_buffer.add_samples(
                    samples_data['observations'],
                    samples_data['actions'],
                    samples_data['rewards'],
                    samples_data['dones'],
                    samples_data['next_observations'],
                )
                proc_samples_time = time.time() - time_proc_samples_start

                paths = self.sampler.obtain_samples(log=True,
                                                    log_prefix='eval-',
                                                    deterministic=True)
                _ = self.sample_processor.process_samples(
                    paths, log='all', log_prefix='eval-')[0]

                # self.log_diagnostics(paths, prefix='train-')
                """ ------------------ Policy Update ---------------------"""

                logger.log("Optimizing policy...")

                # This needs to take all samples_data so that it can construct graph for meta-optimization.
                time_optimization_step_start = time.time()

                self.algo.optimize_policy(self.replay_buffer,
                                          itr * self.epoch_length,
                                          self.num_grad_steps)
                """ ------------------- Logging Stuff --------------------------"""
                logger.logkv('Itr', itr)
                logger.logkv('n_timesteps',
                             self.sampler.total_timesteps_sampled)

                logger.logkv('Time-Optimization',
                             time.time() - time_optimization_step_start)
                logger.logkv('Time-SampleProc', np.sum(proc_samples_time))
                logger.logkv('Time-Sampling', sampling_time)

                logger.logkv('Time', time.time() - start_time)
                logger.logkv('ItrTime', time.time() - itr_start_time)

                logger.dumpkvs()
                if itr == 0:
                    sess.graph.finalize()

        logger.log("Training finished")
        self.sess.close()
コード例 #30
0
    def obtain_samples(self,
                       log=False,
                       log_prefix='',
                       random=False,
                       deterministic=False,
                       verbose=False):
        """
        Collect batch_size trajectories from each task

        Args:
            log (boolean): whether to log sampling times
            log_prefix (str) : prefix for logger
            random (boolean): whether the actions are random

        Returns:
            (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
        """

        # initial setup / preparation
        paths = []

        n_samples = 0
        running_paths = [
            _get_empty_running_paths_dict()
            for _ in range(self.vec_env.num_envs)
        ]

        if verbose: pbar = ProgBar(self.total_samples)
        policy_time, env_time = 0, 0

        policy = self.policy
        policy.reset(dones=[True] * self.vec_env.num_envs)

        # initial reset of meta_envs
        obses = np.asarray(self.vec_env.reset())

        while n_samples < self.total_samples:

            # execute policy
            t = time.time()
            if self.vae is not None:
                obses = np.array(obses)
                obses = self.vae.encode(obses)
            if random:
                actions = np.stack([
                    self.env.action_space.sample()
                    for _ in range(self.vec_env.num_envs)
                ],
                                   axis=0)
                agent_infos = {}
            elif deterministic:
                actions, agent_infos = policy.get_actions(obses)
                actions = [a_i['mean'] for a_i in agent_infos]
                if self.policy.squashed:
                    actions = np.tanh(actions)
            else:
                actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t

            # step environments
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            #  stack agent_infos and if no infos were provided (--> None) create empty dicts
            agent_infos, env_infos = self._handle_info_dicts(
                agent_infos, env_infos)

            new_samples = 0
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                # append new samples to running paths
                if isinstance(reward, np.ndarray):
                    reward = reward[0]
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["dones"].append(done)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                # if running path is done, add it to paths and empty the running path
                if done:
                    paths.append(
                        dict(
                            observations=np.asarray(
                                running_paths[idx]["observations"]),
                            actions=np.asarray(running_paths[idx]["actions"]),
                            rewards=np.asarray(running_paths[idx]["rewards"]),
                            dones=np.asarray(running_paths[idx]["dones"]),
                            env_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    new_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = _get_empty_running_paths_dict()

            if verbose: pbar.update(self.vec_env.num_envs)
            n_samples += new_samples
            obses = next_obses
        if verbose: pbar.stop()

        self.total_timesteps_sampled += self.total_samples
        if log:
            logger.logkv(log_prefix + "PolicyExecTime", policy_time)
            logger.logkv(log_prefix + "EnvExecTime", env_time)

        return paths