Beispiel #1
0
def slave():
  global env, test_env
  env = make_env(args=config_args, dream_env=config_args.dream_env)
  # doom env doesn't support mpi testing so don't bother loading
  if 'DoomTakeCover-v0' != config_args.env_name:
    test_env = make_env(args=config_args, dream_env=False, render_mode=False)

  packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32)
  while 1:
    comm.Recv(packet, source=0)
    assert(len(packet) == SOLUTION_PACKET_SIZE)
    solutions = decode_solution_packet(packet)
    results = []
    for solution in solutions:
      worker_id, jobidx, seed, train_mode, max_len, weights = solution
      assert (train_mode == 1 or train_mode == 0 or train_mode == -1), str(train_mode)
      worker_id = int(worker_id)
      possible_error = "work_id = " + str(worker_id) + " rank = " + str(rank)
      assert worker_id == rank, possible_error
      jobidx = int(jobidx)
      seed = int(seed)
      fitness, timesteps = worker(weights, seed, train_mode, max_len)
      results.append([worker_id, jobidx, fitness, timesteps])
    result_packet = encode_result_packet(results)
    assert len(result_packet) == RESULT_PACKET_SIZE
    comm.Send(result_packet, dest=0)
Beispiel #2
0
def slave():
    global env
    if env_name == 'CarRacing-v0':
        env = make_env(args=config_args, dream_env=False,
                       with_obs=True)  # training in dreams not supported yet
    else:
        env = make_env(args=config_args, dream_env=True, render_mode=False)

    packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32)
    while 1:
        comm.Recv(packet, source=0)
        assert (len(packet) == SOLUTION_PACKET_SIZE)
        solutions = decode_solution_packet(packet)
        results = []
        for solution in solutions:
            worker_id, jobidx, seed, train_mode, max_len, weights = solution
            assert (train_mode == 1 or train_mode == 0), str(train_mode)
            worker_id = int(worker_id)
            possible_error = "work_id = " + str(worker_id) + " rank = " + str(
                rank)
            assert worker_id == rank, possible_error
            jobidx = int(jobidx)
            seed = int(seed)
            fitness, timesteps = worker(weights, seed, train_mode, max_len)
            results.append([worker_id, jobidx, fitness, timesteps])
        result_packet = encode_result_packet(results)
        assert len(result_packet) == RESULT_PACKET_SIZE
        comm.Send(result_packet, dest=0)
def main():
    args = get_argparser().parse_args()
    init_logging('logs')

    env = make_env(args.env, args.seed, num_envs=args.num_envs, num_processes=args.num_processes)
    agent = ActorCritic(env.observation_space, env.action_space, args)

    train(agent, env, args, max_reward=args.max_reward)

    test_env = make_env(args.env, args.seed, num_envs=1, num_processes=1)
    make_fun(agent, test_env, render=True)
def initialize_settings(sigma_init=0.1, sigma_decay=0.9999):
    global population, filebase, game, controller, env, num_params, es, PRECISION, SOLUTION_PACKET_SIZE, RESULT_PACKET_SIZE
    population = num_worker * num_worker_trial
    filebase = 'log/' + gamename + '.' + optimizer + '.' + str(
        num_episode) + '.' + str(population)
    controller = make_model()
    env = make_env()
    num_params = controller.param_count
    print("size of model", num_params)

    if optimizer == 'ses':
        ses = PEPG(num_params,
                   sigma_init=sigma_init,
                   sigma_decay=sigma_decay,
                   sigma_alpha=0.2,
                   sigma_limit=0.02,
                   elite_ratio=0.1,
                   weight_decay=0.005,
                   popsize=population)
        es = ses
    elif optimizer == 'ga':
        ga = SimpleGA(num_params,
                      sigma_init=sigma_init,
                      sigma_decay=sigma_decay,
                      sigma_limit=0.02,
                      elite_ratio=0.1,
                      weight_decay=0.005,
                      popsize=population)
        es = ga
    elif optimizer == 'cma':
        cma = CMAES(num_params, sigma_init=sigma_init, popsize=population)
        es = cma
    elif optimizer == 'pepg':
        pepg = PEPG(num_params,
                    sigma_init=sigma_init,
                    sigma_decay=sigma_decay,
                    sigma_alpha=0.20,
                    sigma_limit=0.02,
                    learning_rate=0.01,
                    learning_rate_decay=1.0,
                    learning_rate_limit=0.01,
                    weight_decay=0.005,
                    popsize=population)
        es = pepg
    else:
        oes = OpenES(num_params,
                     sigma_init=sigma_init,
                     sigma_decay=sigma_decay,
                     sigma_limit=0.02,
                     learning_rate=0.01,
                     learning_rate_decay=1.0,
                     learning_rate_limit=0.01,
                     antithetic=antithetic,
                     weight_decay=0.005,
                     popsize=population)
        es = oes

    PRECISION = 10000
    SOLUTION_PACKET_SIZE = (5 + num_params) * num_worker_trial
    RESULT_PACKET_SIZE = 4 * num_worker_trial
Beispiel #5
0
 def make_env(self, seed=-1, render_mode=False, load_model=True, lock=None):
     self.render_mode = render_mode
     self.env = make_env(self.env_name,
                         seed=seed,
                         render_mode=render_mode,
                         load_model=load_model,
                         lock=lock)
Beispiel #6
0
 def make_env(self, seed=-1, render_mode=False):
     self.render_mode = render_mode
     self.env = make_env(self.env_name,
                         self.encoder,
                         self.max_features,
                         seed=seed,
                         render_mode=render_mode)
Beispiel #7
0
    def __init__(self, **kwarg):
        """
        Args:
            kwarg: configurations for the environment.
        """
        config = get_default_config()

        name = kwarg['name']
        for key, value in kwarg.items():
            if hasattr(config, key):
                setattr(config, key, value)

        # create an environment
        self.env = make_env(name, config)

        # covert observation space
        obs_space = self.env.observation_space
        obs_size = sum([np.prod(v) for v in obs_space.values()])
        low = -1 * np.ones(obs_size)
        high = np.ones(obs_size)
        self.observation_space = gym.spaces.Box(low=low, high=high)

        # covert action space
        dof = self.env.dof
        low = -1 * np.ones(dof)
        high = np.ones(dof)
        self.action_space = gym.spaces.Box(low=low, high=high)
Beispiel #8
0
def run(args):
    # env = make_env(args.env_id)
    env = gym.make(env_id)
    env_test = make_env(args.env_id)
    buffer_exp = SerializedBuffer(
        path=args.buffer, device=torch.device("cuda" if args.cuda else "cpu"))

    algo = AIRL(buffer_exp=buffer_exp,
                state_shape=env.observation_space.shape,
                action_shape=env.action_space.shape,
                device=torch.device("cuda" if args.cuda else "cpu"),
                seed=args.seed,
                rollout_length=args.rollout_length)

    time = datetime.now().strftime("%Y%m%d-%H%M")
    log_dir = os.path.join('logs', args.env_id, args.algo,
                           f'seed{args.seed}-{time}')

    trainer = Trainer(env=env,
                      env_test=env_test,
                      algo=algo,
                      log_dir=log_dir,
                      num_steps=args.num_steps,
                      eval_interval=args.eval_interval,
                      seed=args.seed)
    trainer.train()
Beispiel #9
0
 def make_env(self, env_name, seed=-1, render_mode=False, model=None):
     self.render_mode = render_mode
     self.env_name = env_name
     self.env = make_env(env_name,
                         seed=seed,
                         render_mode=render_mode,
                         model=model)
Beispiel #10
0
    def __init__(self, env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam,
                 clip_range, ent_coef, vf_coef, max_grad_norm):

        self.env_id = env_id

        self.env = make_env(env_id, n_envs=4)

        self.num_envs = self.env.num_envs if isinstance(self.env,
                                                        VecEnv) else 1
        self.state_dim = self.env.observation_space.shape[0]
        self.action_converter = ActionConverter(self.env.action_space)

        self.lr = lr
        self.nstep = nstep
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.gamma = gamma
        self.gae_lam = gae_lam
        self.clip_range = clip_range
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm

        self.ep_info_buffer = deque(maxlen=50)
        self._n_updates = 0
        self.num_timesteps = 0
        self.num_episodes = 0

        self.obs_rms = RunningMeanStd()
Beispiel #11
0
 def make_env(self,
              seed=-1,
              render_mode=False,
              full_episode=False,
              worker_id=0):
     self.render_mode = render_mode
     self.env = make_env(self.env_name,
                         seed=seed,
                         render_mode=render_mode,
                         full_episode=full_episode,
                         worker_id=worker_id)
Beispiel #12
0
def main(args):

    env_name = args.env_name
    total_episodes = args.total_episodes
    start_batch = args.start_batch
    time_steps = args.time_steps

    obs_data = []
    action_data = []

    env = make_env(env_name)
    s = 0
    batch = start_batch

    while s < total_episodes:

        for i_episode in range(200):
            print('-----')
            observation = env.reset()
            env.render()
            done = False
            action = env.action_space.sample()
            t = 0
            obs_sequence = []
            action_sequence = []
            while t < time_steps:
                t = t + 1
                
                action = config.generate_data_action(t, action)
                observation = config.adjust_obs(observation)
                      
                obs_sequence.append(observation)
                action_sequence.append(action)
                
                observation, reward, done, info = env.step(action)
            
            obs_data.append(obs_sequence)
            action_data.append(action_sequence)
            
            print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t+1))
            print("Current dataset contains {} observations".format(sum(map(len, obs_data))))

            s = s + 1

        print("Saving dataset for batch {}".format(batch))
        np.save('./data/obs_data_' + str(batch), obs_data)
        np.save('./data/action_data_' + str(batch), action_data)

        batch = batch + 1

        obs_data = []
        action_data = []
Beispiel #13
0
def test_render():
    env = make_env(10000, visualize=True)
    obs = env.reset()
    #while True:
    #    env.render()
    zero_action = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
    #start_time = time.time()
    new_obs, rew, done, _ = env.step(zero_action)
    action = np.array([[0.0, 0.0, 4.0], [0.0, 0.0, 0.0]])
    while True:
        new_obs, rew, done, _ = env.step(action)
        env.render()
        if done:
            break
Beispiel #14
0
def init_gym(env_name):
    """
    Initialize gym environment, return dimension of observation
    and action spaces.

    Args:
        env_name: str environment name (e.g. "Humanoid-v1")

    Returns: 3-tuple
        gym environment (object)
        number of observation dimensions (int)
        number of action dimensions (int)
    """
    #env = gym.make(env_name)
    env = make_env(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    return env, obs_dim, act_dim
Beispiel #15
0
 def __init__(self, type="CarRacing", history_pick=4, seed=None, detect_edges=False, detect_grass=False, flip=False):
     self.name = type + str(time.time())
     random.seed(30)
     self.env = make_env('CarRacing-v0', random.randint(1,10000000), render_mode = False, full_episode = True)
     self.image_dimension = [64,64]
     self.history_pick = history_pick
     self.state_space_size = history_pick * np.prod(self.image_dimension)
     self.action_space_size = 5
     self.state_shape = [None, self.history_pick] + list(self.image_dimension)
     self.history = []
     self.action_dict = {0: [-1, 0, 0], 1: [1, 0, 0], 2: [0, 1, 0], 3: [0, 0, 0.8], 4: [0, 0, 0]}
     self.seed = seed
     self.detect_edges = detect_edges
     self.detect_grass = detect_grass
     self.flip = flip
     self.flip_episode = False
     self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True)
     self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True)
     self.vae.load_json('vae/vae.json')
     self.rnn.load_json('rnn/rnn.json')
Beispiel #16
0
def test():
    env = make_env()
    obs = env.reset()
    # Test zero action
    zero_action = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
    print(f"Before zero action = {obs}")
    new_obs, rew, done, _ = env.step(zero_action)
    print(f"After zero action = {new_obs}, is equal = {obs == new_obs}")
    obs = new_obs
    # Test dynamics
    dt = 0.002
    action = np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]])
    new_pos = deepcopy(obs)
    new_pos[0] = obs[0] + (1000 * action[0, 0] * np.cos(obs[2]) +
                           1000 * action[0, 1] * np.sin(obs[2])) * dt
    new_pos[1] = obs[1] + (1000 * action[0, 1] * np.cos(obs[2]) -
                           1000 * action[0, 0] * np.sin(obs[2])) * dt
    new_pos[2] = obs[2] + action[0, 2] * dt
    new_obs, rew, done, _ = env.step(action)
    print(f"Is done = {done}, Is dynamics correct = {new_pos == new_obs}")
def main(_):
    display = Display(visible=0, size=(1400, 900))
    #display.start()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session() as sess:

        global_step = tf.Variable(0, name='global_step', trainable=False)
        env = make_env(ENV_NAME, 876, render_mode=False, full_episode=True)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        state_dim = [32]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        print('state_dim: ', state_dim)
        print('action_dim: ', action_dim)
        print('action_bound: ', action_bound)
        # Ensure action bound is symmetric
        # assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim, TAU,
                               actor.get_num_trainable_vars())

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env.monitor.start(MONITOR_DIR,
                                  video_callable=False,
                                  force=True)
            else:
                env.monitor.start(MONITOR_DIR, force=True)

        train(sess, env, actor, critic, global_step)

        if GYM_MONITOR_EN:
            env.monitor.close()
def slave():
    env = make_env()
    packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32)
    while 1:
        comm.Recv(packet, source=0)
        assert (len(packet) == SOLUTION_PACKET_SIZE)
        solutions = decode_solution_packet(packet)
        results = []
        for solution in solutions:
            worker_id, jobidx, seed, train_mode, max_len, weights = solution
            assert (train_mode == 1 or train_mode == 0), str(train_mode)
            worker_id = int(worker_id)
            possible_error = "work_id = " + str(worker_id) + " rank = " + str(
                rank)
            assert worker_id == rank, possible_error
            jobidx = int(jobidx)
            seed = int(seed)
            fitness, timesteps = worker(weights, seed, train_mode, max_len)
            results.append([worker_id, jobidx, fitness, timesteps])
        result_packet = encode_result_packet(results)
        assert len(result_packet) == RESULT_PACKET_SIZE
        comm.Send(result_packet, dest=0)
Beispiel #19
0
def main(args):
  exp_path = mkdir_exp(f'{args.env_id}_PPO')
  export_args(args, os.path.join(exp_path, 'config.json'))

  np.random.seed(args.seed)
  pt.random.manual_seed(args.seed)

  print("== Creating a training environment...")
  env = make_env(args.env_id, NormalizeObservation, num_envs=args.num_envs)

  print("== Creating a evaluation environment...")
  eval_env = make_env(args.env_id, NormalizeObservation, num_envs=1)
  obs_dim = eval_env.observation_space.shape[0]
  act_dim = eval_env.action_space.shape[0]
  
  print("== Creating an agent....")
  device = pt.device('cuda' if pt.cuda.is_available() else 'cpu')
  agent = ContinuousPolicyAgent(obs_dim, act_dim, args.hid_dim).to(device)
  
  print("== Creating a data storage...")
  data = TensorBook(args.env_id, args.rollout_steps)
  
  print("== Creating a PPO optimizer...")
  optimizer = ProximalPolicyOptimization(
    agent, 
    device,
    num_epochs=args.num_epochs,
    batch_size=args.batch_size,
    lr_max=args.lr_max,
    lr_min=args.lr_min,
    eps=args.eps,
    gamma=args.gamma,
    lam=args.lam,
    alpha=args.alpha,
    value_coef=args.value_coef,
    entropy_coef=args.entropy_coef,
    max_grad_norm=args.max_grad_norm,
    target_kldiv=args.target_kldiv
  )

  print("== Creating a TensorBoard summary writer...")
  writer = SummaryWriter(log_dir=exp_path)
  
  print("IT'S DANGEROUS TO GO ALONE! TAKE THIS.")
  obs = env.reset().to(device)
  best_perf = -np.inf

  num_updates = args.num_steps // args.rollout_steps // args.num_envs
  for i in tqdm(range(num_updates)):
    obs = agent.rollout(obs, env, data)
    info = optimizer.update(data)
    lr = optimizer.update_lr(i, num_updates)
    
    # Compute mean total reward during the rollout.
    reward = data.reward.sum(dim=0).mean(dim=0).item()

    # Evaluate the agent.
    perf = play(eval_env, agent, device, repeat=args.num_eval)
    if perf > best_perf:
      model_path = os.path.join(exp_path, f'{agent.__class__.__name__}.pt')
      pt.save(agent.state_dict(), model_path)
      best_perf = perf

    # Log training progress.
    step = i * args.rollout_steps * args.num_envs
    
    writer.add_scalar('Train/lr', lr, step)
    writer.add_scalar('Train/epochs', info['num_epochs'], step)
    writer.add_scalar('Train/loss/policy', info['policy_loss'], step)
    writer.add_scalar('Train/loss/value', info['value_loss'], step)
    writer.add_scalar('Train/loss/entropy', info['entropy'], step)
    writer.add_scalar('Train/loss/total', info['total_loss'], step)
    writer.add_scalar('Train/reward/mean', reward, step)

    writer.add_scalar('Eval/reward/mean', perf, step)
    writer.add_scalar('Eval/reward/best', best_perf, step)

  env.close()
  eval_env.close()
  writer.close()
Beispiel #20
0
def master():
    global test_env
    if env_name == 'CarRacing-v0':
        test_env = make_env(args=config_args, dream_env=False, with_obs=True)
    else:
        test_env = make_env(args=config_args,
                            dream_env=False,
                            render_mode=False)

    start_time = int(time.time())
    sprint("training", env_name)
    sprint("population", es.popsize)
    sprint("num_worker", num_worker)
    sprint("num_worker_trial", num_worker_trial)
    sys.stdout.flush()

    seeder = Seeder(seed_start)

    filename = filebase + '.json'
    filename_log = filebase + '.log.json'
    filename_hist = filebase + '.hist.json'
    filename_eval_hist = filebase + '.eval_hist.json'
    filename_hist_best = filebase + '.hist_best.json'
    filename_best = filebase + '.best.json'

    t = 0

    history = []
    history_best = []  # stores evaluation averages every 25 steps or so
    eval_log = []
    eval_hist = []
    best_reward_eval = 0
    best_model_params_eval = None

    max_len = -1  # max time steps (-1 means ignore)
    while True:
        solutions = es.ask()

        if antithetic:
            seeds = seeder.next_batch(int(es.popsize / 2))
            seeds = seeds + seeds
        else:
            seeds = seeder.next_batch(es.popsize)
        packet_list = encode_solution_packets(seeds,
                                              solutions,
                                              max_len=max_len)

        send_packets_to_slaves(packet_list)
        reward_list_total = receive_packets_from_slaves()

        reward_list = reward_list_total[:, 0]  # get rewards

        mean_time_step = int(np.mean(reward_list_total[:, 1]) *
                             100) / 100.  # get average time step
        max_time_step = int(np.max(reward_list_total[:, 1]) *
                            100) / 100.  # get average time step
        avg_reward = int(
            np.mean(reward_list) * 100) / 100.  # get average time step
        std_reward = int(
            np.std(reward_list) * 100) / 100.  # get average time step

        es.tell(reward_list)

        es_solution = es.result()
        model_params = es_solution[0]  # best historical solution
        reward = es_solution[1]  # best reward
        curr_reward = es_solution[2]  # best of the current batch
        controller.set_model_params(np.array(model_params).round(4))

        r_max = int(np.max(reward_list) * 100) / 100.
        r_min = int(np.min(reward_list) * 100) / 100.

        curr_time = int(time.time()) - start_time

        h = (t, curr_time, avg_reward, r_min, r_max, std_reward,
             int(es.rms_stdev() * 100000) / 100000., mean_time_step + 1.,
             int(max_time_step) + 1)

        if cap_time_mode:
            max_len = 2 * int(mean_time_step + 1.0)
        else:
            max_len = -1

        history.append(h)

        with open(filename, 'wt') as out:
            res = json.dump([np.array(es.current_param()).round(4).tolist()],
                            out,
                            sort_keys=True,
                            indent=2,
                            separators=(',', ': '))

        with open(filename_hist, 'wt') as out:
            res = json.dump(history,
                            out,
                            sort_keys=False,
                            indent=0,
                            separators=(',', ':'))

        sprint(env_name, h)

        if (t == 1):
            best_reward_eval = avg_reward
        if (t % eval_steps == 0):  # evaluate on actual task at hand

            prev_best_reward_eval = best_reward_eval
            model_params_quantized = np.array(es.current_param()).round(4)
            reward_eval_list = evaluate_batch(model_params_quantized,
                                              max_len=-1,
                                              test_seed=t)
            reward_eval = np.mean(reward_eval_list)
            r_eval_std = np.std(reward_eval_list)
            r_eval_min = np.min(reward_eval_list)
            r_eval_max = np.max(reward_eval_list)
            model_params_quantized = model_params_quantized.tolist()
            improvement = reward_eval - best_reward_eval
            eval_log.append([t, reward_eval, model_params_quantized])
            e_h = (t, reward_eval, r_eval_std, r_eval_min, r_eval_max)
            eval_hist.append(e_h)
            with open(filename_eval_hist, 'wt') as out:
                res = json.dump(eval_hist,
                                out,
                                sort_keys=False,
                                indent=0,
                                separators=(',', ':'))
            with open(filename_log, 'wt') as out:
                res = json.dump(eval_log, out)
            if (len(eval_log) == 1 or reward_eval > best_reward_eval):
                best_reward_eval = reward_eval
                best_model_params_eval = model_params_quantized
            else:
                if retrain_mode:
                    sprint(
                        "reset to previous best params, where best_reward_eval =",
                        best_reward_eval)
                    es.set_mu(best_model_params_eval)
            with open(filename_best, 'wt') as out:
                res = json.dump([best_model_params_eval, best_reward_eval],
                                out,
                                sort_keys=True,
                                indent=0,
                                separators=(',', ': '))
            # dump history of best
            curr_time = int(time.time()) - start_time
            best_record = [
                t, curr_time, "improvement", improvement, "curr", reward_eval,
                "prev", prev_best_reward_eval, "best", best_reward_eval
            ]
            history_best.append(best_record)
            with open(filename_hist_best, 'wt') as out:
                res = json.dump(history_best,
                                out,
                                sort_keys=False,
                                indent=0,
                                separators=(',', ':'))

            sprint("Eval", t, curr_time, "improvement", improvement, "curr",
                   reward_eval, "prev", prev_best_reward_eval, "best",
                   best_reward_eval)

        # increment generation
        t += 1
Beispiel #21
0
global_env = make_env(
    [
        ("[]", (TYPE_NIL,)),
        ("true", (TYPE_BOOL, True)),
        ("false", (TYPE_BOOL, False)),
        ("cons", (TYPE_BUILTIN_FUNCTION, tagged_cons)),
        ("head", (TYPE_BUILTIN_FUNCTION, tagged_head)),
        ("tail", (TYPE_BUILTIN_FUNCTION, tagged_tail)),
        ("+", num_op(lambda x, y: x + y)),
        ("*", num_op(lambda x, y: x * y)),
        ("-", num_op(lambda x, y: x - y)),
        ("/", num_op(lambda x, y: x // y)),
        ("or", bool_op(lambda x, y: x or y)),
        ("and", bool_op(lambda x, y: x and y)),
        ("xor", bool_op(lambda x, y: x ^ y)),
        ("not", (TYPE_BUILTIN_FUNCTION, neg)),
        ("<", comp_op(lambda x, y: x < y)),
        (">", comp_op(lambda x, y: x > y)),
        ("<=", comp_op(lambda x, y: x <= y)),
        (">=", comp_op(lambda x, y: x >= y)),
        ("=", (TYPE_BUILTIN_FUNCTION, equal)),
        ("!=", (TYPE_BUILTIN_FUNCTION, unequal)),
        ("int_of_string", (TYPE_BUILTIN_FUNCTION, int_of_string)),
        ("string_of_int", (TYPE_BUILTIN_FUNCTION, string_of_int)),
        ("int_of_char", (TYPE_BUILTIN_FUNCTION, int_of_char)),
        ("char_of_int", (TYPE_BUILTIN_FUNCTION, char_of_int)),
        ("error", (TYPE_BUILTIN_FUNCTION, error)),
        ("concat", (TYPE_BUILTIN_FUNCTION, concat)),
    ]
)
Beispiel #22
0
 def make_env(self, seed=-1, render_mode=False):
     self.render_mode = render_mode
     self.env = make_env(self.env_name, seed=seed, render_mode=render_mode)
Beispiel #23
0
def main(args):
    """
    Inputs type of agent, observation types and simulates the environment.
    """
    print("The observation tutorial will show you the various observation configurations available.")

    background_name = background_names[1]

    # load demo file for playback
    demo = args.load_demo = input('Input path to demo file, such as demos/Sawyer_7.pkl: ')
    if demo == '':
        demo = args.load_demo = 'demos/Sawyer_7.pkl'

    agent_name, furniture_id = demo.split('/')[-1].split('.')[0].split('_')
    agent_name = agent_name[0].upper() + agent_name[1:]
    furniture_id = int(furniture_id)
    furniture_name = furniture_names[furniture_id]

    # choose robot observation
    print()
    print("Include robot observation?\n")
    try:
        s = input("Put 1 for True or 0 for False: ")
        k = int(s) == 1
    except:
        print("Input is not valid. Use 0 by default.")
        k = False

    args.robot_ob = k

    # choose furniture observation
    print()
    print("Include furniture observation?\n")
    try:
        s = input("Put 1 for True or 0 for False: ")
        k = int(s) == 1
    except:
        print("Input is not valid. Use 0 by default.")
        k = False

    args.object_ob = k

    # choose segmentation
    print()
    print("Use segmentation?\n")
    try:
        s = input("Put 1 for True or 0 for False: ")
        k = int(s) == 1
    except:
        print("Input is not valid. Use 0 by default.")
        k = False

    use_seg = k

    # choose depth
    print()
    print("Use depth map?\n")
    try:
        s = input("Put 1 for True or 0 for False: ")
        k = int(s) == 1
    except:
        print("Input is not valid. Use 0 by default.")
        k = False

    use_depth = k

    # set parameters for the environment (env, furniture_id, background)
    env_name = 'Furniture{}Env'.format(agent_name)
    args.env = env_name
    args.furniture_id = furniture_id
    args.background = background_name

    print()
    print("Creating environment (robot: {}, furniture: {}, background: {})".format(
        env_name, furniture_name, background_name))


    # make environment with rgb, depth map, and segmentation
    args.depth_ob = True
    args.segmentation_ob = True

    # make environment following arguments
    env = make_env(env_name, args)
    ob = env.reset(args.furniture_id, args.background)

    # tell user about environment observation space
    print('-' * 80)
    print('Observation configuration:')
    print(f"Robot ob: {args.robot_ob}, Furniture ob: {args.object_ob}")
    print(f"Depth Map: {use_depth}, Segmentation Map: {use_seg}")
    print()
    print("Observation Space:\n")
    print("The observation space is a dictionary. For furniture (object) observations, it is "+
          "a multiple of 7 because each part has 3 dims for position and 4 dims for quaternion. "+
          "The robot_ob is dependent on the agent, and contains position, velocity, or angles of "+
          "the current robot.\n")
    print(env.observation_space)
    print()
    input("Type anything to record an episode's visual observations")


    # run the trajectory, save the video
    rgb_frames = []
    depth_frames = []
    seg_frames = []

    # load demo from pickle file
    with open(env._load_demo, 'rb') as f:
        demo = pickle.load(f)
        all_qpos = demo['qpos']

    # playback first 100 frames
    for qpos in all_qpos:
        # set furniture part positions
        for i, body in enumerate(env._object_names):
            pos = qpos[body][:3]
            quat = qpos[body][3:]
            env._set_qpos(body, pos, quat)
            env._stop_object(body, gravity=0)
        # set robot positions
        if env._agent_type == 'Sawyer':
            env.sim.data.qpos[env._ref_joint_pos_indexes] = qpos['sawyer_qpos']
            env.sim.data.qpos[env._ref_gripper_joint_pos_indexes] = qpos['l_gripper']
        elif env._agent_type == 'Baxter':
            env.sim.data.qpos[env._ref_joint_pos_indexes] = qpos['baxter_qpos']
            env.sim.data.qpos[env._ref_gripper_right_joint_pos_indexes] = qpos['r_gripper']
            env.sim.data.qpos[env._ref_gripper_left_joint_pos_indexes] = qpos['l_gripper']
        elif env._agent_type == 'Cursor':
            env._set_pos('cursor0', qpos['cursor0'])
            env._set_pos('cursor1', qpos['cursor1'])

        env.sim.forward()
        env._update_unity()

        img, depth = env.render('rgbd_array')
        seg = I.color_segmentation(env.render('segmentation'))
        rgb_frames.append(img)
        depth_frames.append(depth)
        seg_frames.append(seg)

    env.close()

    # concatenate available observation frames together and render video
    wide_frames = []
    L = max(len(rgb_frames), len(rgb_frames), len(seg_frames))
    for l in range(L):
        rgb = rgb_frames[l]
        f = [rgb * 255]
        if use_depth:
            depth = depth_frames[l]
            f.append(depth * 255)
        if use_seg:
            seg = seg_frames[l]
            f.append(seg)
        wide = np.concatenate(f, axis=1)
        wide_frames.append(wide)

    vr = VideoRecorder()
    vr._frames = wide_frames
    vr.save_video('observations.mp4')
def main(args):
    print("main")
    env_name = args.env_name
    total_episodes = args.total_episodes
    start_batch = args.start_batch
    time_steps = args.time_steps
    render = args.render
    batch_size = args.batch_size
    run_all_envs = args.run_all_envs

    store_folder = args.store_folder
    if not os.path.exists(store_folder):
        os.makedirs(store_folder)

    if run_all_envs:
        envs_to_generate = config.train_envs
    else:
        envs_to_generate = [env_name]

    print("envs:", envs_to_generate)
    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = make_env(current_env_name)
        s = 0
        batch = start_batch

        batch_size = min(batch_size, total_episodes)

        total_frames = 0
        while s < total_episodes:
            obs_data = []
            action_data = []

            for i_episode in range(batch_size):
                print('-----')
                observation = env._reset()
                #observation = config.adjust_obs(observation)

                # plt.imshow(observation)
                # plt.show()

                env.render()
                done = False
                action = np.random.rand() *2.0 -1.0
                t = 0
                obs_sequence = []
                action_sequence = []
                repeat = np.random.randint(1, 11)

                while t < time_steps:  # and not done:
                    t = t + 1
                    if t % repeat == 0:
                        action = np.random.rand() * 2.0 - 1.0
                        repeat = np.random.randint(1, 11)

                    obs_sequence.append(observation)
                    action_sequence.append(action)

                    observation, reward, done, info = env._step(action)

                    if render:
                        env.render()

                    if done: #If we were killed
                        break

                total_frames += t
                print("dead at", t, "total recorded frames for this worker", total_frames)


                obs_data.append(obs_sequence)
                action_data.append(action_sequence)

                print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t + 1))
                print("Current dataset contains {} observations".format(sum(map(len, obs_data))))

                s = s + 1

            print("Saving dataset for batch {}".format(batch))
            np.save(store_folder+'/obs_data_' + current_env_name + '_' + str(batch), obs_data)
            print("Saving actions for batch {}".format(batch))
            np.save(store_folder+'/action_data_' + current_env_name + '_' + str(batch), action_data)

            batch = batch + 1

        env.close()
Beispiel #25
0
 def make_env(self, env_name, seed=-1, render_mode=False):
   self.render_mode = render_mode
   self.env_name = env_name
   self.env = make_env(env_name, seed=seed, render_mode=render_mode)
Beispiel #26
0
 def _make_env(self):
     self.render_mode = render_mode
     self.env = make_env(self.env_name)
     self.num_actions = self.env.action_space.n
Beispiel #27
0
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak,
         batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len,
         test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs,
         seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = make_env(env_config), make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_high = env.action_space.high

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    actor_critic = core.get_ddpg_actor_critic(ac_type)
    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    RB = get_replay_buffer(rb_type)
    replay_buffer = RB(obs_dim, act_dim, **rb_kwargs)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def get_action(o, noise_scale):
        pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        pi_a += noise_scale * np.random.randn(act_dim)
        pi_a = np.clip(pi_a, 0, 1)
        real_a = pi_a * act_high
        return pi_a, real_a

    def test_agent(n=10):
        test_actions = []
        for j in range(n):
            test_actions_ep = []
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == test_max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                _, real_a = get_action(o, 0)
                test_actions_ep.append(real_a)
                o, r, d, _ = test_env.step(real_a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            test_actions.append(test_actions_ep)
        return test_actions

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    actions = []
    epoch_actions = []
    rewards = []
    rets = []
    test_rets = []
    max_ret = None
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            pi_a, real_a = get_action(o, act_noise)
        else:
            pi_a, real_a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(real_a)
        ep_ret += r
        ep_len += 1
        epoch_actions.append(pi_a)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, pi_a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            actions.append(np.mean(epoch_actions))
            epoch_actions = []
            rewards.append(ep_ret)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_actions = test_agent(number_of_tests_per_epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            ret = logger.log_tabular('EpRet', average_only=True)
            test_ret = logger.log_tabular('TestEpRet', average_only=True)[0]
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('QVals', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            rets.append(ret)
            test_rets.append(test_ret)

            if max_ret is None or test_ret > max_ret:
                max_ret = test_ret
                best_test_actions = test_actions

            max_ep_len += inc_ep
            util.plot_actions(test_actions, act_high,
                              logger.output_dir + '/actions%s.png' % epoch)

    logger.save_state(
        {
            "actions": actions,
            "rewards": rewards,
            "best_test_actions": best_test_actions,
            "rets": rets,
            "test_rets": test_rets,
            "max_ret": max_ret
        }, None)

    util.plot_actions(best_test_actions, act_high,
                      logger.output_dir + '/best_test_actions.png')
    logger.log("max ret: %f" % max_ret)
Beispiel #28
0
exp_name = "control"
seed = 1007
env_config = '1x1_mix_ms_una.json'
iterations = 1
max_ep_len = 600

wp = 0
wi = 0.1
wd = 0.1

initial_bound = 1.1
final_bound = 1.01
bound_decay = 0.98

env = make_env(util.ENV_CONFIG_DIR + env_config)

obs = []
actions = []
action_sign = np.array([-1, -1])
for i in range(iterations):
    current_bound = initial_bound
    o = env.reset()
    real_action = env.action_space.default() * 0.5
    for t in range(max_ep_len):
        o, r, d, _ = env.step(real_action)
        obs.append(o)
        actions.append(real_action)

        vp = o
        vi = np.mean(obs[-5:])
    model.make_env(render_mode=render_mode)
    model.load_model(filename)
  else:
    model = make_model(load_model=False)
    print('model size', model.param_count)
    model.make_env(render_mode=render_mode)
    model.init_random_model_params(stdev=np.random.rand()*0.01)

  N_episode = 100
  if render_mode:
    N_episode = 1
  reward_list = []
  for i in range(N_episode):
    reward, steps_taken = simulate(model,
      train_mode=False, render_mode=render_mode, num_episode=1)
    if render_mode:
      print("terminal reward", reward, "average steps taken", np.mean(steps_taken)+1)
    else:
      print(reward[0])
    reward_list.append(reward[0])
  if not render_mode:
    print("seed", the_seed, "average_reward", np.mean(reward_list), "stdev", np.std(reward_list))

if __name__ == "__main__":
  import env
  e = env.make_env()
  c = Controller()
  import pdb; pdb.set_trace()
  r, t = simulate(c, e, render_mode=False)
  #main()
Beispiel #30
0
def learn(sess, n_tasks, z_size, data_dir, num_steps, max_seq_len,
          batch_size_per_task=16, rnn_size=256,
          grad_clip=1.0, v_lr=0.0001, vr_lr=0.0001,
          min_v_lr=0.00001, v_decay=0.999, kl_tolerance=0.5,
          lr=0.001, min_lr=0.00001, decay=0.999,
          view="transposed",
          model_dir="tf_rnn", layer_norm=False,
          rnn_mmd=False, no_cor=False,
          w_mmd=1.0,
          alpha=1.0, beta=0.1,
          recurrent_dp=1.0,
          input_dp=1.0,
          output_dp=1.0):
  batch_size = batch_size_per_task * n_tasks

  wrapper = WrapperFactory.get_wrapper(view)
  if wrapper is None:
    raise Exception("Such view is not available")

  print("Batch size for each taks is", batch_size_per_task)
  print("The total batch size is", batch_size)

  check_dir(model_dir)
  lf = open(model_dir + '/log_%s' % datetime.now().isoformat(), "w")
  # define env
  na = make_env(config.env_name).action_space.n
  input_size = z_size + na
  output_size = z_size
  print("the environment", config.env_name, "has %i actions" % na)

  seq_len = max_seq_len

  fns = os.listdir(data_dir)
  fns = [fn for fn in fns if '.npz' in fn]
  random.shuffle(fns)
  dm = get_dm(wrapper, seq_len, na, data_dir, fns, not no_cor)
  tf_vrct_lr = tf.placeholder(tf.float32,
                              shape=[])  # learn from reconstruction.
  vaes, vcomps = build_vaes(n_tasks, na, z_size, seq_len, tf_vrct_lr,
                            kl_tolerance)
  vae_losses = [vcomp.loss for vcomp in vcomps]
  transform_loss = get_transform_loss(vcomps[0], vaes[1], wrapper)

  old_vae0 = ConvVAE(name="old_vae0", z_size=z_size)
  old_vcomp0 = build_vae("old_vae0", old_vae0, na, z_size, seq_len,
                         tf_vrct_lr, kl_tolerance)
  assign_old_eq_new = tf.group([tf.assign(oldv, newv)
                                for (oldv, newv) in
                                zip(old_vcomp0.var_list, vcomps[0].var_list)])

  vmmd_losses = get_vmmd_losses(n_tasks, old_vcomp0, vcomps, alpha, beta)
  vrec_ops = get_vae_rec_ops(n_tasks, vcomps, vmmd_losses, w_mmd)
  vrec_all_op = tf.group(vrec_ops)

  # Meta RNN.
  rnn = VRNN("rnn", max_seq_len, input_size, output_size, batch_size_per_task,
             rnn_size, layer_norm, recurrent_dp, input_dp, output_dp)

  global_step = tf.Variable(0, name='global_step', trainable=False)
  tf_rpred_lr = tf.placeholder(tf.float32, shape=[])
  rcomp0 = build_rnn("rnn", rnn, na, z_size, batch_size_per_task, seq_len)

  print("The basic rnn has been built")

  rcomps = build_rnns(n_tasks, rnn, vaes, vcomps, kl_tolerance)
  rnn_losses = [rcomp.loss for rcomp in rcomps]

  if rnn_mmd:
    rmmd_losses = get_rmmd_losses(n_tasks, old_vcomp0, vcomps, alpha, beta)
    for i in range(n_tasks):
      rnn_losses[i] += 0.1 * rmmd_losses[i]

  ptransform_loss = get_predicted_transform_loss(vcomps[0], rcomps[0],
                                                 vaes[1],
                                                 wrapper, batch_size_per_task,
                                                 seq_len)
  print("RNN has been connected to each VAE")

  rnn_total_loss = tf.reduce_mean(rnn_losses)
  rpred_opt = tf.train.AdamOptimizer(tf_rpred_lr, name="rpred_opt")
  gvs = rpred_opt.compute_gradients(rnn_total_loss, rcomp0.var_list)
  clip_gvs = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for
              grad, var in gvs if grad is not None]
  rpred_op = rpred_opt.apply_gradients(clip_gvs, global_step=global_step,
                                       name='rpred_op')

  # VAE in prediction phase
  vpred_ops, tf_vpred_lrs = get_vae_pred_ops(n_tasks, vcomps, rnn_losses)
  vpred_all_op = tf.group(vpred_ops)

  rpred_lr = lr
  vrct_lr = v_lr
  vpred_lr = vr_lr
  sess.run(tf.global_variables_initializer())

  for i in range(num_steps):

    step = sess.run(global_step)
    rpred_lr = (rpred_lr - min_lr) * decay + min_lr
    vrct_lr = (vrct_lr - min_v_lr) * v_decay + min_v_lr
    vpred_lr = (vpred_lr - min_v_lr) * v_decay + min_v_lr

    ratio = 1.0

    data_buffer = []

    for it in range(config.psteps_per_it):
      raw_obs_list, raw_a_list = dm.random_batch(batch_size_per_task)
      data_buffer.append((raw_obs_list, raw_a_list))

      feed = {tf_rpred_lr: rpred_lr, tf_vrct_lr: vrct_lr,
              tf_vpred_lrs[0]: vpred_lr,
              tf_vpred_lrs[1]: vpred_lr * ratio}
      feed[old_vcomp0.x] = raw_obs_list[0]
      for j in range(n_tasks):
        vcomp = vcomps[j]
        feed[vcomp.x] = raw_obs_list[j]
        feed[vcomp.a] = raw_a_list[j][:, :-1, :]

      (rnn_cost, rnn_cost2, vae_cost, vae_cost2,
       transform_cost, ptransform_cost, _, _) = sess.run(
        [rnn_losses[0], rnn_losses[1],
         vae_losses[0], vae_losses[1],
         transform_loss, ptransform_loss,
         rpred_op, vpred_all_op], feed)
      ratio = rnn_cost2 / rnn_cost

    if i % config.log_interval == 0:
      output_log = get_output_log(step, rpred_lr, [vae_cost], [rnn_cost], [transform_cost], [ptransform_cost])
      lf.write(output_log)

    data_order = np.arange(len(data_buffer))
    nd = len(data_order)
    np.random.shuffle(data_order)

    for it in range(config.rsteps_per_it):
      if (it + 1) % nd == 0:
        np.random.shuffle(data_order)
      rid = data_order[it % nd]

      raw_obs_list, raw_a_list = data_buffer[rid]
      # raw_obs_list, raw_a_list = dm.random_batch(batch_size_per_task)

      feed = {tf_rpred_lr: rpred_lr, tf_vrct_lr: vrct_lr}
      feed[old_vcomp0.x] = raw_obs_list[0]
      for j in range(n_tasks):
        vcomp = vcomps[j]
        feed[vcomp.x] = raw_obs_list[j]
        feed[vcomp.a] = raw_a_list[j][:, :-1, :]

      (rnn_cost, rnn_cost2, vae_cost, vae_cost2, transform_cost,
       ptransform_cost, _) = sess.run([
        rnn_losses[0], rnn_losses[1],
        vae_losses[0], vae_losses[1],
        transform_loss, ptransform_loss,
        vrec_all_op], feed)

    if i % config.log_interval == 0:
      output_log = get_output_log(step, rpred_lr, [vae_cost], [rnn_cost], [transform_cost], [ptransform_cost])
      lf.write(output_log)

    lf.flush()

    if (i + 1) % config.target_update_interval == 0:
      sess.run(assign_old_eq_new)

    if i % config.model_save_interval == 0:
      tmp_dir = model_dir + '/it_%i' % i
      check_dir(tmp_dir)
      saveToFlat(rcomp0.var_list, tmp_dir + '/rnn.p')
      for j in range(n_tasks):
        vcomp = vcomps[j]
        saveToFlat(vcomp.var_list, tmp_dir + '/vae%i.p' % j)

  saveToFlat(rcomp0.var_list, model_dir + '/final_rnn.p')
  for i in range(n_tasks):
    vcomp = vcomps[i]
    saveToFlat(vcomp.var_list, model_dir + '/final_vae%i.p' % i)
Beispiel #31
0
def actor():
    print(f"STARTING ACTOR with rank {rank}")
    sys.stdout.flush()

    # GAE hyper-parameters
    lam = 0.95
    gamma = 0.99

    # Build network architecture
    nav = Navigation(1, training=False)
    nav.call_build()

    # Get agent type
    agent_type = np.where(np.array(actors) == rank)[0][0]

    # Setup environment
    env = make_env()
    obs = env.reset()
    dones = False

    while True:
        weights = comm.recv(source=learners[agent_type])
        nav.set_weights(weights)

        mb_rewards = np.zeros([nsteps, 1], dtype=np.float32)
        mb_values = np.zeros([nsteps, 1], dtype=np.float32)
        mb_neglogpacs = np.zeros([nsteps, 1], dtype=np.float32)
        mb_dones = np.zeros([nsteps, 1], dtype=np.float32)
        mb_obs = np.zeros([nsteps, 16], dtype=np.float32)
        mb_actions = {
            'x1': np.zeros([nsteps, 1], dtype=np.int32),
            'x2': np.zeros([nsteps, 1], dtype=np.int32),
            'y1': np.zeros([nsteps, 1], dtype=np.int32),
            'y2': np.zeros([nsteps, 1], dtype=np.int32),
            'w1': np.zeros([nsteps, 1], dtype=np.int32),
            'w2': np.zeros([nsteps, 1], dtype=np.int32)
        }
        mb_logits = {
            'x1': np.zeros([nsteps, 21], dtype=np.float32),
            'x2': np.zeros([nsteps, 21], dtype=np.float32),
            'y1': np.zeros([nsteps, 21], dtype=np.float32),
            'y2': np.zeros([nsteps, 21], dtype=np.float32),
            'w1': np.zeros([nsteps, 21], dtype=np.float32),
            'w2': np.zeros([nsteps, 21], dtype=np.float32)
        }

        for i in range(nsteps):
            # Get actions of training agent
            actions, neglogp, entropy, value, logits = nav(
                np.expand_dims(obs, axis=0))

            mb_values[i] = value
            mb_neglogpacs[i] = neglogp
            mb_obs[i] = obs
            for k in actions.keys():
                mb_actions[k][i] = actions[k]
                mb_logits[k][i] = logits[k]
            mb_dones[i] = dones

            # Take actions in env and look at the results
            actions = {k: (v[0] - 10) / 10 for k, v in actions.items()}
            agent_actions = np.array(
                [[actions['x1'], actions['y1'], actions['w1']],
                 [actions['x2'], actions['y2'], actions['w2']]])
            obs, rewards, dones, infos = env.step(agent_actions)

            # Handle rewards
            mb_rewards[i] = rewards

            if dones:
                obs = env.reset()

        # get last value for bootstrap
        _, _, _, last_values, _ = nav(np.expand_dims(obs, axis=0))

        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        # perform GAE calculation
        for t in reversed(range(nsteps)):
            if t == nsteps - 1:
                nextnonterminal = 1.0 - dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t + 1]
                nextvalues = mb_values[t + 1]
            delta = mb_rewards[
                t] + gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[
                t] = lastgaelam = delta + gamma * lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values

        # Send trajectory to learner
        mb_values = np.squeeze(mb_values, axis=-1)
        mb_rewards = np.squeeze(mb_rewards, axis=-1)
        mb_neglogpacs = np.squeeze(mb_neglogpacs, axis=-1)
        mb_returns = np.squeeze(mb_returns, axis=-1)
        mb_dones = np.squeeze(mb_dones, axis=-1)

        trajectory = {
            'mb_obs': mb_obs,
            'mb_actions': mb_actions,
            'mb_logits': mb_logits,
            'mb_returns': mb_returns,
            'mb_dones': mb_dones,
            'mb_values': mb_values,
            'mb_neglogpacs': mb_neglogpacs,
            'mb_rewards': mb_rewards
        }

        comm.send(trajectory, dest=learners[agent_type])
Beispiel #32
0
def main(args):

    env_name = args.env_name
    total_episodes = args.total_episodes
    time_steps = args.time_steps
    render = args.render
    run_all_envs = args.run_all_envs
    action_refresh_rate = args.action_refresh_rate

    if run_all_envs:
        envs_to_generate = config.train_envs
    else:
        envs_to_generate = [env_name]

    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = make_env(current_env_name)  # <1>
        s = 0

        while s < total_episodes:

            episode_id = random.randint(0, 2**31 - 1)
            filename = DIR_NAME + str(episode_id) + ".npz"

            observation = env.reset()

            env.render()

            t = 0

            obs_sequence = []
            action_sequence = []
            reward_sequence = []
            done_sequence = []

            reward = -0.1
            done = False

            while t < time_steps:  # and not done:
                if t % action_refresh_rate == 0:
                    action = config.generate_data_action(t, env)  # <2>

                observation = config.adjust_obs(observation)  # <3>

                obs_sequence.append(observation)
                action_sequence.append(action)
                reward_sequence.append(reward)
                done_sequence.append(done)

                observation, reward, done, info = env.step(action)  # <4>

                t = t + 1

                if render:
                    env.render()

            print("Episode {} finished after {} timesteps".format(s, t))

            np.savez_compressed(filename,
                                obs=obs_sequence,
                                action=action_sequence,
                                reward=reward_sequence,
                                done=done_sequence)  # <4>

            s = s + 1

        env.close()
def main(args):

    env_name = args.env_name
    total_episodes = args.total_episodes
    start_batch = args.start_batch
    time_steps = args.time_steps
    render = args.render
    batch_size = args.batch_size
    run_all_envs = args.run_all_envs

    if run_all_envs:
        envs_to_generate = config.train_envs
    else:
        envs_to_generate = [env_name]


    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = make_env(current_env_name)
        s = 0
        batch = start_batch

        batch_size = min(batch_size, total_episodes)

        while s < total_episodes:
            obs_data = []
            action_data = []

            for i_episode in range(batch_size):
                print('-----')
                observation = env.reset()
                observation = config.adjust_obs(observation)

                # plt.imshow(observation)
                # plt.show()

                env.render()
                done = False
                action = env.action_space.sample()
                t = 0
                obs_sequence = []
                action_sequence = []

                while t < time_steps: #and not done:
                    t = t + 1
                    
                    action = config.generate_data_action(t, action)
                    
                    obs_sequence.append(observation)
                    action_sequence.append(action)

                    observation, reward, done, info = env.step(action)
                    observation = config.adjust_obs(observation)

                    if render:
                        env.render()

                obs_data.append(obs_sequence)
                action_data.append(action_sequence)
                
                print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t+1))
                print("Current dataset contains {} observations".format(sum(map(len, obs_data))))

                s = s + 1

            print("Saving dataset for batch {}".format(batch))
            np.save('./data/obs_data_' + current_env_name + '_' + str(batch), obs_data)
            np.save('./data/action_data_' + current_env_name + '_' + str(batch), action_data)

            batch = batch + 1

        env.close()