Esempio n. 1
0
    def on_policy(self):
        """Perform n_steps on-policy, and return the data necessary for on-policy update,
        and updates shared_counter.

        Returns:
            training_data (list): A list of TraceTrainingData objects, one for each episode run.
                Only the last object may contain a last_state attribute corresponding to the
                state at which the last episode was cut.
        """
        t = 0
        training_data = [] if self.done else [TraceTrainingData()]
        while t < self.n_steps:
            if self.done:  # Re-initialize objects for new episode
                self.cur_state = utils.state_to_tensor(self.env.reset())
                self.done = False
                training_data.append(TraceTrainingData())
                if len(self.episode_rewards) > 0:
                    self.rewards.append(sum(self.episode_rewards))
                    self.episode_lengths.append(len(self.episode_rewards))
                    self.episode_rewards = []

            # Compute policy and q_values. Note that we do not detach elements used in training,
            # as this saves us computations in _train()
            policy, q_values = self.model(self.cur_state)
            value = (policy * q_values).sum(dim=1, keepdim=True)
            with torch.no_grad():
                avg_policy, _ = self.shared_avg_model(self.cur_state)
            action = torch.multinomial(policy, num_samples=1)[0, 0]

            next_state, reward, done, _ = self.env.step(action.item())
            next_state = utils.state_to_tensor(next_state)

            # Save transition in replay buffer
            self.replay_buffer.append_transition(
                (self.cur_state, torch.LongTensor([[action.item()]]),
                 policy.detach(), torch.LongTensor([[reward]]), done))
            # Save data for training (all tensors have first dimension 1)
            training_data[-1].append(action=torch.LongTensor([[action]]),
                                     policy=policy,
                                     q_values=q_values,
                                     value=value,
                                     reward=torch.Tensor([[reward]]),
                                     average_policy=avg_policy)

            # Update loop data
            t += 1
            self.done = done
            self.cur_state = next_state
            self.episode_rewards.append(reward)

        if not self.done:
            training_data[-1].last_state = self.cur_state
            self.replay_buffer.cutoff(
                self.cur_state)  # Notify termination to the replay buffer
        self.shared_counter.increment(t)
        return training_data
Esempio n. 2
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Esempio n. 3
0
File: test.py Progetto: Luo1996/ACER
def test(rank, args, T, shared_model):
  torch.manual_seed(args.seed + rank)

  env = gym.make(args.env)
  env.seed(args.seed + rank)
  model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
  model.eval()

  can_test = True  # Test flag
  t_start = 1  # Test step counter to check against global counter
  rewards, steps = [], []  # Rewards and steps for plotting
  l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
  done = True  # Start new episode

  while T.value() <= args.T_max:
    if can_test:
      t_start = T.value()  # Reset counter

      # Evaluate over several episodes and average results
      avg_rewards, avg_episode_lengths = [], []
      for _ in range(args.evaluation_episodes):
        while True:
          # Reset or pass on hidden state
          if done:
            # Sync with shared model every episode
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            cx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            done, episode_length = False, 0
            reward_sum = 0

          # Optionally render validation states
          if args.render:
            env.render()

          # Calculate policy
          policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach()))  # Break graph for memory efficiency

          # Choose action greedily
          action = policy.max(1)[1].data[0, 0]

          # Step
          state, reward, done, _ = env.step(action)
          state = state_to_tensor(state)
          reward_sum += reward
          done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
          episode_length += 1  # Increase episode counter

          # Log and reset statistics at the end of every episode
          if done:
            avg_rewards.append(reward_sum)
            avg_episode_lengths.append(episode_length)
            break

      print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
            t_start,
            sum(avg_rewards) / args.evaluation_episodes,
            sum(avg_episode_lengths) / args.evaluation_episodes))

      if args.evaluate:
        return

      rewards.append(avg_rewards)  # Keep all evaluations
      steps.append(t_start)
      plot_line(steps, rewards)  # Plot rewards
      torch.save(model.state_dict(), 'model.pth')  # Save model params
      can_test = False  # Finish testing
    else:
      if T.value() - t_start >= args.evaluation_interval:
        can_test = True

    time.sleep(0.001)  # Check if available to test every millisecond

  env.close()
Esempio n. 4
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.eval()

    save_dir = os.path.join('results', args.name)

    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    # stores step, reward, avg_steps and time
    results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []}

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = torch.zeros(1, args.hidden_size)
                        cx = torch.zeros(1, args.hidden_size)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        done, episode_length = False, 0
                        reward_sum = 0

                    # Optionally render validation states
                    if args.render:
                        env.render()

                    # Calculate policy
                    with torch.no_grad():
                        policy, _, _, (hx, cx), _ = model(state, (hx, cx))

                    # Choose action greedily
                    action = policy.max(1)[1][0]

                    # Step
                    state, reward, done, _ = env.step(action.item())
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break
            print(('[{}] Step: {:<' + l +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
                       t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))
            fields = [
                t_start,
                sum(avg_rewards) / args.evaluation_episodes,
                sum(avg_episode_lengths) / args.evaluation_episodes,
                str(datetime.now())
            ]

            # storing data in the dictionary.
            results_dict['t'].append(t_start)
            results_dict['reward'].append(
                sum(avg_rewards) / args.evaluation_episodes)
            results_dict['avg_steps'].append(
                sum(avg_episode_lengths) / args.evaluation_episodes)
            results_dict['time'].append(str(datetime.now()))

            # Dumping the results in pickle format
            with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
                pickle.dump(results_dict, f)

            # Saving the data in csv format
            with open(os.path.join(save_dir, 'results.csv'), 'a') as f:
                writer = csv.writer(f)
                writer.writerow(fields)

            if args.evaluate:
                return

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards, save_dir)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join(save_dir,
                                    'model.pth'))  # Save model params
            #   torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start)))  # Save model params
            can_test = False  # Finish testing
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond

    # Dumping the results in pickle format
    with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
        pickle.dump(results_dict, f)

    env.close()
Esempio n. 5
0
def train(rank, args, T, shared_model, optimiser):
	torch.manual_seed(args.seed + rank)

	env = gym.make(args.env)
	env.seed(args.seed + rank)
	model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
	model.train()

	t = 1  # Thread step counter
	epr, eploss, done  = 0, 0, True # Start new episode

	while T.value() <= args.T_max:
		while True:
			model.load_state_dict(shared_model.state_dict()) # sync with shared model
			# Get starting timestep
			t_start = t

			policies, Vs, actions, rewards = [], [], [], [] # save values for computing gradientss

			# Reset or pass on hidden state
			if done:
				hx, avg_hx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size))
				cx, avg_cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size))
				# Reset environment and done flag
				state = state_to_tensor(env.reset())
				done, episode_length = False, 0
			else:
				# Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
				hx = hx.detach()
				cx = cx.detach()

			while not done and t - t_start < args.t_max:
				# Calculate policy and values
				policy, V, (hx, cx) = model(Variable(state), (hx, cx))

				# Sample action
				action = policy.multinomial().data[0, 0]

				# Step
				next_state, reward, done, _ = env.step(action)
				next_state = state_to_tensor(next_state)
				reward = args.reward_clip and min(max(reward, -1), 1) or reward  # Optionally clamp rewards
				done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
				episode_length += 1  # Increase episode counter
								
				# Save outputs for online training
				[arr.append(el) for arr, el in zip((policies, Vs, actions, rewards),
									 (policy, V, Variable(torch.LongTensor([[action]])), Variable(torch.Tensor([[reward]]))))]

				# Increment counters
				t += 1
				T.increment()

				# Update state
				state = next_state

			if done:
				R = Variable(torch.zeros(1, 1))
			else:
				# R = V(s_i; θ) for non-terminal s
				_, R, _ = model(Variable(state), (hx, cx))
				R = R.detach()

			# Train the network on-policy
			p_loss, v_loss = _train(args, T, model, shared_model, optimiser, policies, Vs, actions, rewards, R)

			# Finish episode
			if done:
				break
Esempio n. 6
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.train()

    if not args.on_policy:
        memory = EpisodicReplayMemory(args.memory_capacity,
                                      args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                hx, avg_hx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                cx, avg_cx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                action, reward, done, episode_length = 0, 0, False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                input = extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward)
                policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    Variable(input), (avg_hx, avg_cx))

                # Sample action
                action = policy.multinomial().data[
                    0,
                    0]  # Graph broken as loss for stochastic action calculated manually

                # Step
                next_state, reward, done, _ = env.step(action)
                next_state = state_to_tensor(next_state)
                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(input, action, reward,
                                  policy.data)  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards, average_policies
                    ), (policy, Q, V, Variable(torch.LongTensor([[action]])),
                        Variable(torch.Tensor([[reward]])), average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = Variable(torch.zeros(1, 1))

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(
                        extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward), None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(Variable(input), (hx, cx))
                Qret = Qret.detach()

            # Train the network on-policy
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                hx, avg_hx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))
                cx, avg_cx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    input = torch.cat((trajectory.state
                                       for trajectory in trajectories[i]), 0)
                    action = Variable(
                        torch.LongTensor([
                            trajectory.action for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    reward = Variable(
                        torch.Tensor([
                            trajectory.reward for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    old_policy = Variable(
                        torch.cat((trajectory.policy
                                   for trajectory in trajectories[i]), 0))

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               Variable(input),
                                               (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_input = torch.cat(
                        (trajectory.state
                         for trajectory in trajectories[i + 1]), 0)
                    done = Variable(
                        torch.Tensor([
                            trajectory.action is None
                            for trajectory in trajectories[i + 1]
                        ]).unsqueeze(1))

                # Do forward pass for all transitions
                _, _, Qret, _ = model(Variable(next_input), (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach()

                # Train the network off-policy
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Esempio n. 7
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width,
                  args.height,
                  args.frame_skip,
                  args.rewarding_distance,
                  args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)
    if args.render:
        (_, _, obs_rgb_view2) = env.reset()
        plt.ion()
        f, ax = plt.subplots()
        im = ax.imshow(obs_rgb_view2)

    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.eval()
    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    n_digits = str(
        len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        cx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                                0), 0, False, 0
                        reward_sum = 0

                    # Calculate policy
                    policy, _, (hx, cx) = model(
                        Variable(
                            state[0], volatile=True),
                        Variable(
                            state[1], volatile=True),
                        (hx.detach(),
                         cx.detach()))  # Break graph for memory efficiency

                    # Choose action greedily
                    action = [p.max(1)[1].data[0, 0] for p in policy]

                    # Step
                    state, reward, done = env.step(action)
                    obs_rgb_view1 = state[1]
                    obs_rgb_view2 = state[2]
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Optionally render validation states
                    if args.render:
                        # rendering the first camera view
                        im.set_data(obs_rgb_view1)
                        plt.draw()
                        plt.pause(0.05)

                        # rendering mujoco simulation
                        # viewer = mujoco_py.MjViewer(env.sim)
                        # viewer.render()

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break

            print(('[{}] Step: {:<' + n_digits +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime(
                           '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join('results', str(t_start) +
                                    '_model.pth'))  # Checkpoint model params
            can_test = False  # Finish testing
            if args.evaluate:
                return
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond
Esempio n. 8
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size, args.no_noise, args.noise_entropy)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = 0, 0, False, 0
        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()
        model.sample_noise(
        )  # Pick a new noise vector (until next optimisation step)

        # Lists of outputs for training
        values, log_probs, rewards, entropies = [], [], [], []

        while not done and t - t_start < args.t_max:
            input = extend_input(state, action_to_one_hot(action, action_size),
                                 reward, episode_length)
            # Calculate policy and value
            policy, value, (hx, cx) = model(Variable(input), (hx, cx))
            log_policy = policy.log()
            entropy = -(log_policy * policy).sum(1)

            # Sample action
            action = policy.multinomial()
            log_prob = log_policy.gather(
                1, action.detach()
            )  # Graph broken as loss for stochastic action calculated manually
            action = action.data[0, 0]

            # Step
            state, reward, done, _ = env.step(action)
            state = state_to_tensor(state)
            reward = args.reward_clip and min(max(
                reward, -1), 1) or reward  # Optionally clamp rewards
            done = done or episode_length >= args.max_episode_length
            episode_length += 1  # Increase episode counter

            # Save outputs for training
            [
                arr.append(el)
                for arr, el in zip((values, log_probs, rewards,
                                    entropies), (value, log_prob, reward,
                                                 entropy))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Return R = 0 for terminal s or V(s_i; θ) for non-terminal s
        if done:
            R = Variable(torch.zeros(1, 1))
        else:
            _, R, _ = model(Variable(input), (hx, cx))
            R = R.detach()
        values.append(R)

        # Train the network
        policy_loss = 0
        value_loss = 0
        A_GAE = torch.zeros(1, 1)  # Generalised advantage estimator Ψ
        # Calculate n-step returns in forward view, stepping backwards from the last state
        trajectory_length = len(rewards)
        for i in reversed(range(trajectory_length)):
            # R ← r_i + γR
            R = rewards[i] + args.discount * R
            # Advantage A = R - V(s_i; θ)
            A = R - values[i]
            # dθ ← dθ - ∂A^2/∂θ
            value_loss += 0.5 * A**2  # Least squares error

            # TD residual δ = r + γV(s_i+1; θ) - V(s_i; θ)
            td_error = rewards[i] + args.discount * values[
                i + 1].data - values[i].data
            # Generalised advantage estimator Ψ (roughly of form ∑(γλ)^t∙δ)
            A_GAE = A_GAE * args.discount * args.trace_decay + td_error
            # dθ ← dθ + ∇θ∙log(π(a_i|s_i; θ))∙Ψ
            policy_loss -= log_probs[i] * Variable(
                A_GAE)  # Policy gradient loss
            if args.no_noise or args.noise_entropy:
                # dθ ← dθ + β∙∇θH(π(s_i; θ))
                policy_loss -= args.entropy_weight * entropies[
                    i]  # Entropy maximisation loss

        # Optionally normalise loss by number of time steps
        if not args.no_time_normalisation:
            policy_loss /= trajectory_length
            value_loss /= trajectory_length

        # Zero shared and local grads
        optimiser.zero_grad()
        # Note that losses were defined as negatives of normal update rules for gradient descent
        (policy_loss + value_loss).backward()
        # Gradient L2 normalisation
        nn.utils.clip_grad_norm(model.parameters(), args.max_gradient_norm, 2)

        # Transfer gradients to shared model and update
        _transfer_grads_to_shared_model(model, shared_model)
        optimiser.step()
        if not args.no_lr_decay:
            # Linearly decay learning rate
            _adjust_learning_rate(
                optimiser,
                max(args.lr * (args.T_max - T.value()) / args.T_max, 1e-32))

    env.close()
Esempio n. 9
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width, args.height, args.frame_skip,
                  args.rewarding_distance, args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)

    # TODO: pass in the observation and action space
    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                    0), 0, False, 0

        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()

        # Lists of outputs for training
        policies, Vs, actions, rewards = [], [], [], []

        while not done and t - t_start < args.t_max:
            # Calculate policy and value
            policy, V, (hx, cx) = model(Variable(state[0]), Variable(state[1]),
                                        (hx, cx))

            # Sample action
            action = [
                p.multinomial().data[0, 0] for p in policy
            ]  # Graph broken as loss for stochastic action calculated manually

            # Step
            state, reward, done = env.step(action)
            state = state_to_tensor(state)
            done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
            episode_length += 1  # Increase episode counter

            # Save outputs for online training
            [
                arr.append(el)
                for arr, el in zip((policies, Vs, actions, rewards), (
                    policy, V, Variable(torch.LongTensor(action)), reward))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Break graph for last values calculated (used for targets, not directly as model outputs)
        if done:
            # R = 0 for terminal s
            R = Variable(torch.zeros(1, 1))

        else:
            # R = V(s_i; θ) for non-terminal s
            _, R, _ = model(Variable(state[0]), Variable(state[1]), (hx, cx))
            R = R.detach()
        Vs.append(R)

        # Train the network
        _train(args, T, model, shared_model, optimiser, policies, Vs, actions,
               rewards, R)
Esempio n. 10
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)

    # env = gym.make(args.env)
    # env.seed(args.seed + rank)
    # model = ActorCritic(STATE_SPACE, ACTION_SPACE, args.hidden_size, NUM_LAYERS)
    model = torch.load('training_cps/training1_2_layer2_1-0_270000.pt')
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        # memory = EpisodicReplayMemory(args.memory_capacity // args.num_processes, args.max_episode_length)
        parser = Parser()
        several_csvs = [
            'initial_csvs/Task1_3.csv', 'initial_csvs/Task1_4.csv',
            'initial_csvs/Task1_5.csv'
        ]
        parser.parseInit(several_csvs)

        # parser.generateRandomDataset(100)
        # parser.writeToFile('outputs/output1_several_layer{0}_0-4.csv'.format(parser.layer))

        parser.readAMTBatch('AMT_rewards/AMT1_345_layer2_0-8.csv')
        several_outputs = [
            'outputs/output1_3_layer2_0-8.csv',
            'outputs/output1_4_layer2_0-8.csv',
            'outputs/output1_5_layer2_0-8.csv'
        ]
        parser.writeBackMemory(several_outputs)

        memory = parser.memory
        # pdb.set_trace()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if (T.value() % 10000 == 0
            ):  # 500 iterations around 1 min. 10000 iterations 20 mins
            torch.save(
                model,
                'training_cps/training1_2_layer2_1-0_{0}.pt'.format(T.value() +
                                                                    270000))
        # On-policy episode loop
        while False:
            # Sync with shared model at least every t_max steps
            model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                hx, avg_hx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                cx, avg_cx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                # Reset environment and done flag
                # state = state_to_tensor(env.reset())
                state = state_to_tensor(parser.states[0]).view(1, STATE_SPACE)
                print(state)
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(Variable(state), (hx, cx))
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    Variable(state), (avg_hx, avg_cx))

                # Sample action
                action = policy.multinomial().data[
                    0,
                    0]  # Graph broken as loss for stochastic action calculated manually

                # Step
                # next_state, reward, done, _ = env.step(action)
                next_state = parser.states[1]
                reward = parser.rewards[0]
                done = True

                next_state = state_to_tensor(next_state).view(1, 24)
                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.data)  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards, average_policies
                    ), (policy, Q, V, Variable(torch.LongTensor([[action]])),
                        Variable(torch.Tensor([[reward]])), average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = Variable(torch.zeros(1, 1))

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(Variable(state), (hx, cx))
                Qret = Qret.detach()

            # Train the network on-policy
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        # print(len(memory))
        # print(args.replay_start)
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                hx, avg_hx = Variable(
                    torch.zeros(NUM_LAYERS, args.batch_size,
                                args.hidden_size)), Variable(
                                    torch.zeros(NUM_LAYERS, args.batch_size,
                                                args.hidden_size))
                cx, avg_cx = Variable(
                    torch.zeros(NUM_LAYERS, args.batch_size,
                                args.hidden_size)), Variable(
                                    torch.zeros(NUM_LAYERS, args.batch_size,
                                                args.hidden_size))

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []
                # print(len(trajectories))

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat((trajectory.state
                                       for trajectory in trajectories[i]), 0)
                    action = Variable(
                        torch.LongTensor([
                            trajectory.action for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    reward = Variable(
                        torch.Tensor([
                            trajectory.reward for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    old_policy = Variable(
                        torch.cat((trajectory.policy
                                   for trajectory in trajectories[i]), 0))

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(Variable(state), (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               Variable(state),
                                               (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        (trajectory.state
                         for trajectory in trajectories[i + 1]), 0)
                    done = Variable(
                        torch.Tensor([
                            trajectory.action is None
                            for trajectory in trajectories[i + 1]
                        ]).unsqueeze(1))

                # Do forward pass for all transitions
                _, _, Qret, _ = model(Variable(next_state), (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach()

                # Train the network off-policy
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True
        T.increment()
Esempio n. 11
0
  model.train()

  t = 1  # Thread step counter
  done = True  # Start new episode

  while True:  # TODO: Need to receive kill signal from server
    # Sync with server model at least every t_max steps
    _sync_params(socket, model)
    # Get starting timestep
    t_start = t

    # Reset or pass on hidden state
    if done:
      hx, cx = Variable(torch.zeros(1, args.hidden_size)), Variable(torch.zeros(1, args.hidden_size))
      # Reset environment and done flag
      state = state_to_tensor(env.reset())
      action, reward, done, episode_length = 0, 0, False, 0
    else:
      # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
      hx, cx = hx.detach(), cx.detach()

    # Lists of outputs for training
    values, log_probs, rewards, entropies = [], [], [], []

    while not done and t - t_start < args.t_max:
      # Calculate policy and value
      policy, value, (hx, cx) = model(Variable(state), (hx, cx))
      log_policy = policy.log()
      entropy = -(log_policy * policy).sum(1)

      # Sample action