Ejemplo n.º 1
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.train()

    if not args.on_policy:
        memory = EpisodicReplayMemory(args.memory_capacity,
                                      args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                hx, avg_hx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                cx, avg_cx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                action, reward, done, episode_length = 0, 0, False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                input = extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward)
                policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    Variable(input), (avg_hx, avg_cx))

                # Sample action
                action = policy.multinomial().data[
                    0,
                    0]  # Graph broken as loss for stochastic action calculated manually

                # Step
                next_state, reward, done, _ = env.step(action)
                next_state = state_to_tensor(next_state)
                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(input, action, reward,
                                  policy.data)  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards, average_policies
                    ), (policy, Q, V, Variable(torch.LongTensor([[action]])),
                        Variable(torch.Tensor([[reward]])), average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = Variable(torch.zeros(1, 1))

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(
                        extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward), None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(Variable(input), (hx, cx))
                Qret = Qret.detach()

            # Train the network on-policy
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                hx, avg_hx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))
                cx, avg_cx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    input = torch.cat((trajectory.state
                                       for trajectory in trajectories[i]), 0)
                    action = Variable(
                        torch.LongTensor([
                            trajectory.action for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    reward = Variable(
                        torch.Tensor([
                            trajectory.reward for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    old_policy = Variable(
                        torch.cat((trajectory.policy
                                   for trajectory in trajectories[i]), 0))

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               Variable(input),
                                               (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_input = torch.cat(
                        (trajectory.state
                         for trajectory in trajectories[i + 1]), 0)
                    done = Variable(
                        torch.Tensor([
                            trajectory.action is None
                            for trajectory in trajectories[i + 1]
                        ]).unsqueeze(1))

                # Do forward pass for all transitions
                _, _, Qret, _ = model(Variable(next_input), (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach()

                # Train the network off-policy
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Ejemplo n.º 2
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Ejemplo n.º 3
0
    def _getStorage(self, several_outputs):
        scalaHm = {0: 1, 1: 9, 2: 15}
        sizeHm = {0: 1.45, 1: 2.15, 2: 3.15}
        hidden_size = 32
        model = ActorCritic(STATE_SPACE, ACTION_SPACE, hidden_size, NUM_LAYERS)
        memory_capacity = 10000
        max_episode_length = 10
        self.memory = EpisodicReplayMemory(memory_capacity, max_episode_length)

        states = self.states

        # storage will have format [[[], [], []], [episode2], [], []]. the last
        # element of each epsidoe is [tensorState, final reward]
        storage = []
        storage.append([])
        episodeI = 0

        for index in range(len(states)):
            state = states[index]
            hx = Variable(torch.zeros(1, hidden_size))
            cx = Variable(torch.zeros(1, hidden_size))
            tensorState = torch.zeros(1, STATE_SPACE)
            minS = float("inf")
            with open(several_outputs[index]) as f:
                next(f)
                for line in f:
                    curRow = line.rstrip().split(',')
                    if (curRow[0] == '-1'
                            or curRow[0] == '50'):  # end of an episode
                        sigma = minS / (len(storage[episodeI]))
                        entropy = 1 + 0.5 * math.log(39.4 * sigma)
                        # plugging in same reward for all turns in episode... the ACER code uses lambda return to scale them
                        # print(episodeI)
                        storage[episodeI].append([
                            tensorState,
                            self.rewardsWithoutE[episodeI] + entropy
                        ])
                        episodeI += 1
                        # print(episodeI)
                        if (episodeI >= 1583):
                            break
                        hx = Variable(torch.zeros(1, hidden_size))
                        cx = Variable(torch.zeros(1, hidden_size))
                        state = states[index]  # prepare for next episode
                        storage.append([])
                        minS = float("inf")
                    else:

                        tensorState = torch.FloatTensor(np.array(state)).view(
                            1, STATE_SPACE)
                        [action_node_id, bubble_size,
                         bubble_scale] = list(map(int, (curRow[2:5])))
                        actionSingleVal = (action_node_id - 1) * self.bubbleScale * self.bubbleSize +  \
                               bubble_size * self.bubbleScale + bubble_scale

                        minS = min(
                            minS,
                            pow(scalaHm[bubble_scale], 2) *
                            pow(sizeHm[bubble_size], 2))
                        policy, _, _, (hx, cx) = model(Variable(tensorState),
                                                       (hx, cx))
                        storage[episodeI].append(
                            [tensorState, actionSingleVal, policy.data])
                        state[(action_node_id - 1) * 5 + 4] = 1
        return storage