def __init__(self):
     self.policy_net = PolicyNetwork()
     self.n_actions = Config.N_ACTIONS
     self.states, self.actions, self.rewards = [], [], []
     self.gamma = Config.REWARD_DECAY
     self.load_weights(self.policy_net)
     self.optimizer = torch.optim.Adam(self.policy_net.parameters(), Config.LR, (0.9, 0.99))
コード例 #2
0
    def __init__(self, action_dim, state_dim, discount, lam, value_lr,
                 policy_lr, value_path, policy_path):
        self.discount = discount
        self.lam = lam

        tf.reset_default_graph()

        self.value_network = ValueNetwork(state_dim, value_lr, value_path)
        self.policy_network = PolicyNetwork(action_dim, state_dim, policy_lr,
                                            policy_path)
コード例 #3
0
ファイル: dagger.py プロジェクト: murthy95/DDPG_tf
    def __init__(sess, state_shape, action_dim, batch_size, LR, buffer_size):

        print "Building a Policy Network."
        policy_network = PolicyNetwork(sess, state_shape, action_dim, batch_size, LR)

        #now we need to generate data to train the policy using master policy actor Network
        #loading the saved agent.
        trained_agent =

        self.buffer_size = buffer_size
class PolicyGradient:
    def __init__(self):
        self.policy_net = PolicyNetwork()
        self.n_actions = Config.N_ACTIONS
        self.states, self.actions, self.rewards = [], [], []
        self.gamma = Config.REWARD_DECAY
        self.load_weights(self.policy_net)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), Config.LR, (0.9, 0.99))

    def load_weights(self, net):
        # net.state_dict(), 得出来的名字,'layers.1.weight'
        for m in net.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1)
                nn.init.constant_(m.bias, 0)

    def store_trajectory(self, s, a, r):
        self.states.append(s)
        self.actions.append(a)
        self.rewards.append(r)

    def chose_action(self, s):
        s = torch.Tensor(np.expand_dims(s, axis=0))
        # 每个动作的概率
        actions_probs = F.softmax(self.policy_net(s).detach(), dim=1)

        # 根据概率选动作
        action = random.choices(range(self.n_actions), weights=actions_probs.squeeze(0))[0]
        return action

    def learn(self):
        # discount and normalize episode reward
        discount_and_norm_rewards = torch.from_numpy(self._discount_and_norm_rewards()).float()
        states = torch.from_numpy(np.concatenate(self.states, axis=0)).float()
        actions = torch.from_numpy(np.concatenate(self.actions, axis=0)).long()
        log_probs = nn.NLLLoss(reduction='none')(nn.LogSoftmax(dim=1)(self.policy_net(states)), actions)
        loss = torch.sum(log_probs * discount_and_norm_rewards)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _discount_and_norm_rewards(self):
        # discount episode rewards
        discount_and_norm_rewards = np.array([])
        for episode_rewards in self.rewards:
            discounted_episode_rewards = np.zeros_like(episode_rewards)
            running_reward = 0
            for t in reversed(range(0, len(episode_rewards))):
                running_reward = self.gamma * running_reward + episode_rewards[t]
                discounted_episode_rewards[t] = running_reward
            # normalize episode rewards
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            discounted_episode_rewards /= np.std(discounted_episode_rewards)
            discount_and_norm_rewards = np.concatenate(
                (discount_and_norm_rewards, discounted_episode_rewards), axis=0)
        return discount_and_norm_rewards

    def draw_curve(self, loss):
        x = np.arange(1, len(loss)+1)
        plt.title("cost curve")
        plt.xlabel("train step")
        plt.ylabel("cost")
        plt.plot(x, loss)
        plt.show()
コード例 #5
0
# set seeds for np and tf
seed = 1
np.random.seed(seed)
tf.random.set_seed(seed)

epochs = 8

# paths for the KG and QA files
path_KB = "./datasets/3H-kb.txt"
path_QA = "./datasets/PQ-3H.txt"

# Experiment Settings
T = 3                 	# To change according to QA type
attention = True		# Use Attention Model or not
perceptron = True    	# Use Perceptron for semantic similary scores

# Prep Data
KG, dataset = prep_dataset(path_KB, path_QA)
inputs = (KG, dataset, T)

# Run Experiments
print('\n\n*********** Policy Network with Perceptron & Attention ***********')
model_name = fetch_model_name('combined')
policy_network = PolicyNetwork(T, model_name) 											# Model uses attention Layer
train_att_per, val_att_per = policy_network.train(inputs, epochs=epochs)

print('\n\n*********** Policy Network with Perceptron Only ***********')
model_name = fetch_model_name('perceptron')
policy_network = PolicyNetwork(T, model_name)
train_per, val_per = policy_network.train(inputs, epochs=epochs, attention=False)		# Model does not use attention layer
コード例 #6
0
        running_add = running_add * gamma + r[i]
        discounted_r[i] = running_add

    if normalization:
        mean = np.mean(discounted_r)
        std = np.std(discounted_r)
        discounted_r = (discounted_r - mean) / (std)

    return discounted_r


#==============================
#  Initialize network and session
#==============================
# Instantiate the PolicyNetwork
PolicyNetwork = PolicyNetwork(state_size, action_size, learning_rate)
# Initialize session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)


#============================
# run policy
#============================
def make_batch(batch_size):
    """
	We will run the policy and generate a bunch of episodes
	We need to keep track of (st,at,rt,st+1) for each of the episode, those we will use for training
	:param batch_size: number of episodes in a batch
	:return: 3 list: states, actions, rewards of batch (each value is accumulated discounted total rewards)
コード例 #7
0
class PPOLearner:
    def __init__(self, action_dim, state_dim, discount, lam, value_lr,
                 policy_lr, value_path, policy_path):
        self.discount = discount
        self.lam = lam

        tf.reset_default_graph()

        self.value_network = ValueNetwork(state_dim, value_lr, value_path)
        self.policy_network = PolicyNetwork(action_dim, state_dim, policy_lr,
                                            policy_path)

    def collect_data(self, env, N, T):
        state_data, action_data, reward_data = [], [], []
        for i in range(N):
            states, actions, rewards = self.run_timesteps(env, T)
            state_data.append(states)
            action_data.append(actions)
            reward_data.append(rewards)

        return state_data, action_data, reward_data

    def run_timesteps(self, env, T):
        states, actions, rewards = [], [], [
        ]  # states will have one more entry than actions, rewards
        state = env.reset()
        state = np.squeeze(state)
        # state = np.reshape(state, (1, -1))
        states.append(state)
        for j in range(T):
            state = np.reshape(state, (1, -1))
            action = self.policy_network.get_action(state)
            next_state, reward, done, _ = env.step(action)
            env.render()

            # print (action)

            # reward = np.asscalar(reward)
            next_state = np.squeeze(next_state)

            if done:
                reward = -10

            states.append(next_state)
            actions.append(action)
            rewards.append(reward)

            if done:
                break

            state = next_state
        return states, actions, rewards

    def compute_targets_and_advantages(self, state_data, reward_data):
        target_data = []
        advantage_data = []
        assert (len(state_data[0]) == len(reward_data[0]) + 1
                )  # checking the trajectory data is correct format

        n = len(state_data)
        for i in range(n):
            states = state_data[i]
            rewards = reward_data[i]

            targets = []
            advantages = []

            states = np.vstack(states)
            # states = np.reshape(states, (-1, len(states[0])))
            values = self.value_network.get_value(states)
            for i in range(len(rewards)):
                target, advantage = self.compute_timestep_target_and_advantage(
                    values[i:], rewards[i:])
                targets.append(target)
                advantages.append(advantage)

            target_data.append(targets)
            advantage_data.append(advantages)

        return target_data, advantage_data

    def compute_timestep_target_and_advantage(self, values, rewards):
        original_value = values[0]
        original_value = np.asscalar(original_value)
        rewards_term = [
            rewards[i] * (self.lam * self.discount)**(i)
            for i in range(len(rewards))
        ]
        values_term = [
            values[i + 1] * (1 - self.lam) * self.discount *
            (self.lam * self.discount)**(i) for i in range(len(values) - 1)
        ]
        target = sum(rewards_term) + sum(values_term)
        try:
            target = np.asscalar(target)
        except:
            target = target
        advantage = target - original_value

        return target, advantage

    def train(self, env, iterations, N, T, epochs, M):
        for i in range(iterations):
            print('\nCollecting data...')
            state_data, action_data, reward_data = self.collect_data(env, N, T)

            average_reward = np.sum(np.sum(reward_data)) / len(reward_data)
            print('Iteration {}: Average reward = {}'.format(
                i, average_reward))
            if np.isnan(average_reward):
                self.policy_network.print_for_debug()

            print('Calculating targets and advantages...')
            target_data, advantage_data = self.compute_targets_and_advantages(
                state_data, reward_data)

            print('Updating...')
            for j in range(epochs):
                batch_generator = self.get_batches(M, state_data, action_data,
                                                   target_data, advantage_data)
                for s, a, t, adv in batch_generator:
                    if (len(s) > 0):
                        # print (s)
                        s = np.vstack(s)
                    else:
                        print('state')
                        print(s)
                        continue
                    self.value_network.update(s, t)

                    # if (len(a) > 1):
                    # 	a = np.vstack(a)
                    # else:
                    # 	print (a)
                    a = np.vstack(a)
                    self.policy_network.update(s, a, adv)

            if i % 100 == 0:
                self.policy_network.save_model()
                self.value_network.save_model()

    def get_batches(
        self, M, state_data, action_data, target_data, advantage_data
    ):  # concatenate the states, actions... lists together into one whole list
        # need to remove last entry for each state
        flat_state_data = np.concatenate(tuple(states[:-1]
                                               for states in state_data),
                                         axis=0)
        flat_action_data = np.concatenate(tuple(actions
                                                for actions in action_data),
                                          axis=0)
        flat_target_data = np.concatenate(tuple(targets
                                                for targets in target_data),
                                          axis=0)
        flat_advantage_data = np.concatenate(tuple(
            advantages for advantages in advantage_data),
                                             axis=0)

        # print (flat_state_data)
        # print ('--------------------------------')

        # flat_state_data = np.array([s for states in state_data for s in states[:-1]])
        # flat_action_data = np.array([a for actions in action_data for a in actions])
        # flat_target_data = np.array([t for targets in target_data for t in targets])
        # flat_advantage_data = np.array([adv for advantages in advantage_data for adv in advantages])

        n_data = len(flat_state_data)
        randperm = random.sample(range(n_data), n_data)

        for i in range(0, n_data, M):
            end_i = min((i + 1) * M, n_data - 1)
            ii = randperm[i:end_i]
            yield flat_state_data[ii], flat_action_data[ii], flat_target_data[
                ii], flat_advantage_data[ii]

    def save_model(self):
        save_name = self.saver.save(self.sess, self.save_path)
        print("Model checkpoint saved in file: %s" % save_name)