예제 #1
0
    def _create_graph(self):
        if self.reuse:
            tf.get_variable_scope().reuse_variables()
            assert tf.get_variable_scope().reuse

        worker_device = "/job:worker/task:%d" % self.index + self.device
        with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)):
            self.results_sum = tf.get_variable(name="results_sum", shape=[], initializer=tf.zeros_initializer)
            self.game_num = tf.get_variable(name="game_num", shape=[], initializer=tf.zeros_initializer)

            self.global_steps = tf.get_variable(name="global_steps", shape=[], initializer=tf.zeros_initializer)
            self.win_rate = self.results_sum / self.game_num

            self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num)
            self.merged = tf.summary.merge([self.mean_win_rate])

            mini_scope = "MiniPolicyNN"
            with tf.variable_scope(mini_scope):
                ob_space = _SIZE_MINI_INPUT
                act_space_array = _SIZE_MINI_ACTIONS
                self.policy = Policy_net('policy', self.sess, ob_space, act_space_array)
                self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array)
                self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, lr=P.mini_lr, epoch_num=P.mini_epoch_num)
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            self.policy_saver = tf.train.Saver(var_list=var_list)
예제 #2
0
    def _create_graph(self):
        if self.reuse:
            tf.get_variable_scope().reuse_variables()
            assert tf.get_variable_scope().reuse

        worker_device = "/job:worker/task:%d" % self.index + self.device
        print("worker_device:", worker_device)
        with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)):
            self.results_sum = tf.get_variable(trainable=False, name="results_sum", shape=[], initializer=tf.zeros_initializer)
            self.game_num = tf.get_variable(trainable=False, name="game_num", shape=[], initializer=tf.zeros_initializer)

            self.global_steps = tf.get_variable(trainable=False, name="global_steps", shape=[], initializer=tf.zeros_initializer)

            self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num)
            self.merged = tf.summary.merge([self.mean_win_rate])

            self.dynamic_net = DynamicNetwork('train', self.sess, load_path=self.dynamic_load_path, save_path=self.dynamic_save_path)
            
            scope = "PolicyNN"
            with tf.variable_scope(scope):
                ob_space = C._SIZE_SIMPLE_INPUT
                act_space_array = C._SIZE_MAX_ACTIONS
                self.policy = Policy_net('policy', self.sess, ob_space, act_space_array)
                self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array)
                self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, epoch_num=P.src_epoch_num)
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
            self.policy_saver = tf.train.Saver(var_list=var_list)
예제 #3
0
    def main(args):
    	self.scene_scope=bathroom_02
    	self.task_scope=37  #26 43 53 32 41
    	self.env = Environment({'scene_name': self.scene_scope,'terminal_state_id': int(self.task_scope)})
    	self.env.reset()
        Policy = Policy_net('policy', env) #buiding the actor critic graph / object

        PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph
        pdb.set_trace()
        D = Discriminator(env) #discriminator of the Gan Kind of thing
예제 #4
0
def main(args):
    #env.seed(0)
    env = gym.make('MineRLNavigateDense-v0')
    ob_space = env.observation_space
    action_space = env.action_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()

        reward = 0
        success_num = 0
        render = False
        for iteration in range(args.iteration):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.stack([obs['pov']]).astype(dtype=np.float32)
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs['pov'])
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(
                    [int(act / 3) + 1, act - int(act / 3) * 3])
                if (episode_length % 2500 == 0):
                    print(sum(rewards))
                if render:
                    env.render()
                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    print('done')
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=episode_length)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 1:
                success_num += 1
                render = True
                if success_num >= 10:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1, 64, 64, 3])
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
def init_train(args):
    global writer
    global sess
    global Policy
    global Old_Policy
    global PPO
    global Disc
    global max_iteration
    global iteration
    global observation_space
    global action_space
    global expert_observations
    global expert_actions

    print("###### INITIALIZING ######")
    max_iteration = args.iteration
    iteration = 0
    # PPO
    Policy = Policy_net('policy', observation_space)
    Old_Policy = Policy_net('old_policy', observation_space)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)

    # GAIL
    Disc = Discriminator(observation_space)

    # read trajectories
    expert_observations = []
    expert_actions = []
    #for data balancing
    cnt_zero_trj = 0
    ZERO_LIMIT = 300  #limit zero trajectory size
    cnt_left_trj = 0
    LEFT_LIMIT = 776
    cnt_right_trj = 0
    #profiles = []  # center_img, left_img, right_img, wheel_angle, acc, break, speed
    for _dir in os.listdir(args.trjdir):
        raw_filename = os.path.join(os.getcwd(), args.trjdir, _dir,
                                    'driving_log.csv')
        with open(raw_filename) as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:  # each row is a list
                if float(row[3]) == 0.0:  #check zero(go straght)
                    if cnt_zero_trj <= ZERO_LIMIT:
                        cnt_zero_trj += 1
                        expert_observations.append(
                            np.squeeze(image_to_feature(row[0])))
                        expert_actions.append(round(float(row[3]), 2))
                elif float(row[3]) < 0.0:  #check minus(left turn)
                    if cnt_left_trj <= LEFT_LIMIT:
                        cnt_left_trj += 1
                        expert_observations.append(
                            np.squeeze(image_to_feature(row[0])))
                        expert_actions.append(round(float(row[3]), 2))
                else:  #plus(right turn)
                    cnt_right_trj += 1
                    expert_observations.append(
                        np.squeeze(image_to_feature(row[0])))
                    expert_actions.append(round(float(row[3]), 2))

    print("###### READ TRAJECTORY: {} ######".format(len(expert_actions)))
    print("center:{}, left:{}, right:{}".format(cnt_zero_trj, cnt_left_trj,
                                                cnt_right_trj))

    # import matplotlib.pyplot as plt
    # plt.hist(expert_actions, bins=20)
    # plt.ylabel('Probability');
    # plt.xlabel('Weight')
    # plt.show()
    # return

    # initialize Tensorflow
    sess = tf.Session()
    writer = tf.summary.FileWriter(args.logdir, sess.graph)
    sess.run(tf.global_variables_initializer())

    if os.path.isfile(args.savedir + '/model.ckpt.meta') == True:
        print("###### LOAD SAVED MODEL !!!!! ######")
        saver = tf.train.Saver()
        saver.restore(sess, args.savedir + '/model.ckpt')

    extract_agent_trajectory()
예제 #6
0
class HierNetwork(object):

    def __init__(self, sess=None, summary_writer=tf.summary.FileWriter("logs/"), rl_training=False,
                 reuse=False, cluster=None, index=0, device='/gpu:0', policy_path=None,
                 ppo_load_path=None, dynamic_load_path=None, ppo_save_path=None, dynamic_save_path=None,):
        self.system = platform.system()

        if policy_path is not None:
            self.policy_model_path_load = policy_path
            self.policy_model_path_save = policy_path
        else:
            self.policy_model_path_load = ppo_load_path + "probe"
            self.policy_model_path_save = ppo_save_path + "probe"


        self.dynamic_load_path = dynamic_load_path
        self.dynamic_save_path = dynamic_save_path


        self.rl_training = rl_training

        self.use_norm = True

        self.reuse = reuse
        self.sess = sess
        self.cluster = cluster
        self.index = index
        self.device = device

        self._create_graph()

        self.rl_saver = tf.train.Saver()
        self.summary_writer = summary_writer

    def initialize(self):
        init_op = tf.global_variables_initializer()
        self.sess.run(init_op)

    def reset_old_network(self):
        self.policy_ppo.assign_policy_parameters()
        self.policy_ppo.reset_mean_returns()

        self.sess.run(self.results_sum.assign(0))
        self.sess.run(self.game_num.assign(0))

    def _create_graph(self):
        if self.reuse:
            tf.get_variable_scope().reuse_variables()
            assert tf.get_variable_scope().reuse

        worker_device = "/job:worker/task:%d" % self.index + self.device
        print("worker_device:", worker_device)
        with tf.device(tf.train.replica_device_setter(worker_device=worker_device, cluster=self.cluster)):
            self.results_sum = tf.get_variable(trainable=False, name="results_sum", shape=[], initializer=tf.zeros_initializer)
            self.game_num = tf.get_variable(trainable=False, name="game_num", shape=[], initializer=tf.zeros_initializer)

            self.global_steps = tf.get_variable(trainable=False, name="global_steps", shape=[], initializer=tf.zeros_initializer)

            self.mean_win_rate = tf.summary.scalar('mean_win_rate_dis', self.results_sum / self.game_num)
            self.merged = tf.summary.merge([self.mean_win_rate])

            self.dynamic_net = DynamicNetwork('train', self.sess, load_path=self.dynamic_load_path, save_path=self.dynamic_save_path)
            
            scope = "PolicyNN"
            with tf.variable_scope(scope):
                ob_space = C._SIZE_SIMPLE_INPUT
                act_space_array = C._SIZE_MAX_ACTIONS
                self.policy = Policy_net('policy', self.sess, ob_space, act_space_array)
                self.policy_old = Policy_net('old_policy', self.sess, ob_space, act_space_array)
                self.policy_ppo = PPOTrain('PPO', self.sess, self.policy, self.policy_old, epoch_num=P.src_epoch_num)
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
            self.policy_saver = tf.train.Saver(var_list=var_list)

    def Update_result(self, result_list):
        self.sess.run(self.results_sum.assign_add(result_list.count(1)))
        self.sess.run(self.game_num.assign_add(len(result_list)))

    def Update_summary(self, counter):
        print("Update summary........")

        policy_summary = self.policy_ppo.get_summary_dis()
        self.summary_writer.add_summary(policy_summary, counter)

        summary = self.sess.run(self.merged)
        self.summary_writer.add_summary(summary, counter)
        print("counter:", counter)
        self.sess.run(self.global_steps.assign(counter))

        print("Update summary finished!")

    def Update_policy(self, buffer):
        #print('gobal buffer length:', len(buffer.observations))
        self.policy_ppo.ppo_train_dis(buffer.observations, buffer.tech_actions,
                                      buffer.rewards, buffer.values, buffer.values_next, buffer.gaes, buffer.returns)

    def Update_internal_model(self, buffer):
        self.dynamic_net.model_train_dis(buffer.observations, buffer.tech_actions,
                                         buffer.next_observations)

    def get_global_steps(self):
        return int(self.sess.run(self.global_steps))

    def save_policy(self):
        self.policy_saver.save(self.sess, self.policy_model_path_save)
        print("policy has been saved in", self.policy_model_path_save)

    def restore_policy(self):
        self.policy_saver.restore(self.sess, self.policy_model_path_load)
        print("Restore policy from", self.policy_model_path_load)

    def restore_dynamic(self, model_path):
        self.dynamic_net.restore_sl_model(model_path)
        print("Restore internal_model")
예제 #7
0
def main(args):
    # prepare log dir
    if not os.path.exists(args.logdir):
        os.makedirs(args.logdir)
    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)
    # gym環境作成
    env = gym.make("CartPole-v0")
    env.seed(0)
    ob_space = env.observation_space
    # policy net
    Policy = Policy_net("policy", env)
    Old_Policy = Policy_net("old_policy", env)
    # ppo学習インスタンス
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)

    # tensorflow saver
    saver = tf.train.Saver()
    # session config
    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(visible_device_list=args.gpu_num, allow_growth=True)
    )
    # start session
    with tf.Session(config=config) as sess:
        # summary writer
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        # Sessionの初期化
        sess.run(tf.global_variables_initializer())
        # 状態の初期化
        obs = env.reset()
        # episodeの成功回数
        success_num = 0

        # episode loop
        for iteration in tqdm(range(args.iteration)):
            # episodeのtrajectory配列
            # buffer
            observations = []
            actions = []
            v_preds = []
            rewards = []
            # episodeのstep回数
            episode_length = 0
            # run episode
            while True:
                episode_length += 1
                # プレースホルダー用に変換
                obs = np.stack([obs]).astype(dtype=np.float32)
                # 行動と状態価値を推定
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                # 要素数が1の配列をスカラーに変換
                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                # policyによる行動で状態を更新
                next_obs, reward, done, info = env.step(act)

                # episodeの各変数を追加
                # (s_t, a_t, v_t, r_t)
                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                # episode終了判定
                # episodeが終了していたら次のepisodeを開始
                if done:
                    # v_t+1の配列
                    v_preds_next = v_preds[1:] + [0]
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            # summary追加
            writer.add_summary(
                tf.Summary(
                    value=[
                        tf.Summary.Value(
                            tag="episode_length", simple_value=episode_length
                        )
                    ]
                ),
                iteration,
            )
            writer.add_summary(
                tf.Summary(
                    value=[
                        tf.Summary.Value(
                            tag="episode_reward", simple_value=sum(rewards)
                        )
                    ]
                ),
                iteration,
            )

            # episode成功判定
            if sum(rewards) >= 195:
                success_num += 1
                # 連続で100回成功していればepisode loopを終了
                if success_num >= 100:
                    saver.save(sess, args.savedir + "/model.ckpt")
                    print("Clear!! Model saved.")
                    break
            else:
                success_num = 0

            # policy netによるtrajectryをプレースホルダー用に変換
            observations = np.reshape(
                observations, newshape=[-1] + list(ob_space.shape)
            )
            actions = np.array(actions).astype(dtype=np.int32)
            # rewardsをプレースホルダー用に変換
            rewards = np.array(rewards).astype(dtype=np.float32)

            # gaesの取得
            gaes = PPO.get_gaes(
                rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next
            )
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # エージェントのexperience
            inp = [observations, actions, gaes, rewards, v_preds_next]
            # Old_Policyにパラメータを代入
            PPO.assign_policy_parameters()

            # PPOの学習
            for epoch in range(6):
                # 学習データサンプル用のインデックスを取得
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0], size=32
                )
                # PPO学習データをサンプル
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]
                PPO.train(
                    obs=sampled_inp[0],
                    actions=sampled_inp[1],
                    gaes=sampled_inp[2],
                    rewards=sampled_inp[3],
                    v_preds_next=sampled_inp[4],
                )

            # summaryの取得
            summary = PPO.get_summary(
                obs=inp[0],
                actions=inp[1],
                gaes=inp[2],
                rewards=inp[3],
                v_preds_next=inp[4],
            )

            writer.add_summary(summary, iteration)
        writer.close()
def main(args):
    # init directories
    if not os.path.isdir(args.logdir):
        os.mkdir(args.logdir)
    if not os.path.isdir(args.logdir + '/' + args.env):
        os.mkdir(args.logdir + '/' + args.env)
    if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer)
    args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer
    if not os.path.isdir(args.savedir):
        os.mkdir(args.savedir)
    if not os.path.isdir(args.savedir + '/' + args.env):
        os.mkdir(args.savedir + '/' + args.env)
    if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer)
    args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer
    args.tradir = args.tradir + '/' + args.env + '/' + args.optimizer

    # init classes
    env = gym.make(args.env)
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env, args.env)
    Old_Policy = Policy_net('old_policy', env, args.env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer)
    D = Discriminator(env, args.env, _optimizer=args.optimizer)

    expert_observations = np.genfromtxt(args.tradir + '/observations.csv')
    expert_actions = np.genfromtxt(args.tradir + '/actions.csv', dtype=np.int32)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        reward = 0  # do NOT use rewards to update policy
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            rewards = []
            v_preds = []
            run_policy_steps = 0
            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs

                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)])
                               , iteration)
            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                               , iteration)

            print('iteration:', iteration, ',rewards:', sum(rewards))

            if iteration == (args.iteration - 1):
                saver.save(sess, args.savedir + '/model.ckpt')
                print('Clear!! Model saved.')
                break

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                sample_indices = np.random.randint(low=0, high=observations.shape[0],
                                                   size=32)  # indices are in [low, high)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
예제 #9
0
class MiniNetwork(object):
    def __init__(self,
                 sess=None,
                 summary_writer=tf.summary.FileWriter("logs/"),
                 rl_training=False,
                 reuse=False,
                 cluster=None,
                 index=0,
                 device='/gpu:0',
                 ppo_load_path=None,
                 ppo_save_path=None):
        self.policy_model_path_load = ppo_load_path + "mini"
        self.policy_model_path_save = ppo_save_path + "mini"

        self.rl_training = rl_training

        self.use_norm = True

        self.reuse = reuse
        self.sess = sess
        self.cluster = cluster
        self.index = index
        self.device = device

        self._create_graph()

        self.rl_saver = tf.train.Saver()
        self.summary_writer = summary_writer

    def initialize(self):
        init_op = tf.global_variables_initializer()
        self.sess.run(init_op)

    def reset_old_network(self):
        self.policy_ppo.assign_policy_parameters()
        self.policy_ppo.reset_mean_returns()

        self.sess.run(self.results_sum.assign(0))
        self.sess.run(self.game_num.assign(0))

    def _create_graph(self):
        if self.reuse:
            tf.get_variable_scope().reuse_variables()
            assert tf.get_variable_scope().reuse

        worker_device = "/job:worker/task:%d" % self.index + self.device
        with tf.device(
                tf.train.replica_device_setter(worker_device=worker_device,
                                               cluster=self.cluster)):
            self.results_sum = tf.get_variable(
                name="results_sum", shape=[], initializer=tf.zeros_initializer)
            self.game_num = tf.get_variable(name="game_num",
                                            shape=[],
                                            initializer=tf.zeros_initializer)

            self.global_steps = tf.get_variable(
                name="global_steps",
                shape=[],
                initializer=tf.zeros_initializer)
            self.win_rate = self.results_sum / self.game_num

            self.mean_win_rate = tf.summary.scalar(
                'mean_win_rate_dis', self.results_sum / self.game_num)
            self.merged = tf.summary.merge([self.mean_win_rate])

            mini_scope = "MiniPolicyNN"
            with tf.variable_scope(mini_scope):
                ob_space = _SIZE_MINI_INPUT
                act_space_array = _SIZE_MINI_ACTIONS
                self.policy = Policy_net('policy', self.sess, ob_space,
                                         act_space_array)
                self.policy_old = Policy_net('old_policy', self.sess, ob_space,
                                             act_space_array)
                self.policy_ppo = PPOTrain('PPO',
                                           self.sess,
                                           self.policy,
                                           self.policy_old,
                                           lr=P.mini_lr,
                                           epoch_num=P.mini_epoch_num)
            var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            self.policy_saver = tf.train.Saver(var_list=var_list)

    def Update_result(self, result_list):
        win = 0
        for i in result_list:
            if i > 0:
                win += 1
        self.sess.run(self.results_sum.assign_add(win))
        self.sess.run(self.game_num.assign_add(len(result_list)))

    def Update_summary(self, counter):
        print("Update summary........")

        policy_summary = self.policy_ppo.get_summary_dis()
        self.summary_writer.add_summary(policy_summary, counter)

        summary = self.sess.run(self.merged)
        self.summary_writer.add_summary(summary, counter)
        self.sess.run(self.global_steps.assign(counter))

        print("Update summary finished!")

        steps = int(self.sess.run(self.global_steps))
        win_game = int(self.sess.run(self.results_sum))
        all_game = int(self.sess.run(self.game_num))
        win_rate = win_game / float(all_game)

        return steps, win_rate

    def get_win_rate(self):
        return float(self.sess.run(self.win_rate))

    def Update_policy(self, buffer):
        self.policy_ppo.ppo_train_dis(buffer.observations,
                                      buffer.tech_actions,
                                      buffer.rewards,
                                      buffer.values,
                                      buffer.values_next,
                                      buffer.gaes,
                                      buffer.returns,
                                      verbose=False)

    def get_global_steps(self):
        return int(self.sess.run(self.global_steps))

    def save_policy(self):
        self.policy_saver.save(self.sess, self.policy_model_path_save)
        print("policy has been saved in", self.policy_model_path_save)

    def restore_policy(self):
        self.policy_saver.restore(self.sess, self.policy_model_path_load)
        print("Restore policy from", self.policy_model_path_load)
예제 #10
0
        Old_Policy = SNGANPolicy(
                'old_policy',
                obs_shape=obs_shape,
                batch_size=args.batch_size,
                decode=True)

    # Build reinforcement agent
>>>>>>> c79cfc48f93b70a6c24e29d063cb881ff88f5fde
    if args.algo == 'ppo':
        print('Building PPO Agent')
        Agent = PPOTrain(
                Policy,
                Old_Policy,
                obs_shape=obs_shape,
                gamma=args.gamma,
                c_vf=args.c_vf,
                c_entropy=args.c_entropy,
                c_l1=args.c_l1,
                obs_size=args.obs_size,
                vf_clip=args.vf_clip,
                optimizer=args.g_optimizer)
    elif args.algo == 'trpo':
        print('Building TRPO Agent')
        Agent = TRPOTrain(
                Policy,
                Old_Policy,
                obs_shape=obs_shape,
                gamma=args.gamma,
                c_vf=args.c_vf,
                c_entropy=args.c_entropy,
                c_l1=args.c_l1,
예제 #11
0
def main(args):
    # env = gym.make('CartPole-v0')
    # env.seed(0)
    env = CustomEnv()
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=gamma)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(log_path, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs, acs, target_video = env.reset()
        success_num = 0

        for iteration in range(iterations):
            observations = []
            actions = []
            pred_actions = []
            rewards = []
            v_preds = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.array([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                acs = np.array([acs]).astype(dtype=np.float32)
                pred_act, v_pred = Policy.act(obs=obs, acs=acs, stochastic=True)

                # act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                next_obs, reward, done, info = env.step(acs)

                observations.append(obs)
                actions.append(acs)
                pred_actions.append(pred_act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    next_obs = np.stack([next_obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs, acs, target_video = env.reset()
                    break
                else:
                    obs = next_obs

            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)])
                               , iteration)
            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                               , iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, weight_path + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            # observations = np.reshape(observations, newshape=[-1,] + list(ob_space.shape))
            observations = np.array(observations).astype(dtype=np.float32)

            actions = np.array(actions).astype(dtype=np.float32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
예제 #12
0
def main(args):
    print(date)
    energyPolicy_training_data.append("Energy poilcy training")
    energyPolicy_training_data.append(
        "Date:                                                          " +
        str(date))
    energyPolicy_training_data.append(
        "Noise type:                                                    " +
        str(args.noise_type))
    energyPolicy_training_data.append(
        "Policy Training max episodes:                                  " +
        str(args.iteration))
    energyPolicy_training_data.append(
        "Number of iterations the energy model have ben trained:        " +
        str(args.model))
    energyPolicy_training_data.append(
        "PPO gamma:                                                     " +
        str(args.gamma))
    energyPolicy_training_data.append(
        "Do we add noise to sapair for calculating energy               " +
        str(args.sanoise))
    energyPolicy_training_data.append(
        "The noise we add to sapair                                     " +
        str(args.noise_sigma))
    energyPolicy_training_data.append(
        "h(energy)                                                      " +
        str(args.reward_function))
    energyPolicy_training_data.append(" \n\n")

    env = gym.make('CartPole-v0')
    Energy = Energy_net('energy', 'CartPole-v0')
    energy_saver = tf.train.Saver()

    sapairs = np.genfromtxt('training_data/sapairs.csv')
    noise_sapairs = np.genfromtxt('training_data/noise_sapairs.csv')

    with tf.Session() as sess:
        # writer = tf.summary.FileWriter(args.logdir+'/'+args.alg, sess.graph)
        sess.run(tf.global_variables_initializer())
        if args.model == '':
            energy_saver.restore(
                sess, args.modeldir + '/' + args.alg + '/' + args.noise_type +
                '/' + 'model.ckpt')
        else:
            energy_saver.restore(
                sess, args.modeldir + '/' + args.alg + '/' + args.noise_type +
                '/' + 'model.ckpt-' + args.model)
        print("As for model after ", args.model, "training iterations")
        print("Energy for expert sapairs looks like:",
              Energy.get_energy(sapairs))
        print(
            "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:",
            Energy.get_energy(noise_sapairs))

        energyPolicy_training_data.append(
            ["As for model after ", args.model, "training iterations"])

        energyPolicy_training_data.append(
            "Energy for expert sapairs looks like:\n" +
            str(Energy.get_energy(sapairs)))
        energyPolicy_training_data.append(
            "Energy for noise sapairs (not corresponding to the noise trained for Energy) looks like:\n"
            + str(Energy.get_energy(noise_sapairs)))
        energyPolicy_training_data.append(" \n\n\n\n\n\n\n\n\n")
        energyPolicy_training_data.append(
            "Done with reloading Energy. Start RL")

        # writer.close()

        open_file_and_save(
            args.logdir + '/' + args.model + "_iter_" + args.noise_type +
            '_Policy' + date, energyPolicy_training_data)
        print("Done with reloading Energy. Start RL")

        # Start RL

        env.seed(0)
        ob_space = env.observation_space
        Policy = Policy_net('policy', env)
        Old_Policy = Policy_net('old_policy', env)
        PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
        saver = tf.train.Saver()

        # writer = tf.summary.FileWriter(args.logdir+'/'+args.noise_type, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()

        reward = 0
        alter_reward = 0
        success_num = 0
        render = False
        #ep_reward = []

        # 用于记录每个trajectory的数据最后做总结
        Summary_after_max_episodes_training = []
        Trajectory_rewards = []
        Trajectory_alter_rewards = []
        Trajectory_success_num = 0  # 与success_num一样,只不过这个不会清零,这个用于评估这个energy对于训练的效果

        plot_rewards = []
        plot_alter_rewards = []
        plot_iteration = []
        for iteration in range(args.iteration):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            alter_rewards = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1

                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                alter_rewards.append(alter_reward)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)

                # alter reward
                sapair = np.append(obs, np.array([[act]]), axis=1)
                # print("sapair:",sapair)
                energy = Energy.get_energy(sapair)[0][0]
                print("Energy for this sapair", energy)
                if args.sanoise == True:
                    # 定义 gauss noise 的均值和方差
                    mu, sigma = 0, args.noise_sigma
                    # 一维guass
                    # saNumber = sapairs.shape[0]
                    saShape = sapair.shape[1]
                    # sampleNo = saNumber * saShape  # 采样sampleNo个gauss noise
                    noise = np.random.normal(mu, sigma, saShape)
                    noise_sapair = sapair + noise
                    print("noise_sapair:", noise_sapair)
                    # noise_sapairs = np.reshape(noise_sapairs, newshape=[saNumber, saShape])
                    noise_energy = Energy.get_energy(noise_sapair)[0][0]
                    print("Noise Energy for this sapair", noise_energy)
                    energy = noise_energy

                if args.reward_function == "-energy":
                    alter_reward = -energy
                elif args.reward_function == "-energy+1":
                    alter_reward = -energy + 1
                elif args.reward_function == "exp(-energy-1)":
                    alter_reward = np.exp(-energy - 1)
                elif args.reward_function == "exp(-energy)":
                    alter_reward = np.exp(-energy)
                else:
                    print("No such reward_function")
                #alter_reward = np.exp(-energy-1)
                #alter_reward = -energy+1
                #alter_reward = reward
                #alter_reward = -energy

                # if render:
                # env.render()
                # pass
                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    alter_reward = -1

                    break
                else:
                    obs = next_obs

            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]), iteration)
            # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]), iteration)

            # if sum(rewards) >= 195:
            #     success_num += 1
            #     Trajectory_success_num +=1
            #     render = True
            #     if success_num >= 100:
            #         saver.save(sess, args.savedir + '/model.ckpt')
            #         print('Clear!! Model saved.')
            #         break
            # else:
            #     success_num = 0

            sum_rewards = sum(rewards)
            sum_alter_rewards = sum(alter_rewards)
            Trajectory_rewards.append(sum_rewards)
            Trajectory_alter_rewards.append(sum_alter_rewards)
            #画图
            plot_rewards.append(sum_rewards)
            plot_alter_rewards.append(sum_alter_rewards)
            plot_iteration.append(iteration)
            #ep_reward.append(sum(rewards))

            # print("Sample done in one traj.")
            energyPolicy_training_data_for_this_episode = []
            energyPolicy_training_data_for_this_episode.append(" ")
            energyPolicy_training_data_for_this_episode.append(
                "Trajectory:     " + str(iteration))
            energyPolicy_training_data_for_this_episode.append(
                "episode_len:    " + str(episode_length))
            energyPolicy_training_data_for_this_episode.append(
                "True rewards:   " + str(sum_rewards))
            energyPolicy_training_data_for_this_episode.append(
                "alter_rewards:  " + str(sum_alter_rewards))
            open_file_and_save(
                args.logdir + '/' + args.model + "_iter_" + args.noise_type +
                '_Policy' + date, energyPolicy_training_data_for_this_episode)
            print()
            print("Trajectory", iteration, ":")
            print("episode_len: ", episode_length)
            print("rewards: ", sum(rewards))
            print("alter_rewards: ", sum(alter_rewards))

            # gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = PPO.get_gaes(rewards=alter_rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)
            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            alter_rewards = np.array(alter_rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, alter_rewards, v_preds_next]
            # inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            # writer.add_summary(summary, iteration)
        # writer.close()

        #开始画图
        plt.title('Noise:' + str(args.sanoise))
        plt.plot(plot_iteration,
                 plot_rewards,
                 color='red',
                 label='True_rewards')
        plt.plot(plot_iteration,
                 plot_alter_rewards,
                 color='green',
                 label='alter_rewards')
        plt.legend()  #显示图例

        plt.xlabel('Episodes')
        plt.ylabel('Rewards')
        plt.show()
def main(args):
    writer = SummaryWriter(args.logdir)
    logger = ResultLogger(writer)

    env = Environment()  # 自定义环境
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy,
                   Old_Policy,
                   gamma=args.gamma,
                   args=args,
                   logger=logger)
    saver = tf.train.Saver()

    if args.continue_train:
        tf.reset_default_graph()
        tf.train.import_meta_graph(args.continue_meta)

    with tf.Session() as sess:
        if args.continue_train:
            saver.restore(sess, args.continue_modeldir)
        sess.run(tf.global_variables_initializer())
        reward = 0
        winnum = 0
        drawnum = 0
        for episode in range(args.episode):

            observations = []
            actions = []
            v_preds = []
            rewards = []

            run_policy_steps = 0

            total_reward = 0
            obs = env.reset()
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                run_policy_steps += 1

                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, sparse_rew, done, info = env.step(act)
                if reward < -1000:
                    reward = -10

                reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0,
                                                     run_policy_steps)
                # if episode==1:
                #     print(reward)

                obs = next_obs
                if done:
                    total_reward = sum(rewards)
                    total_reward /= run_policy_steps
                    total_reward += reward
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value

                    reward = -1
                    if info == 3:
                        winnum += 1
                    if info == 2:
                        drawnum += 1

                    break

            if episode % 100 == 0:
                winnum = 0
                drawnum = 0

            logger.log_result(total_reward, winnum, drawnum, episode)
            print(episode, total_reward)
            if episode % 1000 == 0:
                saver.save(sess, args.savedir + '/model.ckpt')

            ####
            ##  GAE
            ####
            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # 把list 转成 适应于tf.placeholder 的numpy array
            observations = np.reshape(observations, newshape=(-1, ob_space))
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            logger.log_gaes(gaes.mean(), episode)
            PPO.log_parameter(observations, actions, gaes, rewards,
                              v_preds_next)
            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(2):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])
예제 #14
0
def main(args):
    env = Environment()
    batch_size = args.batchsize
    writer = SummaryWriter(args.logdir)
    logger = ResultLogger(writer)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy,
                   Old_Policy,
                   gamma=args.gamma,
                   logger=logger,
                   args=args)
    D = Discriminator(env, batch_size, logger=logger, args=args)

    expert_ds = pd.read_csv(args.expertdir)
    expert_observations = expert_ds[
        utils.observation_field].as_matrix()  # 筛选obs特征
    expert_actions = utils.merge_to_one_action(
        expert_ds[utils.action_field].as_matrix())  # 映射action空间,与具体环境相关,这里省略

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        reward = 0  # do NOT use rewards to update policy

        for episode in range(args.episode):
            observations = []
            actions = []
            rewards = []
            v_preds = []
            run_policy_steps = 0

            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs

                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                next_obs, reward, sparse_rew, done, info = env.step(act)
                reward = utils.get_curriculum_reward(reward, sparse_rew, 1.0,
                                                     run_policy_steps)

                if done:
                    total_reward = sum(rewards)
                    total_reward /= run_policy_steps
                    total_reward += reward

                    print("[episode]: ", episode)
                    print('[Policy Reward]: ', total_reward)

                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()

                    reward = 0
                    break
                else:
                    obs = next_obs

                if episode % 100 == 0:
                    winnum = 0
                    drawnum = 0

            logger.log_result(total_reward, winnum, drawnum, episode)
            if episode % 1000 == 0:
                saver.save(sess, args.savedir + '/model.ckpt')

            observations = np.reshape(observations, newshape=(-1, ob_space))
            actions = np.array(actions).astype(dtype=np.int32)

            # 训练 Discriminator
            d_rewards = train_discriminator(expert_observations,
                                            expert_actions, observations,
                                            actions, D, batch_size, episode,
                                            logger)
            # 训练 PPO
            train_PPO(PPO, observations, actions, d_rewards, v_preds,
                      v_preds_next, batch_size, episode, logger)
예제 #15
0
def main(args):
    env = gym.make('CartPole-v0')
    BCPolicy = Policy_net('bcpolicy', env)
    BC = BehavioralCloning(BCPolicy)
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    saver = tf.train.Saver(max_to_keep=args.max_to_keep) #实例化一个Saver对象,在训练过程中,定期调用saver.save方法,像文件夹中写入包含当前模型中所有可训练变量的checkpoint文件 saver.save(sess,FLAGG.train_dir,global_step=step)

    exp_obs = np.genfromtxt('trajectory/observations.csv')[0:exp_len]   #exp_len=200
    exp_acts = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)[0:exp_len]

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph) #指定一个文件用来保存图。格式:tf.summary.FileWritter(path,sess.graph),可以调用其add_summary()方法将训练过程数据保存在filewriter指定的文件中
        sess.run(tf.global_variables_initializer())

        inp = [exp_obs, exp_acts]  #inp[0]就是observations, inp[1]就是actoins
        
        for iteration in range(args.iteration):  # episode

            # train
            for epoch in range(args.epoch_num):
                # select sample indices in [low, high)
                sample_indices = np.random.randint(low=0, high=exp_obs.shape[0], size=args.minibatch_size)   #函数的作用是,返回一个随机整型数,范围从低(包括)到高(不包括),即[low, high)

                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                BC.train(obs=sampled_inp[0], actions=sampled_inp[1])

            bc_summary = BC.get_summary(obs=inp[0], actions=inp[1])

            if (iteration+1) % args.interval == 0:
                saver.save(sess, args.savedir + '/model.ckpt', global_step=iteration+1)

            writer.add_summary(bc_summary, iteration)


        print("Done with BC. Start RL")
        # Start RL
        obs = env.reset()
        ob_space = env.observation_space
       
        reward = 0
        alter_reward = 0
        success_num = 0
        render = False
        ep_reward=[]
        for iteration in range(5*args.iteration):
            print("iter:{}".format(iteration))
            observations = []
            actions = []
            v_preds = []
            rewards = []
            alter_rewards = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                alter_rewards.append(alter_reward)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)
                alter_reward = np.log(1/(kl_divergence(obs, BCPolicy, Policy)+0.00001))
                #alter_reward = -kl_divergence(obs, BCPolicy, Policy)
                #alter_reward = kl_divergence(obs, BCPolicy, Policy)
                #print(alter_reward)
                if render:
                    #env.render()
                    pass
                if done:
                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    alter_reward = -1
                    print("episode_len: ",episode_length)
                    break
                else:
                    obs = next_obs

            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)])
                               , iteration)
            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                               , iteration)

            if sum(rewards) >= 195:
                success_num += 1
                render = True
                if success_num >= 100:
                    saver.save(sess, args.savedir+'/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0
            ep_reward.append(sum(rewards))
            print("rewards: ",sum(rewards))
            print("alter_rewards: ",sum(alter_rewards))
            print("Sample done in one traj.")
            gaes = PPO.get_gaes(rewards=alter_rewards, v_preds=v_preds, v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            alter_rewards = np.array(alter_rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) 

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, alter_rewards, v_preds_next]

            print("Begin Training")
            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]
                PPO.train(obs=sampled_inp[0],
                        actions=sampled_inp[1],
                        gaes=sampled_inp[2],
                        rewards=sampled_inp[3],
                        v_preds_next=sampled_inp[4])
                """
                summary = PPO.get_summary(obs=inp[0],
                        actions=inp[1],
                        gaes=inp[2],
                        rewards=inp[3],
                        v_preds_next=inp[4])
                """

            #writer.add_summary(summary, iteration)
        writer.close()
    plt.plot(ep_reward)
예제 #16
0
def main(args):
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space  #This is the environment for the gym to observe

    Policy = Policy_net(
        'policy', env)  #take the environments  #this is normal policy class
    Old_Policy = Policy_net('old_policy', env)  #this is for the old policy
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)  #this is for training

    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer()
                 )  #Here all the variabls get initialized
        obs = env.reset(
        )  # [position of cart, velocity of cart, angle of pole, rotation rate of pole] Initial observation

        reward = 0
        success_num = 0

        for iteration in range(args.iteration):
            observations = []  #to store observations
            actions = []
            v_preds = []
            rewards = []
            episode_length = 0

            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length  #Starting to run the
                episode_length += 1  #episode length is something dynamic
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(
                    obs=obs, stochastic=True
                )  #get the action and value prediction (actor and critic network output)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(
                    act)  #get the observation from the environments

                #The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center. That is done

                if done:  #This is a termination stage
                    #this has all the next state eliements of the episode inputs
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value  #after the terminal stage  there shouldn;t be a value function
                    obs = env.reset()
                    reward = -1
                    break
                else:  #here your break the episode
                    obs = next_obs  #if the system do not get terminated it will run for ever
            #After a one episode get  terminated

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=episode_length)
                ])  #From this we can learn how long the episode went
                ,
                iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ])  #
                ,
                iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print(
                        'Clear!! Model saved.'
                    )  #this is like after this much sucessfull attempts we are confident about the model
                    break
            else:
                success_num = 0

            gaes = PPO.get_gaes(
                rewards=rewards, v_preds=v_preds,
                v_preds_next=v_preds_next)  #this is the advantage function
            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(
                observations, newshape=[-1] +
                list(ob_space.shape))  #observations from the current policy
            actions = np.array(actions).astype(
                dtype=np.int32)  #actions taken from current policy
            gaes = np.array(gaes).astype(
                dtype=np.float32)  #generalized advantage enstimation
            gaes = (gaes - gaes.mean()) / gaes.std()  #Normalize it
            rewards = np.array(rewards).astype(
                dtype=np.float32)  #Extracted rewrds
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters(
            )  #before updating the new policy we assign current policy parameters to old policy

            inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(6):  #starting the optimization
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data  Randomly take one sample from the training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
예제 #17
0
def main(args):
    # prepare log dir
    if not os.path.exists(args.logdir):
        os.makedirs(args.logdir)
    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)
    # gym環境作成
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    # policy net
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    # ppo学習インスタンス
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    # discriminator
    D = Discriminator(env)

    # エキスパートtrajectory読み込み
    expert_observations = np.genfromtxt('trajectory/observations.csv')
    expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)

    # tensorflow saver
    saver = tf.train.Saver()
    # session config
    config = tf.ConfigProto(
            gpu_options=tf.GPUOptions(
                visible_device_list=args.gpu_num,
                allow_growth=True
                ))
    # start session
    with tf.Session(config=config) as sess:
        # summary writer
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        # Sessionの初期化
        sess.run(tf.global_variables_initializer())
        # 状態の初期化
        obs = env.reset()
        success_num = 0
        # episode loop
        for iteration in tqdm(range(args.iteration)):
            # buffer
            observations = []
            actions = []
            rewards = []
            v_preds = []
            run_policy_steps = 0
            # run episode
            while True:
                run_policy_steps += 1
                # ネットワーク入力用にobsを変換
                obs = np.stack([obs]).astype(dtype=np.float32)

                # 行動と価値を推定
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                # 要素数が1の配列をスカラーに変換
                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                # policy netの推定行動で状態の更新
                next_obs, reward, done, info = env.step(act)

                # episodeの各変数を追加
                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                # episode終了判定
                if done:
                    v_preds_next = v_preds[1:] + [0]
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            # summary追加
            writer.add_summary(
                    tf.Summary(value=[tf.Summary.Value(
                        tag='episode_length',
                        simple_value=run_policy_steps)]),
                    iteration)
            writer.add_summary(
                    tf.Summary(value=[tf.Summary.Value(
                        tag='episode_reward',
                        simple_value=sum(rewards))]),
                    iteration)

            # episode成功判定
            if sum(rewards) >= 195:
                success_num += 1
                # 連続で100回成功していればepisode loopを終了
                if success_num >= 100:
                    saver.save(sess, args.savedir+'/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            # policy netによるtrajectryをプレースホルダー用に変換
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            ###########################
            # GAILの変更点はここだけ
            # discriminatorでエキスパートの報酬に近づける
            # discriminator学習 2回
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # get d_rewards from discrminator
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            # transform d_rewards to numpy for placeholder
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)
            ###########################

            # get generalized advantage estimator
            gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # ppo input data whose rewards is discriminator rewards
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            # assign parameters to old policy
            PPO.assign_policy_parameters()

            # train PPO
            for epoch in range(6):
                # sample index
                sample_indices = np.random.randint(
                        low=0,
                        high=observations.shape[0],
                        size=32)
                # sampling from input data
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]
                # run ppo
                PPO.train(
                        obs=sampled_inp[0],
                        actions=sampled_inp[1],
                        gaes=sampled_inp[2],
                        rewards=sampled_inp[3],
                        v_preds_next=sampled_inp[4])

            # get summary
            summary = PPO.get_summary(
                    obs=inp[0],
                    actions=inp[1],
                    gaes=inp[2],
                    rewards=inp[3],
                    v_preds_next=inp[4])

            # add summary
            writer.add_summary(summary, iteration)
        writer.close()
예제 #18
0
def main(args):
    #init directories
    if not os.path.isdir(args.logdir):
        os.mkdir(args.logdir)
    if not os.path.isdir(args.logdir + '/' + args.env):
        os.mkdir(args.logdir + '/' + args.env)
    if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer)
    args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer
    if not os.path.isdir(args.savedir):
        os.mkdir(args.savedir)
    if not os.path.isdir(args.savedir + '/' + args.env):
        os.mkdir(args.savedir + '/' + args.env)
    if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer)
    args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer

    #init classes
    env = gym.make(args.env)
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env, args.env)
    Old_Policy = Policy_net('old_policy', env, args.env)
    PPO = PPOTrain(Policy,
                   Old_Policy,
                   gamma=args.gamma,
                   _optimizer=args.optimizer,
                   _lr=args.lr)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=episode_length)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if iteration == (args.iteration - 1):
                saver.save(sess, args.savedir + '/model.ckpt')
                print('Clear!! Model saved.')
                break

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            print('iteration:', iteration, ',rewards:', sum(rewards))

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
예제 #19
0
def main(args):
    scene_scope = 'bathroom_02'
    task_scope = 26  #26 43 53 32 41
    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
    })

    S_Class = SIAMESE()  #Creating  a siamese class -object

    Policy = Policy_net(
        'policy', S_Class)  #buiding the actor critic graph / object  , Passing
    Old_Policy = Policy_net('old_policy',
                            S_Class)  #same thing as the other PPO

    PPO = PPOTrain(Policy, Old_Policy,
                   gamma=args.gamma)  #gradiet updatror object or the graph
    D = Discriminator(S_Class)  #discriminator of the Gan Kind of thing
    '''
    batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese')
    '''

    #Loading Expert Data State/Tragets etc
    expert_observations = np.genfromtxt(
        'trajectory/observations.csv')  #load expert demnetrations
    expert_targets = np.genfromtxt('trajectory/targets.csv')
    expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)
    expert_observations = np.reshape(expert_observations,
                                     newshape=[-1, 2048, 4])
    expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4])

    saver = tf.train.Saver(
    )  #Assign another save if you want to use BC weights
    if args.restore:  #We need a seperate saver only for assigning paramters from BC trained thing
        saver2 = tf.tran.Saver([
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese')
        ])

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(
            tf.global_variables_initializer()
        )  #here already variables get intialized both old policy and new policy net

        if args.restore:
            if args.model == '':
                saver2.restore(
                    sess,
                    args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt')
                print("Model Reastored")
            else:
                saver.restore(
                    sess, args.modeldir + '/' + args.alg + '/' +
                    'model.ckpt-' + args.model)

        success_num = 0  #This is use to check whether my agent went to the terminal point

        #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

        for iteration in range(
                100000):  #args.iteration):#Here start the adversial training
            print(
                "Starting ........ The Iteration---------------------------------------------------- :",
                iteration)
            observations = []
            actions = []
            #rewards = []
            targets = []  #for the gail
            v_preds = []
            run_policy_steps = 0

            while (
                    True
            ):  #Here what is happenning is , this again samples  trajectories from untrain agent
                run_policy_steps += 1
                obs = np.stack([env.s_t]).astype(
                    dtype=np.float32
                )  # prepare to feed placeholder Policy.obs #Initial observation
                target = np.stack([env.s_target]).astype(
                    dtype=np.float32
                )  #This is to make sure that input is [batch_size,2048,4]

                act, v_pred, prob = Policy.act(
                    state=obs, target=target,
                    stochastic=True)  # Agents action and values

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)  #save the set of observations
                targets.append(target)
                actions.append(act)  #save the set of actions
                v_preds.append(v_pred)

                #next_obs, reward, done, info = env.step(act)  #get the next observation and reward acording to the observation
                next_obs, is_terminal, is_collided = env.step(act)

                if is_terminal:
                    success_num = success_num + 1
                    print(
                        "Congratz yoy just reach the terminal state which is:",
                        env.terminal_state_id)

                if is_collided:
                    print(
                        "Bad Luck your agent just collided couldn't made it  to the terminal state which is :",
                        env.terminal_state_id)

                if (is_terminal or is_collided
                        or (run_policy_steps
                            == 100)):  #run one episode till the termination
                    print("Number Of Exploration by the AGENT:",
                          run_policy_steps)
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value #this list use to update the parameters of the calue net
                    print(
                        "Environment is resetting after the collition/Terminal"
                    )
                    obs = env.reset()
                    #reward = -1
                    break  #with tihs vreak all obsercation ,action and other lists get empty

            #print(sum(rewards))

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
            #, iteration)

            if success_num >= 5000:
                saver.save(sess, args.savedir + '/model.ckpt')
                print('Clear!! Model saved.')
                break
            #else:
            #success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1, 2048,
                                                4])  #collect observations
            targets = np.reshape(targets, newshape=[-1, 2048, 4])
            actions = np.array(actions).astype(
                dtype=np.int32)  #collect the actions

            # train discriminator  #Here comes the Discriminator !!
            Dis_input = [
                expert_observations, expert_targets, expert_actions,
                observations, targets, actions
            ]
            observations.shape[0]
            expert_observations.shape[0]

            if observations.shape[0] < expert_observations.shape[0]:
                High = observations.shape[0]
            else:
                High = expert_observations.shape[0]
            for i in range(100):
                sample_indices = np.random.randint(low=0, high=High, size=32)
                sampled_inp_D = [
                    np.take(a=a, indices=sample_indices, axis=0)
                    for a in Dis_input
                ]

                D.train(expert_s=sampled_inp_D[0],
                        expert_t=sampled_inp_D[1],
                        expert_a=sampled_inp_D[2],
                        agent_s=sampled_inp_D[3],
                        agent_t=sampled_inp_D[4],
                        agent_a=sampled_inp_D[5])
                '''
               
                D.train(expert_s=expert_observations,        
                        expert_t=expert_targets,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_t=targets,
                        agent_a=actions)
                '''


#To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function
            d_rewards = D.get_rewards(
                agent_s=observations, agent_t=targets, agent_a=actions
            )  #how well our agent performed with respect to the expert
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(
                dtype=np.float32)  #rewards for each action pair

            gaes = PPO.get_gaes(
                rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next
            )  #this to calcuate the advantage function in PPO
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(
                dtype=np.float32)  #This is the next value function

            #train policy
            inp = [
                observations, targets, actions, gaes, d_rewards, v_preds_next
            ]
            PPO.assign_policy_parameters(
            )  #Assigning policy params means assigning the weights to the default policy nets
            for epoch in range(
                    100
            ):  #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=32)  # indices are in [low, high)

                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # Here trainign the policy network

                PPO.train(state=sampled_inp[0],
                          targets=sampled_inp[1],
                          actions=sampled_inp[2],
                          gaes=sampled_inp[3],
                          rewards=sampled_inp[4],
                          v_preds_next=sampled_inp[5])

            summary = PPO.get_summary(obs=inp[0],
                                      target=inp[1],
                                      actions=inp[2],
                                      gaes=inp[3],
                                      rewards=inp[4],
                                      v_preds_next=inp[5])

            writer.add_summary(summary, iteration)
        writer.close()
def main(args):
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    #Policy = Policy_net('policy', env)
    #Old_Policy = Policy_net('old_policy', env)
    Policy = Policy_net_quantum('policy', env, 32)
    Old_Policy = Policy_net_quantum('old_policy', env, 32)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    D = Discriminator(env)

    expert_observations = np.genfromtxt('trajectory/observations.csv')
    expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            # do NOT use rewards to update policy
            rewards = []
            v_preds = []
            run_policy_steps = 0
            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                next_obs, reward, done, info = env.step(act)

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    next_obs = np.stack([next_obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    break
                else:
                    obs = next_obs

            print("Iteration: " + str(iteration))

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator is reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards,
                                   newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            #inp = [observations, actions, gaes, d_rewards, v_preds_next]
            """PPO.assign_policy_parameters()
            for epoch in range(6):
                sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32)  # indices are in [low, high)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])"""

            #writer.add_summary(summary, iteration)
        writer.close()
예제 #21
0
def main(args):
    env = myTGym(episode_type='0', percent_goal_profit=1, percent_stop_loss=1)
    obs = env.reset()
    action_space = np.array([0, 1])
    Policy = Policy_net('policy', env, action_space)
    Old_Policy = Policy_net('old_policy', env, action_space)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    D = Discriminator(env)

    # expert_observations = np.genfromtxt('trajectory/expert_obs/000430.csv', delimiter=',', invalid_raise = False)
    # expert_actions = np.genfromtxt('trajectory/action_list/actions0-000430-20180503.csv', dtype=np.int32)
    expert_observations = pd.read_csv('trajectory/expert_obs/000520.csv', index_col=0)
    expert_actions = pd.read_csv('trajectory/expert_actions/action000520.csv', index_col=0)
    #print('expert_action: ',expert_actions.shape)
    expert_actions = expert_actions.replace(2, 0)['0']

    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())

        obs = env.reset()
        reward = 0  # do NOT use rewards to update policy
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            rewards = []
            v_preds = []
            run_policy_steps = 0
            while True:
                run_policy_steps += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs #[1, 111]
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                next_obs, reward, done, info = env.step(act)

                #print(iteration, ' reward: ', reward)

                if done:
                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            if iteration % 10 == 0:

                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)])
                                   , iteration)
                writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                                   , iteration)

            if sum(rewards) >= 195:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, args.savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
            else:
                success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(obs.shape))
            actions = np.array(actions).astype(dtype=np.int32)

            # train discriminator
            for i in range(2):
                D.train(expert_s=expert_observations,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_a=actions)

            # output of this discriminator qis reward
            d_rewards = D.get_rewards(agent_s=observations, agent_a=actions)
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32)

            gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            # train policy
            inp = [observations, actions, gaes, d_rewards, v_preds_next]
            PPO.assign_policy_parameters()
            for epoch in range(6):
                sample_indices = np.random.randint(low=0, high=observations.shape[0],
                                                   size=32)  # indices are in [low, high)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
def main(args):
    #init directories
    if not os.path.isdir(args.logdir):
        os.mkdir(args.logdir)
    if not os.path.isdir(args.logdir + '/' + args.env):
        os.mkdir(args.logdir + '/' + args.env)
    if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer)
    if not os.path.isdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)):
        os.mkdir(args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr))
    args.logdir = args.logdir + '/' + args.env + '/' + args.optimizer + '/lr_' + str(args.lr)
    if not os.path.isdir(args.savedir):
        os.mkdir(args.savedir)
    if not os.path.isdir(args.savedir + '/' + args.env):
        os.mkdir(args.savedir + '/' + args.env)
    if not os.path.isdir(args.savedir + '/' + args.env + '/' + args.optimizer):
        os.mkdir(args.savedir + '/' + args.env + '/' + args.optimizer)
    args.savedir = args.savedir + '/' + args.env + '/' + args.optimizer

    #init classes
    env = gym.make(args.env)
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env, args.env)
    Old_Policy = Policy_net('old_policy', env, args.env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma, _optimizer=args.optimizer, _lr=args.lr)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)

                next_obs, reward, done, info = env.step(act)

                if done:
                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
                    obs = env.reset()
                    reward = -1
                    break
                else:
                    obs = next_obs

            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)])
                               , iteration)
            writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
                               , iteration)

            if iteration == (args.iteration-1):
                saver.save(sess, args.savedir+'/model'+str(args.lr)+'.ckpt')
                print('Clear!! Model saved.')
                break

            gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            print('iteration:', iteration, ',rewards:', sum(rewards))

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32)
                sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()
예제 #23
0
def main(args):
    env = gym.make('CartPole-v0')
    env.seed(0)
    ob_space = env.observation_space
    Policy = Policy_net('policy', env)
    Old_Policy = Policy_net('old_policy', env)
    PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
    saver = tf.train.Saver()
    tl = args.train_level

    if tl == 'expert':
        threshold = 195
        savedir = 'trained_models/ppo/expert'
        logdir = 'log/train/ppo/expert/'
    elif tl == 'med':
        threshold = 100
        savedir = 'trained_models/ppo/med'
        logdir = 'log/train/ppo/med/'
    else:
        print("[run_ppo.py] Error: Unrecognized train level: {}".format(tl))
        exit(1)

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(logdir, sess.graph)
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        success_num = 0

        for iteration in range(args.iteration):
            observations = []
            actions = []
            rewards = []
            v_preds = []
            episode_length = 0
            while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                episode_length += 1
                obs = np.stack([obs]).astype(
                    dtype=np.float32)  # prepare to feed placeholder Policy.obs
                act, v_pred = Policy.act(obs=obs, stochastic=True)

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                next_obs, reward, done, info = env.step(act)

                observations.append(obs)
                actions.append(act)
                rewards.append(reward)
                v_preds.append(v_pred)

                if done:
                    next_obs = np.stack([next_obs]).astype(
                        dtype=np.float32
                    )  # prepare to feed placeholder Policy.obs
                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                    obs = env.reset()
                    break
                else:
                    obs = next_obs

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=episode_length)
                ]), iteration)
            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_reward',
                                     simple_value=sum(rewards))
                ]), iteration)

            if sum(rewards) >= threshold:
                success_num += 1
                if success_num >= 100:
                    saver.save(sess, savedir + '/model.ckpt')
                    print('Clear!! Model saved.')
                    break
            else:
                success_num = 0

            print("Iteration: {}, Rewards: {}".format(iteration, sum(rewards)),
                  end='\r')
            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=(-1, ) + ob_space.shape)
            actions = np.array(actions).astype(dtype=np.int32)
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean()) / gaes.std()
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

            PPO.assign_policy_parameters()

            inp = [observations, actions, gaes, rewards, v_preds_next]

            # train
            for epoch in range(6):
                # sample indices from [low, high)
                sample_indices = np.random.randint(low=0,
                                                   high=observations.shape[0],
                                                   size=32)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          gaes=sampled_inp[2],
                          rewards=sampled_inp[3],
                          v_preds_next=sampled_inp[4])

            summary = PPO.get_summary(obs=inp[0],
                                      actions=inp[1],
                                      gaes=inp[2],
                                      rewards=inp[3],
                                      v_preds_next=inp[4])

            writer.add_summary(summary, iteration)
        writer.close()