params.log_dir = "../logs/logs/{}".format(params.env_name) params.model_dir = "../logs/models/{}".format(params.env_name) env = gym.make(params.env_name) # set seed env.seed(params.seed) tf.compat.v1.random.set_random_seed(params.seed) agent = DDPG(Actor, Critic, env.action_space.shape[0], params) replay_buffer = ReplayBuffer(params.memory_size) reward_buffer = deque(maxlen=params.reward_buffer_ep) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) init_state = env.reset() # reset agent.predict(init_state) # burn the format of the input matrix to get the weight matrices!! gp_model, update = create_bayes_net() optimiser = tf.compat.v1.train.AdamOptimizer() num_sample = 100 # number of sampling get_ready(agent.params) global_timestep = tf.compat.v1.train.get_or_create_global_step() time_buffer = deque(maxlen=agent.params.reward_buffer_ep) log = logger(agent.params) with summary_writer.as_default(): # for summary purpose, we put all codes in this context with tf.contrib.summary.always_record_summaries(): policies, scores = list(), list() for i in itertools.count():
with tf.contrib.summary.always_record_summaries(): for i in itertools.count(): state = env.reset() total_reward = 0 self_rewards = 0 start = time.time() agent.random_process.reset_states() done = False episode_len = 0 while not done: traj.append(state) if global_timestep.numpy() < agent.params.learning_start: action = env.action_space.sample() else: action = agent.predict(state) next_state, reward, done, info = env.step( np.clip(action, -1.0, 1.0)) replay_buffer.add(state, action, reward, next_state, done) """ === Update the models """ if global_timestep.numpy() > agent.params.learning_start: states, actions, rewards, next_states, dones = replay_buffer.sample( agent.params.batch_size) loss = agent.update(states, actions, rewards, next_states, dones) soft_target_model_update_eager( agent.target_actor, agent.actor, tau=agent.params.soft_update_tau)