def playonce(remote_env): from multi import fastenv fenv = fastenv(remote_env, 2) #print(type(agent)) agent.play(fenv) remote_env.rel() del fenv
def playonce(nl, remote_env): from multi import fastenv fenv = fastenv(remote_env, 2) #print(type(agent)) agent.play(fenv, realtime=False, max_steps=-1, noise_Level=nl) remote_env.rel() del fenv
def playonce(nl, env): from multi import fastenv # global noise_level # env = farmer.acq_env() fenv = fastenv(env, 4) agent.play(fenv, realtime=False, max_steps=-1, noise_level=nl) # epl.rel_env(env) env.rel() del fenv
def test(skip=1): # e = p.env te = RunEnv(visualize=True, max_obstacles=10) from multi import fastenv fenv = fastenv(te, skip) # 4 is skip factor agent.render = True try: agent.play(fenv, realtime=True, max_steps=-1, noise_level=1e-11) except: pass finally: del te
def get_env(self): # obtain a new environment on demand global farmer while 1: remote_env = farmer.acq_env() if remote_env == False: # no free environment time.sleep(0.1) else: if hasattr(self, 'remote_env'): del self.remote_env # release previous before allocate new self.remote_env = remote_env from multi import fastenv fenv = fastenv(remote_env, 2) # a skip of 2; also performs observation processing return fenv
def test(skip=1): # e = p.env te = RunEnv(visualize=False) from multi import fastenv fenv = fastenv(te, skip) # 4 is skip factor agent.render = True agent.training = False try: #print('playing') #agent.play(fenv,realtime=True,max_steps=-1,noise_level=1e-11) playifavailable(0) except: pass finally: del te
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis, args.atari) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume validate_episodes = args.validate_episodes if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: # print("observation shape:", observation.shape) action = agent.select_action(observation, noise_level=noise_level) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation = ", observation) # print("reward = ", reward) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save # if args.env == "Paint": # writer.add_image(str(episode) + '.png', env.canvas) if step > args.warmup: # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False) if debug: prRed( 'Step_{:07d}: mean_reward:{} reward_var:{}'.format( step, np.mean(validate_reward), np.var(validate_reward))) writer.add_scalar('validate/reward', np.mean(validate_reward), step) writer.add_image(str(step) + '.png', env.canvas) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(traintimes): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num)
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis, args.atari) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume validate_episodes = args.validate_episodes if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: # print("observation shape:", observation.shape) action = agent.select_action(observation, noise_level=noise_level) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation = ", observation) # print("reward = ", reward) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save # if args.env == "Paint": # writer.add_image(str(episode) + '.png', env.canvas) if step > args.warmup: # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False) if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward))) writer.add_scalar('validate/reward', np.mean(validate_reward), step) if args.env == "Paint": writer.add_image(str(step) + '.png', env.canvas) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(traintimes): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num)
env = NormalizedEnv(gym.make(args.env)) # input random seed if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) # input states count & actions count print(env.observation_space.shape, env.action_space.shape) nb_states = env.observation_space.shape[0] if args.discrete: nb_actions = env.action_space.n else: nb_actions = env.action_space.shape[0] env = fastenv(env, args.action_repeat, args.vis) agent = DDPG(nb_states, nb_actions, args, args.discrete, args.cuda) evaluate = Evaluator(args.validate_episodes, max_episode_length=args.max_episode_length) if args.vis and args.env == 'HalfCheetahBulletEnv-v0': env.render() if args.test is False: train(args.train_iter, agent, env, evaluate, args.validate_interval, args.output, args.window_length, max_episode_length=args.max_episode_length, debug=args.debug, visualize=args.vis, traintimes=args.traintimes, resume=args.resume) else: test(args.validate_episodes, agent, env, evaluate, args.resume, args.window_length, visualize=True, debug=args.debug)
def playonce(self, env, T): from multi import fastenv fenv = fastenv(env, 4) self.play(fenv, T) env.rel() del fenv
def train(num_iterations, agent, env): fenv = fastenv(env, args.action_repeat) window_length = args.window_length save_interval = args.save_interval debug = args.debug output = args.output time_stamp = 0. log = 0 step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = args.noise_level * random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if done: # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{} in {}'.format( save_num, args.output)) agent.save_model(output, save_num) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = args.noise_level * random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume ace = args.ace validate_episodes = args.validate_episodes # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf if ace != 1: ensemble = ACE(nb_status, nb_actions, args) if resume is not None: print('load weight') if ace != 1: ensemble.load(output) agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) agent.save_model(output, 0) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation shape = ", np.shape(observation)) # print("observation = ", observation) # print("reward = ", reward) # exit() # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num) if ace != 1: ensemble.append(output, save_num) # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False) if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward))) if ace != 1 and save_num >= 1: validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False) if debug: prRed('ACE Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward2), np.var(validate_reward2))) # for i in range(validate_episodes): # validate_num += 1 writer.add_scalar('validate/reward', np.mean(validate_reward), step) if ace != 1 and save_num >= 1: writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 sigint_handler(0, 0)
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume ace = args.ace validate_episodes = args.validate_episodes # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf if ace != 1: ensemble = ACE(nb_status, nb_actions, args) if resume is not None: print('load weight') if ace != 1: ensemble.load(output) agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) agent.save_model(output, 0) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation shape = ", np.shape(observation)) # print("observation = ", observation) # print("reward = ", reward) # exit() # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num) if ace != 1: ensemble.append(output, save_num) # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False) if debug: prRed( 'Step_{:07d}: mean_reward:{} reward_var:{}'.format( step, np.mean(validate_reward), np.var(validate_reward))) if ace != 1 and save_num >= 1: validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False) if debug: prRed( 'ACE Step_{:07d}: mean_reward:{} reward_var:{}' .format(step, np.mean(validate_reward2), np.var(validate_reward2))) # for i in range(validate_episodes): # validate_num += 1 writer.add_scalar('validate/reward', np.mean(validate_reward), step) if ace != 1 and save_num >= 1: writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 sigint_handler(0, 0)
def real_test(skip=1): def obg(plain_obs): # observation generator # derivatives of observations extracted here. #print('pg multi.py 21, plain_obs:', len(plain_obs)) #processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=self.stepcount) observation = plain_obs obs = [] obs.extend(observation['misc']['mass_center_pos']) # x, y, z obs.extend(observation['misc']['mass_center_vel']) # x, y, z obs.extend(observation['misc']['mass_center_acc']) # x, y, z # joint body, positions and vels relative to pelvis # Absolute Joint Positions obs.extend(observation['joint_pos']['ground_pelvis']) obs.extend(observation['joint_pos']['hip_r']) obs.extend(observation['joint_pos']['knee_r']) obs.extend(observation['joint_pos']['ankle_r']) obs.extend(observation['joint_pos']['hip_l']) obs.extend(observation['joint_pos']['knee_l']) obs.extend(observation['joint_pos']['ankle_l']) ''' # Relative Joint Positions #print(observation['joint_pos']['ground_pelvis']) obs.extend(observation['joint_pos']['ground_pelvis']) # 6 elements #print(rel_to_A(observation['joint_pos']['hip_r'], observation['body_pos']['pelvis'])) obs.extend(rel_to_A(observation['joint_pos']['hip_r'], observation['body_pos']['pelvis'])) # 3e obs.extend(rel_to_A(observation['joint_pos']['knee_r'], observation['body_pos']['pelvis'])) # 1e obs.extend(rel_to_A(observation['joint_pos']['ankle_r'], observation['body_pos']['pelvis'])) # 1e obs.extend(rel_to_A(observation['joint_pos']['hip_l'], observation['body_pos']['pelvis'])) # 3e obs.extend(rel_to_A(observation['joint_pos']['knee_l'], observation['body_pos']['pelvis'])) # 1e obs.extend(rel_to_A(observation['joint_pos']['ankle_l'], observation['body_pos']['pelvis'])) # 1e ''' # Absolute Joint Vel obs.extend(observation['joint_vel']['ground_pelvis']) obs.extend(observation['joint_vel']['hip_r']) obs.extend(observation['joint_vel']['knee_r']) obs.extend(observation['joint_vel']['ankle_r']) obs.extend(observation['joint_vel']['hip_l']) obs.extend(observation['joint_vel']['knee_l']) obs.extend(observation['joint_vel']['ankle_l']) # Absolute Joint Acc obs.extend(observation['joint_acc']['ground_pelvis']) obs.extend(observation['joint_acc']['hip_r']) obs.extend(observation['joint_acc']['knee_r']) obs.extend(observation['joint_acc']['ankle_r']) obs.extend(observation['joint_acc']['hip_l']) obs.extend(observation['joint_acc']['knee_l']) obs.extend(observation['joint_acc']['ankle_l']) b = [ 'body_pos', 'body_vel', 'body_acc', 'body_pos_rot', 'body_vel_rot', 'body_acc_rot' ] parts = [ 'pelvis', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l', 'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head' ] for i in b: for j in parts: obs.extend(observation[i][j]) forces_subkeys = observation['forces'].keys() for k in forces_subkeys: obs.extend(observation['forces'][k]) #print('pg multi.py 25, proc_obs:', len(processed_observation)) return np.array(obs) import opensim as osim from osim.env import ProstheticsEnv as RunEnv #te = RunEnv(visualize=False) from multi import fastenv #env = fastenv(te,skip) remote_env = farmer.acq_env() env = fastenv(remote_env, skip) observation = env.reset() #print(observation) stepno = 0 epino = 0 total_reward = 0 old_observation = None while True: proc_observation = observation a = [float(i) for i in list(agent.act(proc_observation)[0])] #print(a) observation, reward, done, info, real_reward = env.step(a) stepno += 1 total_reward += reward print('step', stepno, 'total reward', total_reward) if done: print('>>>>>>>episode', epino, ' DONE after', stepno, 'got_reward', total_reward) break
def train(num_iterations, agent, env): fenv = fastenv(env, args.action_repeat) window_length = args.window_length save_interval = args.save_interval debug = args.debug output = args.output time_stamp = 0. log = 0 step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = args.noise_level * random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if done: # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{} in {}'.format(save_num, args.output)) agent.save_model(output, save_num) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = args.noise_level * random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1