def play_func(params, net, cuda, exp_queue): env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) device = torch.device("cuda" if cuda else "cpu") writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params.stop_reward) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break exp_queue.put(None)
def play_func(params, net, cuda, exp_queue): env = make_env(params) writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=cuda) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break exp_queue.put(None)
def play_func(params, net, cuda, fsa, exp_queue, fsa_nvec=None): device = torch.device("cuda" if cuda else "cpu") env = make_env(params) writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers") if not fsa: selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa) else: if 'Index' in net.__class__.__name__: selector = ptan.actions.EpsilonGreedyActionSelectorFsa(fsa_nvec, epsilon=params['epsilon_start']) epsilon_tracker = common.IndexedEpsilonTracker(selector, params, fsa_nvec) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa, epsilon_tracker=epsilon_tracker) else: selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa) # epsilon_tracker = common.IndexedEpsilonTrackerNoStates(selector, params, fsa_nvec) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward'], params['telemetry'], params['plot']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) if not fsa or 'Index' not in net.__class__.__name__: epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() new_scores = exp_source.pop_total_scores() if new_rewards: if not fsa or 'Index' not in net.__class__.__name__: new_score = [] if not new_scores else new_scores[0] if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon, params['plot']): break else: new_score = [] if not new_scores else new_scores[0] if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon_dict, params['plot']): break exp_queue.put(None)
def play_func(params, net, cuda, exp_queue): env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) device = torch.device("cuda" if cuda else "cpu") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) for frame_idx, exp in enumerate(exp_source): epsilon_tracker.frame(frame_idx / BATCH_MUL) exp_queue.put(exp) for reward, steps in exp_source.pop_rewards_steps(): exp_queue.put(EpisodeEnded(reward, steps, selector.epsilon))
def play_func(params, net, cuda, exp_queue, device_id): """ The paper suggests sampling the actions from the learner net, so that requires little change from the multienv implementation. *** There is a reason that it reinitializes the envs in this function that has to do with parallelization *** """ run_name = params['run_name'] if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] envSI = gym.make('SpaceInvadersNoFrameskip-v4') envSI = ptan.common.wrappers.wrap_dqn(envSI) envDA = gym.make('DemonAttackNoFrameskip-v4') envDA = ptan.common.wrappers.wrap_dqn(envDA) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") writer = SummaryWriter(comment="-" + run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast_AM([envSI, envDA], agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('mimic_models/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward( new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % 500 == 0): # write to disk print("Saving model...") model_name = 'mimic_models/{}_{}.pth'.format( run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('mimic_models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'mimic_models/{}_{}.pth'.format(run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('mimic_models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('mimic_models/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
action="store_true", help="Enable double dqn") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) env.seed(common.SEED) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params.replay_size) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) def process_batch(engine, batch): optimizer.zero_grad() loss_v = calc_loss_double_dqn(batch, net, tgt_net.target_model, gamma=params.gamma,
def play_func(params, net, cuda, exp_queue, device_id): env_name = params['env_name'] run_name = params['run_name'] if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] env = gym.make(env_name) env = ptan.common.wrappers.wrap_dqn(env) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") if 'save_iter' not in params: save_iter = 500 else: save_iter = params['save_iter'] writer = SummaryWriter(comment="-" + params['run_name'] + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('models/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] best_reward = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % save_iter == 0): # write to disk np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) if mean_reward > best_reward: print("Saving model...") model_name = 'models/{}_{}.pth'.format(run_name, game_idx) torch.save(net, model_name) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) best_reward = mean_reward if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'models/{}_{}.pth'.format(run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('models/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env) writer = SummaryWriter(comment="-" + params['run_name'] + "-basic") net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet( net) # Target network (copy of net synchronized from time to time) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) # e-greedy selectoy of actions epsilon_tracker = common.EpsilonTracker( selector, params) # schedules epsilon according to the current frame number agent = ptan.agent.DQNAgent( net, selector, device=device) # agent class with Q-network and selector exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1 ) # generates touples from the environment in the form (s, a, r, s') buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size'] ) # buffer of experiences for experience replay optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0
net_deer = model.DQNModel( deer_obs.spaces[0].shape, deer_obs.spaces[1].shape, m_env.get_action_space(deer_handle)[0]).to(device) tgt_net_deer = ptan.agent.TargetNet(net_deer) print(net_deer) net_tiger = model.DQNModel( tiger_obs.spaces[0].shape, tiger_obs.spaces[1].shape, m_env.get_action_space(tiger_handle)[0]).to(device) tgt_net_tiger = ptan.agent.TargetNet(net_tiger) print(net_tiger) action_selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=PARAMS.epsilon_start) epsilon_tracker = common.EpsilonTracker(action_selector, PARAMS) preproc = model.MAgentPreprocessor(device) agent = model.GroupDQNAgent([net_deer, net_tiger], action_selector, device, preprocessor=preproc) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, PARAMS.gamma, vectorized=True) deer_buffer = ptan.experience.ExperienceReplayBuffer( None, PARAMS.replay_size) tiger_buffer = ptan.experience.ExperienceReplayBuffer( None, PARAMS.replay_size) deer_optimizer = optim.Adam(net_deer.parameters(), lr=PARAMS.learning_rate)
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay") net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: writer.add_scalar("beta", beta, frame_idx) if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta) loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model, params["gamma"], cuda=args.cuda) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
def play_func(params, net, cuda, exp_queue, device_id): """ With multiple envs, the exp_source class will return experiences (defined as a tuple of (state_framestack, action, reward, last_state_framestack) alternating between the two environments. Otherwise it returns just experinces from a single env. Even if the games have different frame shapes, they will by reduced to 84x84 *** There is a reason that it reinitializes the envs in this function that has to do with parallelization *** """ run_name = 'demon_invaders' if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] envSI = gym.make('SpaceInvadersNoFrameskip-v4') envSI = ptan.common.wrappers.wrap_dqn(envSI) envDA = gym.make('DemonAttackNoFrameskip-v4') envDA = ptan.common.wrappers.wrap_dqn(envDA) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") if 'save_iter' not in params: save_iter = 500 else: save_iter = params['save_iter'] writer = SummaryWriter(comment="-" + run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( [envSI, envDA], agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('models_multi/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward( new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % save_iter == 0): # write to disk print("Saving model...") model_name = 'models_multi/{}_{}_{}.pth'.format( run_name, params['secondary'], game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt( 'models_multi/{}_{}_reward.txt'.format( run_name, params['secondary']), np.array(mean_rewards)) if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'models_multi/{}_{}_{}.pth'.format(run_name, params['secondary'], game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt( 'models_multi/{}_{}_reward.txt'.format(run_name, params['secondary']), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('models_multi/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
def main(): global params_save_file game = 'revenge' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-dqfd(PDD DQN)") net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) demo_data = demo_data_reader.get_demo_data(env, game, num_states=params['demo_size'], skip=params['skip-frames']) exp_source = ptan.experience.ExperienceSourceNFirstLast(env, agent, gamma=params['gamma'], steps_count=params['n-steps'], demo_data=demo_data) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) buffer.populate_demo_data() optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], weight_decay=L2_REG_LAMBDA) print("Demo data size: {}".format(buffer.demo_samples)) sys.stdout.flush() frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] if frame_idx > params['pretrain_steps']: buffer.populate(params['steps']) else: if frame_idx % 500 == 0: writer.add_scalar("beta", beta, frame_idx) reward_tracker.record_training(frame_idx, selector.epsilon, last_dq_losses, last_n_losses, last_e_losses, last_demo_sizes) epsilon_tracker.frame(frame_idx) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: writer.add_scalar("beta", beta, frame_idx) if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses, last_n_losses, last_e_losses, last_demo_sizes): break optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta) batch_demo_mask = (np.array(batch_indices) < buffer.demo_samples).astype(np.uint8) loss_v, sample_prios = calc_loss(batch, batch_demo_mask, batch_weights, net, tgt_net.target_model, params["gamma"], params["gamma"] ** params['n-steps'], cuda=args.cuda) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double)) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch = buffer.sample(params['batch_size'] * params['steps']) loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, double=args.double) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % EVAL_EVERY_FRAME == 0: mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda) writer.add_scalar("values_mean", mean_val, frame_idx) if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))