def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % EVAL_EVERY_FRAME == 0: eval_states = getattr(engine.state, "eval_states", None) if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) engine.state.eval_states = eval_states evaluate_states(eval_states, net, device, engine) return { "loss": loss_v.item(), "epsilon": selector.epsilon, }
def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { "loss": loss_v.item(), "epsilon": batch_generator.epsilon, }
def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0: for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2 return { "loss": loss_v.item(), }
def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration) if engine.state.iteration % params.target_net_sync == 0: print('syncing...') tgt_net.sync() return {'loss': loss_v.item(), "epsilon": batch_generator.epsilon}
def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) loss_v.backward() optimizer.step() epsilon_tracker.frame(engine.state.iteration * args.envs) if engine.state.iteration % params.target_net_sync == 0: tgt_net.sync() return { 'loss': loss_v.item(), 'epsilon': selector.epsilon, }
while True: frame_idx += 1 buffer.populate(1) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % 500 == 0: for layer_idx, sigma_l2 in enumerate( net.noisy_layers_sigma_snr()): writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1), sigma_l2, frame_idx)
''' not train the NNs until buffer's size in more than replay inital ''' if len(buffer)<params["replay_initial"]: continue optimizer.zero_grad() ''' sample from experience buffer ''' batch=buffer.sample(params["batch_size"]) ''' calculate loss between: 1. actions' q values from main NN using current state (WARNING: this is not just the max of the outputs) 2. output of bell equation: max of outputs from TARGET NN using next state*gamma + this step's reward ''' loss_v=common.calc_loss_dqn(batch,net,tgt_net.target_model, gamma=params["gamma"],device=device) loss_v.backward() optimizer.step() ''' sync target NN with main NN every target_net_sync steps ''' if frame_idx % params["target_net_sync"]==0: tgt_net.sync()
buffer = ptan.experience.ExperienceReplayBuffer(experience_source=None, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) exp_queue = mp.Queue(maxsize=PLAY_STEPS * 2) play_proc = mp.Process(target=play_func, args=(params, net, args.cuda, exp_queue, cuda_id)) play_proc.start() frame_idx = 0 while play_proc.is_alive(): frame_idx += PLAY_STEPS for _ in range(PLAY_STEPS): exp = exp_queue.get() if exp is None: play_proc.join() break buffer._add(exp) if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, cuda_async=True, cuda_id=cuda_id) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] < PLAY_STEPS: tgt_net.sync()
with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) # put 1 sample in the experience buffer epsilon_tracker.frame( frame_idx) # set epsilon decay for this frame new_rewards = exp_source.pop_total_rewards( ) # check for finished episode and monitor their total reward if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample( params['batch_size']) # get batch from experience buffer loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device) # compute loss loss_v.backward() # compute loss derivative optimizer.step() # optimization step if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() # synchronize target network
if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], double=False, device=device) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % 500 == 0: for layer_idx, sigma_l2 in enumerate( net.noisy_layers_sigma_snr()): writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1), sigma_l2, frame_idx) if frame_idx % EVAL_EVERY_FRAME == 0:
if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma']**args.n, device=device, double=args.double) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() save_model(tgt_net.target_model, args.model) if frame_idx % EVAL_EVERY_FRAME == 0: mean_val = calc_values_of_states(eval_states, net, device=device) writer.add_scalar("values_mean", mean_val, frame_idx)
with RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) # get latest rewards new_rewards = experience_source.pop_total_rewards() # new_rewards are empty till the end of the episode # so we need to check if the list is empty to pass it to # reward_tracker if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break # till buffer fills up and we can sample continue the loop if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss = calc_loss_dqn(batch, net, target_net.target_model, params['gamma']**args.n, device) loss.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: target_net.sync() if frame_idx % 500 == 0: for layer_idx, sigma_l2 in \ enumerate(net.noisy_layers_sigma_snr()): writer.add_scalar("sigma_snr_layer_%d" % (layer_idx + 1), sigma_l2, frame_idx)
buffer.populate(1)#where all the magic happens! # 1- the buffer will ask the experience source to produce a transision s,a,R,s' # 2- the experience source will feed the current observation s to the agent # 3- the agent will feed the observation to the network, get the q values of the observation and ask the action selector to decide an action # 4- the action selector will generate a random value, compare it to epsilon and decide whether to act greedy or randomly decide the action # 5- the action decided is passed to the experience source which feeds it to the environment to get the reward r and new state s' and now, s,a,R,s' are passed to the buffer # 6- the buffer takes in the s,a,R,s' data in and kicks out an old one to maintain the same size epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): #If the reward tracker returns True, then it's an indication that the mean reward has reached the score boundary and we can stop our training. break if len(buffer) < params['replay_initial']:# we need to fill the buffer before training so that we have episodes to train on continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) if frame_idx % EVAL_EVERY_FRAME == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) writer.add_scalar("values_mean", mean_val, frame_idx) # taking a training step! optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma']**unrolling_steps, double = double, device=device) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0:#when to sync our target network tgt_net.sync()
frame_idx += PLAY_STEPS for _ in range(PLAY_STEPS): exp = exp_queue.get() if exp is None: play_proc.join() break buffer._add(exp) if len(buffer) < params['replay_initial']: continue # train on ERB? optimizer.zero_grad() optimizer_tm.zero_grad() batch = buffer.sample(params['batch_size']) loss_v, tm_loss = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, cuda_async=True, fsa=args.fsa, tm_net=tm_net) loss_v.backward() optimizer.step() tm_loss.backward() optimizer_tm.step() if frame_idx > counter*params['video_interval'] and args.video: test_env = wrappers.Monitor(make_env(params), "{}/frame{}".format(video_path, counter), video_callable=lambda ep_id: True if ep_id < 3 else False, force=True) obs = test_env.reset() test_agent = ptan.agent.PolicyAgent(net, action_selector=ptan.actions.ArgmaxActionSelector(), device=device, fsa=args.fsa) real_done = False
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(HVALUE ,H_map, batch, net, tgt_net.target_model, gamma=params['gamma'], device=device) time.sleep(2) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync()
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=args.n) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma']**args.n, device=device) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync()