def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list_np = sample_initial_context_normal( args.num_ensembles) if initial_training: train_on_initial(improved_context_list_np) for i_iter in range(args.max_iter_num): # generate multiple trajectories that reach the minimum batch_size policy_np.training = False if len(replay_memory) == 0 or not args.rm_as_context: context_list_np = improved_context_list_np else: context_list_np = replay_memory.data batch_np, log_np = agent_np.collect_episodes(context_list_np, args.num_req_steps, args.num_ensembles) disc_rew_np = discounted_rewards(batch_np.memory, args.gamma) iter_dataset_np = BaseDataset(batch_np.memory, disc_rew_np, args.device_np, args.dtype, max_len=max_episode_len) print('np avg actions: ', log_np['action_mean']) advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np, value_replay_memory, value_np, args) improved_context_list_np = improvement_step_all( iter_dataset_np, advantages_np, args.max_kl_np, args) # training value_replay_memory.add(iter_dataset_np) train_value_np(value_replay_memory) tn0 = time.time() replay_memory.add(iter_dataset_np) train_np(replay_memory) tn1 = time.time() tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps']) avg_rewards_np.append(log_np['avg_reward']) if i_iter % args.log_interval == 0: print(i_iter) print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log_np['min_reward'], log_np['max_reward'], log_np['avg_reward'])) print('new sigma', args.fixed_sigma) plot_rewards_history(tot_steps_np, avg_rewards_np) store_avg_rewards( tot_steps_np[-1], avg_rewards_np[-1], np_file.replace( str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv')) if tot_steps_np[-1] > args.tot_steps: break
def main_loop(): improved_context_list_np = sample_initial_context_normal(env, init_sigma=0.05) train_on_initial(improved_context_list_np) #policy_np.apply(InitFunc.init_zero) for i_iter in range(args.max_iter_num): # define context set policy_np.training = False if len(replay_memory) == 0 or not args.rm_as_context: context_list_np = improved_context_list_np else: context_list_np = replay_memory.data # collect samples batch_np, log_np = agent_np.collect_episodes(context_list_np, args.num_req_steps, args.num_ensembles) # compute discounted rewards disc_rew_np = discounted_rewards(batch_np.memory, args.gamma) iter_dataset_np = BaseDataset(batch_np.memory, disc_rew_np, args.device_np, args.dtype, max_len=max_episode_len) # estimate advantages if args.learn_baseline: if args.value_net: state_list = [ep['states'][:ep['real_len']] for ep in iter_dataset_np] advantages_np, returns = critic_estimate(value_net, state_list, disc_rew_np, args) update_critic(value_net, torch.cat(state_list, dim=0), torch.cat(returns, dim=0)) else: advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np, value_replay_memory, value_np, args) value_replay_memory.add(iter_dataset_np) train_value_np(value_replay_memory) else: advantages_np = disc_rew_np # update step improved_context_list_np = improvement_step_all(iter_dataset_np, advantages_np, args.max_kl_np, args) # training replay_memory.add(iter_dataset_np) train_np(replay_memory) # prints & plots tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps']) avg_rewards_np.append(log_np['avg_reward']) if i_iter % args.log_interval == 0: print(i_iter) print('np avg actions: ', log_np['action_mean']) print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(log_np['min_reward'], log_np['max_reward'], log_np['avg_reward'])) # plot_pca_proj(iter_dataset_np, advantages_np, policy_np) print('new sigma', args.fixed_sigma) plot_rewards_history(tot_steps_np, avg_rewards_np) store_avg_rewards(tot_steps_np[-1], avg_rewards_np[-1], np_file.replace(str(args.seed)+'.csv', 'avg'+str(args.seed)+'.csv')) if tot_steps_np[-1] > args.tot_steps or log_np['avg_reward'] < -5000: break """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(improved_context_list): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size batch, log = agent.collect_episodes(improved_context_list, render=(i_iter % 10 == 0)) disc_rew = discounted_rewards(batch.memory, args.gamma) complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype, max_len=max_episode_len) advantages = estimate_v_a(complete_dataset, disc_rew) t0 = time.time() improved_context_list = improvement_step_all(complete_dataset, advantages) t1 = time.time() # create training set tn0 = time.time() replay_memory.add(complete_dataset) train_np(replay_memory) tn1 = time.time() tv0 = time.time() if i_iter % args.plot_every == 0: # plot_initial_context(improved_context_list, colors, env, args, i_iter) # plot_training_set(i_iter, replay_memory, env, args) plot_policy(model, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors) plot_improvements(complete_dataset, disc_rew, env, i_iter, args, colors) tv1 = time.time() tot_steps.append(tot_steps[-1] + log['num_steps']) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0: print( '{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}' .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) print('Training: \tT_policy {:.2f} \nT_plots {:.2f}'.format( tn1 - tn0, tv1 - tv0)) if i_iter % args.plot_every == 0: plot_rewards_history(avg_rewards, tot_steps, args) plot_rewards_history(avg_rewards, tot_steps, args) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): improved_context = sample_initial_context() avg_rewards = [] for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size # introduce param context=None when np is policy, these will be the context points used to predict policy_np.training = False batch, log, memory = agent.collect_samples(args.min_batch_size, context=improved_context) # batch of batch_size transitions from multiple print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch, args.gamma) complete_dataset = BaseDataset(batch, disc_rew, args.device_np, args.dtype) value_replay_memory.add(complete_dataset) train_value_np(value_replay_memory) estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter) memory.set_disc_rew(estimated_disc_rew) t0 = time.time() all_improved_context_0 = improvement_step(batch) all_improved_context = imporove_mean_stdv(complete_dataset, estimated_disc_rew, values_stdevs) t1 = time.time() key = 'means' if args.improve_mean else 'actions' improved_context = [all_improved_context['states'], all_improved_context[key]] # plot improved context and actions' discounted rewards plot_improvements(batch, improved_context, env, i_iter, args) # create training set training_set = all_improved_context['means'] frac_action_in_training = int(frac_replace_actions * training_set.shape[1]) training_set[:, :frac_action_in_training, :] = all_improved_context['actions'][:, :frac_action_in_training, :] dataset = MemoryDatasetNP(batch, training_set, args.device_np, args.dtype, max_len=999) replay_memory.add(dataset) plot_training_set(i_iter, replay_memory, env, args) print('replay memory size:', len(replay_memory)) train_np(replay_memory) plot_NP_policy(policy_np, improved_context, i_iter, log['avg_reward'], env, args, num_samples=1) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) plot_rewards_history(avg_rewards, args) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): for i_iter in range(args.max_iter_num): # collect samples batch, log = agent.collect_episodes(args.num_req_steps) # compute discounted rewards disc_rew_mlp = discounted_rewards(batch.memory, args.gamma) iter_dataset = BaseDataset(batch.memory, disc_rew_mlp, args.device_np, args.dtype, max_len=max_episode_len) # estimate advantages if args.value_net: state_list = [ep['states'][:ep['real_len']] for ep in iter_dataset] advantages, returns = critic_estimate(value_net, state_list, disc_rew_mlp, args) update_critic(value_net, torch.cat(state_list, dim=0), torch.cat(returns, dim=0)) else: advantages = estimate_v_a(iter_dataset, disc_rew_mlp, value_replay_memory, value_net, args) value_replay_memory.add(iter_dataset) train_value(value_replay_memory) # returned context not used but added to iter_dataset inside the function improved_context_list = improvement_step_all(iter_dataset, advantages, args.max_kl_mlp, args) # training replay_memory.add(iter_dataset) train_policy(replay_memory) # prints & plots tot_steps.append(tot_steps[-1] + log['num_steps']) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0: print(i_iter) print('mlp: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log['min_reward'], log['max_reward'], log['avg_reward'])) print('new sigma', args.fixed_sigma) store_avg_rewards( tot_steps[-1], avg_rewards[-1], mlp_file.replace( str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv')) if i_iter % args.plot_every == 0: plot_rewards_history(tot_steps, avg_rewards) if tot_steps[-1] > args.tot_steps: break """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) #print('sampling initial context') if args.init_normal: improved_context_list = sample_initial_context_normal(args.num_ensembles) else: improved_context_list = sample_initial_context_uniform(args.num_ensembles) plot_initial_context(improved_context_list, colors, env, args, '00') if initial_training: train_on_initial(improved_context_list) for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size # introduce param context=None when np is policy, these will be the context points used to predict policy_np.training = False batch, log = agent.collect_episodes(improved_context_list) # batch of batch_size transitions from multiple #print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch.memory, args.gamma) complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype, max_len=max_episode_len) value_replay_memory.add(complete_dataset) train_value_np(value_replay_memory) estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter, episode_specific_value=args.episode_specific_value) t0 = time.time() improved_context_list = improvement_step(complete_dataset, estimated_disc_rew, values_stdevs) t1 = time.time() #plot_initial_context(improved_context_list, colors, env, args, i_iter) # plot improved context and actions' discounted rewards if i_iter % args.plot_every == 0: plot_improvements(complete_dataset, estimated_disc_rew, env, i_iter, args, colors) # create training set replay_memory.add(complete_dataset) train_np(replay_memory) #plot_training_set(i_iter, replay_memory, env, args) if i_iter % args.plot_every == 0: plot_NP_policy(policy_np, improved_context_list, i_iter, log['avg_reward'], env, args, colors) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0 and False: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) plot_rewards_history(avg_rewards, args) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): for i_iter in range(args.max_iter_num): # (1) # generate multiple trajectories that reach the minimum batch_size # introduce param context=None when np is policy, these will be the context points used to predict batch, log, memory = agent.collect_samples( args.min_batch_size ) # batch of batch_size transitions from multiple print(log['num_steps'], log['num_episodes'] ) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch, args.gamma) memory.set_disc_rew(disc_rew) complete_dataset = BaseDataset(batch, disc_rew, args.device, dtype, max_len=max_episode_len) t0 = time.time() update_params_trpo( batch, i_iter ) # estimate advantages from samples and update policy by TRPO step t1 = time.time() plot_policy(policy_net, (i_iter, log['avg_reward'], 'policies')) if i_iter % args.log_interval == 0: print( '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}' .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if not args.episode_specific_value: iter_dataset = {} iter_states, iter_q = merge_padded_lists( [episode['states'] for episode in complete_dataset], [ episode['discounted_rewards'] for episode in complete_dataset ], max_lens=[episode['real_len'] for episode in complete_dataset]) iter_dataset['states'] = iter_states iter_dataset['discounted_rewards'] = iter_q iter_dataset['real_len'] = iter_states.shape[-2] complete_dataset = [iter_dataset] value_replay_memory.add(complete_dataset) train_value_np(value_replay_memory)
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list = sample_initial_context_normal(args.num_ensembles) if initial_training: train_on_initial(improved_context_list) for i_iter in range(args.max_iter_num): if tot_steps_trpo[-1] - tot_steps_np[-1] < 1000: batch_trpo, log_trpo, memory_trpo = agent_trpo.collect_samples(args.min_batch_size) # batch of batch_size transitions from multiple update_params_trpo(batch_trpo) # generate multiple trajectories that reach the minimum batch_size tot_steps_trpo.append(tot_steps_trpo[-1] + log_trpo['num_steps']) avg_rewards_trpo.append(log_trpo['avg_reward']) # generate multiple trajectories that reach the minimum batch_size policy_np.training = False batch, log = agent.collect_episodes(improved_context_list) # batch of batch_size transitions from multiple #print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch.memory, args.gamma) complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype, max_len=max_episode_len) value_replay_memory.add(complete_dataset) advantages = estimate_v_a(complete_dataset, disc_rew) tv0 = time.time() train_value_np(value_replay_memory) tv1 = time.time() t0 = time.time() improved_context_list = improvement_step_all(complete_dataset, advantages) t1 = time.time() #plot_initial_context(improved_context_list, colors, env, args, i_iter) # create training set tn0 = time.time() replay_memory.add(complete_dataset) train_np(replay_memory) tn1 = time.time() tot_steps_np.append(tot_steps_np[-1] + log['num_steps']) #plot_training_set(i_iter, replay_memory, env, args) if i_iter % args.plot_every == 0 and False: plot_NP_policy(policy_np, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors) plot_improvements(complete_dataset, advantages, env, i_iter, args, colors) avg_rewards_np.append(log['avg_reward']) if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) print('Training: \tT_policy {:.2f} \tT_value {:.2f}'.format(tn1-tn0, tv1-tv0)) if i_iter % args.plot_every == 0: plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_np, avg_rewards_np], args=args) plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_np, avg_rewards_np], args=args) args.max_kl = args.max_kl*0.99 args.fixed_sigma = args.fixed_sigma*0.96 """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): improved_context_list_mi = sample_initial_context_normal(env) for i_iter in range(args.max_iter_num): # define context set if len(replay_memory_mi) == 0 or not args.rm_as_context: context_list_np = improved_context_list_mi else: context_list_np = replay_memory_mi.data # collect samples batch_mi, log_mi = agent_mi.collect_episodes(context_list_np, args.num_req_steps, args.num_ensembles) # compute discounted rewards disc_rew_mi = discounted_rewards(batch_mi.memory, args.gamma) iter_dataset_mi = BaseDataset(batch_mi.memory, disc_rew_mi, args.device_np, args.dtype, max_len=max_episode_len) # estimate advantages if args.value_net: state_list = [ ep['states'][:ep['real_len']] for ep in iter_dataset_mi ] advantages_mi, returns = critic_estimate(value_net, state_list, disc_rew_mi, args) update_critic(value_net, torch.cat(state_list, dim=0), torch.cat(returns, dim=0)) else: advantages_mi = estimate_v_a(iter_dataset_mi, disc_rew_mi, value_replay_memory, model, args) value_replay_memory.add(iter_dataset_mi) # update step improved_context_list_mi = improvement_step_all( iter_dataset_mi, advantages_mi, args.max_kl_mi, args) # training replay_memory_mi.add(iter_dataset_mi) train_mi(replay_memory_mi) # prints & plots tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps']) avg_rewards_mi.append(log_mi['avg_reward']) if i_iter % 1 == 0: plot_pca_proj(iter_dataset_mi, advantages_mi, model) if i_iter % args.log_interval == 0: print(i_iter) print('mi: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log_mi['min_reward'], log_mi['max_reward'], log_mi['avg_reward'])) print('new sigma', args.fixed_sigma) store_avg_rewards( tot_steps_mi[-1], log_mi['avg_reward'], mi_file.replace( str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv')) if i_iter % args.plot_every == 0: plot_rewards_history(tot_steps_mi, avg_rewards_mi) if tot_steps_mi[-1] > args.tot_steps: break """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(improved_context_list): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) for i_iter in range(args.max_iter_num): if tot_steps_trpo[-1] - tot_steps_mi[-1] < 1000: batch_trpo, log_trpo, memory_trpo = agent_trpo.collect_samples( args.min_batch_size ) # batch of batch_size transitions from multiple update_params_trpo( batch_trpo ) # generate multiple trajectories that reach the minimum batch_size tot_steps_trpo.append(tot_steps_trpo[-1] + log_trpo['num_steps']) avg_rewards_trpo.append(log_trpo['avg_reward']) batch_mi, log_mi = agent_mi.collect_episodes(improved_context_list, render=(i_iter % 10 == 0)) disc_rew = discounted_rewards(batch_mi.memory, args.gamma) complete_dataset = BaseDataset(batch_mi.memory, disc_rew, args.device_np, args.dtype, max_len=max_episode_len) advantages = estimate_v_a(complete_dataset, disc_rew) t0 = time.time() improved_context_list = improvement_step_all(complete_dataset, advantages) t1 = time.time() # create training set tn0 = time.time() replay_memory.add(complete_dataset) train_np(replay_memory) tn1 = time.time() tv0 = time.time() if False and i_iter % args.plot_every == 0: # plot_initial_context(improved_context_list, colors, env, args, i_iter) # plot_training_set(i_iter, replay_memory, env, args) # plot_policy(model, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors) plot_improvements(complete_dataset, disc_rew, env, i_iter, args, colors) tv1 = time.time() tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps']) avg_rewards_mi.append(log_mi['avg_reward']) if i_iter % args.log_interval == 0: print( '{}\n R_min_trpo {:.2f} \tR_max_trpo {:.2f} \tR_avg_trpo {:.2f}\nR_min_mi {:.2f} \tR_max_mi {:.2f} \tR_avg_mi {:.2f} ' .format(i_iter, log_trpo['min_reward'], log_trpo['max_reward'], log_trpo['avg_reward'], log_mi['min_reward'], log_mi['max_reward'], log_mi['avg_reward'])) if i_iter % args.plot_every == 0: plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_mi, avg_rewards_mi], args=args) plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_mi, avg_rewards_mi], args=args) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): improved_context = sample_initial_context(args.min_batch_size, dtype=dtype) avg_rewards = [] for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size # introduce param context=None when np is policy, these will be the context points used to predict policy_np.training = False batch, log, memory = agent.collect_samples( args.min_batch_size, context=improved_context ) # batch of batch_size transitions from multiple print(log['num_steps'], log['num_episodes'] ) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch, args.gamma) memory.set_disc_rew(disc_rew) t0 = time.time() all_improved_context = improvement_step(batch) t1 = time.time() key = 'means' if improve_mean else 'actions' improved_context = [ all_improved_context['states'], all_improved_context[key] ] # plot improved context and actions' discounted rewards plot_improvements( batch, [all_improved_context['states'], all_improved_context['means']], i_iter) # create training set training_set = all_improved_context['means'] num_action_in_training = int(frac_replace_actions * training_set.shape[1]) print('replacing {} means with actions'.format(num_action_in_training)) training_set[:, :num_action_in_training, :] = all_improved_context[ 'actions'][:, :num_action_in_training, :] dataset = MemoryDatasetNP(batch, training_set, device_np, dtype, max_len=999) replay_memory.add(dataset) plot_training_set(i_iter) print('replay memory size:', len(replay_memory)) train_np(replay_memory) plot_NP_policy(improved_context, i_iter, log['avg_reward']) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0: print( '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}' .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) fig_rew, ax_rew = plt.subplots(1, 1) ax_rew.plot(np.arange(len(avg_rewards)), avg_rewards) ax_rew.set_xlabel('iterations') ax_rew.set_ylabel('average reward') fig_rew.savefig(directory_path + '/average reward') plt.close(fig_rew) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list = sample_initial_context_normal(args.num_ensembles) if initial_training: train_on_initial(improved_context_list) for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size policy_np.training = False batch, log = agent.collect_episodes(improved_context_list) # batch of batch_size transitions from multiple #print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory disc_rew = discounted_rewards(batch.memory, args.gamma) complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype, max_len=max_episode_len) if not args.episode_specific_value: iter_dataset = {} iter_states, iter_q = merge_padded_lists([episode['states'] for episode in complete_dataset], [episode['discounted_rewards'] for episode in complete_dataset], max_lens=[episode['real_len'] for episode in complete_dataset]) iter_dataset['states'] = iter_states iter_dataset['discounted_rewards'] = iter_q iter_dataset['real_len'] = iter_states.shape[-2] value_replay_memory.add([iter_dataset]) else: value_replay_memory.add(complete_dataset) estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter, episode_specific_value=args.episode_specific_value) tv0 = time.time() train_value_np(value_replay_memory) tv1 = time.time() t0 = time.time() improved_context_list = improvement_step_all(complete_dataset, estimated_disc_rew) t1 = time.time() #plot_initial_context(improved_context_list, colors, env, args, i_iter) # plot improved context and actions' discounted rewards #if i_iter % args.plot_every == 0: plot_improvements(complete_dataset, estimated_disc_rew, env, i_iter, args, colors) # create training set tn0 = time.time() replay_memory.add(complete_dataset) train_np(replay_memory) tn1 = time.time() #if i_iter % args.plot_every == 0: # plot_NP_policy(policy_np, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors) avg_rewards.append(log['avg_reward']) if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) print('Training: \tT_policy {:.2f} \tT_value {:.2f}'.format(tn1-tn0, tv1-tv0)) if log['avg_reward'] > 195: print('converged') plot_rewards_history(avg_rewards, args) #if i_iter % args.plot_every == 0: plot_rewards_history(avg_rewards, args) #args.fixed_sigma = args.fixed_sigma * args.gamma plot_rewards_history(avg_rewards, args) """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list = sample_initial_context_normal(args.num_ensembles) plot_initial_context(improved_context_list, colors, env, args, '00') if initial_training: train_on_initial(improved_context_list) for i_iter in range(args.max_iter_num): print('sampling episodes') # (1) # generate multiple trajectories that reach the minimum batch_size policy_np.training = False batch, log = agent.collect_episodes( improved_context_list, args.num_req_steps, args.num_ensembles ) # batch of batch_size transitions from multiple #print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory estimated_disc_rew = discounted_rewards(batch.memory, args.gamma) complete_dataset = BaseDataset(batch.memory, estimated_disc_rew, args.device_np, args.dtype, max_len=max_episode_len) t0 = time.time() improved_context_list = improvement_step_all(complete_dataset, estimated_disc_rew) t1 = time.time() #plot_initial_context(improved_context_list, colors, env, args, i_iter) # plot improved context and actions' discounted rewards if i_iter % args.plot_every == 0: plot_improvements(complete_dataset, estimated_disc_rew, env, i_iter, args, colors) # create training set tn0 = time.time() replay_memory.add(complete_dataset) train_np(replay_memory) tn1 = time.time() #plot_training_set(i_iter, replay_memory, env, args) if i_iter % args.plot_every == 0: plot_NP_policy(policy_np, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors) avg_rewards.append(log['avg_reward']) tot_steps.append(tot_steps[-1] + log['num_steps']) if i_iter % args.log_interval == 0: print( '{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}' .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) print('Training: \tT_policy {:.2f}'.format(tn1 - tn0)) if log['avg_reward'] > 95: print('converged') plot_rewards_history(avg_rewards, args) if i_iter % args.plot_every == 0: plot_rewards_history(avg_rewards, tot_steps, args) #if args.pick_context: plot_chosen_context(improved_context_list, args.num_context, i_iter, args, env) plot_all_training_set(i_iter, replay_memory, env, args) if args.fixed_sigma is not None: args.fixed_sigma = args.fixed_sigma * args.gamma plot_rewards_history(avg_rewards, args) """clean up gpu memory""" torch.cuda.empty_cache()
model = MultiLayerPerceptron(state_dim, action_dim, 512).to(device).double() agent = AgentMLP(env, model, num_epi, device, fixed_sigma=args.fixed_sigma) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) model_trainer = MLPTrainer(device, model, optimizer, print_freq=50) replay_memory = ReplayMemoryDataset(20) tot_steps = [0] avg_rewards = [0] for i_iter in range(500): batch, log = agent.collect_episodes( ) # batch of batch_size transitions from multiple disc_rew = discounted_rewards(batch.memory, 0.999) complete_dataset = BaseDataset(batch.memory, disc_rew, device, torch.float64, max_len=max_episode_len) print('average reward at', i_iter, log['avg_reward'].item()) t0 = time.time() improved_context_list_mi = improvement_step_all( complete_dataset, disc_rew, 0.01, args) t1 = time.time() # create training set tn0 = time.time() replay_memory.add(complete_dataset) data_loader = DataLoader(replay_memory, batch_size=1, shuffle=True)
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list_np = sample_initial_context_normal( args.num_ensembles) for i_iter in range(args.max_iter_num): # generate multiple trajectories that reach the minimum batch_size policy_np.training = False if len(replay_memory) == 0 or not args.rm_as_context: context_list_np = improved_context_list_np else: context_list_np = replay_memory.data ts0 = time.time() batch_np, log_np = agent_np.collect_episodes(context_list_np, args.num_req_steps, args.num_ensembles) print('sampling:', time.time() - ts0) disc_rew_np = discounted_rewards(batch_np.memory, args.gamma) iter_dataset_np = BaseDataset(batch_np.memory, disc_rew_np, args.device_np, args.dtype, max_len=max_episode_len) print('np avg actions: ', log_np['action_mean']) if args.value_net: state_list = [ ep['states'][:ep['real_len']] for ep in iter_dataset_np ] advantages_np, returns = critic_estimate(state_list, disc_rew_np, args) update_critic(torch.cat(state_list, dim=0), torch.cat(returns, dim=0)) else: advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np, value_replay_memory, value_np, args) value_replay_memory.add(iter_dataset_np) train_value_np(value_replay_memory) improved_context_list_np = improvement_step_all( iter_dataset_np, advantages_np, args.max_kl_np, args) # training tn0 = time.time() replay_memory.add(iter_dataset_np) train_np(replay_memory) tn1 = time.time() tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps']) avg_rewards_np.append(log_np['avg_reward']) if i_iter % args.plot_every in [0, 1]: if 'CartPole' in args.env_name: plot_NP_policy_CP(policy_np, replay_memory, i_iter, env, args, use_np_sigma=args.plot_np_sigma) plot_rm(replay_memory, i_iter, args) plot_improvements_CP(iter_dataset_np, advantages_np, env, i_iter, args, colors) elif 'MountainCar' in args.env_name: plot_NP_policy_MC(policy_np, replay_memory, i_iter, env, args, use_np_sigma=args.plot_np_sigma) plot_improvements_MC(iter_dataset_np, advantages_np, env, i_iter, args, colors) plot_improvements_MC_all(iter_dataset_np, advantages_np, env, i_iter, args, colors) if i_iter % args.log_interval == 0: print(i_iter) print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log_np['min_reward'], log_np['max_reward'], log_np['avg_reward'])) print('new sigma', args.fixed_sigma) plot_rewards_history(tot_steps_np, avg_rewards_np) store_avg_rewards( tot_steps_np[-1], avg_rewards_np[-1], np_file.replace( str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv')) if args.fixed_sigma is not None: sigma_history.append(torch.tensor(args.fixed_sigma)) else: sigma_history.append( torch.cat([ep['stddevs'] for ep in iter_dataset_np.data]).mean(dim=0)) plot_sigma_history(sigma_history) if tot_steps_np[-1] > args.tot_steps: break """clean up gpu memory""" torch.cuda.empty_cache()
def main_loop(): colors = [] num_episodes = args.num_ensembles for i in range(num_episodes): colors.append('#%06X' % randint(0, 0xFFFFFF)) improved_context_list_np = sample_initial_context_normal( args.num_ensembles) improved_context_list_mi = improved_context_list_np if args.use_np: if initial_training: train_on_initial(improved_context_list_np) for i_iter in range(args.max_iter_num): if args.use_trpo and tot_steps_trpo[ -1] < args.tot_steps: # and tot_steps_trpo[-1] - max(tot_steps_mi[-1], tot_steps_np[-1]) < 1000: batch_trpo, log, memory_trpo = agent_trpo.collect_samples( args.min_batch_size ) # batch of batch_size transitions from multiple store_rewards_trpo(memory_trpo.memory, trpo_file) update_params_trpo( batch_trpo ) # generate multiple trajectories that reach the minimum batch_size tot_steps_trpo.append(tot_steps_trpo[-1] + log['num_steps']) avg_rewards_trpo.append(log['avg_reward']) print('trpo avg actions: ', log['action_mean']) if args.use_np and tot_steps_np[-1] < args.tot_steps: # generate multiple trajectories that reach the minimum batch_size policy_np.training = False batch_np, log_np = agent_np.collect_episodes( improved_context_list_np ) # batch of batch_size transitions from multiple store_rewards(batch_np.memory, np_file) disc_rew_np = discounted_rewards(batch_np.memory, args.gamma) complete_dataset_np = BaseDataset(batch_np.memory, disc_rew_np, args.device_np, args.dtype, max_len=max_episode_len) print('np avg actions: ', log_np['action_mean']) advantages_np = estimate_v_a(complete_dataset_np, disc_rew_np) improved_context_list_np = improvement_step_all( complete_dataset_np, advantages_np, args.max_kl_np) # training value_replay_memory.add(complete_dataset_np) train_value_np(value_replay_memory) tn0 = time.time() replay_memory.add(complete_dataset_np) train_np(replay_memory) tn1 = time.time() tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps']) avg_rewards_np.append(log_np['avg_reward']) if args.use_mi and tot_steps_mi[-1] < args.tot_steps: # generate multiple trajectories that reach the minimum batch_size batch_mi, log_mi = agent_mi.collect_episodes( improved_context_list_mi ) # batch of batch_size transitions from multiple store_rewards(batch_mi.memory, mi_file) #print(log['num_steps'], log['num_episodes']) # episodes (separated by mask=0). Stored in Memory print('mi avg actions: ', log_mi['action_mean']) disc_rew_mi = discounted_rewards(batch_mi.memory, args.gamma) complete_dataset_mi = BaseDataset(batch_mi.memory, disc_rew_mi, args.device_np, args.dtype, max_len=max_episode_len) advantages_mi = estimate_v_a_mi(complete_dataset_mi, disc_rew_mi) t0 = time.time() improved_context_list_mi = improvement_step_all( complete_dataset_mi, advantages_mi, args.max_kl_mi) t1 = time.time() # create training set tn0 = time.time() replay_memory_mi.add(complete_dataset_mi) train_mi(replay_memory_mi) tn1 = time.time() tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps']) avg_rewards_mi.append(log_mi['avg_reward'].item()) if i_iter % args.log_interval == 0: print(i_iter) if args.use_trpo: print('trpo: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'. format(log['min_reward'], log['max_reward'], log['avg_reward'])) if args.use_np: print( 'np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log_np['min_reward'], log_np['max_reward'], log_np['avg_reward'])) if args.use_mi: print( 'mi: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format( log_mi['min_reward'], log_mi['max_reward'], log_mi['avg_reward'])) print(args.fixed_sigma) if i_iter % args.plot_every == 0: plot_rewards_history( [tot_steps_trpo, tot_steps_np, tot_steps_mi], [avg_rewards_trpo, avg_rewards_np, avg_rewards_mi]) """clean up gpu memory""" torch.cuda.empty_cache()