def train_dqn(env, args, workdir): action_space_size = env.action_space.n if not os.path.exists('train_log'): os.mkdir('train_log') writer = TensorBoard(f'train_log/{args.run_name}') dqn_config = dqn_config_default.copy() dqn_config.update({ "batch_size": 4096, "min_replay_history": 40960, "training_steps": 4000, "lr": 0.0001, "target_update_period": 500 }) policy = DQNTorchPolicy(env.observation_space, env.action_space, env.config, dqn_config) dqn_trainer = Trainer(env, policy, dqn_config) max_mean_reward = - 1000 debug = False mean_cost_list = [] total_cost_list = [] for i in range(args.iters): result = dqn_trainer.train(i) now_mean_reward = print_result(action_space_size, result, writer, i) mean_cost_list.append(now_mean_reward) if (i+1) % 5 == 0: _total_value = draw_route(args, dqn_trainer, env, mean_cost_list, workdir) total_cost_list.append(_total_value) list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}.png')
def train_dqn(dqn_trainer, env, args, workdir, suffix): action_space_size = env.action_space.n if not os.path.exists('train_log'): os.mkdir('train_log') writer = TensorBoard(f'train_log/{args.problem}_{args.run_name}') max_mean_reward = - 1000 mean_cost_list = [] total_cost_list = [] total_valid_cost_list = [] # min_route_cost = 10000000 for i in range(args.iters): print(suffix) dqn_trainer.switch_mode(eval_mode=False) result = dqn_trainer.train(i) now_mean_reward = print_result( result, writer, i, dqn_trainer.policies_to_train, action_space_size ) # if now_mean_reward > max_mean_reward: # dqn_trainer.save(f"{args.problem}_{suffix}", i) mean_cost_list.append(now_mean_reward) if (i+1) % 5 == 0 or (now_mean_reward > max_mean_reward): reset_sequence = ((i+1) % args.sequence_update_freq == 0) dqn_trainer.switch_mode(eval_mode=True) _total_cost, _valid_route = draw_route(args, dqn_trainer, env, mean_cost_list, workdir, suffix, (now_mean_reward > max_mean_reward), reset_sequence) total_cost_list.append(_total_cost) if _valid_route: total_valid_cost_list.append(_total_cost) list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png') if len(total_valid_cost_list) > 0: list_to_figure([total_valid_cost_list], ['total_valid_cost'], 'total_valid_cost', f'{workdir}/dqn_total_valid_cost_{args.problem}_{suffix}.png') max_mean_reward = max(max_mean_reward, now_mean_reward) return mean_cost_list
def draw_route(args, trainer, env, mean_cost_list, workdir, suffix, is_render): plt.figure(figsize=(30, 30)) plt.axis("on") G, pos, route_edges, total_cost, valid_route = rl_solution_to_graph( trainer, env) if is_render: labels = {} for node in G.nodes(): labels[node] = node nx.draw_networkx_nodes(G, pos, node_size=1000) nx.draw_networkx_labels(G, pos, labels, font_size=30, font_color="black") cmap = matplotlib.cm.get_cmap('Spectral') max_vehicle_id = np.max(list(route_edges.keys())) + 1.0 for vehicle_id in route_edges.keys(): if len(route_edges[vehicle_id]) <= 0: continue nx.draw_networkx_edges(G, pos, width=2, arrows=True, arrowsize=100, edgelist=route_edges[vehicle_id], edge_color=cmap(vehicle_id / max_vehicle_id)) plt.show() plt.savefig(f'{workdir}/dqn_vrp_{args.problem}_{suffix}.png') plt.close() list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost', f'{workdir}/dqn_cost_{args.problem}_{suffix}.png') return total_cost, valid_route
def draw_route(args, trainer, env, mean_cost_list, workdir): items_in_pack = [] total_value = 0.0 total_weight = 0.0 policy = trainer.get_policy() state = env.reset() for i in range(env.num_items): action, _, _ = policy.compute_single_action( state, info={}, explore=False ) state, _, _, _ = env.step(action) if action == 1 and total_weight + env.weights[env.items_in_sequence[i]] <= env.capacity: items_in_pack.append(env.items_in_sequence[i]) total_value += env.values[env.items_in_sequence[i]] total_weight += env.weights[env.items_in_sequence[i]] print('Total values = ', total_value, ', ortool values = ', env.get_ortool_value()) print('Total weight = ', total_weight) print('Capacity = ', env.capacity) print('Packed items: ', items_in_pack) list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost', f'{workdir}/dqn_cost_{args.problem}.png') return total_value
def train_dqn(dqn_trainer, env, args, workdir, suffix): action_space_size = env.action_space.n if not os.path.exists('train_log'): os.mkdir('train_log') writer = TensorBoard(f'train_log/{args.run_name}') max_mean_reward = -1000 mean_cost_list = [] total_cost_list = [] total_valid_cost_list = [] min_route_cost = 10000000 for i in range(args.iters): print(suffix) result = dqn_trainer.train(i) now_mean_reward = print_result(action_space_size, result, writer, i) if now_mean_reward > max_mean_reward: dqn_trainer.policy.save_param(f"{args.problem}_{suffix}_best") mean_cost_list.append(now_mean_reward) if (i + 1) % 5 == 0 or (now_mean_reward > max_mean_reward): _total_cost, _valid_route = draw_route( args, dqn_trainer, env, mean_cost_list, workdir, suffix, is_render=(now_mean_reward > max_mean_reward)) total_cost_list.append(_total_cost) if _valid_route: total_valid_cost_list.append(_total_cost) list_to_figure( [total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png') if len(total_valid_cost_list) > 0: list_to_figure( [total_valid_cost_list], ['total_valid_cost'], 'total_valid_cost', f'{workdir}/dqn_total_valid_cost_{args.problem}_{suffix}.png' ) max_mean_reward = max(max_mean_reward, now_mean_reward) return mean_cost_list
def train_ppo(args, env, knapsack_config, workdir, n_iterations): ext_conf = ppo.DEFAULT_CONFIG.copy() ext_conf.update({ "num_workers": 2, "num_cpus_per_worker": 1, "vf_share_layers": True, "vf_loss_coeff": 1.0, "vf_clip_param": 100.0, "use_critic": True, "use_gae": True, "framework": "torch", "lambda": 1.0, "gamma": 1.0, 'env_config': knapsack_config, 'timesteps_per_iteration': knapsack_config['episode_len'], 'batch_mode': 'complete_episodes', # Size of batches collected from each worker "rollout_fragment_length": args.rollout, # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. "train_batch_size": args.batch_size*args.rollout, # Total SGD batch size across all devices for SGD. This defines the # minibatch size within each epoch. "sgd_minibatch_size": args.min_batch_size*args.rollout, # Number of SGD iterations in each outer loop (i.e., number of epochs to # execute per train batch). "num_sgd_iter": 100, "shuffle_sequences": True, "lr": 1e-4, "_fake_gpus": True, "num_gpus": 0, "num_gpus_per_worker": 0, "model": {"custom_model": "knapsack_model"}, "explore": True, # "exploration_config": { # # The Exploration class to use. # "type": "EpsilonGreedy", # # Config for the Exploration class' constructor: # "initial_epsilon": 1.0, # "final_epsilon": 0.02, # "epsilon_timesteps": args.rollout*args.batch_size*args.iters // 3, # Timesteps over which to anneal epsilon. # }, "exploration_config": { "type": StochasticSampling, "random_timesteps": args.rollout*args.batch_size*args.iters // 4, }, }) print(f"Environment: action space {env.action_space}, observation space {env.observation_space}") ppo_trainer = ppo.PPOTrainer( env = KnapsackEnv, config = ext_conf) # ppo_trainer.restore('/root/ray_results/PPO_CVRPEnv_2020-12-29_11-50-29uylrljyr/checkpoint_100/checkpoint-100') mean_cost_list = [] total_cost_list = [] for i in range(n_iterations): print("== Iteration", i, "==") trainer_result = ppo_trainer.train() print_training_results(trainer_result) # cost = env.total_cost - (trainer_result['episode_reward_mean']*env.total_cost) / trainer_result['episode_len_mean'] # cost = (1.0 - trainer_result['episode_reward_mean']/trainer_result['episode_len_mean']) * env.max_cost * env.num_nodes cost = trainer_result['episode_reward_mean'] mean_cost_list.append(cost) print('cost: ', cost) if (i+1) % 5 == 0: checkpoint = ppo_trainer.save() print("checkpoint saved at", checkpoint) _total_value = draw_route(args, ppo_trainer, env, mean_cost_list, workdir) total_cost_list.append(_total_value) list_to_figure([total_cost_list], ['total_cost'], 'total_cost', f'{workdir}/rl_knapsack_total_cost_{args.problem}.png') return ppo_trainer, mean_cost_list
# python ma_dp_dqn_vrp.py --iters 1000 --problem A-n64-k9 --training-step 512 --num-agents 9 --priori-memory 1 --run-name pm1 --episode 128 # python ma_dp_dqn_vrp.py --iters 3000 --problem A-n32-k5 --training-step 320 --num-agents 5 --run-name tt --episode 31 --mode dp if __name__ == "__main__": args = parser.parse_args() vrp_config = env_config.copy() vrp_config.update({'problem': args.problem, "constraint_id": args.constraint_id, "episode_len": args.episode}) env = CVRPEnv(vrp_config) if args.pt: workdir = f"{os.environ['PT_OUTPUT_DIR']}/{args.problem}_{args.run_name}/" else: workdir = f"output/vrp/{args.problem}_{args.run_name}/" os.makedirs(workdir, exist_ok=True) metric_list = [] metric_labels = [] env.reset() env.is_constraint_imposed = False trainer_woc = create_trainer(env, args, workdir, 'woc') total_cost_list = train_dqn(trainer_woc, env, args, workdir, 'woc') metric_list.append(total_cost_list) metric_labels.append('mean_reward_without_constraint') list_to_figure(metric_list, metric_labels, 'mean reward of policies', f'{workdir}/dqn_reward_{args.problem}.png', smoothed=False)
def train_dqn(rank, total_num_process, models, env, args, workdir, dqn_config, suffix): torch.manual_seed(args.seed + rank) agent_policies = {} policies_to_train = [] _env = CVRPEnv(env.config) for agent_id, vehicle_id in enumerate([0]): policy = DistDQNDPTorchPolicy(0, env.observation_space, env.action_space, dqn_config, models) agent_policies[vehicle_id] = policy policies_to_train.append(vehicle_id) dqn_trainer = Trainer(_env, agent_policies, policies_to_train, dqn_config) action_space_size = env.action_space.n # if not os.path.exists('train_log'): # os.mkdir('train_log') # writer = TensorBoard(f'train_log/{args.problem}_{args.run_name}') max_mean_reward = -1000 mean_cost_list = [] total_cost_list = [] total_valid_cost_list = [] ortool_val_list = [] cost_diff_list = [] # pool = mp.Pool(8) for i in range(args.iters): print(f"===={suffix}=======iters: {i}======rank: {rank}===") dqn_trainer.switch_mode(eval_mode=False) result = dqn_trainer.train(i) if rank == total_num_process: now_mean_reward, _ = print_result(result, None, i, dqn_trainer.policies_to_train, action_space_size) # if now_mean_reward > max_mean_reward: # dqn_trainer.save(f"{args.problem}_{suffix}", 'best') mean_cost_list.append(now_mean_reward) max_mean_reward = max(max_mean_reward, now_mean_reward) reset_sequence = False # ((i+1) % args.sequence_update_freq == 0) if (rank == total_num_process) and ((i + 1) % args.render_freq == 0): dqn_trainer.switch_mode(eval_mode=True) tmp_total_cost_list = [] tmp_total_valid_cost_list = [] tmp_ortool_val_list = [] for _ in range(args.eval_rounds): _total_cost, _valid_route, _ortool_val = draw_route( args, dqn_trainer, env, mean_cost_list, workdir, suffix, True, reset_sequence) tmp_total_cost_list.append(_total_cost) tmp_ortool_val_list.append(_ortool_val) if _valid_route: tmp_total_valid_cost_list.append(_total_cost) total_cost_list.append(np.mean(tmp_total_cost_list)) ortool_val_list.append(np.mean(tmp_ortool_val_list)) if len(tmp_total_valid_cost_list) > 0: total_valid_cost_list.append( np.mean(tmp_total_valid_cost_list)) cost_diff_list.append(total_valid_cost_list[-1] - ortool_val_list[-1]) elif len(total_valid_cost_list) > 0: total_valid_cost_list.append(total_valid_cost_list[-1]) else: total_valid_cost_list.append(0.0) list_to_figure( [total_cost_list, ortool_val_list, total_valid_cost_list], ['total_cost', 'ortool_cost', 'total_valid_cost'], 'total_cost', f'{workdir}/dqn_total_cost_{args.problem}_{suffix}.png') list_to_figure([mean_cost_list], ['mean_cost'], 'mean_cost', f'{workdir}/dqn_cost_{args.problem}_{suffix}.png') if len(cost_diff_list) > 0: list_to_figure( [cost_diff_list], ['cost_diff'], 'cost_diff', f'{workdir}/dqn_cost_diff_{args.problem}_{suffix}.png') sys.stdout.flush() return mean_cost_list