def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_eval'.format(args.env)) # env for eval eval_env_params = EnvConfig['test_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) # restore trained agent agent.restore('./{}'.format(args.restore_model)) # Evaluate episode for episode in range(args.eval_episodes): episode_reward = run_episode(agent, eval_env) tensorboard.add_scalar('eval/episode_reward', episode_reward, episode) logger.info('Evaluation episode reward: {}'.format(episode_reward))
def __init__(self, port, monitor_port=None): self.ctx = zmq.Context() self.master_ip = get_ip_address() self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port) logger.set_dir( os.path.expanduser('~/.parl_data/master/{}_{}'.format( self.master_ip, port))) self.client_socket = self.ctx.socket(zmq.REP) self.client_socket.bind("tcp://*:{}".format(port)) self.client_socket.linger = 0 self.port = port self.job_center = JobCenter(self.master_ip) self.cluster_monitor = ClusterMonitor() self.master_is_alive = True self.client_hostname = defaultdict(int)
def _reply_heartbeat(self, target): """Worker will kill its jobs when it lost connection with the master. """ socket = self.ctx.socket(zmq.REP) socket.linger = 0 socket.setsockopt(zmq.RCVTIMEO, remote_constants.HEARTBEAT_RCVTIMEO_S * 1000) heartbeat_master_port =\ socket.bind_to_random_port("tcp://*") self.master_heartbeat_address = "{}:{}".format(self.worker_ip, heartbeat_master_port) logger.set_dir( os.path.expanduser('~/.parl_data/worker/{}'.format( self.master_heartbeat_address.replace(':', '_')))) self.heartbeat_socket_initialized.set() logger.info("[Worker] Connect to the master node successfully. " "({} CPUs)".format(self.cpu_num)) while self.master_is_alive and self.worker_is_alive: try: message = socket.recv_multipart() worker_status = self._get_worker_status() socket.send_multipart([ remote_constants.HEARTBEAT_TAG, to_byte(str(worker_status[0])), to_byte(str(worker_status[1])), to_byte(worker_status[2]), to_byte(str(worker_status[3])) ]) except zmq.error.Again as e: self.master_is_alive = False except zmq.error.ContextTerminated as e: break socket.close(0) logger.warning( "[Worker] lost connection with the master, will exit reply heartbeat for master." ) self.worker_status.clear() self.log_server_proc.kill() self.log_server_proc.wait() # exit the worker self.worker_is_alive = False self.exit()
def run(self, job_address_sender, job_id_sender): """An infinite loop waiting for a new task. Args: job_address_sender(sending end of multiprocessing.Pipe): send job address of reply_socket to main process. """ ctx = zmq.Context() # create the reply_socket reply_socket = ctx.socket(zmq.REP) job_port = reply_socket.bind_to_random_port(addr="tcp://*") reply_socket.linger = 0 job_ip = get_ip_address() job_address = "{}:{}".format(job_ip, job_port) job_id = job_address.replace(':', '_') + '_' + str(int(time.time())) self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id)) logger.set_dir(self.log_dir) logger.info( "[Job] Job {} initialized. Reply heartbeat socket Address: {}.". format(job_id, job_address)) job_address_sender.send(job_address) job_id_sender.send(job_id) try: # receive source code from the actor and append them to the environment variables. envdir = self.wait_for_files(reply_socket, job_address) sys.path.insert(0, envdir) os.chdir(envdir) obj = self.wait_for_connection(reply_socket) assert obj is not None self.single_task(obj, reply_socket, job_address) except Exception as e: logger.error( "Error occurs when running a single task. We will reset this job. \nReason:{}" .format(e)) traceback_str = str(traceback.format_exc()) logger.error("traceback:\n{}".format(traceback_str))
deterministic = True if deterministic: seed = 772 def seed_torch(seed): torch.manual_seed(seed) if torch.backends.cudnn.enabled: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True np.random.seed(seed) random.seed(seed) seed_torch(seed) env.seed(seed) logger.set_dir('./logs') # parameters num_frames = 20000 memory_size = 1000 batch_size = 32 target_update = 100 atom_size = 51 v_min = 0.0 v_max = 200.0 n_step = 3 gamma = 0.99 alpha = 0.2
save_path = os.path.join( logger.get_dir(), 'model_every_100_episodes/episodes-{}'.format(T)) self.agent.save(save_path) def restore(self, model_path): logger.info('restore model from {}'.format(model_path)) self.agent.restore(model_path) def add_episode_rpm(self, episode_rpm): for x in episode_rpm: self.rpm.append( obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4]) def pred_batch(self, obs): batch_obs = np.expand_dims(obs, axis=0) with self.model_lock: action = self.agent.predict(batch_obs.astype('float32')) action = np.squeeze(action, axis=0) return action if __name__ == '__main__': from train_args import get_args args = get_args() if args.logdir is not None: logger.set_dir(args.logdir) learner = Learner(args)
parser = argparse.ArgumentParser() parser.add_argument('--env', help='Mujoco environment name', default='HalfCheetah-v1') parser.add_argument('--train_total_steps', type=int, default=int(55e5), help='maximum training steps') parser.add_argument( '--test_every_steps', type=int, default=int(1e4), help='the step interval between two consecutive evaluations') parser.add_argument('--kappa', type=float, default=float(5), help='kappa') parser.add_argument('--epoch', type=float, default=float(10000), help='epoch') parser.add_argument('--alpha', type=float, default=float(2), help='alpha') parser.add_argument('--seed', type=int, default=int(1), help='env seed') args = parser.parse_args() logger.set_dir('./train_log/{}_k_{}_e_{}_a_{}_s_{}_{}'.format( args.env, str(args.kappa), str(args.epoch), str(args.alpha), str(args.seed), time.strftime("%H%M%S"))) csv_logger = CSVLogger( os.path.join(logger.get_dir(), 'ADER_{}_{}.csv'.format(args.env, str(args.seed)))) main()
def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_train'.format(args.env)) # Parallel environments for training train_envs_params = EnvConfig['train_envs_params'] env_num = EnvConfig['env_num'] env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params) # env for eval eval_env_params = EnvConfig['eval_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent, replay_memory if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) rpm = ReplayMemory( max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim) total_steps = 0 last_save_steps = 0 test_flag = 0 obs_list = env_list.reset() while total_steps < args.train_total_steps: # Train episode if rpm.size() < WARMUP_STEPS: action_list = [ np.random.uniform(-1, 1, size=action_dim) for _ in range(env_num) ] else: action_list = [agent.sample(obs) for obs in obs_list] next_obs_list, reward_list, done_list, info_list = env_list.step( action_list) # Store data in replay memory for i in range(env_num): rpm.append(obs_list[i], action_list[i], reward_list[i], next_obs_list[i], done_list[i]) obs_list = env_list.get_obs() total_steps = env_list.total_steps # Train agent after collecting sufficient data if rpm.size() >= WARMUP_STEPS: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( BATCH_SIZE) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) # Save agent if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4): agent.save('./{}_model/step_{}_model.ckpt'.format( args.framework, total_steps)) last_save_steps = total_steps # Evaluate episode if (total_steps + 1) // args.test_every_steps >= test_flag: while (total_steps + 1) // args.test_every_steps >= test_flag: test_flag += 1 avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES) tensorboard.add_scalar('eval/episode_reward', avg_reward, total_steps) logger.info( 'Total steps {}, Evaluation over {} episodes, Average reward: {}' .format(total_steps, EVAL_EPISODES, avg_reward))
help='Batch Size', default=256) parser.add_argument('-d', '--gamma', type=float, help='Discount Factor', default=0.99) parser.add_argument('-c', '--critic_lr', default=3e-4, type=float, help='Critic Learning_rate') parser.add_argument('-a', '--actor_lr', default=3e-4, type=float, help='Actor Learning_rate') args = parser.parse_args() al, cl, dc, bs = args.actor_lr, args.critic_lr, args.gamma, args.batch_size print( "Start Training: actor/critic lr = %.4f / %.4f, batch size = %d, and gamma = %.2f" % (al, cl, dc, bs)) print("#" * 60) # batch size + discount + actor lr + critic lr log_path = os.path.join('./log_dir', "%s_%s_%s_%s" % (bs, dc, al, cl)) logger.set_dir(log_path) tb_logger = LogWriter(log_path) main()
def main(): # Prepare environments # env = get_player( # args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) # test_env = get_player( # args.rom, # image_size=IMAGE_SIZE, # frame_skip=FRAME_SKIP, # context_len=CONTEXT_LEN) env = gym.make("pseudoslam:RobotExploration-v0") env = MonitorEnv(env, param={'goal': args.goal, 'obs': args.obs}) # obs = env.reset() # print(obs.shape) # raise NotImplementedError # Init Prioritized Replay Memory per = ProportionalPER(alpha=0.6, seg_num=args.batch_size, size=MEMORY_SIZE) suffix = args.suffix + "_Rp{}_Goal{}_Obs{}".format(args.Rp, args.goal, args.obs) logdir = os.path.join(args.logdir, suffix) if not os.path.exists(logdir): os.mkdir(logdir) logger.set_dir(logdir) modeldir = os.path.join(args.modeldir, suffix) if not os.path.exists(modeldir): os.mkdir(modeldir) # Prepare PARL agent act_dim = env.action_space.n model = AtariModel(act_dim) if args.alg == 'ddqn': algorithm = PrioritizedDoubleDQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) elif args.alg == 'dqn': algorithm = PrioritizedDQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = AtariAgent(algorithm, act_dim=act_dim, update_freq=UPDATE_FREQ) if os.path.exists(args.load): agent.restore(args.load) # Replay memory warmup total_step = 0 with tqdm(total=MEMORY_SIZE, desc='[Replay Memory Warm Up]') as pbar: mem = [] while total_step < MEMORY_WARMUP_SIZE: total_reward, steps, _, _ = run_episode(env, agent, per, mem=mem, warmup=True) total_step += steps pbar.update(steps) per.elements.from_list(mem[:int(MEMORY_WARMUP_SIZE)]) # env_name = args.rom.split('/')[-1].split('.')[0] test_flag = 0 total_steps = 0 pbar = tqdm(total=args.train_total_steps) save_steps = 0 while total_steps < args.train_total_steps: # start epoch total_reward, steps, loss, info = run_episode(env, agent, per, train=True) total_steps += steps save_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) summary.add_scalar('train/score', total_reward, total_steps) summary.add_scalar('train/loss', loss, total_steps) # mean of total loss summary.add_scalar('train/exploration', agent.exploration, total_steps) summary.add_scalar('train/steps', steps, total_steps) for key in info.keys(): summary.add_scalar('train/' + key, info[key], total_steps) pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: print('start test!') while total_steps // args.test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") test_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(env, agent) test_rewards.append(eval_reward) eval_reward = np.mean(test_rewards) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, eval_reward)) summary.add_scalar('eval/reward', eval_reward, total_steps) if save_steps >= 100000: modeldir_ = os.path.join(modeldir, 'itr_{}'.format(total_steps)) if not os.path.exists(modeldir_): os.mkdir(modeldir_) print('save model!', modeldir_) agent.save(modeldir_) save_steps = 0 pbar.close()
total_steps) summary.add_scalar('dqn/Q value', evaluate_fixed_Q(agent, fixed_obs), total_steps) summary.add_scalar('dqn/grad_norm', get_grad_norm(agent.alg.model), total_steps) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--rom', default='rom_files/breakout.bin') parser.add_argument('--batch_size', type=int, default=32, help='batch size for training') parser.add_argument('--lr', default=3e-4, help='learning_rate') parser.add_argument('--algo', default='DQN', help='DQN/Double/Dueling DQN') parser.add_argument('--train_total_steps', type=int, default=int(1e7), help='maximum environmental steps of games') parser.add_argument( '--test_every_steps', type=int, default=int(1e5), help='the step interval between two consecutive evaluations') args = parser.parse_args() rom_name = args.rom.split('/')[-1].split('.')[0] logger.set_dir(os.path.join('./train_log', rom_name)) main()
total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 训练结束,保存模型 save_path = 'saved_model/dqn_model_%s_%s.ckpt' % (args.model, args.lr) agent.save(save_path) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-l', '--lr', default=3e-4, type=float, help='learning_rate') parser.add_argument('-m', dest='model', default='v1', help='neural_network') args = parser.parse_args() print("Start Training: learning rate = %.4f, model = model %s" % (args.lr, args.model)) print("#" * 50) logger.set_dir(os.path.join('./train_log', "%s_%s" % (args.model, args.lr))) main()
from smac.env import StarCraft2Env from env_wrapper import SC2EnvWrapper from replay_buffer import EpisodeExperience, EpisodeReplayBuffer from qmixer_model import QMixerModel from rnn_model import RNNModel from parl.algorithms import QMIX from qmix_agent import QMixAgent import parl from parl.utils import logger from parl.utils import summary import numpy as np from copy import deepcopy from qmix_config import QMixConfig logger.set_dir('./log_path') def run_train_episode(env, agent, rpm, config): episode_limit = config['episode_limit'] agent.reset_agent() episode_reward = 0.0 episode_step = 0 terminated = False state, obs = env.reset() episode_experience = EpisodeExperience(episode_limit) while not terminated: available_actions = env.get_available_actions() actions = agent.sample(obs, available_actions)