Beispiel #1
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_eval'.format(args.env))

    # env for eval
    eval_env_params = EnvConfig['test_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    # restore trained agent
    agent.restore('./{}'.format(args.restore_model))

    # Evaluate episode
    for episode in range(args.eval_episodes):
        episode_reward = run_episode(agent, eval_env)
        tensorboard.add_scalar('eval/episode_reward', episode_reward, episode)
        logger.info('Evaluation episode reward: {}'.format(episode_reward))
Beispiel #2
0
    def __init__(self, port, monitor_port=None):
        self.ctx = zmq.Context()
        self.master_ip = get_ip_address()
        self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port)
        logger.set_dir(
            os.path.expanduser('~/.parl_data/master/{}_{}'.format(
                self.master_ip, port)))
        self.client_socket = self.ctx.socket(zmq.REP)
        self.client_socket.bind("tcp://*:{}".format(port))
        self.client_socket.linger = 0
        self.port = port

        self.job_center = JobCenter(self.master_ip)
        self.cluster_monitor = ClusterMonitor()
        self.master_is_alive = True
        self.client_hostname = defaultdict(int)
Beispiel #3
0
    def _reply_heartbeat(self, target):
        """Worker will kill its jobs when it lost connection with the master.
        """

        socket = self.ctx.socket(zmq.REP)
        socket.linger = 0
        socket.setsockopt(zmq.RCVTIMEO,
                          remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
        heartbeat_master_port =\
            socket.bind_to_random_port("tcp://*")
        self.master_heartbeat_address = "{}:{}".format(self.worker_ip,
                                                       heartbeat_master_port)

        logger.set_dir(
            os.path.expanduser('~/.parl_data/worker/{}'.format(
                self.master_heartbeat_address.replace(':', '_'))))

        self.heartbeat_socket_initialized.set()
        logger.info("[Worker] Connect to the master node successfully. "
                    "({} CPUs)".format(self.cpu_num))
        while self.master_is_alive and self.worker_is_alive:
            try:
                message = socket.recv_multipart()
                worker_status = self._get_worker_status()
                socket.send_multipart([
                    remote_constants.HEARTBEAT_TAG,
                    to_byte(str(worker_status[0])),
                    to_byte(str(worker_status[1])),
                    to_byte(worker_status[2]),
                    to_byte(str(worker_status[3]))
                ])
            except zmq.error.Again as e:
                self.master_is_alive = False
            except zmq.error.ContextTerminated as e:
                break
        socket.close(0)
        logger.warning(
            "[Worker] lost connection with the master, will exit reply heartbeat for master."
        )
        self.worker_status.clear()
        self.log_server_proc.kill()
        self.log_server_proc.wait()
        # exit the worker
        self.worker_is_alive = False
        self.exit()
Beispiel #4
0
    def run(self, job_address_sender, job_id_sender):
        """An infinite loop waiting for a new task.

        Args:
            job_address_sender(sending end of multiprocessing.Pipe): send job address of reply_socket to main process.
        """
        ctx = zmq.Context()

        # create the reply_socket
        reply_socket = ctx.socket(zmq.REP)
        job_port = reply_socket.bind_to_random_port(addr="tcp://*")
        reply_socket.linger = 0
        job_ip = get_ip_address()
        job_address = "{}:{}".format(job_ip, job_port)

        job_id = job_address.replace(':', '_') + '_' + str(int(time.time()))
        self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id))
        logger.set_dir(self.log_dir)
        logger.info(
            "[Job] Job {} initialized. Reply heartbeat socket Address: {}.".
            format(job_id, job_address))

        job_address_sender.send(job_address)
        job_id_sender.send(job_id)

        try:
            # receive source code from the actor and append them to the environment variables.
            envdir = self.wait_for_files(reply_socket, job_address)
            sys.path.insert(0, envdir)
            os.chdir(envdir)

            obj = self.wait_for_connection(reply_socket)
            assert obj is not None
            self.single_task(obj, reply_socket, job_address)
        except Exception as e:
            logger.error(
                "Error occurs when running a single task. We will reset this job. \nReason:{}"
                .format(e))
            traceback_str = str(traceback.format_exc())
            logger.error("traceback:\n{}".format(traceback_str))
Beispiel #5
0
deterministic = True
if deterministic:
    seed = 772

    def seed_torch(seed):
        torch.manual_seed(seed)
        if torch.backends.cudnn.enabled:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

    np.random.seed(seed)
    random.seed(seed)
    seed_torch(seed)
    env.seed(seed)

logger.set_dir('./logs')

# parameters
num_frames = 20000
memory_size = 1000
batch_size = 32
target_update = 100

atom_size = 51
v_min = 0.0
v_max = 200.0

n_step = 3

gamma = 0.99
alpha = 0.2
Beispiel #6
0
        save_path = os.path.join(
            logger.get_dir(), 'model_every_100_episodes/episodes-{}'.format(T))
        self.agent.save(save_path)

    def restore(self, model_path):
        logger.info('restore model from {}'.format(model_path))
        self.agent.restore(model_path)

    def add_episode_rpm(self, episode_rpm):
        for x in episode_rpm:
            self.rpm.append(
                obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4])

    def pred_batch(self, obs):
        batch_obs = np.expand_dims(obs, axis=0)

        with self.model_lock:
            action = self.agent.predict(batch_obs.astype('float32'))

        action = np.squeeze(action, axis=0)
        return action


if __name__ == '__main__':
    from train_args import get_args
    args = get_args()
    if args.logdir is not None:
        logger.set_dir(args.logdir)

    learner = Learner(args)
Beispiel #7
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        help='Mujoco environment name',
                        default='HalfCheetah-v1')
    parser.add_argument('--train_total_steps',
                        type=int,
                        default=int(55e5),
                        help='maximum training steps')
    parser.add_argument(
        '--test_every_steps',
        type=int,
        default=int(1e4),
        help='the step interval between two consecutive evaluations')
    parser.add_argument('--kappa', type=float, default=float(5), help='kappa')
    parser.add_argument('--epoch',
                        type=float,
                        default=float(10000),
                        help='epoch')
    parser.add_argument('--alpha', type=float, default=float(2), help='alpha')
    parser.add_argument('--seed', type=int, default=int(1), help='env seed')

    args = parser.parse_args()

    logger.set_dir('./train_log/{}_k_{}_e_{}_a_{}_s_{}_{}'.format(
        args.env, str(args.kappa), str(args.epoch), str(args.alpha),
        str(args.seed), time.strftime("%H%M%S")))
    csv_logger = CSVLogger(
        os.path.join(logger.get_dir(),
                     'ADER_{}_{}.csv'.format(args.env, str(args.seed))))
    main()
Beispiel #8
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_train'.format(args.env))

    # Parallel environments for training
    train_envs_params = EnvConfig['train_envs_params']
    env_num = EnvConfig['env_num']
    env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params)

    # env for eval
    eval_env_params = EnvConfig['eval_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent, replay_memory
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim)

    total_steps = 0
    last_save_steps = 0
    test_flag = 0

    obs_list = env_list.reset()

    while total_steps < args.train_total_steps:
        # Train episode
        if rpm.size() < WARMUP_STEPS:
            action_list = [
                np.random.uniform(-1, 1, size=action_dim)
                for _ in range(env_num)
            ]
        else:
            action_list = [agent.sample(obs) for obs in obs_list]
        next_obs_list, reward_list, done_list, info_list = env_list.step(
            action_list)

        # Store data in replay memory
        for i in range(env_num):
            rpm.append(obs_list[i], action_list[i], reward_list[i],
                       next_obs_list[i], done_list[i])

        obs_list = env_list.get_obs()
        total_steps = env_list.total_steps
        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)

        # Save agent
        if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4):
            agent.save('./{}_model/step_{}_model.ckpt'.format(
                args.framework, total_steps))
            last_save_steps = total_steps

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info(
                'Total steps {}, Evaluation over {} episodes, Average reward: {}'
                .format(total_steps, EVAL_EPISODES, avg_reward))
Beispiel #9
0
                        help='Batch Size',
                        default=256)
    parser.add_argument('-d',
                        '--gamma',
                        type=float,
                        help='Discount Factor',
                        default=0.99)
    parser.add_argument('-c',
                        '--critic_lr',
                        default=3e-4,
                        type=float,
                        help='Critic Learning_rate')
    parser.add_argument('-a',
                        '--actor_lr',
                        default=3e-4,
                        type=float,
                        help='Actor Learning_rate')

    args = parser.parse_args()
    al, cl, dc, bs = args.actor_lr, args.critic_lr, args.gamma, args.batch_size
    print(
        "Start Training: actor/critic lr = %.4f / %.4f, batch size = %d, and gamma = %.2f"
        % (al, cl, dc, bs))
    print("#" * 60)

    # batch size + discount + actor lr + critic lr
    log_path = os.path.join('./log_dir', "%s_%s_%s_%s" % (bs, dc, al, cl))
    logger.set_dir(log_path)
    tb_logger = LogWriter(log_path)
    main()
Beispiel #10
0
def main():
    # Prepare environments
    # env = get_player(
    #     args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
    # test_env = get_player(
    #     args.rom,
    #     image_size=IMAGE_SIZE,
    #     frame_skip=FRAME_SKIP,
    #     context_len=CONTEXT_LEN)
    env = gym.make("pseudoslam:RobotExploration-v0")
    env = MonitorEnv(env, param={'goal': args.goal, 'obs': args.obs})

    # obs = env.reset()
    # print(obs.shape)
    # raise NotImplementedError
    # Init Prioritized Replay Memory
    per = ProportionalPER(alpha=0.6, seg_num=args.batch_size, size=MEMORY_SIZE)
    suffix = args.suffix + "_Rp{}_Goal{}_Obs{}".format(args.Rp, args.goal,
                                                       args.obs)
    logdir = os.path.join(args.logdir, suffix)
    if not os.path.exists(logdir):
        os.mkdir(logdir)
    logger.set_dir(logdir)
    modeldir = os.path.join(args.modeldir, suffix)
    if not os.path.exists(modeldir):
        os.mkdir(modeldir)

    # Prepare PARL agent
    act_dim = env.action_space.n
    model = AtariModel(act_dim)
    if args.alg == 'ddqn':
        algorithm = PrioritizedDoubleDQN(model,
                                         act_dim=act_dim,
                                         gamma=GAMMA,
                                         lr=LEARNING_RATE)
    elif args.alg == 'dqn':
        algorithm = PrioritizedDQN(model,
                                   act_dim=act_dim,
                                   gamma=GAMMA,
                                   lr=LEARNING_RATE)
    agent = AtariAgent(algorithm, act_dim=act_dim, update_freq=UPDATE_FREQ)
    if os.path.exists(args.load):
        agent.restore(args.load)
    # Replay memory warmup
    total_step = 0
    with tqdm(total=MEMORY_SIZE, desc='[Replay Memory Warm Up]') as pbar:
        mem = []
        while total_step < MEMORY_WARMUP_SIZE:
            total_reward, steps, _, _ = run_episode(env,
                                                    agent,
                                                    per,
                                                    mem=mem,
                                                    warmup=True)
            total_step += steps
            pbar.update(steps)
    per.elements.from_list(mem[:int(MEMORY_WARMUP_SIZE)])

    # env_name = args.rom.split('/')[-1].split('.')[0]
    test_flag = 0
    total_steps = 0
    pbar = tqdm(total=args.train_total_steps)
    save_steps = 0
    while total_steps < args.train_total_steps:
        # start epoch
        total_reward, steps, loss, info = run_episode(env,
                                                      agent,
                                                      per,
                                                      train=True)
        total_steps += steps
        save_steps += steps
        pbar.set_description('[train]exploration:{}'.format(agent.exploration))
        summary.add_scalar('train/score', total_reward, total_steps)
        summary.add_scalar('train/loss', loss,
                           total_steps)  # mean of total loss
        summary.add_scalar('train/exploration', agent.exploration, total_steps)
        summary.add_scalar('train/steps', steps, total_steps)
        for key in info.keys():
            summary.add_scalar('train/' + key, info[key], total_steps)
        pbar.update(steps)

        if total_steps // args.test_every_steps >= test_flag:
            print('start test!')
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            pbar.write("testing")
            test_rewards = []
            for _ in tqdm(range(3), desc='eval agent'):
                eval_reward = run_evaluate_episode(env, agent)
                test_rewards.append(eval_reward)
            eval_reward = np.mean(test_rewards)
            logger.info(
                "eval_agent done, (steps, eval_reward): ({}, {})".format(
                    total_steps, eval_reward))
            summary.add_scalar('eval/reward', eval_reward, total_steps)
        if save_steps >= 100000:
            modeldir_ = os.path.join(modeldir, 'itr_{}'.format(total_steps))
            if not os.path.exists(modeldir_):
                os.mkdir(modeldir_)
            print('save model!', modeldir_)
            agent.save(modeldir_)
            save_steps = 0

    pbar.close()
Beispiel #11
0
                                   total_steps)
                summary.add_scalar('dqn/Q value',
                                   evaluate_fixed_Q(agent, fixed_obs),
                                   total_steps)
                summary.add_scalar('dqn/grad_norm',
                                   get_grad_norm(agent.alg.model), total_steps)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--rom', default='rom_files/breakout.bin')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size for training')
    parser.add_argument('--lr', default=3e-4, help='learning_rate')
    parser.add_argument('--algo', default='DQN', help='DQN/Double/Dueling DQN')
    parser.add_argument('--train_total_steps',
                        type=int,
                        default=int(1e7),
                        help='maximum environmental steps of games')
    parser.add_argument(
        '--test_every_steps',
        type=int,
        default=int(1e5),
        help='the step interval between two consecutive evaluations')
    args = parser.parse_args()
    rom_name = args.rom.split('/')[-1].split('.')[0]
    logger.set_dir(os.path.join('./train_log', rom_name))
    main()
Beispiel #12
0
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        eval_reward = evaluate(env, agent, render=False)  # render=True 查看显示效果
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, agent.e_greed, eval_reward))

    # 训练结束,保存模型
    save_path = 'saved_model/dqn_model_%s_%s.ckpt' % (args.model, args.lr)
    agent.save(save_path)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--lr',
                        default=3e-4,
                        type=float,
                        help='learning_rate')
    parser.add_argument('-m',
                        dest='model',
                        default='v1',
                        help='neural_network')
    args = parser.parse_args()
    print("Start Training: learning rate = %.4f, model = model %s" %
          (args.lr, args.model))
    print("#" * 50)
    logger.set_dir(os.path.join('./train_log',
                                "%s_%s" % (args.model, args.lr)))
    main()
Beispiel #13
0
from smac.env import StarCraft2Env
from env_wrapper import SC2EnvWrapper
from replay_buffer import EpisodeExperience, EpisodeReplayBuffer
from qmixer_model import QMixerModel
from rnn_model import RNNModel
from parl.algorithms import QMIX
from qmix_agent import QMixAgent
import parl
from parl.utils import logger
from parl.utils import summary
import numpy as np
from copy import deepcopy
from qmix_config import QMixConfig

logger.set_dir('./log_path')


def run_train_episode(env, agent, rpm, config):
    episode_limit = config['episode_limit']
    agent.reset_agent()
    episode_reward = 0.0
    episode_step = 0
    terminated = False
    state, obs = env.reset()

    episode_experience = EpisodeExperience(episode_limit)

    while not terminated:
        available_actions = env.get_available_actions()
        actions = agent.sample(obs, available_actions)