Beispiel #1
0
def main():
    mp.set_start_method('spawn')
    config = Config()
    # 1. 初始化环境
    env = NormalizedEnv(gym.make('Pendulum-v0'))

    # 2. 初始化agent
    agent = DDPGAgent(env=env,
                      seed=config.seed,
                      batch_size=config.batch_size,
                      learning_rate_actor=config.learning_rate_actor,
                      learning_rate_critic=config.learning_rate_critic,
                      weight_decay=config.weight_decay)
    agent.target_actor.share_memory()
    # 3. 初始化memory
    memory = ReplayMemory(config.capacity)

    q = mp.Queue(10)

    process_collect_list = []
    for i in range(config.agent_num):
        process_name = "collect_process_" + str(i)
        process = mp.Process(name=process_name,
                             target=collect_porcess,
                             args=(i, q, agent.target_actor))
        process.start()
        process_collect_list.append(process)

    steps = mp.Value('d', 0)
    test_p = mp.Process(name="test_process",
                        target=test_process,
                        args=(config, steps, agent.target_actor))
    test_p.start()
    process_collect_list.append(test_p)

    try:
        while True:
            len = q.qsize()
            while len:
                mem = q.get()
                memory.push(mem[0], mem[1], mem[2], mem[3], mem[4])
                len -= 1
            # 4.4 学习
            if memory.len > config.batch_size:
                agent.learning(memory)
            # save model
            if steps.value > 1 and steps.value % config.save_steps == 0:
                agent.save_models(steps.value / config.save_steps)
            steps.value += 1
    except Exception as e:
        print(e)
    except:
        for process in process_collect_list:
            process.join()
            print(process.name + " stop ")
    env.close()
Beispiel #2
0
def collect_porcess(agent_index, queue_mem, acrot_param):
    env = NormalizedEnv(gym.make('Pendulum-v0'))
    agent = Action(state_dim=env.observation_space.shape[0],
                   action_dim=env.action_space.shape[0])
    try:
        while True:
            done = False
            state = env.reset()
            state = (state - env.observation_space.low) / (
                env.observation_space.high - env.observation_space.low)
            agent.load_param(acrot_param)
            print("agent {} load param".format(agent_index))

            while not done:
                action = agent.chose_action(state, explort=True)
                next_state, reward, done, _ = env.step(action)
                # env.render()
                next_state = (next_state - env.observation_space.low) / (
                    env.observation_space.high - env.observation_space.low)
                is_done = 0 if done else 1
                queue_mem.put((state, action, next_state, reward, is_done))
                state = next_state
    except Exception as e:
        print(e)
        print("agent {} exit".format(agent_index))
        env.close()
Beispiel #3
0
def test_process(config, steps, target_actor):
    env = NormalizedEnv(gym.make('Pendulum-v0'))
    agent = Action(state_dim=env.observation_space.shape[0],
                   action_dim=env.action_space.shape[0])
    reward_list = []
    try:
        while True:
            # for test
            if (steps.value) != 0 and (steps.value % config.test_every_eposide
                                       == 0):
                agent.load_param(target_actor)
                print("test agent load param ")
                et_reward = 0
                for index in range(config.num_eposide_test):
                    eposide = 0
                    state = env.reset()
                    state = (state - env.observation_space.low) / (
                        env.observation_space.high - env.observation_space.low)

                    while True:
                        action = agent.chose_action(state, explort=False)
                        next_state, reward, done, _ = env.step(action)
                        env.render()
                        next_state = (next_state - env.observation_space.low
                                      ) / (env.observation_space.high -
                                           env.observation_space.low)
                        eposide += reward
                        state = next_state
                        if done:
                            break
                    et_reward += eposide
                print("\033[93m [ test ] eposide average reward : {}\033[00m".
                      format(et_reward / config.num_eposide_test))
                reward_list.append(et_reward / config.num_eposide_test)

                x = np.arange(len(reward_list))
                y = np.array(reward_list)
                plt.plot(x, y)
                plt.savefig("./eposide_reward.png")

    except Exception as e:
        print(e)
        print("test process exit")
        env.close()
Beispiel #4
0
def main():
    env = NormalizedEnv(gym.make('Pendulum-v0'))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    agent = Actor(state_dim, action_dim).to('cuda')

    agent.load_state_dict(torch.load('./Models/78.0_actor.pt'))

    eposide = 0
    done = False
    eposide_list = []
    while eposide < 100:
        eposide_reward = 0
        state = env.reset()
        state = (state - env.observation_space.low) / (
            env.observation_space.high - env.observation_space.low)
        state = to_tensor(state)
        while not done:
            action = agent.forward(state).detach().cpu().data.numpy()
            state_, reward, done, _ = env.step(action)
            state_ = (state_ - env.observation_space.low) / (
                env.observation_space.high - env.observation_space.low)
            env.render()
            state = to_tensor(state_)
            eposide_reward += reward

        eposide_list.append(eposide_reward)
        eposide += 1
        done = False
        print("{} : {}".format(eposide, eposide_reward))

    import matplotlib.pyplot as plt
    x = np.arange(100)
    y = np.array(eposide_list)
    plt.plot(x, y)
    plt.savefig("./test_eposide_reward.png")

    env.close()
Beispiel #5
0
            action = trainer.select_action(observation)
            observation2, reward, done, info = env.step(action)
            observation2 = deepcopy(observation2)
            if step >= MAX_STEP_PER_EPISODE - 1:
                done = True

            # trainer store transitions and update all networks
            trainer.observe(reward, observation2, done)
            trainer.update_all()
            episode_reward += reward
            observation = deepcopy(observation2)
        print('Training Episode {}, Episode Reward is:{}'.format(
            episode, episode_reward))
        if episode % EVALUATING_EPISODE_INTERVAL == 0:
            policy = lambda x: trainer.select_action(x, decay_epsilon=False)
            evaluator(env,
                      policy,
                      debug=True,
                      visualize=OPEN_VISUALIZATION_EVA,
                      save=True)


'''
    Entrance of Main Program
'''
env = NormalizedEnv(gym.make(ENVIRONMENT))
nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]

ddpg_trainer = DDPG_trainer(nb_states, nb_actions)
train(ddpg_trainer, env)
Beispiel #6
0
parser.add_argument('--log_interval', default=50, type=int) #
parser.add_argument('--load', default=False, type=bool) # load model
parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
parser.add_argument('--exploration_noise', default=0.1, type=float)
parser.add_argument('--max_episode', default=10000, type=int) # num of games
parser.add_argument('--num_episode', default=0, type=int)
parser.add_argument('--print_log', default=5, type=int)
parser.add_argument('--update_iteration', default=200, type=int)
args = parser.parse_args()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('env:', args.env_name)
print('seed:', args.random_seed)
script_name = os.path.basename(__file__)
eps = np.finfo(np.float32).eps
env = NormalizedEnv(gym.make(args.env_name))

if args.seed:
    env.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    np.random.seed(args.random_seed)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_Val = torch.tensor(1e-7).float().to(device) # min value

directory = './exp'+ script_name +'Seed'+str(args.random_seed)+ args.env_name +'./'

def normal_R_V(R_, current_Q, reward):
    R_ = np.array(R_)
Beispiel #7
0
                        type=int,
                        help='linear decay of exploration policy')
    parser.add_argument('--seed', default=-1, type=int, help='')
    parser.add_argument('--resume',
                        default='default',
                        type=str,
                        help='Resuming model path for testing')
    # parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO
    # parser.add_argument('--cuda', dest='cuda', action='store_true') # TODO

    args = parser.parse_args()
    args.output = get_output_folder(args.output, args.env)
    if args.resume == 'default':
        args.resume = 'output/{}-run0'.format(args.env)

    env = NormalizedEnv(gym.make(args.env))

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states, nb_actions, args)
    evaluate = Evaluator(args.validate_episodes,
                         args.validate_steps,
                         args.output,
                         max_episode_length=args.max_episode_length)

    if args.mode == 'train':
Beispiel #8
0
    parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO

    args = parser.parse_args()
    # StrCat args.output with args.env
    if args.resume is None:
        args.output = get_output_folder(args.output, args.env)
    else:
        args.output = args.resume

    if args.env == "KukaGym":
        env = KukaGymEnv(renders=False, isDiscrete=True)
    elif args.discrete:
        env = gym.make(args.env)
        env = env.unwrapped
    else:
        env = NormalizedEnv(gym.make(args.env))

    # input random seed
    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    # input states count & actions count
    print(env.observation_space.shape, env.action_space.shape)
    nb_states = env.observation_space.shape[0]
    if args.discrete:
        nb_actions = env.action_space.n
    else:
        nb_actions = env.action_space.shape[0]

    env = fastenv(env, args.action_repeat, args.vis)
Beispiel #9
0
                stats["Surrogate loss"] = surrafter
                summary = tf.Summary()
                for k, v in stats.iteritems():
                    print(k + ": " + " " * (40 - len(k)) + str(v))
                    if k != "Time elapsed":
                        summary.value.add(tag=k, simple_value=float(v))
                # save stats
                self.summary_writer.add_summary(summary, i)
                self.summary_writer.flush()
                if entropy != entropy:
                    exit(-1)
                """
                if exp > 0.8:
                    self.train = False
                """
            i += 1


if __name__ == '__main__':
    logging.getLogger().setLevel(logging.DEBUG)

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)
    env = gym.make(args.env_id)
    env = NormalizedEnv(env, normalize_obs=True)

    agent = TRPOAgent(env, args)
    agent.learn()
Beispiel #10
0
                [path["rewards"].sum() for path in paths])

            print "\n********** Iteration %i ************" % i
            if episoderewards.mean() >= self.env._spec.reward_threshold:
                print "Solved Env"
                self.solved = True

            stats = {}
            numeptotal += len(episoderewards)
            stats["Total number of episodes"] = numeptotal
            stats["Average sum of rewards per episode"] = episoderewards.mean()
            for k, v in stats.iteritems():
                print(k + ": " + " " * (40 - len(k)) + str(v))

            i += 1


if __name__ == '__main__':

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)
    env = gym.make(args.env_id)
    if args.use_pixels:
        env = JacoCombiEnv(env, is_rgb=True, is_depth=True)
    else:
        env = NormalizedEnv(env)
    agent = AsyncNGAgent(env, args)
    agent.deploy()
Beispiel #11
0
    from util import get_output_folder, setup_logger
    from wolp_agent import WolpertingerAgent

    args.save_model_dir = get_output_folder('../output', args.env)

    env = gym.make(args.env)
    continuous = None
    try:
        # continuous action
        nb_states = env.observation_space.shape[0]
        nb_actions = env.action_space.shape[0]
        action_high = env.action_space.high
        action_low = env.action_space.low
        continuous = True
        env = NormalizedEnv(env)
    except IndexError:
        # discrete action for 1 dimension
        nb_states = env.observation_space.shape[0]
        nb_actions = 1  # the dimension of actions, usually it is 1. Depend on the environment.
        max_actions = env.action_space.n
        continuous = False

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    if continuous:
        agent_args = {
            'continuous': continuous,
            'max_actions': None,
Beispiel #12
0
    args.output = get_output_folder(args.output, args.env)

    if args.debug:
        print('Writing to {}'.format(args.output))

    writer = SummaryWriter(args.output)
    with open(os.path.join(args.output, 'cmdline.txt'), 'a') as f:
        f.write(' '.join(sys.argv) + '\n')

    bullet = ("Bullet" in args.env)
    if bullet:
        import pybullet
        import pybullet_envs
        
    env = NormalizedEnv(gym.make(args.env))

    # input random seed
    if args.seed > 0:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        env.seed(args.seed)
        if args.cuda:
            torch.cuda.manual_seed(args.seed)

    # input status count & actions count
    print('observation_space', env.observation_space.shape, 'action_space', env.action_space.shape)
    nb_status = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]
    
Beispiel #13
0
def main(args):

    env = make_env('simple_tag')
    env = NormalizedEnv(env)

    kwargs = dict()
    kwargs['config'] = args

    predator_model = Predators(16, 2, num_agent=3, **kwargs)
    preyer_model = Preyer(14, 2, **kwargs)
    if args.tensorboard:
        writer = SummaryWriter(log_dir='runs/' + args.log_dir)
    episode = 0
    total_step = 0

    while episode < args.max_episodes:

        state = env.reset()
        episode += 1
        step = 0
        predator_accum_reward = []
        preyer_accum_reward = 0

        while True:
            state_predator, state_prayer = split_obs(state)

            predator_model.prep_eval()
            action_predator = predator_model.choose_action(state_predator)
            action_prayer = preyer_model.random_action()
            #action_prayer = preyer_model.choose_action(state_prayer)

            action = merge_action(action_predator, action_prayer)

            next_state, reward, done, info = env.step(action)
            step += 1
            total_step += 1

            predator_accum_reward.append(np.mean(reward[:3]))
            preyer_accum_reward = reward[3]

            if step > args.episode_length:
                done = [True, True, True, True]

            if args.render and (episode % 10 == 1):
                env.render(mode='rgb_array')

            predator_model.memory(state[:3], action[:3], reward[:3],
                                  next_state[:3], done[:3])
            # preyer_model.memory(state[3], action[3], reward[3], next_state[3], done[3])

            if len(
                    predator_model.replay_buffer
            ) >= args.batch_size and total_step % args.steps_per_update == 0:
                predator_model.prep_train()
                predator_model.train()
                # preyer_model.train()

            if True in done:
                predator_c_loss, predator_a_loss = predator_model.getLoss()
                preyer_c_loss, preyer_a_loss = preyer_model.getLoss()
                print("[Episode %05d] reward_predator %3.1f reward_preyer %3.1f predator_c_loss %3.1f predator_a_loss %3.1f preyer_c_loss %3.1f preyer_a_loss %3.1f" % \
                      (episode, np.mean(predator_accum_reward).item(), preyer_accum_reward, predator_c_loss, predator_a_loss, preyer_c_loss, preyer_a_loss))
                if args.tensorboard:
                    # writer.add_scalar(tag='debug/memory_length', global_step=episode, scalar_value=len(predator_model.replay_buffer))
                    # writer.add_scalar(tag='debug/predator_epsilon', global_step=episode, scalar_value=predator_model.epsilon)
                    # writer.add_scalar(tag='debug/preyer_epsilon', global_step=episode, scalar_value=preyer_model.epsilon)
                    writer.add_scalar(
                        tag='agent/reward_predator',
                        global_step=episode,
                        scalar_value=np.mean(predator_accum_reward).item())
                    # writer.add_scalar(tag='perf/reward_preyer', global_step=episode, scalar_value=preyer_accum_reward)
                    if predator_c_loss and predator_a_loss:
                        writer.add_scalars('agent/predator_loss',
                                           global_step=episode,
                                           tag_scalar_dict={
                                               'actor': -predator_a_loss,
                                               'critic': predator_c_loss
                                           })
                    # writer.add_scalar(tag='loss/preyer_c_loss', global_step=episode, scalar_value=preyer_c_loss)
                    # writer.add_scalar(tag='loss/preyer_a_loss', global_step=episode, scalar_value=preyer_a_loss)

                predator_model.reset()
                preyer_model.reset()
                break

            state = next_state
    if args.tensorboard:
        writer.close()