Exemple #1
0
def main():
    args = get_args()

    # create save directory
    save_dir = os.path.join('weights', args.exp_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    else:
        shutil.move(save_dir, save_dir + '.backup')
        os.makedirs(save_dir)

    state_transform = StateVelCentr(obstacles_mode='standard',
                                    exclude_centr=True,
                                    vel_states=[])
    num_actions = 18

    # build model
    model_params = {
        'state_size': state_transform.state_size,
        'num_act': num_actions,
        'gamma': args.gamma,
        'actor_lr': args.actor_lr,
        'critic_lr': args.critic_lr,
        'layer_norm': args.layer_norm
    }
    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \
        build_model(**model_params)
    actor = Agent(actor_fn, params_actor, params_crit)

    if args.weights is not None:
        actor.load(args.weights)

    actor_lr_step = (args.actor_lr - args.actor_lr_end) / args.max_steps
    critic_lr_step = (args.critic_lr - args.critic_lr_end) / args.max_steps

    # build actor
    weights = [p.get_value() for p in params_actor]

    # build replay memory
    memory = ReplayMemory(state_transform.state_size, 18, 5000000)

    # init shared variables
    global_step = Value('i', 0)
    updates = Value('i', 0)
    best_reward = Value('f', -1e8)
    testing = Value('i', 0)

    # init agents
    data_queue = Queue()
    workers = []
    weights_queues = []
    num_agents = args.n_threads - 2
    print('starting {} agents'.format(num_agents))
    for i in range(num_agents):
        w_queue = Queue()
        worker = Process(target=run_agent,
                         args=(model_params, weights, state_transform, data_queue, w_queue,
                               i, global_step, updates, best_reward,
                               args.param_noise_prob, save_dir, args.max_steps)
                         )
        worker.daemon = True
        worker.start()
        sleep(args.sleep)
        workers.append(worker)
        weights_queues.append(w_queue)

    prev_steps = 0
    start_save = time()
    start_test = time()
    weights_rew_to_check = []
    while global_step.value < args.max_steps:

        # get all data
        try:
            i, batch, weights_check, reward = data_queue.get_nowait()
            if weights_check is not None:
                weights_rew_to_check.append((weights_check, reward))
            weights_queues[i].put(weights)
            # add data to memory
            memory.add_samples(*batch)
        except queue.Empty:
            pass

        # training step
        # TODO: consider not training during testing model
        if len(memory) > args.start_train_steps:
            batch = memory.random_batch(args.batch_size)

            if np.random.rand() < args.flip_prob:
                states, actions, rewards, terminals, next_states = batch

                states_flip = state_transform.flip_states(states)
                next_states_flip = state_transform.flip_states(next_states)
                actions_flip = np.zeros_like(actions)
                actions_flip[:, :num_actions//2] = actions[:, num_actions//2:]
                actions_flip[:, num_actions//2:] = actions[:, :num_actions//2]

                states_all = np.concatenate((states, states_flip))
                actions_all = np.concatenate((actions, actions_flip))
                rewards_all = np.tile(rewards.ravel(), 2).reshape(-1, 1)
                terminals_all = np.tile(terminals.ravel(), 2).reshape(-1, 1)
                next_states_all = np.concatenate((next_states, next_states_flip))
                batch = (states_all, actions_all, rewards_all, terminals_all, next_states_all)

            actor_loss, critic_loss = train_fn(*batch)
            updates.value += 1
            if np.isnan(actor_loss):
                raise Value('actor loss is nan')
            if np.isnan(critic_loss):
                raise Value('critic loss is nan')
            target_update_fn()
            weights = actor.get_actor_weights()

        delta_steps = global_step.value - prev_steps
        prev_steps += delta_steps

        actor_lr.set_value(lasagne.utils.floatX(max(actor_lr.get_value() - delta_steps*actor_lr_step, args.actor_lr_end)))
        critic_lr.set_value(lasagne.utils.floatX(max(critic_lr.get_value() - delta_steps*critic_lr_step, args.critic_lr_end)))

        # check if need to save and test
        if (time() - start_save)/60. > args.save_period_min:
            fname = os.path.join(save_dir, 'weights_updates_{}.pkl'.format(updates.value))
            actor.save(fname)
            start_save = time()

        # start new test process
        weights_rew_to_check = [(w, r) for w, r in weights_rew_to_check if r > best_reward.value and r > 0]
        weights_rew_to_check = sorted(weights_rew_to_check, key=lambda x: x[1])
        if ((time() - start_test) / 60. > args.test_period_min or len(weights_rew_to_check) > 0) and testing.value == 0:
            testing.value = 1
            print('start test')
            if len(weights_rew_to_check) > 0:
                _weights, _ = weights_rew_to_check.pop()
            else:
                _weights = weights
            worker = Process(target=test_agent,
                             args=(testing, state_transform, args.num_test_episodes,
                                   model_params, _weights, best_reward,
                                   updates, global_step, save_dir)
                             )
            worker.daemon = True
            worker.start()
            start_test = time()

    # end all processes
    for w in workers:
        w.join()
Exemple #2
0
def run_agent(model_params, weights, state_transform, data_queue, weights_queue,
              process, global_step, updates, best_reward, param_noise_prob, save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \
        build_model(**model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
    random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput,
                                              sigma_min=0.05, n_steps_annealing=1e6)
    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32-2)
        state = env.reset(seed=seed, difficulty=2)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0
        
        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env.step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (states_np,
                np.asarray(actions).astype(np.float32),
                np.asarray(rewards).astype(np.float32),
                np.asarray(terminals),
                )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \
                     'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps,
                   total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
            f.write(report_str + '\n')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
Exemple #3
0
def run_agent(args,
              model_params,
              weights,
              data_queue,
              weights_queue,
              process,
              global_step,
              updates,
              best_reward,
              param_noise_prob,
              save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(
        **model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(model=args.modeldim,
                  prosthetic=args.prosthetic,
                  difficulty=args.difficulty,
                  skip_frame=config.skip_frames)
    env.spec.timestep_limit = 3000  # ndrw
    # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6)

    sigma_rand = random.uniform(0.05, 0.5)
    dt_rand = random.uniform(0.002, 0.02)
    param_noise_prob = random.uniform(param_noise_prob * 0.25,
                                      min(param_noise_prob * 1.5, 1.))

    random_process = OrnsteinUhlenbeckProcess(theta=.1,
                                              mu=0.,
                                              sigma=sigma_rand,
                                              dt=dt_rand,
                                              size=env.noutput,
                                              sigma_min=0.05,
                                              n_steps_annealing=1e6)

    print('OUProcess_sigma = ' + str(sigma_rand) + '    OUProcess_dt = ' +
          str(dt_rand) + '    param_noise_prob = ' + str(param_noise_prob))

    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32 - 2)
        state = env.reset(seed=seed, difficulty=args.difficulty)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0

        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env._step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (
            states_np,
            np.asarray(actions).astype(np.float32),
            np.asarray(rewards).astype(np.float32),
            np.asarray(terminals),
        )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        try:
            with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
                f.write(report_str + '\n')
        except:
            print('#############################################')
            print(
                'except  »  with open(os.path.join(save_dir, train_report.log), a) as f:'
            )
            print('#############################################')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(model=args.modeldim,
                          prosthetic=args.prosthetic,
                          difficulty=args.difficulty,
                          skip_frame=config.skip_frames)