Ejemplo n.º 1
0
def test_worker(args, shared_model, total_steps, optimizer):
    args.environment.clip_rewards = False
    env = make_env(args)

    log_path = '{}/{}'.format(args.train.experiment_path, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.info("STARTED TRAINING PROCESS {}".format(
        time.strftime("%Y.%m.%d_%H:%M", time.localtime())))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model.eval()

    start_time = time.time()

    reward_history = []
    while True:
        model.load_state_dict(shared_model.state_dict())
        if (len(reward_history) + 1) % args.train.save_frequency == 0:
            save_progress(args, model, optimizer, total_steps.value)
        total_reward, _ = play_game(model, env)
        reward_history.append(total_reward)

        log_message = "Time {}, num steps {}, FPS {:.0f}, curr episode reward {}, mean episode reward: {}".format(
            time.strftime("%Hh %Mm %Ss",
                          time.gmtime(time.time() - start_time)),
            total_steps.value,
            total_steps.value / (time.time() - start_time),
            total_reward,
            np.mean(reward_history[-60:]),
        )
        print(log_message)
        logging.info(log_message)
        time.sleep(60)
Ejemplo n.º 2
0
def test_worker(args, shared_model, total_steps, optimizer):
    args.environment.clip_rewards = False
    env = make_env(args.environment)

    log_path = '{}/{}'.format(args.train.experiment_folder, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.info("STARTED TRAINING PROCESS {}".format(time.strftime("%Y.%m.%d_%H:%M", time.localtime())))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (args.train.use_pixel_control or
            args.train.use_reward_prediction):
        model = ExperienceWrapper(model)
    if args.train.use_pixel_control:
        model = PixelControlWrapper(model, args.train.gamma, args.train.pc_coef)
    if args.train.use_reward_prediction:
        model = RewardPredictionWrapper(model, args.train.rp_coef)
    if args.train.use_value_replay:
        model = ValueReplayWrapper(model)
    model.config = args
    model.eval()

    start_time = time.time()

    reward_history = []
    while True:
        model.load_state_dict(shared_model.state_dict())
        if (len(reward_history) + 1) % args.train.save_frequency == 0:
            save_progress(args, model, optimizer, total_steps.value)
        stats = play_game(model, env)
        reward_history.append(stats['total_reward'])

        log_message = (
                'Time {}, num steps {}, FPS {:.0f}, '+
                'curr episode reward {:.2f}, mean episode reward: {:.2f}, '+
                'mean policy loss {:.2f}, mean value loss {:.2f}, '+
                'mean entropy percentage {:.2f}'
            ).format(
            time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
            total_steps.value,
            total_steps.value / (time.time() - start_time),
            stats['total_reward'],
            np.mean(reward_history[-60:]),
            stats['policy_loss'],
            stats['value_loss'],
            stats['entropy']
        )
        if args.train.use_pixel_control:
            log_message += ', pixel control loss %.2f' %stats['pc_loss']
        if args.train.use_reward_prediction:
            log_message += ', reward prediction loss %.2f' %stats['rp_loss']
        if args.train.use_value_replay:
            log_message += ', value replay loss %.2f' %stats['vr_loss']
        print(log_message)
        logging.info(log_message)
        time.sleep(60)
Ejemplo n.º 3
0
def train_worker(args, shared_model, total_steps, optimizer, lock):
    env = make_env(args)
    args = args.train

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)

    while True:
        model.load_state_dict(shared_model.state_dict())
        model.detach_hidden()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.update_agent_frequency):
            value, logit = model(state.unsqueeze(0))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())

            with total_steps.get_lock():
                total_steps.value += 1

            if done:
                state = env.reset()
                model.reset_hidden()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(state.unsqueeze(0))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach(
            ) - args.entropy_weight * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_weight * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        with lock:
            ensure_shared_grads(model, shared_model)
            optimizer.step()
Ejemplo n.º 4
0
    cmd_args = parser.parse_args()
    config = Config.fromYamlFile('{}/{}'.format(cmd_args.experiment_folder,
                                                'config.yaml'))

    log_path = '{}/{}'.format(cmd_args.experiment_folder, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)

    config.environment.clip_rewards = False
    if config.environment.env_type == 'dmlab':
        config.environment.episode_length_sec = min(
            config.environment.episode_length_sec, 60)
        config.environment.prev_frame_h = config.environment.frame_h
        config.environment.prev_frame_w = config.environment.frame_w
        config.environment.frame_h = max(config.environment.frame_h, 256)
        config.environment.frame_w = max(config.environment.frame_w, 256)
    env = make_env(config.environment, recording=True)
    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (config.train.use_pixel_control or config.train.use_reward_prediction):
        model = ExperienceWrapper(model)
    if config.train.use_pixel_control:
        model = PixelControlWrapper(model, config.train.gamma,
                                    config.train.pc_coef)
    if config.train.use_reward_prediction:
        model = RewardPredictionWrapper(model, config.train.rp_coef)
    if config.train.use_value_replay:
        model = ValueReplayWrapper(model)
    model.config = config
    if cmd_args.pretrained_weights is not None:
        model.load_state_dict(torch.load(cmd_args.pretrained_weights))
    else:
Ejemplo n.º 5
0
from config import Config
from envs.utils import make_env
from models.actor_critic_rnn import ActorCriticRNN as ActorCritic
from utils import record_video

parser = argparse.ArgumentParser(description='A3C')
parser.add_argument('--experiment-path', required=True,
                    help='path to folder with config')
parser.add_argument('--pretrained-weights', default=None,
                    help='path to pretrained weights (default: None – evaluate random model)')

if __name__ == '__main__':
    cmd_args = parser.parse_args()
    config_path = '{}/{}'.format(cmd_args.experiment_path, 'config.yaml')
    config = Config.fromYamlFile(config_path)
    
    log_path = '{}/{}'.format(cmd_args.experiment_path, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)

    env = make_env(config)
    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    if cmd_args.pretrained_weights is not None:
        model.load_state_dict(torch.load(cmd_args.pretrained_weights))
    else:
        print("You have not specified path to model weights, random plays will be performed")
    model.eval()
    results = record_video(model, env)
    log_message = "evaluated on pretrained weights: {}, results: {}".format(cmd_args.pretrained_weights, results)
    print(log_message)
    logging.info(log_message)
Ejemplo n.º 6
0
    default=None,
    help='path to pretrained weights (default: if None – train from scratch)')
parser.add_argument(
    '--pretrained-optimizer',
    default=None,
    help=
    'path to pretrained optimizer params (default: if None – train from scratch)'
)

if __name__ == '__main__':
    cmd_args = parser.parse_args()
    config = Config.fromYamlFile('{}/{}'.format(cmd_args.experiment_folder,
                                                'config.yaml'))
    config.train.__dict__.update(vars(cmd_args))

    env = make_env(config.environment)

    shared_model = ActorCritic(env.observation_space.shape, env.action_space.n)
    shared_model = BaseWrapper(shared_model)
    if (config.train.use_pixel_control or config.train.use_reward_prediction
            or config.train.use_value_replay):
        shared_model = ExperienceWrapper(shared_model)
    if config.train.use_pixel_control:
        shared_model = PixelControlWrapper(shared_model, config.train.gamma,
                                           config.train.pc_coef)
    if config.train.use_reward_prediction:
        shared_model = RewardPredictionWrapper(shared_model,
                                               config.train.rp_coef)
    if config.train.use_value_replay:
        shared_model = ValueReplayWrapper(shared_model)
    if config.train.pretrained_weights is not None:
Ejemplo n.º 7
0
def train_worker(args, shared_model, total_steps, optimizer, lock):
    env = make_env(args.environment)
    args = args.train
    if args.sample_entropy:
        args.entropy_weight = np.exp(
            np.random.uniform(np.log(0.0005), np.log(0.01)))
    if args.sample_lr:
        args.learning_rate = np.exp(
            np.random.uniform(np.log(0.0001), np.log(0.005)))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (args.use_pixel_control or
            args.use_reward_prediction):
        model = ExperienceWrapper(model)
    if args.use_pixel_control:
        model = PixelControlWrapper(model, args.gamma, args.pc_coef)
    if args.use_reward_prediction:
        model = RewardPredictionWrapper(model, args.rp_coef)
    if args.use_value_replay:
        model = ValueReplayWrapper(model)
    model.train()

    curiosity_rewarder = CuriosityRewarder(env.observation_space.shape, env.action_space.n)
    curiosity_rewarder.train()

    curiosity_optimizer = optim.Adam(curiosity_rewarder.parameters())

    state = env.reset()
    state = torch.FloatTensor(state)
    last_act = 0
    sum_reward = 0
    last_reward = 0

    while True:
        model.load_state_dict(shared_model.state_dict())
        model.detach_hidden()

        values = []
        log_probs = []
        rewards = []
        curiosity_rewards = []
        entropies = []

        for step in range(args.update_agent_frequency):
            value, logit = model((state.unsqueeze(0), last_act, sum_reward))
            prob = F.relu(F.softmax(logit, dim=-1))
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            act = action.numpy()[0][0]
            next_state, reward, done, _ = env.step(act)

            if (args.use_pixel_control or
                    args.use_reward_prediction or
                    args.use_value_replay):
                tr = Transaction(state, next_state, act,
                        reward, done, last_act, last_reward, sum_reward)
                model.add_frame(tr)

            last_reward = reward
            last_act = act
            sum_reward += reward

            with total_steps.get_lock():
                total_steps.value += 1

            if done:
                sum_reward = 0
                last_act = 0
                last_reward = 0
                next_state = env.reset()
                model.reset_hidden()

            next_state = torch.FloatTensor(next_state)
            curiosity_reward = curiosity_rewarder.get_reward(state.unsqueeze(0), action, next_state.unsqueeze(0))
            state = next_state

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            curiosity_rewards.append(curiosity_reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model((state.unsqueeze(0), last_act, sum_reward))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            # print(rewards[i], args.curiosity_weight * curiosity_rewards[i].detach())
            R = args.gamma * R + rewards[i] + args.curiosity_weight * curiosity_rewards[i].detach()
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            # print('lp:', log_probs[i], 'gae:', gae.detach(), 'ent:', entropies[i])
            policy_loss = policy_loss - log_probs[i] * gae.detach() - args.entropy_weight * entropies[i]

        curiosity_optimizer.zero_grad()
        curiosity_loss = sum(map(lambda x: x**2, curiosity_rewards)) / len(curiosity_rewards)
        curiosity_loss.backward()
        curiosity_optimizer.step()

        optimizer.zero_grad()
        (policy_loss + args.value_weight * value_loss + model.get_loss()).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        with lock:
            ensure_shared_grads(model, shared_model)
            optimizer.step()
        model.reset()