コード例 #1
0
ファイル: workers.py プロジェクト: granilace/rita
def train_worker(args, shared_model, total_steps, optimizer, lock):
    env = make_env(args)
    args = args.train

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)

    while True:
        model.load_state_dict(shared_model.state_dict())
        model.detach_hidden()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.update_agent_frequency):
            value, logit = model(state.unsqueeze(0))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())

            with total_steps.get_lock():
                total_steps.value += 1

            if done:
                state = env.reset()
                model.reset_hidden()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(state.unsqueeze(0))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach(
            ) - args.entropy_weight * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_weight * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        with lock:
            ensure_shared_grads(model, shared_model)
            optimizer.step()
コード例 #2
0
)

if __name__ == '__main__':
    cmd_args = parser.parse_args()
    config = Config.fromYamlFile('config.yaml')
    args = config.train
    args.__dict__.update(vars(cmd_args))

    env = make_atari(args.env_name)

    shared_model = ActorCritic(env.observation_space.shape, env.action_space.n)
    if args.pretrained_weights is not None:
        shared_model.load_weights(args.pretrained_weights)
    shared_model.share_memory()

    optimizer = SharedAdam(shared_model.parameters(), lr=args.learning_rate)
    if args.pretrained_weights is not None:
        optimizer.load_params(
            args.pretrained_weights.replace('weights/', 'optimizer_params/'))
    optimizer.share_memory()

    processes = []

    lock = mp.Lock()
    total_steps = Value('i', 0)

    p = mp.Process(target=test_worker,
                   args=(args, shared_model, total_steps, optimizer))
    p.start()
    processes.append(p)
コード例 #3
0
ファイル: train.py プロジェクト: dasimagin/rita
            or config.train.use_value_replay):
        shared_model = ExperienceWrapper(shared_model)
    if config.train.use_pixel_control:
        shared_model = PixelControlWrapper(shared_model, config.train.gamma,
                                           config.train.pc_coef)
    if config.train.use_reward_prediction:
        shared_model = RewardPredictionWrapper(shared_model,
                                               config.train.rp_coef)
    if config.train.use_value_replay:
        shared_model = ValueReplayWrapper(shared_model)
    if config.train.pretrained_weights is not None:
        shared_model.load_state_dict(
            torch.load(config.train.pretrained_weights))
    shared_model.share_memory()

    optimizer = SharedAdam(shared_model.parameters(),
                           lr=config.train.learning_rate)
    if config.train.pretrained_optimizer is not None:
        optimizer.load_state_dict(torch.load(
            config.train.pretrained_optimizer))
    optimizer.share_memory()

    processes = []

    lock = mp.Lock()
    total_steps = Value('i', 0)

    p = mp.Process(target=test_worker,
                   args=(config, shared_model, total_steps, optimizer))
    p.start()
    processes.append(p)
コード例 #4
0
ファイル: workers.py プロジェクト: dasimagin/rita
def train_worker(args, shared_model, total_steps, optimizer, lock):
    env = make_env(args.environment)
    args = args.train
    if args.sample_entropy:
        args.entropy_weight = np.exp(
            np.random.uniform(np.log(0.0005), np.log(0.01)))
    if args.sample_lr:
        args.learning_rate = np.exp(
            np.random.uniform(np.log(0.0001), np.log(0.005)))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (args.use_pixel_control or
            args.use_reward_prediction):
        model = ExperienceWrapper(model)
    if args.use_pixel_control:
        model = PixelControlWrapper(model, args.gamma, args.pc_coef)
    if args.use_reward_prediction:
        model = RewardPredictionWrapper(model, args.rp_coef)
    if args.use_value_replay:
        model = ValueReplayWrapper(model)
    model.train()

    curiosity_rewarder = CuriosityRewarder(env.observation_space.shape, env.action_space.n)
    curiosity_rewarder.train()

    curiosity_optimizer = optim.Adam(curiosity_rewarder.parameters())

    state = env.reset()
    state = torch.FloatTensor(state)
    last_act = 0
    sum_reward = 0
    last_reward = 0

    while True:
        model.load_state_dict(shared_model.state_dict())
        model.detach_hidden()

        values = []
        log_probs = []
        rewards = []
        curiosity_rewards = []
        entropies = []

        for step in range(args.update_agent_frequency):
            value, logit = model((state.unsqueeze(0), last_act, sum_reward))
            prob = F.relu(F.softmax(logit, dim=-1))
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            act = action.numpy()[0][0]
            next_state, reward, done, _ = env.step(act)

            if (args.use_pixel_control or
                    args.use_reward_prediction or
                    args.use_value_replay):
                tr = Transaction(state, next_state, act,
                        reward, done, last_act, last_reward, sum_reward)
                model.add_frame(tr)

            last_reward = reward
            last_act = act
            sum_reward += reward

            with total_steps.get_lock():
                total_steps.value += 1

            if done:
                sum_reward = 0
                last_act = 0
                last_reward = 0
                next_state = env.reset()
                model.reset_hidden()

            next_state = torch.FloatTensor(next_state)
            curiosity_reward = curiosity_rewarder.get_reward(state.unsqueeze(0), action, next_state.unsqueeze(0))
            state = next_state

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            curiosity_rewards.append(curiosity_reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model((state.unsqueeze(0), last_act, sum_reward))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            # print(rewards[i], args.curiosity_weight * curiosity_rewards[i].detach())
            R = args.gamma * R + rewards[i] + args.curiosity_weight * curiosity_rewards[i].detach()
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            # print('lp:', log_probs[i], 'gae:', gae.detach(), 'ent:', entropies[i])
            policy_loss = policy_loss - log_probs[i] * gae.detach() - args.entropy_weight * entropies[i]

        curiosity_optimizer.zero_grad()
        curiosity_loss = sum(map(lambda x: x**2, curiosity_rewards)) / len(curiosity_rewards)
        curiosity_loss.backward()
        curiosity_optimizer.step()

        optimizer.zero_grad()
        (policy_loss + args.value_weight * value_loss + model.get_loss()).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        with lock:
            ensure_shared_grads(model, shared_model)
            optimizer.step()
        model.reset()