コード例 #1
0
        memory = deque()

        steps = 0
        scores = []
        while steps < 2048:
            episodes += 1
            state = env.reset()
            state = running_state(state)
            score = 0
            for _ in range(10000):
                if args.render:
                    env.render()

                steps += 1
                mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std, not args.categorical)[0]
                next_state, reward, done, _ = env.step(action)
                next_state = running_state(next_state)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                score += reward
                state = next_state

                if done:
                    break
コード例 #2
0
        memory = deque()

        steps = 0
        scores = []
        while steps < 2048:
            episodes += 1
            state = env.reset()
            state = running_state(state)
            score = 0
            for _ in range(10000):
                if args.render:
                    env.render()

                steps += 1
                mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                next_state = running_state(next_state)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                score += reward
                state = next_state

                if done:
                    break
コード例 #3
0
    scores = []
    score_avg = 0

    for iter in range(args.max_iter):
        actor.eval(), critic.eval()
        memory = [Memory() for _ in range(num_agent)]

        steps = 0
        score = 0

        while steps < args.time_horizon:
            steps += 1

            mu, std, _ = actor(to_tensor(states))
            actions = get_action(mu, std)
            env_info = env.step(actions)[default_brain]

            next_states = running_state(env_info.vector_observations)
            rewards = env_info.rewards
            dones = env_info.local_done
            masks = list(~(np.array(dones)))

            for i in range(num_agent):
                memory[i].push(states[i], actions[i], rewards[i], masks[i])

            score += rewards[0]
            states = next_states

            if dones[0]:
                scores.append(score)
コード例 #4
0
ファイル: main.py プロジェクト: HarunaHaju/IRL
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
コード例 #5
0
ファイル: main.py プロジェクト: sevenquarkoniums/miniDota
def test(interval, runs):
    print('Testing..')
    numAgent = 10
    numGame = 1
    assert numGame == 1  # needed.
    env = {0: miniDotaEnv(args, numAgent)}
    net = ac(args)
    if not args.cpuSimulation:
        net = net.to(device)
    saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                   str(args.load_model))
    ckpt = torch.load(saved_ckpt_path)
    net.load_state_dict(ckpt['net'])
    net.eval()
    observations = {0: env[0].reset(0)['observations']}

    for iteration in range(runs):
        start = time.time()
        print()
        print('Start iteration %d ..' % iteration)
        if args.cpuSimulation:
            net = net.cpu()
        steps = 0
        teamscore = 0
        gameEnd = np.zeros(numGame).astype(bool)
        record = []
        teamLabel = env[0].getState().reshape((12, 4))[:10, 0]

        while steps <= args.time_horizon:  # loop for one round of games.
            if np.all(gameEnd):
                break
            steps += 1
            stateList = []
            for game in range(numGame):
                for agent in range(numAgent):
                    stateList.append(
                        np.expand_dims(observations[game][agent], axis=0))
            stateCombined = np.concatenate(stateList, axis=0)
            with torch.no_grad():
                actionDistr = net(to_tensor(
                    stateCombined,
                    args.cpuSimulation))  # calculate all envs together.
            actions = get_action(actionDistr)

            for game in range(numGame):
                if not gameEnd[game]:
                    thisGameAction = actions[
                        10 * game:10 *
                        (game + 1), :]  # contain actions from all agents.
                    #                    for player in range(10):
                    #                        if teamLabel[player] == 0 and steps < 100:
                    #                            thisGameAction[player] = [0, 1, 1, 0] # ablation test.
                    envInfo = env[game].step(
                        thisGameAction
                    )  # environment runs one step given the action.
                    nextObs = envInfo['observations']  # get the next state.
                    allAction = np.concatenate(
                        [actionDistr[x] for x in range(1, 5)], axis=1)
                    record.append(
                        np.concatenate([
                            env[0].getState(), actions[0:10, :].reshape(-1),
                            allAction.reshape(-1)
                        ]))
                    rewards = envInfo['rewards']
                    dones = envInfo['local_done']
                    teamscore += sum([rewards[x] for x in env[0].getTeam0()])
                    observations[game] = nextObs

                    gameEnd[game] = np.all(dones)
                    if gameEnd[game]:
                        print('Team 0 score: %f' % teamscore)
                        simEnd = time.time()
                        print('Simulation time: %.f' % (simEnd - start))
                        recordMat = np.stack(
                            record
                        )  # stack will expand the dimension before concatenate.
                        draw(recordMat, iteration, env[game].getUnitRange(),
                             interval)
                        observations[game] = env[game].reset(iteration +
                                                             1)['observations']

        drawEnd = time.time()
        print('Drawing time: %.f' % (drawEnd - simEnd))
コード例 #6
0
ファイル: main.py プロジェクト: sevenquarkoniums/miniDota
def train():
    numAgent = 10  # multiple agents are running synchronously.
    # each agent has a different type with different properties.
    # Only one network is created, different agent gets their
    # own behavior according to the embedding input.
    numGame = 20  # multiple games running simultaneously.
    print('agent count:', numAgent)
    print('Env num:', numGame)

    env = {}
    for game in range(numGame):
        env[game] = miniDotaEnv(args, numAgent)

    # initialize the neural networks.
    # use a single network to share the knowledge.
    net = ac(args)
    if not args.cpuSimulation:
        net = net.to(device)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)
        net.load_state_dict(ckpt['net'])

    observations, lastDone = {}, {}
    for game in range(numGame):
        observations[game] = env[game].reset(0)[
            'observations']  # get initial state.
        lastDone[game] = [
            False
        ] * 10  # to record whether game is done at the previous step.

    optimizer = optim.Adam(net.parameters(), lr=args.lr)

    for iteration in range(args.max_iter):  # playing-training iteration.
        start = time.time()
        print()
        print('Start iteration %d ..' % iteration)
        if args.cpuSimulation:
            net = net.cpu()
        net.eval()  # switch to evaluation mode.
        memory = []
        for i in range(numGame):
            memory.append([Memory() for j in range(numAgent)])
            # memory is cleared at every iter so only the current iteration's samples are used in training.
            # the separation of memory according to game is necessary as they
            # need to be processed separate for each game.

        steps = 0
        teamscore = 0  # only for game 0.
        record = []  # record the states for visualization.
        gameEnd = np.zeros(numGame).astype(bool)

        while steps <= args.time_horizon:  # loop for one game.
            if np.all(gameEnd):
                break
            steps += 1
            stateList = []
            for game in range(numGame):
                for agent in range(numAgent):
                    stateList.append(
                        np.expand_dims(observations[game][agent], axis=0))
            stateCombined = np.concatenate(stateList, axis=0)
            # concatenate the states of all games and process them by the network together.
            with torch.no_grad():
                actionDistr = net(to_tensor(stateCombined, args.cpuSimulation))
            actions = get_action(actionDistr)

            for game in range(numGame):
                if not gameEnd[game]:
                    # the following random action cannot work, because random action has too small prob density value,
                    # leading to strange bugs.
                    #                    sample = random.random()
                    #                    if sample > args.randomActionRatio * (1 - min(1, iteration/1000) ):
                    #                        thisGameAction = actions[10*game:10*(game+1), :] # contain actions from all agents.
                    #                        check(thisGameAction)
                    #                    else:
                    #                        actionmove = np.random.randint(0, 3, size=(10,3))
                    #                        target = np.random.randint(0, 12, size=(10,1))
                    #                        thisGameAction = np.concatenate([actionmove, target], axis=1)
                    thisGameAction = actions[10 * game:10 * (
                        game + 1
                    ), :]  # select the actions from all agents of this env.
                    envInfo = env[game].step(
                        thisGameAction
                    )  # environment runs one step given the action.
                    nextObs = envInfo['observations']  # get the next state.
                    if game == 0:
                        record.append(
                            np.concatenate([
                                env[game].getState(),
                                actions[0:10, :].reshape(-1)
                            ]))
                    rewards = envInfo['rewards']
                    dones = envInfo['local_done']
                    #                    masks = list(~dones) # cut the return calculation at the done point.
                    masks = [
                        True
                    ] * numAgent  # no need to mask out the last state-action pair,
                    # because the last reward is useful to us.

                    for i in range(numAgent):
                        if not lastDone[game][i]:
                            memory[game][i].push(observations[game][i],
                                                 thisGameAction[i], rewards[i],
                                                 masks[i])
                    lastDone[game] = dones
                    if game == 0:
                        teamscore += sum(
                            [rewards[x] for x in env[game].getTeam0()])
                    observations[game] = nextObs

                    gameEnd[game] = np.all(dones)
                    if gameEnd[game]:
                        if game == 0:
                            print('Game 0 score: %f' % teamscore)


#                            recordMat = np.stack(record)# stack will expand the dimension before concatenate.
#                            draw(recordMat, iteration, env[game].getUnitRange(), 10)
                        observations[game] = env[game].reset(iteration +
                                                             1)['observations']
                        lastDone[game] = [False] * 10

        simEnd = time.time()
        print('Simulation time: %.f' % (simEnd - start))

        net.train()  # switch to training mode.
        net = net.cuda()

        sts, ats, returns, advants, old_policy, old_value = [], [], [], [], [], []

        for game in range(numGame):
            for i in range(numAgent):
                batch = memory[game][i].sample()
                st, at, rt, adv, old_p, old_v = process_memory(
                    net, batch, args)
                sts.append(st)
                ats.append(at)
                returns.append(rt)
                advants.append(adv)
                old_policy.append(old_p)
                old_value.append(old_v)

        sts = torch.cat(sts)
        ats = torch.cat(ats)
        returns = torch.cat(returns)
        advants = torch.cat(advants)
        old_policy = torch.cat(old_policy)
        old_value = torch.cat(old_value)

        train_model(net, optimizer, sts, ats, returns, advants, old_policy,
                    old_value, args)
        # training is based on the state-action pairs from all games of the current iteration.

        trainEnd = time.time()
        print('Training time: %.f' % (trainEnd - simEnd))

        if iteration % 10 == 0:
            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path,
                                     'ckpt_%.3f.pth.tar' % teamscore)

            save_checkpoint(
                {
                    'net': net.state_dict(),
                    'args': args,
                    'score': teamscore
                },
                filename=ckpt_path)
コード例 #7
0
ファイル: test.py プロジェクト: BinaryKR/GAIL_Sketch
    else:
        assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")


    actor.eval(), critic.eval()
    for episode in range(args.iter):
        state = env.reset()
        steps = 0
        score = 0
        for _ in range(500):
            env.render()

            # mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
            mu, std = actor(torch.Tensor(state).unsqueeze(0))
            action2 = np.argmax(get_action(mu, std)[0])
            action = get_action(mu, std)[0]
            print('mu, std :', mu, std)
            next_state, reward, done, _ = env.step(action2)
            print('1','next_state :', next_state)
            next_state = running_state(next_state) # ZFilter의 역할 : env 환경에 맞춰진 state 값을 반환
            print('2','next_state :', next_state)

            state = next_state
            score += reward

            if done:
                print("{} cumulative reward: {}".format(episode, score))
                break
    print(torch.Tensor(state).unsqueeze(0))