def fitness(self, x):
        # Returns fitness of a given individual.
        # To be implemented in subclasses
        N = math.floor(x[-4])

        env = self.CHSH(self.n_questions, self.game_type, self.max_gates, reward_function=x[-2], anneal=True)

        if self.agent_type == BasicAgent:
            agent = BasicAgent(state_size=len(env.repr_state), action_size=len(self.ALL_POSSIBLE_ACTIONS), gamma=x[0], eps=x[1], eps_min=x[2],
                               eps_decay=x[3], alpha=x[4], momentum=x[5], ALL_POSSIBLE_ACTIONS=self.ALL_POSSIBLE_ACTIONS,
                               model_type=LinearModel)
            scaler = get_scaler(env, N, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS)

        else:
            # transform actions to noncorellated encoding
            encoder = OneHotEncoder(drop='first', sparse=False)
            # transform data
            onehot = encoder.fit_transform(ALL_POSSIBLE_ACTIONS)
            onehot_to_action = dict()
            action_to_onehot = dict()
            for a, a_encoded in enumerate(onehot):
                onehot_to_action[str(a_encoded)] = a
                action_to_onehot[a] = str(a_encoded)

            HIDDEN_LAYERS = x[-3]
            agent = DQNAgent(state_size=env.state_size, action_size=len(ALL_POSSIBLE_ACTIONS), gamma=x[0], eps=x[1], eps_min=x[2],
                             eps_decay=x[3], ALL_POSSIBLE_ACTIONS=self.ALL_POSSIBLE_ACTIONS, learning_rate=x[4], hidden_layers=len(HIDDEN_LAYERS),
                             hidden_dim=HIDDEN_LAYERS, onehot_to_action=onehot_to_action, action_to_onehot=action_to_onehot)
            scaler = None

        game = Game(scaler, batch_size=x[-1])
        game.evaluate_train(N, agent, env)

        fitness_individual = game.evaluate_test(agent, env)
        return fitness_individual
    def __init__(self, **kwargs):
        args = kwargs.get('args')
        # Number of steps of training before training network's weights are
        # copied to target network (C)
        self.copy_steps = 10000
        # Number of frames to be stacked for a state representation (m)
        self.stack_num = 4
        # Number of times actions are to be repeated (k)
        self.repeat_action = 1
        # Size of minibatch
        self.minibatch_size = 32
        # Lower than this, epsilon is kept constant
        self.min_epsilon = 0.1
        # Epsilon's starting value
        self.max_epsilon = 1.0
        self.epsilon = 1.0
        # Number of steps to anneal epsilon
        self.anneal_till = 1000000
        # Discount factor
        self.discount = 0.99
        # Variable that holds the current Environment
        self.environment = AtariEnvironment(args=args)
        self.action_space = self.environment.getPossibleActions()
        # For how long should the network observe before playing?
        self.observation_time_steps = 50000
        # The network
        self.network = DQNAgent(self.action_space, self.discount, args)
        self.train_frequency = 4
        self.record_frequency = 10000
        # The current state of the environment (stacked)
        self.current_state = deque(maxlen=self.stack_num)
        self.current_state.append(self.environment.getObservation())
        # Experience replay
        self.memory_limit = 50000
        self.experience_replay = ExperienceReplay(self.memory_limit, (84, 84),
                                                  self.minibatch_size,
                                                  self.stack_num)
        # Maximum no-ops
        self.num_no_op = 0
        self.max_no_op = 30
        self.steps = 0

        self.num_epochs = 120
        self.train_steps_per_epoch = 250000
        self.num_test_epochs = 10
        self.test_steps_per_epoch = 1000
Esempio n. 3
0
    def __init__(self, env_name, dqn_variant='nature', mode='train'):
        """
        Classic Control Class is defined for train and test of the classic control problems of gym
        :param env_name: environment name shows the gym name (e.g. CartPole)
        :param dqn_variant: DQN variant shows the different variants of DQN (e.g. Nature, Dueling, Double)
        :param mode: mode could be train or test
        """
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.action_size = self.env.action_space.n
        if dqn_variant == "nature_dqn":
            self.rl_agent = DQNAgent(self.action_size,
                                     environment_type='atari',
                                     mode=mode,
                                     min_replay_buffer_size=5000,
                                     update_target_network_after=5000)
        elif dqn_variant == "double_dqn":
            self.rl_agent = DoubleDQNAgent(self.action_size,
                                           environment_type='atari',
                                           mode=mode,
                                           min_replay_buffer_size=5000,
                                           update_target_network_after=5000)
        elif dqn_variant == "prioritized_dqn":
            self.rl_agent = PrioritizedDoubleDQNAgent(
                self.action_size,
                environment_type='atari',
                mode=mode,
                min_replay_buffer_size=5000,
                update_target_network_after=5000)

        self.save_model_frequency = 20
        self.total_episode_counter = 0
        self.total_action_counter = 0
        self.episode_reward = 0

        if LOGGING:
            reward_log_dir = 'logs/gradient_tape/' + dqn_variant + '_' + mode + '/' + current_time + 'reward'
            self.reward_writer = tf.summary.create_file_writer(reward_log_dir)
            self.reward_metric = DQNMetric()

        if mode == 'train':
            self.train()
        elif mode == 'test':
            self.test()
Esempio n. 4
0
        )
        
        # agents
        agents = {}
        agents['DQN'] = DQNAgent(num_of_clusters, num_of_clusters*6,
            learning_rate=0.01,
            reward_decay=0.9,

            # Epsilon greedy
            e_greedy_min=(0.0, 0.1),
            e_greedy_max=(0.2, 0.8),
            e_greedy_init=(0.1, 0.5),
            e_greedy_increment=(0.005, 0.01),
            e_greedy_decrement=(0.005, 0.001),

            history_size=50,
            dynamic_e_greedy_iter=25,
            reward_threshold=3,
            explore_mentor = 'LRU',

            replace_target_iter=100,
            memory_size=10000,
            batch_size=128,

            output_graph=False,
            verbose=0
        )
        for (name, agent) in agents.items():

            print("-------------------- %s --------------------" % name)
            step = 0
    epsi = [1.0]
    epsi_decay = [0.05, 0.01, 0.005, 0.001]
    reward = 0
    XlearnRate = 0
    Xdiscount = 0
    Xepsi = 0
    Xdec = 0
    rewards = []
    for j in learnRates:
        for k in discounts:
            for ep in epsi:
                for epdec in epsi_decay:
                    params = [episode_count, j, k, ep, epdec]
                    #params = [5000, 0.001,0.95, 1.1, 0.005]
                    #params = [5000, 0.0005,0.99, 0.1]
                    agent = DQNAgent(env.action_space, env.observation_space,
                                     params)

                    agent._render = False
                    rewardList, stepList = agent.train(env)
                    rewards.append(rewardList)
                    if reward < sum(rewards[-1]) / episode_count:
                        reward = sum(rewards[-1]) / episode_count
                        mytrainedAgent = agent
                        XlearnRate = j
                        Xdiscount = k
                        Xepsi = ep
                        Xdec = epdec
                    plt.plot(low_pass(rewardList),
                             label=("Test: " + str(params)))

    ## Train Agent
Esempio n. 6
0
            os.remove(f)
    
    # Config
    config = Config()
    config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Special Configuration
    config.SIGMA_INIT = 0.0
    config.N_STEPS = 3


    # Env
    env = PrepareAtariEnv(env_id, log_dir)

    # Agent
    agent = DQNAgent(config, env, log_dir, static_policy=False)

    # Begin Interaction & Learning
    
    episode_reward = 0
    observation = env.reset()

    for frame_idx in tqdm(range(1, config.MAX_FRAMES+1)):
        # Prepare to explore
        eps = agent.epsilon_by_frame(frame_idx)

        # Explore or Exploit
        action = agent.get_action(observation, eps)
        agent.save_action(action, frame_idx)

        # Execute
class DQNController(object):
    def __init__(self, **kwargs):
        args = kwargs.get('args')
        # Number of steps of training before training network's weights are
        # copied to target network (C)
        self.copy_steps = 10000
        # Number of frames to be stacked for a state representation (m)
        self.stack_num = 4
        # Number of times actions are to be repeated (k)
        self.repeat_action = 1
        # Size of minibatch
        self.minibatch_size = 32
        # Lower than this, epsilon is kept constant
        self.min_epsilon = 0.1
        # Epsilon's starting value
        self.max_epsilon = 1.0
        self.epsilon = 1.0
        # Number of steps to anneal epsilon
        self.anneal_till = 1000000
        # Discount factor
        self.discount = 0.99
        # Variable that holds the current Environment
        self.environment = AtariEnvironment(args=args)
        self.action_space = self.environment.getPossibleActions()
        # For how long should the network observe before playing?
        self.observation_time_steps = 50000
        # The network
        self.network = DQNAgent(self.action_space, self.discount, args)
        self.train_frequency = 4
        self.record_frequency = 10000
        # The current state of the environment (stacked)
        self.current_state = deque(maxlen=self.stack_num)
        self.current_state.append(self.environment.getObservation())
        # Experience replay
        self.memory_limit = 50000
        self.experience_replay = ExperienceReplay(self.memory_limit, (84, 84),
                                                  self.minibatch_size,
                                                  self.stack_num)
        # Maximum no-ops
        self.num_no_op = 0
        self.max_no_op = 30
        self.steps = 0

        self.num_epochs = 120
        self.train_steps_per_epoch = 250000
        self.num_test_epochs = 10
        self.test_steps_per_epoch = 1000

    def __anneal_epsilon__(self):
        self.epsilon = max(
            self.epsilon -
            ((self.max_epsilon - self.min_epsilon) / self.anneal_till),
            self.min_epsilon)
        return

    def __sample_epsilon_action__(self):
        action = None
        if random.random() < self.epsilon:
            action = self.environment.sampleRandomAction()
        else:
            # Use the current state of the emulator and predict an action which gets
            # added to replay memory (use playing_network)
            q_values = self.network.predict(
                self.experience_replay.getCurrentState())
            action = np.argmax(q_values, axis=1)[0]
        return action

    def __supply_action_to_environment__(self, action):
        self.environment.performAction(action)
        # Add current state, action, reward, consequent state to experience replay
        self.experience_replay.add(
            (self.environment.getObservation(), action,
             self.environment.getReward(), self.environment.isTerminalState()))
        return

    def __observe__(self):
        observe_start = time.time()
        for _ in xrange(self.observation_time_steps):
            action = self.environment.sampleRandomAction()
            self.__supply_action_to_environment__(action)
        observe_duration = time.time() - observe_start
        logger.info('Finished observation. Steps=%d; Time taken=%.2f',
                    self.observation_time_steps, observe_duration)

    def isGameOver(self):
        if self.environment.isTerminalState():
            self.environment.reset()
            for _ in xrange(self.stack_num):
                action = self.environment.sampleRandomAction()
                self.__supply_action_to_environment__(action)

    def run(self):
        """This method will be called from the main() method."""
        # Observe the game by randomly sampling actions from the environment
        # and performing those actions
        self.__observe__()
        for i in xrange(self.num_epochs):
            self.environment.resetStatistics()
            time_now = time.time()
            for j in xrange(self.train_steps_per_epoch):
                # Get action using epsilon-greedy strategy
                action = self.__sample_epsilon_action__()
                # Perform action based on epsilon-greedy search and store the transitions
                # in experience replay
                self.__supply_action_to_environment__(action)
                # If the environment is in the terminal state, reset the environment, and
                # perform self.stack_num actions to reset the environment
                self.isGameOver()
                if j % self.train_frequency == 0:
                    # print "Started training"
                    # Sample minibatch of size self.minibatch_size from experience replay
                    minibatch = self.experience_replay.sample()
                    minibatch_states, minibatch_action, minibatch_reward, minibatch_next_states, \
                            minibatch_terminals = minibatch
                    cost = self.network.train_network(minibatch_states,
                                                      minibatch_action,
                                                      minibatch_reward,
                                                      minibatch_terminals,
                                                      minibatch_next_states)
                if j % self.record_frequency == 0:
                    total_score, num_games = self.environment.getStatistics()
                    avg_score = total_score / num_games
                    self.network.record_average_qvalue(
                        self.experience_replay.getCurrentState(),
                        i * self.train_steps_per_epoch + j, self.epsilon,
                        avg_score)
                # Epsilon annealing
                self.__anneal_epsilon__()
                # if self.time_step % 1000 == 0:
                #     print "Cost at iteration", self.time_step, " is", cost
                #     print "Value of epsilon is", self.epsilon
                self.steps += 1
                if j % self.copy_steps == 0:
                    self.network.copy_weights()
            total_score, num_games = self.environment.getStatistics()
            time_taken = (time.time() - time_now)
            logger.info("Finished epoch %d: Steps=%d; Time taken=%.2f", i, j,
                        time_taken)
            logger.info("\tNumber of games: %d; Average reward: %.2f",
                        num_games, (total_score / num_games))
            logger.info("\tFinal epsilon value for epoch: %f", self.epsilon)
            self.network.create_checkpoint()

    def run_testing_stage(self):
        for i in xrange(self.num_test_epochs):
            self.environment.resetStatistics()
            for _ in xrange(self.stack_num):
                action = self.environment.sampleRandomAction()
                self.__supply_action_to_environment__(action)
            for j in xrange(self.test_steps_per_epoch):
                q_values = self.network.predict(
                    self.experience_replay.getCurrentState())
                action = np.argmax(q_values, axis=1)[0]
                self.environment.performAction(action)
from agents.DQNAgent import DQNAgent
from utils.Environments import DiscEnv
import matplotlib.pyplot as plt
import numpy as np

num_act = 3
input_dim = 1
batch_size = 100
sender = DQNAgent(input_dim=input_dim,
                  num_actions=num_act,
                  batch_size=batch_size,
                  lr=0.01,
                  eps_start=0.99,
                  intermed_nodes=num_act,
                  eps_min=0.01,
                  eps_dec=5e-5,
                  capacity=7000)  #5
receiver = DQNAgent(input_dim=input_dim,
                    num_actions=num_act,
                    batch_size=batch_size,
                    lr=0.01,
                    eps_start=0.99,
                    intermed_nodes=num_act,
                    eps_min=0.01,
                    eps_dec=5e-5,
                    capacity=7000)
env = DiscEnv(num_obs=num_act, num_actions=num_act)
returns = []

print("sender action probabilities")
for s in range(num_act):
Esempio n. 9
0
eval_env = DoudizhuEnv(config)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# initialize random agent for evaluation
random_agent = RandomAgent(action_num=eval_env.action_num)

rule_agent = DouDizhuRuleAgentV1()

# initialize DQN agents
dqn_agents = []

for i in range(env.player_num):
    dqn_agents.append(DQNAgent(num_actions=env.action_num,
                               state_shape=env.state_shape,
                               lr=.000001,
                               use_conv=True,
                               dueling=False,
                               soft_update=True))

env.set_agents(dqn_agents)
eval_env.set_agents([dqn_agents[0], rule_agent, rule_agent])
print(dqn_agents[0].q_net)

eval_every = 500
eval_num = 1000
episode_num = 100_000

log_dir = './experiments/dqn_conv/'
logger = Logger(log_dir)

save_dir = './experiments/dqn_conv/models'
Esempio n. 10
0
    evaluation_tactic = [[1, 0, 0, 1],
                         [1, 0, 0, 1],
                         [1, 0, 0, 1],
                         [0, 1, 1, 0]]
    max_gates = 10
    round_to = 3
    env = Environment(n_questions, evaluation_tactic, max_gates, )

    # (state_size, action_size, gamma, eps, eps_min, eps_decay, alpha, momentum)
    # agent = BasicAgent(state_size=len(env.repr_state), action_size=len(ALL_POSSIBLE_ACTIONS), gamma=0.1, eps=1, eps_min=0.01,
    #                    eps_decay=0.9998, alpha=0.001, momentum=0.9, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS, model_type=LinearModel)

    hidden_dim = [len(env.repr_state), len(env.repr_state) // 2]
    #
    agent = DQNAgent(state_size=len(env.repr_state), action_size=len(ALL_POSSIBLE_ACTIONS), gamma=0.1, eps=1, eps_min=0.01,
                     eps_decay=0.9998, ALL_POSSIBLE_ACTIONS=ALL_POSSIBLE_ACTIONS, learning_rate=0.001, hidden_layers=len(hidden_dim),
                     hidden_dim=hidden_dim)

    # scaler = get_scaler(env, N, ALL_POSSIBLE_ACTIONS, round_to=round_to)
    batch_size = 128

    # store the final value of the portfolio (end of episode)
    game = Game(round_to=round_to)
    portfolio_value, rewards = game.evaluate_train(N, agent, env)

    # plot relevant information
    NonLocalGame.show_plot_of(rewards, "reward")

    if agent.model.losses is not None:
        NonLocalGame.show_plot_of(agent.model.losses, "loss")
Esempio n. 11
0
def DQN_Exploration(args, log_dir, device, initial_state):
    env = NqubitEnvDiscrete(args.nbit,
                            initial_state)  # env.get_easy_T() remained to do
    agent = DQNAgent(args, env, log_dir, device)
    writer = SummaryWriter(log_dir)

    Temp = args.Temp
    totalstep = 0
    epsilon = 1.0
    obs = env.reset()
    print('initial_reward{0}'.format(env.get_current_threshold(obs)))

    for episode in tqdm(range(args.num_episodes)):
        Temp = Temp * 10.0**(-0.1)
        obs = env.reset()

        for step in tqdm(range(args.episode_length)):

            # choose large stepsize action number
            action = agent.get_action(obs, epsilon)
            # aciton <class 'int'>

            # execute large stepsize number if it satisfies the strong constraint
            next_obs, reward, done, info = env.step(obs, action,
                                                    args.action_delta)
            #agent.buffer.push((obs, action, reward, next_obs))

            # judge the large action stepsize effect
            # if ep = 0 : large stepsize is useless

            ep, action_delta = agent.prob(obs, next_obs, action)

            accept_probability = 1 if (ep > 0) else np.exp(ep / Temp)
            u = random.random()

            if u <= accept_probability:  # take a small stepsize
                #agent.buffer.push((obs, action, reward, next_obs))

                next_obs, reward, done, info = env.step(
                    obs, action, action_delta)
            else:  # No operation, the transition will be (obs, 0, reward, obs)
                action = 0
                next_obs, reward, done, info = env.step(
                    obs, action, action_delta)

            # record
            writer.add_scalar('threshold_rew', reward, totalstep)

            agent.buffer.push((obs, action, reward, next_obs))

            if (totalstep > args.learn_start_steps) and (
                    totalstep % args.update_freq == 0):
                loss = agent.update()
                writer.add_scalar('loss', loss, totalstep)
                epsilon = agent.epsilon_by_step(totalstep)
                if epsilon < args.epsilon_min:
                    epsilon = args.epsilon_min

            obs = next_obs
            totalstep += 1
            if (reward >= -1.0):
                return reward, obs

            # Test_DQN_Agent
            if (totalstep % args.test_freq == 0):
                test_epsilon = 0.0
                test_obs = env.reset()
                #T = env.get_easy_T(args.nbits)
                reward_recorder = -2.0
                obs_recorder = test_obs

                for step in range(args.test_step):
                    test_action = agent.get_action(test_obs, test_epsilon)

                    # execute large stepsize number
                    test_next_obs, reward, done, info = env.step(
                        test_obs, test_action, args.action_delta)

                    # judge the large action stepsize effect
                    ep, action_delta = agent.prob(test_obs, test_next_obs,
                                                  test_action)

                    accept_probability = 1 if (ep > 0) else np.exp(ep / Temp)
                    u = random.random()

                    if u <= accept_probability:  # take a small stepsize

                        test_next_obs, reward, done, info = env.step(
                            test_obs, test_action, action_delta)
                    else:
                        action = 0
                        test_next_obs = test_obs
                        reward = env.get_current_threshold(test_obs)

                    if reward > reward_recorder:
                        reward_recorder = reward
                        obs_recorder = test_next_obs
                    if (reward >= -1.0):
                        return reward, test_obs

                    agent.buffer.push(
                        (test_obs, action, reward, test_next_obs))
                    test_obs = test_next_obs

                writer.add_scalar('test_max_reward', reward_recorder,
                                  totalstep)
                writer.add_scalars(
                    'solution', {
                        's0': obs_recorder[0],
                        's1': obs_recorder[1],
                        's2': obs_recorder[2],
                        's3': obs_recorder[3],
                        's4': obs_recorder[4],
                        's5': obs_recorder[5]
                    }, totalstep)
Esempio n. 12
0
    def __init__(self,
                 # these are the parameters in nfsp paper
                 scope,
                 num_actions,
                 state_shape,
                 sl_lr=.005,
                 rl_lr=.1,
                 batch_size=256,
                 train_every=128,
                 sl_memory_init_size=1000,
                 sl_memory_size=int(2e6),
                 q_train_every=128,
                 epsilon_decay_steps=int(1e5),
                 eta=.2,
                 gamma=.99,
                 device=None):
        if device is None:
            self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
        self.scope = scope
        self.num_actions = num_actions
        self.state_shape = state_shape
        self.rl_lr = rl_lr
        self.sl_lr = sl_lr
        self.batch_size = batch_size
        self.train_every = train_every
        self.discount_factor = gamma
        self.sl_memory_init_size = sl_memory_init_size
        self.q_train_every = q_train_every
        self.anticipatory_param = eta
        self.device = device
        self.use_raw = False

        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilons = np.linspace(0.08, 0.0, epsilon_decay_steps)
        # self.epsilons = np.linspace(1.0, 0.1, epsilon_decay_steps)

        # average policy can be modeled as a Deep Q Network and we take softmax after final layer
        self.average_policy = AveragePolicyNet(state_shape=state_shape,
                                               num_actions=num_actions,
                                               use_conv=True,).to(self.device)
        self.average_policy.eval()

        # action value and target network are Deep Q Networks
        self.rl_agent = DQNAgent(state_shape=self.state_shape,
                                 num_actions=self.num_actions,
                                 lr=self.rl_lr,
                                 batch_size=128,
                                 train_every=64,
                                 epsilons=self.epsilons,
                                 )

        # initialize optimizers
        """
        in the paper: using sgd optim, eta = 0.1.
        rl_lr = 0.1, sl_lr = 0.005,
        epsilon decay from 0.06 to 0
        """
        self.sl_optim = torch.optim.Adam(self.average_policy.parameters(), lr=self.sl_lr)

        # initialize memory buffers
        self.sl_buffer = ReservoirMemoryBuffer(sl_memory_size, batch_size)

        # current policy
        self.policy = None

        self.softmax = torch.nn.Softmax(dim=1)

        self.timestep = 0

        # for plotting
        self.loss = 0
        self.actions = []
        self.predictions = []
Esempio n. 13
0
class NFSPAgent:
    """
    Parameters:
        num_actions (int) : how many possible actions
        state_shape (list) : tensor shape of state
        sl_hidden_layers (list) : hidden layer sizes to use for average policy net for supervised learning
        rl_hidden_layers (list) : hidden layer sizes to use for best response net for reinforcement learning
        sl_lr (float) : learning rate to use for training average policy net
        rl_lr (float) : learning rate to use for training action value net
        batch_size (int) : batch sizes to use when training networks
        rl_memory_size (int) : max number of experiences to store in reinforcement learning memory buffer
        sl_memory_size (int) : max number of experiences to store in supervised learning memory buffer
        q_update_every (int) : how often to copy parameters to target network
        epsilons (list) : list of epsilon values to use over training period
        epsilon_decay_steps (int) : how often should we decay epsilon value
        eta (float) : anticipatory parameter for NFSP
        gamma (float) : discount parameter
        device (torch.device) : device to put models on
    """
    def __init__(self,
                 # these are the parameters in nfsp paper
                 scope,
                 num_actions,
                 state_shape,
                 sl_lr=.005,
                 rl_lr=.1,
                 batch_size=256,
                 train_every=128,
                 sl_memory_init_size=1000,
                 sl_memory_size=int(2e6),
                 q_train_every=128,
                 epsilon_decay_steps=int(1e5),
                 eta=.2,
                 gamma=.99,
                 device=None):
        if device is None:
            self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
        self.scope = scope
        self.num_actions = num_actions
        self.state_shape = state_shape
        self.rl_lr = rl_lr
        self.sl_lr = sl_lr
        self.batch_size = batch_size
        self.train_every = train_every
        self.discount_factor = gamma
        self.sl_memory_init_size = sl_memory_init_size
        self.q_train_every = q_train_every
        self.anticipatory_param = eta
        self.device = device
        self.use_raw = False

        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilons = np.linspace(0.08, 0.0, epsilon_decay_steps)
        # self.epsilons = np.linspace(1.0, 0.1, epsilon_decay_steps)

        # average policy can be modeled as a Deep Q Network and we take softmax after final layer
        self.average_policy = AveragePolicyNet(state_shape=state_shape,
                                               num_actions=num_actions,
                                               use_conv=True,).to(self.device)
        self.average_policy.eval()

        # action value and target network are Deep Q Networks
        self.rl_agent = DQNAgent(state_shape=self.state_shape,
                                 num_actions=self.num_actions,
                                 lr=self.rl_lr,
                                 batch_size=128,
                                 train_every=64,
                                 epsilons=self.epsilons,
                                 )

        # initialize optimizers
        """
        in the paper: using sgd optim, eta = 0.1.
        rl_lr = 0.1, sl_lr = 0.005,
        epsilon decay from 0.06 to 0
        """
        self.sl_optim = torch.optim.Adam(self.average_policy.parameters(), lr=self.sl_lr)

        # initialize memory buffers
        self.sl_buffer = ReservoirMemoryBuffer(sl_memory_size, batch_size)

        # current policy
        self.policy = None

        self.softmax = torch.nn.Softmax(dim=1)

        self.timestep = 0

        # for plotting
        self.loss = 0
        self.actions = []
        self.predictions = []

    def set_policy(self, policy=None):
        """
            Set policy parameter
            Input :
                policy (str) : policy to use. sets according to anticipatory parameter on default.
            Output :
                None, sets policy parameter
        """
        # set policy according to string
        if policy and policy in ['average_policy', 'best_response', 'greedy_average_policy']:
            self.policy = policy
        else:
            self.policy = 'best_response' if np.random.uniform() <= self.anticipatory_param else 'average_policy'
        return self.policy

    def ap_pick_action(self, state):
        """

         Pick an action given a state using the average policy network
         Input:
             state (dict)
                 'obs' : actual state representation
                 'legal_actions' : possible legal actions to be taken from this state
         Output:
             action (int) : integer representing action id
         """
        with torch.no_grad():
            state_obs = torch.FloatTensor(state['obs']).unsqueeze(0).to(self.device)
            q_values = self.average_policy(state_obs)[0].cpu().detach().numpy()
            probs = remove_illegal(q_values, state['legal_actions'])
            action = np.random.choice(self.num_actions, p=probs)
            # print('sl: ', action)
            # print(q_values, action)
            return action, probs

    def greedy_ap_pick_action(self, state):
        """
        Pick an action greedily given a state using the average policy network
        Input:
            state (dict)
                'obs' : actual state representation
                'legal_actions' : possible legal actions to be taken from this state
        Output:
            action (int) : integer representing action id
        """
        with torch.no_grad():
            state_obs = torch.FloatTensor(state['obs']).unsqueeze(0).to(self.device)
            q_values = self.average_policy(state_obs)[0].cpu().detach().numpy()
            probs = remove_illegal(q_values, state['legal_actions'])
            action = np.argmax(probs)
            return action, probs

    def step(self, state):
        """
        Given state, produce actions to generate training data. Choose action according to set policy parameter.
        Input:
            state (dict)
                'obs' : actual state representation
                'legal_actions' : possible legal actions to be taken from this state
        Output:
            action (int) : integer representing action id
        """

        if self.policy == 'average_policy':
            action = self.ap_pick_action(state)[0]
        elif self.policy == 'best_response':
            action = self.rl_agent.step(state)

        return action

    def eval_step(self, state):
        """
           Pick an action given a state according to set policy. This is to be used during evaluation, so no epsilon greedy.
           Makes call to eval_pick_action or average_policy to actually select the action
           Input:
               state (dict)
                   'obs' : actual state representation
                   'legal_actions' : possible legal actions to be taken from this state
           Output:
               action (int) : integer representing action id
               probs (np.array) : softmax distribution over the actions
        """
        if self.policy == 'average_policy':
            action, probs = self.ap_pick_action(state)
        elif self.policy == 'greedy_average_policy':
            action, probs = self.greedy_ap_pick_action(state)
        elif self.policy == 'best_response':
            action, probs = self.rl_agent.eval_step(state)
        self.actions.append(action)

        return action, probs

    def add_transition(self, transition):
        """"
        Add transition to our memory buffers and train the networks one batch.
        Input:
            transition (tuple) : tuple representation of a transition --> (state, action, reward, next state, done)
        Output:
            Nothing. Stores transition in the buffers, updates networks using memory buffers, and updates target network
            depending on what timestep we're at.
        """

        state, action, reward, next_state, done = transition
        self.rl_agent.add_transition(transition)
        self.timestep += 1

        if self.policy == 'best_response':
            # this version saving the predicted action from dqn_eval instead of the action that was taken by the agent.
            self.sl_buffer.add_sa(state['obs'], action)

        if len(self.sl_buffer.memory) >= self.sl_memory_init_size and self.timestep % self.train_every == 0:
            sl_loss = self.train_sl()
            print(f'\rAgent {self.scope}, step: {self.timestep}, sl_loss on batch: {sl_loss}', end='')
            # print(f'step: {self.timestep} average policy updated')

    def train_sl(self):
        """
        Samples from supervised learning memory buffer and trains the average policy network one step.
        Input:
            Nothing. Draws sample from sl buffer to train the network
        Output:
            loss (float) : loss on training batch
        """

        samples = self.sl_buffer.sample()

        states = [s[0] for s in samples]
        actions = [s[1] for s in samples]

        self.average_policy.train()
        self.sl_optim.zero_grad()

        # [batch, state_shape(450)]
        states = torch.FloatTensor(states).to(self.device)
        # [batch, 1]
        actions = torch.LongTensor(actions).to(self.device)

        #### optimizing the log-prob of past actions taken as in NFSP paper
        # [batch, action_num(309)]
        probs = self.average_policy(states)
        # [batch, 1]
        prob = probs.gather(1, actions.unsqueeze(1)).squeeze(1)
        # adding a small eps to torch.log(), avoiding nan in the log_prob
        eps = 1e-7
        log_prob = torch.log(prob + eps)

        ### look into torch.nll_loss
        loss = -log_prob.mean()

        loss.backward()
        self.sl_optim.step()
        self.average_policy.eval()

        return loss.item()

    def save_state_dict(self, file_path):
        """
        Save state dict for networks of NFSP agent
        Input:
            file_path (str) : string filepath to save agent at
        """
        state_dict = dict()
        state_dict['average_policy'] = self.average_policy.state_dict()
        state_dict['dqn_net'] = self.rl_agent.q_net.state_dict()
        state_dict['dqn_target'] = self.rl_agent.target_net.state_dict()

        torch.save(state_dict, file_path)

    def load_from_state_dict(self, filepath):
        """
           Load agent parameters from filepath
           Input:
               file_path (str) : string filepath to load parameters from
        """

        state_dict = torch.load(filepath, map_location=self.device)
        self.average_policy.load_state_dict(state_dict['average_policy'])
        self.rl_agent.q_net.load_state_dict(state_dict['dqn_net'])
        self.rl_agent.target_net.load_state_dict(state_dict['dqn_target'])
Esempio n. 14
0
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--type",
                        choices=["Atari", "Classic"],
                        help="Select the Type of Game from OpenAI gym",
                        required=True)
    parser.add_argument("--name",
                        help="Select the Name of Game eg. Breakout-v0",
                        required=True)
    parser.add_argument("--mode",
                        choices=["train", "test"],
                        help="Choose to Train or Test",
                        default="train",
                        required=False)
    args = parser.parse_args()

    if args.type == "Classic":
        environment = BaseGymEnvironment(args.type, args.name)
    elif args.type == "Atari":
        environment = AtariGymEnvironment(args.type, args.name)

    input_shape = environment.observation_shape()
    nb_actions = environment.nb_actions()

    agent = DQNAgent(args.type, args.name, input_shape, nb_actions)
    if args.mode == "train":
        agent.learn(environment)
    else:
        agent.play(environment)