Example #1
0
def play(env_name, seed=42, model=None, render=True):
    # Create the environment
    env = make_env(env_name, seed)
    # Get PyTorch device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Create Qnetwork
    state = torch.load(
        model, map_location="cuda" if torch.cuda.is_available() else "cpu")
    net = QNetwork(env.observation_space,
                   env.action_space,
                   arch=state['arch'],
                   dueling=state.get('dueling', False)).to(device)
    net.load_state_dict(state['state_dict'])

    total_returns = []
    obs, ep_return, ep_len = env.reset(), 0, 0
    while len(total_returns) < 42:

        action = net(torch.from_numpy(np.expand_dims(
            obs, 0)).to(device)).argmax(dim=1)[0]
        obs, reward, done, _ = env.step(action)
        if render:
            env.render()
        ep_return += reward
        ep_len += 1

        if done:
            total_returns.append(ep_return)
            print("Episode:", ep_return, '\t\t', ep_len)
            obs, ep_return, ep_len = env.reset(), 0, 0

        if render:
            time.sleep(0.01)

    print("Mean return", sum(total_returns) / len(total_returns))
Example #2
0
    def __init__(self, 
                 env: 'Environment',
                 input_frame: ('int: the number of channels of input image'),
                 input_dim: ('int: the width and height of pre-processed input image'),
                 num_frames: ('int: Total number of frames'),
                 eps_decay: ('float: Epsilon Decay_rate'),
                 gamma: ('float: Discount Factor'),
                 target_update_freq: ('int: Target Update Frequency (by frames)'),
                 update_type: ('str: Update type for target network. Hard or Soft')='hard',
                 soft_update_tau: ('float: Soft update ratio')=None,
                 batch_size: ('int: Update batch size')=32,
                 buffer_size: ('int: Replay buffer size')=1000000,
                 update_start_buffer_size: ('int: Update starting buffer size')=50000,
                 learning_rate: ('float: Learning rate')=0.0004,
                 eps_min: ('float: Epsilon Min')=0.1,
                 eps_max: ('float: Epsilon Max')=1.0,
                 device_num: ('int: GPU device number')=0,
                 rand_seed: ('int: Random seed')=None,
                 plot_option: ('str: Plotting option')=False,
                 model_path: ('str: Model saving path')='./'):
        
        action_dim = env.action_space.n
        self.device = torch.device(f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path
        
        self.env = env
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option
        
        self.q_current = QNetwork((self.input_frames, self.input_dim, self.input_dim), action_dim).to(self.device)
        self.q_target = QNetwork((self.input_frames, self.input_dim, self.input_dim), action_dim).to(self.device)
        self.q_target.load_state_dict(self.q_current.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_current.parameters(), lr=learning_rate) 

        self.memory = ReplayBuffer(self.buffer_size, (self.input_frames, self.input_dim, self.input_dim), self.batch_size)
Example #3
0
    def __init__(self, task):
        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        self.w = np.random.normal(
            size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        
        self.learning_rate = 0.0001
        self.hidden_size = 64
        tf.reset_default_graph()
        self.mainQN = QNetwork(name='main', hidden_size=self.hidden_size, learning_rate=self.learning_rate)

        # Episode variables
        self.reset_episode()
Example #4
0
 def __init__(self,state_space,action_space,seed,update_every,batch_size,buffer_size,learning_rate):
     self.action_space = action_space
     self.state_space = state_space
     self.seed = random.seed(seed)
     self.batch_size = batch_size
     self.buffer_size = buffer_size
     self.learning_rate = learning_rate
     self.update_every = update_every
     
     self.qnetwork_local = QNetwork(state_space,action_space)
     self.qnetwork_target = QNetwork(state_space,action_space)
     self.optimizer = optim.Adam(self.qnetwork_local.parameters(),lr=learning_rate)
     # Initialize replaybuffer
     self.memory = ReplayBuffer(action_space,buffer_size,buffer_size,seed)
     # Initialize time step (for updating every UPDATE_EVERY steps)
     self.t_step = 0
Example #5
0
def main(flags):
    '''
        Runs an agent in an environment.
        params:
            flags (dict): configuration
    '''
    env = gym.make('FrozenLake-v0')
    #    env.seed(42)
    #
    #    import numpy as np
    #    np.random.seed(42)
    #
    #    import tensorflow as tf
    #    tf.set_random_seed(42)

    agent = QNetwork(env,
                     gamma=flags.gamma,
                     learning_rate=flags.learning_rate,
                     num_units=flags.num_units,
                     num_layers=flags.num_layers)
    #    agent = QTable(env,
    #                   gamma=flags.gamma,
    #                   alpha=flags.learning_rate)

    trainer = Trainer(env, agent, flags)
    rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps)

    plot_results(rewards, lengths)
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 GAMMA=GAMMA,
                 TAU=TAU,
                 LR=LR,
                 UPDATE_EVERY=UPDATE_EVERY,
                 BUFFER_SIZE=BUFFER_SIZE,
                 BATCH_SIZE=BATCH_SIZE):
        """ Initialize the agent.
        ==========
        PARAMETERS 
        ==========
            state_size (int) = observation dimension of the environment
            action_size (int) = dimension of each action
            seed (int) = random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # instantiate online local and target network for weight updates
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        # create a replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # time steps for updating target network every time t_step % 4 == 0
        self.t_step = 0
Example #7
0
 def __init__(self, state_size,action_size, seed):
     """
     Params
     ======
         state_size (int): dimension of each state
         action_size (int): dimension of each action
         seed (int): random seed
     """    
     self.state_size = state_size
     self.action_size = action_size
     self.seed = random.seed(seed)
     
     # Q_Network
     self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
     self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
     self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
     
     # Replay Buffer
     self.replay_buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
     self.t_step = 0
Example #8
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
def main(flags):
    '''
        Runs an agent in an environment.
        params:
            flags (dict): configuration
    '''
    env = gym.make('FrozenLake-v0')

    agent = QNetwork(env,
                     gamma=flags.gamma,
                     learning_rate=flags.learning_rate,
                     num_units=flags.num_units,
                     num_layers=flags.num_layers)

    trainer = Trainer(env, agent, flags)
    rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps)

    plot_results(rewards, lengths)
Example #10
0
def train(gamepath, n_episodes,  display_screen,  record_weights,  reduce_exploration_prob_amount, n_frames_to_skip, exploration_prob, verbose, discount, learning_rate, load_weights, frozen_target_update_period, use_replay_mem):
    """
    :description: trains an agent to play a game 

    :type gamepath: string 
    :param gamepath: path to the binary of the game to be played

    :type n_episodes: int 
    :param n_episodes: number of episodes of the game on which to train

    display_screen : whether or not to display the screen of the game 
    
    record_weights : whether or not to save the weights of the nextwork
    
    reduce_exploration_prob_amount : amount to reduce exploration prob each episode
                                     to not reduce exploration_prob set to 0
    
    n_frames_to_skip : how frequently to determine a new action to use
    
    exploration_prob : probability of choosing a random action
    
    verbose : whether or not to print information about the run periodically
    
    discount : discount factor used in learning 
    
    learning_rate : the scaling factor for the sgd update
    
    load_weights : whether or not to load weights for the network (set the files directly below)
    
    frozen_target_update_period : the number of episodes between reseting the target of the network
    """

    # load the ale interface to interact with
    ale = ALEInterface()
    ale.setInt('random_seed', 42)

    # display/recording settings, doesn't seem to work currently
    recordings_dir = './recordings/breakout/'
    # previously "USE_SDL"
    if display_screen:
        if sys.platform == 'darwin':
            import pygame
            pygame.init()
            ale.setBool('sound', False) # Sound doesn't work on OSX
            #ale.setString("record_screen_dir", recordings_dir);
        elif sys.platform.startswith('linux'):
            ale.setBool('sound', True)
        ale.setBool('display_screen', True)

    ale.loadROM(gamepath)
    ale.setInt("frame_skip", n_frames_to_skip)
    # real actions for breakout are [0,1,3,4]
    real_actions = ale.getMinimalActionSet()

    # use a list of actions [0,1,2,3] to index into the array of real actions
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    # load weights by file name
    # currently must be loaded by individual hidden layers
    if load_weights:
        hidden_layer_1 = file_utils.load_model('weights/hidden0_replay.pkl')
        hidden_layer_2 = file_utils.load_model('weights/hidden1_replay.pkl')
    else:
        # defining the hidden layer network structure
        # the n_hid of a prior layer must equal the n_vis of a subsequent layer
        # for q-learning the output layer must be of len(actions)
        hidden_layer_1 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=NNET_INPUT_DIMENSION, layer_name='hidden1', activation='relu')
        hidden_layer_2 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=NNET_INPUT_DIMENSION, layer_name='hidden2', activation='relu')
    hidden_layer_3 = HiddenLayer(n_vis=NNET_INPUT_DIMENSION, 
            n_hid=len(actions), layer_name='hidden3', activation='relu') 
    # the output layer is currently necessary when using tanh units in the
    # hidden layer in order to prevent a theano warning
    # currently the relu unit setting of the hidden and output layers is leaky w/ alpha=0.01
    output_layer = OutputLayer(layer_name='output', activation='relu')

    # pass a list of layers to the constructor of the network (here called "mlp")
    layers = [hidden_layer_1, hidden_layer_2, hidden_layer_3, output_layer]
    qnetwork = QNetwork(layers, discount=discount, learning_rate=learning_rate)

    # this call gets the symbolic output of the network
    # along with the parameter updates expected
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward, next_features)

    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(NNET_INPUT_DIMENSION)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(NNET_INPUT_DIMENSION))],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    sym_action = qnetwork.get_action(features)
    get_action = theano.function([features], sym_action)

    # some containers for collecting information about the training processes 
    rewards = []
    losses = []
    best_reward = 4
    sequence_examples = []
    sampled_examples = []

    # the preprocessor and feature extractor to use
    preprocessor = screen_utils.RGBScreenPreprocessor()
    feature_extractor = feature_extractors.NNetOpenCVBoundingBoxExtractor(max_features=MAX_FEATURES)

    if use_replay_mem:
        replay_mem = ReplayMemory()
    # main training loop, each episode is a full playthrough of the game
    for episode in xrange(n_episodes):

        # this implements the frozen target component of the network
        # by setting the frozen layers of the network to a copy of the current layers
        if episode % frozen_target_update_period == 0:
            qnetwork.frozen_layers = copy.deepcopy(qnetwork.layers)


        # some variables for collecting information about this particular run of the game
        total_reward = 0
        action = 1
        counter = 0
        reward = 0
        loss = 0
        previous_param_0 = None

        # lives here is used for the reward heuristic of subtracting 1 from the reward 
        # when we lose a life. currently commented out this functionality because
        # i think it might not be helpful.
        lives = ale.lives()

        # the initial state of the screen and state
        screen = np.zeros((preprocessor.dim, preprocessor.dim, preprocessor.channels))
        state = { "screen" : screen, "objects" : None, "prev_objects": None, "features": np.zeros(MAX_FEATURES)}
        
        # start the actual play through of the game
        while not ale.game_over():
            counter += 1

            # get the current features, which is the representation of the state provided to 
            # the "agent" (here just the network directly)
            features = state["features"]

            # epsilon greedy action selection (note that exploration_prob is reduced by
            # reduce_exploration_prob_amount after every game)
            if random.random() < exploration_prob: 
                action = random.choice(actions)
            else:
                # to choose an action from the network, we fprop 
                # the current state and take the argmax of the output
                # layer (i.e., the action that corresponds to the 
                # maximum q value)
                action = get_action(features)

            # take the action and receive the reward
            reward += ale.act(real_actions[action])

            # this is commented out because i think it might not be helpful
            if ale.lives() < lives: 
                 lives = ale.lives()
                 reward -= 1


            # get the next screen, preprocess it, initialize the next state
            next_screen = ale.getScreenRGB()
            next_screen = preprocessor.preprocess(next_screen)
            next_state = {"screen": next_screen, "objects": None, "prev_objects": state["objects"]}

            # get the features for the next state
            next_features = feature_extractor(next_state, action=None)

            if use_replay_mem:
                sars_tuple = (features, action, reward, next_features)
                replay_mem.store(sars_tuple)
                num_samples = 5 if replay_mem.isFull() else 1
                for i in range(0, num_samples):
                    random_train_tuple = replay_mem.sample()
                    loss += train_model(*random_train_tuple)

                # collect for pca
                sequence_examples.append(list(sars_tuple[0]) + [sars_tuple[1]] \
                         + [sars_tuple[2]] + sars_tuple[3])
                sequence_examples = sequence_examples[-100:]
                sampled_examples.append(list(random_train_tuple[0]) + [random_train_tuple[1]] \
                        + [random_train_tuple[2]] + random_train_tuple[3])
                sampled_examples = sampled_examples[-100:]
            else:
                # call the train model function
                loss += train_model(features, action, reward, next_features)
            # prepare for the next loop through the game
            next_state["features"] = next_features
            state = next_state
                
            # weird counter value to avoid interaction with any other counter
            # loop that might be added, not necessary right now
            if verbose and counter % PRINT_TRAINING_INFO_PERIOD == 0:
                print('*' * 15 + ' training information ' + '*' * 15) 
                print('episode: {}'.format(episode))
                print('reward: \t{}'.format(reward))
                print('avg reward: \t{}'.format(np.mean(rewards)))
                print 'avg reward (last 25): \t{}'.format(np.mean(rewards[-NUM_EPISODES_AVERAGE_REWARD_OVER:]))
                print('action: \t{}'.format(real_actions[action]))
                print('exploration prob: {}'.format(exploration_prob))
                
                param_info = [(p.eval(), p.name) for p in qnetwork.get_params()]
                for index, (val, name) in enumerate(param_info):
                    if previous_param_0 is None and index == 0:
                        previous_param_0 = val
                    print('parameter {} value: \n{}'.format(name, val))
                    if index == 0:
                        diff = val - previous_param_0
                        print('difference from previous param {}: \n{}'.format(name, diff))

                print('features: \t{}'.format(features))
                print('next_features: \t{}'.format(next_features))

                scaled_sequence = preprocessing.scale(np.array(sequence_examples))
                scaled_sampled = preprocessing.scale(np.array(sampled_examples))
                pca = PCA()
                _ = pca.fit_transform(scaled_sequence)
                print('variance explained by first component for sequence: {}%'.format(pca. \
                    explained_variance_ratio_[0] * 100))
                _ = pca.fit_transform(scaled_sampled)
                print('variance explained by first component for sampled: {}%'.format(pca. \
                    explained_variance_ratio_[0] * 100))

                print('*' * 52)
                print('\n')

            # collect info and total reward and also reset the reward to 0 if we reach this point
            total_reward += reward
            reward = 0
        # collect stats from this game run    
        losses.append(loss)
        rewards.append(total_reward)
    
        # if we got a best reward, inform the user 
        if total_reward > best_reward:
            best_reward = total_reward
            print("best reward!: {}".format(total_reward))

        # record the weights if record_weights=True
        # must record the weights of the indiviual layers
        # only save hidden layers b/c output layer does not have weights
        if episode != 0 and episode % RECORD_WEIGHTS_PERIOD == 0 and record_weights:
            file_utils.save_rewards(rewards)
            file_utils.save_model(qnetwork.layers[0], 'weights/hidden0_{}.pkl'.format(episode))
            file_utils.save_model(qnetwork.layers[1], 'weights/hidden1_{}.pkl'.format(episode))

        # reduce exploration policy over time
        if exploration_prob > MINIMUM_EXPLORATION_EPSILON:
            exploration_prob -= reduce_exploration_prob_amount
        
        # inform user of how the episode went and reset the game
        print('episode: {} ended with score: {}\tloss: {}'.format(episode, rewards[-1], losses[-1]))
        ale.reset_game()

    # return the list of rewards attained
    return rewards
def simulate_symbolic_online_RL_algorithm(mdp, num_episodes, max_iterations):
    
    real_actions = mdp.actions(None)
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')
    learning_rate_symbol = T.dscalar('learning_rate')

    h1 = HiddenLayer(n_vis=INPUT_DIM,  n_hid=HIDDEN_DIM, layer_name='h1')
    h2 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h2')
    h3 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h3')
    h4 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h4')
    # h5 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h5')
    # h6 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h6')
        
    layers = [h1, h2, h3, h4] #, h3, h4, h5, h6]
    learning_rate = 1e-2
    explorationProb = .4
    regularization_weight = 1e-5
    momentum_rate = 9e-1
    qnetwork = QNetwork(layers, discount=mdp.discount,  
                            momentum_rate=momentum_rate,
                            regularization_weight=regularization_weight)

    exploration_reduction = (explorationProb - MIN_EXPLORATION_PROB) / num_episodes
    learning_rate_reduction = (learning_rate - MIN_LEARNING_RATE) / num_episodes

    # this call gets the symbolic output of the network along with the parameter updates
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward, next_features, learning_rate_symbol)

    print 'Building Training Function...'
    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(INPUT_DIM)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(HIDDEN_DIM)),
                    learning_rate_symbol],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    get_action = theano.function([features], qnetwork.get_action(features))

    total_rewards = []
    total_losses = []
    weight_magnitudes = []

    print 'Starting Training...'
    replay_mem = replay_memory.ReplayMemory()
    for episode in xrange(num_episodes):
        
        state = np.array(mdp.start_state)
        total_reward = 0
        total_loss = 0
        for iteration in xrange(max_iterations):
            
            if random.random() < explorationProb:
                action = random.choice(actions)
            else:
                action = get_action(state)

            real_action = real_actions[action]
            transitions = mdp.succAndProbReward(state, real_action)

            if len(transitions) == 0:
                # loss += train_model(state, action, 0, next_features)
                break

            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            newState = np.array(newState)

            sars_tuple = (state, action, np.clip(reward,-1,1), newState)
            replay_mem.store(sars_tuple)
            num_samples = 5 if replay_mem.isFull() else 1
            for i in range(0, num_samples):
                random_train_tuple = replay_mem.sample()
                sample_state = random_train_tuple[0]
                sample_action = random_train_tuple[1]
                sample_reward = random_train_tuple[2]
                sample_new_state = random_train_tuple[3]

                total_loss += train_model(sample_state, sample_action, sample_reward, sample_new_state, learning_rate)

            total_reward += reward
            state = newState

        explorationProb -= exploration_reduction
        learning_rate -= learning_rate_reduction


        total_rewards.append(total_reward * mdp.discount ** iteration)
        total_losses.append(total_loss)
        weight_magnitude = qnetwork.get_weight_magnitude()
        weight_magnitudes.append(weight_magnitude)

        print 'episode: {}\t\t loss: {}\t\t reward: {}\t\tweight magnitude: {}'.format(episode, round(total_loss, 2), total_reward, weight_magnitude)

    # return the list of rewards attained
    return total_rewards, total_losses
Example #12
0
    def __init__(
            self,
            env: 'Environment',
            input_frame: ('int: the number of channels of input image'),
            input_dim: (
                'int: the width and height of pre-processed input image'),
            num_frames: ('int: Total number of frames'),
            skipped_frame: ('int: The number of skipped frames'),
            eps_decay: ('float: Epsilon Decay_rate'),
            gamma: ('float: Discount Factor'),
            target_update_freq: ('int: Target Update Frequency (by frames)'),
            update_type: (
                'str: Update type for target network. Hard or Soft') = 'hard',
            soft_update_tau: ('float: Soft update ratio') = None,
            batch_size: ('int: Update batch size') = 32,
            buffer_size: ('int: Replay buffer size') = 1000000,
            update_start_buffer_size: (
                'int: Update starting buffer size') = 50000,
            learning_rate: ('float: Learning rate') = 0.0004,
            eps_min: ('float: Epsilon Min') = 0.1,
            eps_max: ('float: Epsilon Max') = 1.0,
            device_num: ('int: GPU device number') = 0,
            rand_seed: ('int: Random seed') = None,
            plot_option: ('str: Plotting option') = False,
            model_path: ('str: Model saving path') = './'):

        self.action_dim = env.action_space.n
        self.device = torch.device(
            f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path

        self.env = env
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.skipped_frame = skipped_frame
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option

        self.q_current = QNetwork(
            (self.input_frames, self.input_dim, self.input_dim),
            self.action_dim).to(self.device)
        self.q_target = QNetwork(
            (self.input_frames, self.input_dim, self.input_dim),
            self.action_dim).to(self.device)
        # self.q_current.load_state_dict(torch.load("/home/ubuntu/playground/MacaronRL_prev/Value_Based/DuelingDQN/model_save/890_BreakoutNoFrameskip-v4_num_f:10000000_eps_dec:8.3e-07f_gamma:0.99_tar_up_frq:150f_up_type:hard_soft_tau:0.002f_batch:32_buffer:750000f_up_start:50000_lr:0.0001f_eps_min:0.1_device:0_rand:None_0/2913941_Score:37.6.pt"))
        # print("load completed.")
        self.q_target.load_state_dict(self.q_current.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_current.parameters(),
                                    lr=learning_rate)

        self.memory = ReplayBuffer(
            self.buffer_size,
            (self.input_frames, self.input_dim, self.input_dim),
            self.batch_size)
Example #13
0
def train(game,
          num_steps=60000000,
          lr=0.00025,
          gamma=0.99,
          C=20000,
          batch_size=32):

    env = wrappers.wrap(gym.make(GAMES[game]))
    num_actions = env.action_space.n

    Q1 = QNetwork(num_actions)
    Q2 = QNetwork(num_actions)
    Q2.load_state_dict(Q1.state_dict())

    if torch.cuda.is_available():
        Q1.cuda()
        Q2.cuda()

    epsilon = Epsilon(1, 0.05, 1000000)
    optimizer = torch.optim.Adam(Q1.parameters(), lr=lr)
    optimizer.zero_grad()

    state1 = env.reset()

    t, last_t, loss, episode, score = 0, 0, 0, 0, 0
    last_ts, scores = datetime.now(), collections.deque(maxlen=100)

    while t < num_steps:

        qvalues = Q1(state1)
        if random() < epsilon(t):
            action = env.action_space.sample()
        else:
            action = qvalues.data.max(dim=1)[1][0]

        q = qvalues[0][action]

        state2, reward, done, _info = env.step(action)
        score += reward

        if not done:
            y = gamma * Q2(state2).detach().max(dim=1)[0][0] + reward
            state1 = state2
        else:
            reward = FloatTensor([reward])
            y = torch.autograd.Variable(reward, requires_grad=False)
            state1 = env.reset()
            scores.append(score)
            score = 0
            episode += 1

        loss += torch.nn.functional.smooth_l1_loss(q, y)

        t += 1

        if done or t % batch_size == 0:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss = 0

        if t % C == 0:
            Q2.load_state_dict(Q1.state_dict())
            torch.save(Q1.state_dict(), 'qlearning_{}.pt'.format(game))

        if t % 1000 == 0:
            ts = datetime.now()
            datestr = ts.strftime('%Y-%m-%dT%H:%M:%S.%f')
            avg = mean(scores) if scores else float('nan')
            steps_per_sec = (t - last_t) / (ts - last_ts).total_seconds()
            l = '{} step {} episode {} avg last 100 scores: {:.2f} ε: {:.2f}, steps/s: {:.0f}'
            print(l.format(datestr, t, episode, avg, epsilon(t),
                           steps_per_sec))
            last_t, last_ts = t, ts
Example #14
0
def train(env_name,
          arch,
          timesteps=1,
          init_timesteps=0,
          seed=42,
          er_capacity=1,
          epsilon_start=1.0,
          epsilon_stop=0.05,
          epsilon_decay_stop=1,
          batch_size=16,
          target_sync=16,
          lr=1e-3,
          gamma=1.0,
          dueling=False,
          play_steps=1,
          lr_steps=1e4,
          lr_gamma=0.99,
          save_steps=5e4,
          logger=None,
          experiment_name='test'):
    """
        Main training function. Calls the subprocesses to get experience and
        train the network.
    """

    # Casting params which are expressable in scientific notation
    def int_scientific(x):
        return int(float(x))

    timesteps, init_timesteps = map(int_scientific,
                                    [timesteps, init_timesteps])
    lr_steps, epsilon_decay_stop = map(int_scientific,
                                       [lr_steps, epsilon_decay_stop])
    er_capacity, target_sync, save_steps = map(
        int_scientific, [er_capacity, target_sync, save_steps])
    lr = float(lr)

    # Multiprocessing method
    mp.set_start_method('spawn')

    # Get PyTorch device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create the Q network
    _env = make_env(env_name, seed)
    net = QNetwork(_env.observation_space,
                   _env.action_space,
                   arch=arch,
                   dueling=dueling).to(device)
    # Create the target network as a copy of the Q network
    tgt_net = ptan.agent.TargetNet(net)
    # Create buffer and optimizer
    buffer = ptan.experience.ExperienceReplayBuffer(experience_source=None,
                                                    buffer_size=er_capacity)
    optimizer = optim.Adam(net.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=lr_steps, gamma=0.99)

    # Multiprocessing queue
    epsilon_schedule = (epsilon_start, epsilon_stop, epsilon_decay_stop)
    exp_queue = mp.Queue(maxsize=play_steps * 2)
    play_proc = mp.Process(target=play_func,
                           args=(env_name, net, exp_queue, seed, timesteps,
                                 epsilon_schedule, gamma))
    play_proc.start()

    # Main training loop
    timestep = 0
    while play_proc.is_alive() and timestep < timesteps:
        timestep += play_steps
        # Query the environments and log results if the episode has ended
        for _ in range(play_steps):
            exp, info = exp_queue.get()
            if exp is None:
                play_proc.join()
                break
            buffer._add(exp)
            logger.log_kv('internals/epsilon', info['epsilon'][0],
                          info['epsilon'][1])
            if 'ep_reward' in info.keys():
                logger.log_kv('performance/return', info['ep_reward'],
                              timestep)
                logger.log_kv('performance/length', info['ep_length'],
                              timestep)
                logger.log_kv('performance/speed', info['speed'], timestep)

        # Check if we are in the starting phase
        if len(buffer) < init_timesteps:
            continue

        scheduler.step()
        logger.log_kv('internals/lr', scheduler.get_lr()[0], timestep)
        # Get a batch from experience replay
        optimizer.zero_grad()
        batch = buffer.sample(batch_size * play_steps)
        # Unpack the batch
        states, actions, rewards, dones, next_states = unpack_batch(batch)
        states_v = torch.tensor(states).to(device)
        next_states_v = torch.tensor(next_states).to(device)
        actions_v = torch.tensor(actions).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        done_mask = torch.ByteTensor(dones).to(device)
        # Optimize defining the loss function
        state_action_values = net(states_v).gather(
            1, actions_v.unsqueeze(-1)).squeeze(-1)
        next_state_values = tgt_net.target_model(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        expected_state_action_values = next_state_values.detach(
        ) * gamma + rewards_v
        loss = F.mse_loss(state_action_values, expected_state_action_values)
        logger.log_kv('internals/loss', loss.item(), timestep)
        loss.backward()
        # Clip the gradients to avoid to abrupt changes (this is equivalent to Huber Loss)
        for param in net.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

        # Check if the target network need to be synched
        if timestep % target_sync == 0:
            tgt_net.sync()

        # Check if we need to save a checkpoint
        if timestep % save_steps == 0:
            torch.save(net.get_extended_state(), experiment_name + '.pth')
Example #15
0
def train(env_name, seed=42, timesteps=1, epsilon_decay_last_step=1000,
            er_capacity=1e4, batch_size=16, lr=1e-3, gamma=1.0,  update_target=16,
            exp_name='test', init_timesteps=100, save_every_steps=1e4, arch='nature',
            dueling=False, play_steps=2, n_jobs=2):
    """
        Main training function. Calls the subprocesses to get experience and
        train the network.
    """
    # Multiprocessing method
    mp.set_start_method('spawn')

    # Get PyTorch device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Set random seed for PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Create logger
    logger = Logger(exp_name, loggers=['tensorboard'])

    # Create the Q network
    _env = make_env(env_name, seed)
    net = QNetwork(_env.observation_space, _env.action_space, arch=arch, dueling=dueling).to(device)
    # Create the target network as a copy of the Q network
    target_net = copy.deepcopy(net)
    # Create buffer and optimizer
    buffer = ExperienceReplay(capacity=int(er_capacity))
    optimizer = optim.Adam(net.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=LR_STEPS, gamma=0.99)

    # Multiprocessing queue
    obs_queue = mp.Queue(maxsize=n_jobs)
    transition_queue = mp.Queue(maxsize=n_jobs)
    workers, action_queues = [], []
    for i in range(n_jobs):
        action_queue = mp.Queue(maxsize=1)
        _seed = seed + i * 1000
        play_proc = mp.Process(target=play_func, args=(i, env_name, obs_queue, transition_queue, action_queue, _seed))
        play_proc.start()
        workers.append(play_proc)
        action_queues.append(action_queue)

    # Vars to keep track of performances and time
    timestep = 0
    current_reward, current_len = np.zeros(play_steps), np.zeros(play_steps, dtype=np.int64)
    current_time = [time.time() for _ in range(play_steps)]
    # Training loop
    while timestep < timesteps:
        # Compute the current epsilon
        epsilon = EPSILON_STOP + max(0, (EPSILON_START - EPSILON_STOP)*(epsilon_decay_last_step-timestep)/epsilon_decay_last_step)
        logger.log_kv('internals/epsilon', epsilon, timestep)
        # Gather observation N_STEPS
        ids, obs_batch = zip(*[obs_queue.get() for _ in range(play_steps)])
        # Pre-process observation_batch for PyTorch
        obs_batch = torch.from_numpy(np.array(obs_batch)).to(device)
        # Select greedy action from policy, apply epsilon-greedy selection
        greedy_actions = net(obs_batch).argmax(dim=1).cpu().detach().numpy()
        probs = torch.rand(greedy_actions.shape)
        actions = np.where(probs < epsilon, _env.action_space.sample(), greedy_actions)
        # Send actions
        for id, action in zip(ids, actions):
            action_queues[id].put(action)
        # Add transitions to experience replay
        transitions = [transition_queue.get() for _ in range(play_steps)]
        buffer.pushTransitions(transitions)
        # Check if we need to update rewards, time and lengths
        _, _, _, reward, done, _ = zip(*transitions)
        current_reward += reward
        current_len += 1
        for i, done in enumerate(done):
            if done:
                # Log quantities
                logger.log_kv('performance/return', current_reward[i], timestep)
                logger.log_kv('performance/length', current_len[i], timestep)
                logger.log_kv('performance/speed', current_len[i] / (time.time() - current_time[i]), timestep)
                # Reset counters
                current_reward[i] = 0.0
                current_len[i] = 0
                current_time[i] = time.time()

        # Update number of steps
        timestep += play_steps

        # Check if we are in the warm-up phase, otherwise go on with policy update
        if timestep < init_timesteps:
            continue
        # Learning rate upddate and log
        scheduler.step()
        logger.log_kv('internals/lr', scheduler.get_lr()[0], timestep)
        # Clear grads
        optimizer.zero_grad()
        # Get a batch from experience replay
        batch = buffer.sampleTransitions(batch_size)
        def batch_preprocess(batch_item):
            return torch.tensor(batch_item, dtype=(torch.long if isinstance(batch_item[0], np.int64) else None)).to(device)
        ids, states_batch, actions_batch, rewards_batch, done_batch, next_states_batch = map(batch_preprocess, zip(*batch))
        # Compute the loss function
        state_action_values = net(states_batch).gather(1, actions_batch.unsqueeze(-1)).squeeze(-1)
        next_state_values = target_net(next_states_batch).max(1)[0]
        next_state_values[done_batch] = 0.0
        expected_state_action_values = next_state_values.detach() * gamma + rewards_batch
        loss = F.mse_loss(state_action_values, expected_state_action_values)
        logger.log_kv('internals/loss', loss.item(), timestep)
        loss.backward()
        # Clip the gradients to avoid to abrupt changes (this is equivalent to Huber Loss)
        for param in net.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

        if timestep % update_target == 0:
            target_net.load_state_dict(net.state_dict())

        # Check if we need to save a checkpoint
        if timestep % save_every_steps == 0:
            torch.save(net.get_extended_state(), exp_name + '.pth')

    # Ending
    for i, worker in enumerate(workers):
        action_queues[i].put(None)
        worker.join()
Example #16
0
                    type=int,
                    help='Number of Episodes the Agent should play')

if __name__ == "__main__":
    args = vars(parser.parse_args())

    env = UnityEnvironment(file_name="Banana_Linux/Banana.x86")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]
    action_size = brain.vector_action_space_size
    state = env_info.vector_observations[0]
    state_size = len(state)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    q_policy = QNetwork(len(env_info.vector_observations[0]),
                        brain.vector_action_space_size, seed).to(device)
    q_policy.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(args['episodes']):
        env_info = env.reset(train_mode=False)[brain_name]
        state = torch.tensor(
            env_info.vector_observations[0]).float().to(device)
        score = 0
        while True:
            action_values = q_policy(state.unsqueeze(0))
            action = action_values.max(1)[1]
            env_info = env.step(action.item())[brain_name]

            reward = env_info.rewards[0]
            score += reward
            done = float(env_info.local_done[0])
class DQN_Agent():
    """ Interacts an learns from the environment. """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 GAMMA=GAMMA,
                 TAU=TAU,
                 LR=LR,
                 UPDATE_EVERY=UPDATE_EVERY,
                 BUFFER_SIZE=BUFFER_SIZE,
                 BATCH_SIZE=BATCH_SIZE):
        """ Initialize the agent.
        ==========
        PARAMETERS 
        ==========
            state_size (int) = observation dimension of the environment
            action_size (int) = dimension of each action
            seed (int) = random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.gamma = GAMMA
        self.tau = TAU
        self.lr = LR
        self.update_every = UPDATE_EVERY
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # instantiate online local and target network for weight updates
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        # create a replay buffer
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # time steps for updating target network every time t_step % 4 == 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        ''' Append a SARS sequence to memory, then every update_every steps learn from experiences'''
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # in case enough samples are available in internal memory, sample and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """ Choose action from an epsilon-greedy policy
        ==========
        PARAMETERS
        ==========
            state (array) = current state space
            eps (float) = epsilon, for epsilon-greedy action choice """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ Update the value parameters using experience tuples sampled from ReplayBuffer
        ==========
        PARAMETERS
        ==========
          experiences = Tuple of torch.Variable: SARS', done
          gamma (float) = discount factor to weight rewards
        """

        states, actions, rewards, next_states, dones = experiences

        # calculate max predicted Q values for the next states using target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # calculate expected Q vaues from the local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        # compute MSE Loss
        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update for model parameters, every update steps as defined above
        theta_target = tau * theta_local + (1-tau)*theta_target 

        ==========
        PARAMETERS 
        ==========
          local_model, target_model = PyTorch Models, weights will be copied from-to
          tau = interpolation parameter, type=float 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Example #18
0
def simulate_symbolic_online_RL_algorithm(mdp, num_episodes, max_iterations):

    real_actions = mdp.actions(None)
    actions = np.arange(len(real_actions))

    # these theano variables are used to define the symbolic input of the network
    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')
    learning_rate_symbol = T.dscalar('learning_rate')

    h1 = HiddenLayer(n_vis=INPUT_DIM, n_hid=HIDDEN_DIM, layer_name='h1')
    h2 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h2')
    h3 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h3')
    h4 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h4')
    # h5 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=HIDDEN_DIM, layer_name='h5')
    # h6 = HiddenLayer(n_vis=HIDDEN_DIM, n_hid=OUTPUT_DIM, layer_name='h6')

    layers = [h1, h2, h3, h4]  #, h3, h4, h5, h6]
    learning_rate = 1e-2
    explorationProb = .4
    regularization_weight = 1e-5
    momentum_rate = 9e-1
    qnetwork = QNetwork(layers,
                        discount=mdp.discount,
                        momentum_rate=momentum_rate,
                        regularization_weight=regularization_weight)

    exploration_reduction = (explorationProb -
                             MIN_EXPLORATION_PROB) / num_episodes
    learning_rate_reduction = (learning_rate -
                               MIN_LEARNING_RATE) / num_episodes

    # this call gets the symbolic output of the network along with the parameter updates
    loss, updates = qnetwork.get_loss_and_updates(features, action, reward,
                                                  next_features,
                                                  learning_rate_symbol)

    print 'Building Training Function...'
    # this defines the theano symbolic function used to train the network
    # 1st argument is a list of inputs, here the symbolic variables above
    # 2nd argument is the symbolic output expected
    # 3rd argument is the dictionary of parameter updates
    # 4th argument is the compilation mode
    train_model = theano.function([
        theano.Param(features, default=np.zeros(INPUT_DIM)),
        theano.Param(action, default=0),
        theano.Param(reward, default=0),
        theano.Param(next_features, default=np.zeros(HIDDEN_DIM)),
        learning_rate_symbol
    ],
                                  outputs=loss,
                                  updates=updates,
                                  mode='FAST_RUN')

    get_action = theano.function([features], qnetwork.get_action(features))

    total_rewards = []
    total_losses = []
    weight_magnitudes = []

    print 'Starting Training...'
    replay_mem = replay_memory.ReplayMemory()
    for episode in xrange(num_episodes):

        state = np.array(mdp.start_state)
        total_reward = 0
        total_loss = 0
        for iteration in xrange(max_iterations):

            if random.random() < explorationProb:
                action = random.choice(actions)
            else:
                action = get_action(state)

            real_action = real_actions[action]
            transitions = mdp.succAndProbReward(state, real_action)

            if len(transitions) == 0:
                # loss += train_model(state, action, 0, next_features)
                break

            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            newState = np.array(newState)

            sars_tuple = (state, action, np.clip(reward, -1, 1), newState)
            replay_mem.store(sars_tuple)
            num_samples = 5 if replay_mem.isFull() else 1
            for i in range(0, num_samples):
                random_train_tuple = replay_mem.sample()
                sample_state = random_train_tuple[0]
                sample_action = random_train_tuple[1]
                sample_reward = random_train_tuple[2]
                sample_new_state = random_train_tuple[3]

                total_loss += train_model(sample_state, sample_action,
                                          sample_reward, sample_new_state,
                                          learning_rate)

            total_reward += reward
            state = newState

        explorationProb -= exploration_reduction
        learning_rate -= learning_rate_reduction

        total_rewards.append(total_reward * mdp.discount**iteration)
        total_losses.append(total_loss)
        weight_magnitude = qnetwork.get_weight_magnitude()
        weight_magnitudes.append(weight_magnitude)

        print 'episode: {}\t\t loss: {}\t\t reward: {}\t\tweight magnitude: {}'.format(
            episode, round(total_loss, 2), total_reward, weight_magnitude)

    # return the list of rewards attained
    return total_rewards, total_losses
Example #19
0
    def __init__(self,
                 config,
                 num_actions,
                 width,
                 height,
                 num_channels,
                 memory_size,
                 load_model=None,
                 target_network_update_tau=None):
        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)

        self.num_actions = num_actions
        self.width = width
        self.height = height
        self.num_channels = num_channels
        self.memory_size = memory_size
        self.target_network_update_tau = target_network_update_tau

        layers = config["layers"]
        self.clip_max = config["clip_max"]
        self.clip_grad_norm = None
        if "clip_grad_norm" in config:
            self.clip_grad_norm = config["clip_grad_norm"]
            print("Clipping gradient norm to {}".format(self.clip_grad_norm))
        self.lr = config["learning_rate"]
        self.rms_decay = config["rms_decay"]
        self.gamma = config["gamma"]
        self.loss_type = config["loss"]
        self.optimizer_type = config["optimizer"]

        #placeholders
        with self.graph.as_default():
            self.state_train_placeholder = tf.placeholder(
                tf.float32, [None, self.width, self.height, self.num_channels])
            self.state_target_placeholder = tf.placeholder(
                tf.float32,
                shape=[None, self.width, self.height, self.num_channels])
            self.action_index_placeholder = tf.placeholder(tf.int32,
                                                           shape=[None])
            self.reward_placeholder = tf.placeholder(tf.float32,
                                                     shape=[None, 1])
            self.terminal_placeholder = tf.placeholder(tf.float32,
                                                       shape=[None, 1])
            self.beta_placeholder = tf.placeholder(tf.float32, shape=[])
            self.p_placeholder = tf.placeholder(tf.float32, shape=[None, 1])

        #create q networks
        self.train_network = QNetwork(layers,
                                      "train",
                                      self.graph,
                                      self.num_actions,
                                      self.state_train_placeholder,
                                      trainable=True)
        self.target_network = QNetwork(layers,
                                       "target",
                                       self.graph,
                                       self.num_actions,
                                       self.state_target_placeholder,
                                       trainable=False)
        self.add_training_ops()
        self.create_target_update_operations(
            tau=self.target_network_update_tau)
        self.add_saver()

        #load variables from file
        if not load_model is None:
            self.load_model(load_model)
            self.variables_initialized = True
        #initialize variables
        else:
            with self.graph.as_default():
                self.init_op = tf.global_variables_initializer()
            self.run_operations(self.init_op)
            self.update_target_network()
            self.variables_initialized = True
def test_nnet_numberline_mdp(n_episodes,  exploration_prob=0.9, learning_rate=.0005, target_freeze_period=500):

    reduce_explore = 0.0001
    size = 20.
    mdp = GridSearchMDP(size)
    actions = mdp.actions(mdp.startState())

    print actions

    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    n_vis = 2 # for chain mdp
    hidden_layer_1 = HiddenLayer(n_vis=n_vis, n_hid=len(actions), layer_name='hidden', activation='tanh')
    output_layer = OutputLayer(layer_name='out', activation='relu')
    layers = [hidden_layer_1, output_layer]
    mlp = QNetwork(layers, discount=mdp.discount(), learning_rate=learning_rate)
    loss, updates = mlp.get_loss_and_updates(features, action, reward, next_features)

    train_model = theano.function(
                    [theano.Param(features, default=np.zeros(MAX_FEATURES_TEST)),
                    theano.Param(action, default=0),
                    theano.Param(reward, default=0),
                    theano.Param(next_features, default=np.zeros(MAX_FEATURES_TEST))],
                    outputs=loss,
                    updates=updates,
                    mode='FAST_RUN')

    rewards = []
    counter = 0
    for episode in xrange(n_episodes):
	curDiscount = mdp.discount()
	totalReward = 0
        cur_state = mdp.startState()
	print cur_state
        while not mdp.isEnd(cur_state):
            counter += 1
            if counter % 1000 == 0:
                mlp.frozen_layers = copy.deepcopy(mlp.layers)

	    if (counter % 100 == 0):
            	print 'cur_state: {}'.format(cur_state)

            if random.random() < exploration_prob: 
                action = random.choice(actions)
                action_index = actions.index(action)
            else:
                action_index = T.argmax(mlp.fprop([cur_state[0]/mdp.n, cur_state[1]/mdp.n])).eval()
                action = actions[action_index]
		if (counter % 100 == 0):
               	    print 'action: {}'.format(action)
            # realAction = action
            # if action == 0: realAction = -1
            transitions = mdp.succAndProbReward(cur_state, action) # previously realAction)
            if len(transitions) == 0:
                break
            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            #print 'newState: {}'.format(newState)
            #print 'reward: {}'.format(reward)
            #print [(p.eval(), p.name) for p in mlp.get_params()]
            #print [(p.eval(), p.name) for p in mlp.get_params(freeze=True)]
            #print '\n'
	    reward *= curDiscount
	    totalReward += reward
	    curDiscount *= mdp.discount()
            loss = train_model([cur_state[0]/mdp.n, cur_state[1]/mdp.n], action_index, reward, [newState[0]/mdp.n, newState[1]/mdp.n]) # previously action
            cur_state = newState
	    exploration_prob -= reduce_explore
	    if (exploration_prob < 0.25):
		exploration_prob = 0.25
        rewards.append(totalReward)
	
        print('*' * 30)
        print('episode: {} ended with score: {}'.format(episode, rewards[-1]))
	print('avg reward: {}'.format(np.mean(rewards[-25:])))
	print('explore: {}'.format(exploration_prob))
        print('*' * 30)
        print('\n')
        
    return rewards
Example #21
0
class DQN(object):
    def __init__(self,state_space,action_space,seed,update_every,batch_size,buffer_size,learning_rate):
        self.action_space = action_space
        self.state_space = state_space
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.update_every = update_every
        
        self.qnetwork_local = QNetwork(state_space,action_space)
        self.qnetwork_target = QNetwork(state_space,action_space)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),lr=learning_rate)
        # Initialize replaybuffer
        self.memory = ReplayBuffer(action_space,buffer_size,buffer_size,seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def step(self,state,action,reward,next_state,done,GAMMA):
        # Save the experience
        self.memory.add_experience(state,action,reward,next_state,done)
        
        # learn from the experience
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.buffer_size:
                experiences = self.memory.sample()
                self.learn(experiences,GAMMA)
        
    def act(self,state,eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.sample(np.arange(self.action_space))
        
    def learn(self,experiences,GAMMA):
        
        states,actions,rewards,next_states,dones = experiences
        
        target_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        targets = reward + (GAMMA * target_values * (1-done))
        action_values = self.qnetwork_local(states).gather(1,actions)
        loss = F.mse_loss(action_values,targets)
        loss.backward()
        self.optimizer.step()
        soft_update(TAU)
        
    def soft_update(self,tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for local_param,target_param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()):
            local_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)
#         self.qnetwork_local.parameters() = TAU*self.qnetwork_local.parameters() + (1-TAU)*self.qnetwork_target.parameters()
Example #22
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.0, training_mode=True):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        if training_mode is True:
            self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            action = np.argmax(action_values.cpu().data.numpy())
        else:
            action = random.choice(np.arange(self.action_size))

        action = np.int32(action)
        return action

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
def test_nnet_numberline_mdp(n_episodes,
                             exploration_prob=0.9,
                             learning_rate=.0005,
                             target_freeze_period=500):

    reduce_explore = 0.0001
    size = 20.
    mdp = GridSearchMDP(size)
    actions = mdp.actions(mdp.startState())

    print actions

    features = T.dvector('features')
    action = T.lscalar('action')
    reward = T.dscalar('reward')
    next_features = T.dvector('next_features')

    n_vis = 2  # for chain mdp
    hidden_layer_1 = HiddenLayer(n_vis=n_vis,
                                 n_hid=len(actions),
                                 layer_name='hidden',
                                 activation='tanh')
    output_layer = OutputLayer(layer_name='out', activation='relu')
    layers = [hidden_layer_1, output_layer]
    mlp = QNetwork(layers,
                   discount=mdp.discount(),
                   learning_rate=learning_rate)
    loss, updates = mlp.get_loss_and_updates(features, action, reward,
                                             next_features)

    train_model = theano.function([
        theano.Param(features, default=np.zeros(MAX_FEATURES_TEST)),
        theano.Param(action, default=0),
        theano.Param(reward, default=0),
        theano.Param(next_features, default=np.zeros(MAX_FEATURES_TEST))
    ],
                                  outputs=loss,
                                  updates=updates,
                                  mode='FAST_RUN')

    rewards = []
    counter = 0
    for episode in xrange(n_episodes):
        curDiscount = mdp.discount()
        totalReward = 0
        cur_state = mdp.startState()
        print cur_state
        while not mdp.isEnd(cur_state):
            counter += 1
            if counter % 1000 == 0:
                mlp.frozen_layers = copy.deepcopy(mlp.layers)

            if (counter % 100 == 0):
                print 'cur_state: {}'.format(cur_state)

            if random.random() < exploration_prob:
                action = random.choice(actions)
                action_index = actions.index(action)
            else:
                action_index = T.argmax(
                    mlp.fprop([cur_state[0] / mdp.n,
                               cur_state[1] / mdp.n])).eval()
                action = actions[action_index]
                if (counter % 100 == 0):
                    print 'action: {}'.format(action)
            # realAction = action
            # if action == 0: realAction = -1
            transitions = mdp.succAndProbReward(
                cur_state, action)  # previously realAction)
            if len(transitions) == 0:
                break
            # Choose a random transition
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            #print 'newState: {}'.format(newState)
            #print 'reward: {}'.format(reward)
            #print [(p.eval(), p.name) for p in mlp.get_params()]
            #print [(p.eval(), p.name) for p in mlp.get_params(freeze=True)]
            #print '\n'
            reward *= curDiscount
            totalReward += reward
            curDiscount *= mdp.discount()
            loss = train_model([cur_state[0] / mdp.n, cur_state[1] / mdp.n],
                               action_index, reward,
                               [newState[0] / mdp.n, newState[1] / mdp.n
                                ])  # previously action
            cur_state = newState
            exploration_prob -= reduce_explore
            if (exploration_prob < 0.25):
                exploration_prob = 0.25
        rewards.append(totalReward)

        print('*' * 30)
        print('episode: {} ended with score: {}'.format(episode, rewards[-1]))
        print('avg reward: {}'.format(np.mean(rewards[-25:])))
        print('explore: {}'.format(exploration_prob))
        print('*' * 30)
        print('\n')

    return rewards
Example #24
0
    def __init__(
            self,
            env: 'Environment',
            input_frame: ('int: the number of channels of input image'),
            input_dim: (
                'int: the width and height of pre-processed input image'),
            input_type: ('str: the type of input dimension'),
            num_frames: ('int: Total number of frames'),
            skipped_frame: ('int: The number of skipped frames'),
            eps_decay: ('float: Epsilon Decay_rate'),
            gamma: ('float: Discount Factor'),
            target_update_freq: ('int: Target Update Frequency (by frames)'),
            update_type: (
                'str: Update type for target network. Hard or Soft') = 'hard',
            soft_update_tau: ('float: Soft update ratio') = None,
            batch_size: ('int: Update batch size') = 32,
            buffer_size: ('int: Replay buffer size') = 1000000,
            alpha: (
                'int: Hyperparameter for how large prioritization is applied'
            ) = 0.5,
            beta:
        ('int: Hyperparameter for the annealing factor of importance sampling'
         ) = 0.5,
            epsilon_for_priority:
        ('float: Hyperparameter for adding small increment to the priority'
         ) = 1e-6,
            update_start_buffer_size: (
                'int: Update starting buffer size') = 50000,
            learning_rate: ('float: Learning rate') = 0.0004,
            eps_min: ('float: Epsilon Min') = 0.1,
            eps_max: ('float: Epsilon Max') = 1.0,
            device_num: ('int: GPU device number') = 0,
            rand_seed: ('int: Random seed') = None,
            plot_option: ('str: Plotting option') = False,
            model_path: ('str: Model saving path') = './'):

        self.action_dim = env.action_space.n
        self.device = torch.device(
            f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path

        self.env = env
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.skipped_frame = skipped_frame
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option

        # hyper parameters for PER
        self.alpha = alpha
        self.beta = beta
        self.beta_step = (1.0 - beta) / num_frames
        self.epsilon_for_priority = epsilon_for_priority

        if input_type == '1-dim':
            self.q_current = QNetwork_1dim(self.input_dim,
                                           self.action_dim).to(self.device)
            self.q_target = QNetwork_1dim(self.input_dim,
                                          self.action_dim).to(self.device)
        else:
            self.q_current = QNetwork(
                (self.input_frames, self.input_dim, self.input_dim),
                self.action_dim).to(self.device)
            self.q_target = QNetwork(
                (self.input_frames, self.input_dim, self.input_dim),
                self.action_dim).to(self.device)
        self.q_target.load_state_dict(self.q_current.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_current.parameters(),
                                    lr=learning_rate)

        if input_type == '1-dim':
            self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                                  self.input_dim,
                                                  self.batch_size, self.alpha,
                                                  input_type)
        else:
            self.memory = PrioritizedReplayBuffer(
                self.buffer_size,
                (self.input_frames, self.input_dim, self.input_dim),
                self.batch_size, self.alpha, input_type)
Example #25
0
class Agent:
    def __init__(
            self,
            env: 'Environment',
            input_frame: ('int: the number of channels of input image'),
            input_dim: (
                'int: the width and height of pre-processed input image'),
            num_frames: ('int: Total number of frames'),
            eps_decay: ('float: Epsilon Decay_rate'),
            gamma: ('float: Discount Factor'),
            target_update_freq: ('int: Target Update Frequency (by frames)'),
            update_type: (
                'str: Update type for target network. Hard or Soft') = 'hard',
            soft_update_tau: ('float: Soft update ratio') = None,
            batch_size: ('int: Update batch size') = 32,
            buffer_size: ('int: Replay buffer size') = 1000000,
            update_start_buffer_size: (
                'int: Update starting buffer size') = 50000,
            learning_rate: ('float: Learning rate') = 0.0004,
            eps_min: ('float: Epsilon Min') = 0.1,
            eps_max: ('float: Epsilon Max') = 1.0,
            device_num: ('int: GPU device number') = 0,
            rand_seed: ('int: Random seed') = None,
            plot_option: ('str: Plotting option') = False,
            model_path: ('str: Model saving path') = './'):

        self.action_dim = env.action_space.n
        self.device = torch.device(
            f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path

        self.env = env
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option

        self.q_current = QNetwork(
            (self.input_frames, self.input_dim, self.input_dim),
            self.action_dim).to(self.device)
        self.q_target = QNetwork(
            (self.input_frames, self.input_dim, self.input_dim),
            self.action_dim).to(self.device)
        self.q_target.load_state_dict(self.q_current.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_current.parameters(),
                                    lr=learning_rate)

        self.memory = ReplayBuffer(
            self.buffer_size,
            (self.input_frames, self.input_dim, self.input_dim),
            self.batch_size)

    def select_action(
        self, state:
        'Must be pre-processed in the same way while updating current Q network. See def _compute_loss'
    ):

        if np.random.random() < self.epsilon:
            return np.zeros(self.action_dim), self.env.action_space.sample()
        else:
            # if normalization is applied to the image such as devision by 255, MUST be expressed 'state/255' below.
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0) / 255
            Qs = self.q_current(state)
            action = Qs.argmax()
            return Qs.detach().cpu().numpy(), action.detach().item()

    def processing_resize_and_gray(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)  # Pure
        # frame = cv2.cvtColor(frame[:177, 32:128, :], cv2.COLOR_RGB2GRAY) # Boxing
        # frame = cv2.cvtColor(frame[2:198, 7:-7, :], cv2.COLOR_RGB2GRAY) # Breakout
        frame = cv2.resize(frame,
                           dsize=(self.input_dim, self.input_dim)).reshape(
                               self.input_dim, self.input_dim).astype(np.uint8)
        return frame

    def get_state(self, action, skipped_frame=0):
        '''
        num_frames: how many frames to be merged
        input_size: hight and width of input resized image
        skipped_frame: how many frames to be skipped
        '''
        next_state = np.zeros(
            (self.input_frames, self.input_dim, self.input_dim))
        rewards = 0
        dones = 0
        for i in range(self.input_frames):
            for j in range(skipped_frame):
                state, reward, done, _ = self.env.step(action)
                rewards += reward
                dones += int(done)
            state, reward, done, _ = self.env.step(action)
            next_state[i] = self.processing_resize_and_gray(state)
            rewards += reward
            dones += int(done)
        return rewards, next_state, dones

    def get_init_state(self):
        state = self.env.reset()
        action = self.env.action_space.sample()
        _, state, _ = self.get_state(action, skipped_frame=0)
        return state

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def update_current_q_net(self):
        batch = self.memory.batch_load()
        loss = self._compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def target_soft_update(self):
        for target_param, current_param in zip(self.q_target.parameters(),
                                               self.q_current.parameters()):
            target_param.data.copy_(self.tau * current_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def target_hard_update(self):
        self.update_cnt = (self.update_cnt + 1) % self.target_update_freq
        if self.update_cnt == 0:
            self.q_target.load_state_dict(self.q_current.state_dict())

    def train(self):
        tic = time.time()
        losses = []
        scores = []
        epsilons = []
        avg_scores = [[-1000]]

        score = 0

        print("Storing initial buffer..")
        state = self.get_init_state()
        for frame_idx in range(1, self.update_start + 1):
            _, action = self.select_action(state)
            reward, next_state, done = self.get_state(action, skipped_frame=0)
            self.store(state, action, reward, next_state, done)
            state = next_state
            if done: state = self.get_init_state()

        print("Done. Start learning..")
        history_store = []
        for frame_idx in range(1, self.num_frames + 1):
            Qs, action = self.select_action(state)
            reward, next_state, done = self.get_state(action, skipped_frame=0)
            self.store(state, action, reward, next_state, done)
            history_store.append([state, Qs, action, reward, next_state, done])
            loss = self.update_current_q_net()

            if self.update_type == 'hard': self.target_hard_update()
            elif self.update_type == 'soft': self.target_soft_update()

            score += reward
            losses.append(loss)

            if done:
                scores.append(score)
                if np.mean(scores[-10:]) > max(avg_scores):
                    torch.save(
                        self.q_current.state_dict(),
                        self.model_path + '{}_Score:{}.pt'.format(
                            frame_idx, np.mean(scores[-10:])))
                    training_time = round((time.time() - tic) / 3600, 1)
                    np.save(
                        self.model_path +
                        '{}_history_Score_{}_{}hrs.npy'.format(
                            frame_idx, score, training_time),
                        np.array(history_store))
                    print(
                        "          | Model saved. Recent scores: {}, Training time: {}hrs"
                        .format(scores[-10:], training_time),
                        ' /'.join(os.getcwd().split('/')[-3:]))
                avg_scores.append(np.mean(scores[-10:]))

                if self.plot_option == 'inline':
                    scores.append(score)
                    epsilons.append(self.epsilon)
                    self._plot(frame_idx, scores, losses, epsilons)
                elif self.plot_option == 'wandb':
                    wandb.log({
                        'Score': score,
                        'loss(10 frames avg)': np.mean(losses[-10:]),
                        'Epsilon': self.epsilon
                    })
                    print(score, end='\r')
                else:
                    print(score, end='\r')

                score = 0
                state = self.get_init_state()
                history_store = []
            else:
                state = next_state

            self._epsilon_step()

        print("Total training time: {}(hrs)".format(
            (time.time() - tic) / 3600))

    def _epsilon_step(self):
        ''' Epsilon decay control '''
        eps_decay_init = 1 / 1200000
        eps_decay = [
            eps_decay_init, eps_decay_init / 2.5, eps_decay_init / 3.5,
            eps_decay_init / 5.5
        ]

        if self.epsilon > 0.35:
            self.epsilon = max(self.epsilon - eps_decay[0], 0.1)
        elif self.epsilon > 0.27:
            self.epsilon = max(self.epsilon - eps_decay[1], 0.1)
        elif self.epsilon > 1.7:
            self.epsilon = max(self.epsilon - eps_decay[2], 0.1)
        else:
            self.epsilon = max(self.epsilon - eps_decay[3], 0.1)

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        # If normalization is used, it must be applied to 'state' and 'next_state' here. ex) state/255
        states = torch.FloatTensor(batch['states']).to(self.device) / 255
        next_states = torch.FloatTensor(batch['next_states']).to(
            self.device) / 255
        actions = torch.LongTensor(batch['actions'].reshape(-1,
                                                            1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(
            self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1,
                                                         1)).to(self.device)

        current_q = self.q_current(states).gather(1, actions)
        # The next line is the only difference from Vanila DQN.
        next_q = self.q_target(next_states).gather(
            1,
            self.q_current(next_states).argmax(axis=1, keepdim=True)).detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        loss = F.smooth_l1_loss(current_q, target)
        return loss

    def _plot(self, frame_idx, scores, losses, epsilons):
        clear_output(True)
        plt.figure(figsize=(20, 5), facecolor='w')
        plt.subplot(131)
        plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
        plt.plot(scores)
        plt.subplot(132)
        plt.title('loss')
        plt.plot(losses)
        plt.subplot(133)
        plt.title('epsilons')
        plt.plot(epsilons)
        plt.show()
    op_holder = []
    for idx, var in enumerate(tfVars[0:total_vars / 2]):
        op_holder.append(
            tfVars[idx + total_vars / 2].assign((var.value() * tau) + (
                (1 - tau) * tfVars[idx + total_vars / 2].value())))
    return op_holder


def updateTarget(op_holder, sess):
    for op in op_holder:
        sess.run(op)


tf.logging.set_verbosity(tf.logging.INFO)
tf.reset_default_graph()
mainQN = QNetwork(sensor_size, action_size, h_size, l_rate, init_value)
targetQN = QNetwork(sensor_size, action_size, h_size, l_rate, init_value)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
trainables = tf.trainable_variables()
targetOps = updateTargetGraph(trainables, tau)
myBuffer = ExperienceBuffer(total_size, resolution, buffersize)
env = VrepEnvironment(motor_speed, turn_speed, resolution, reset_distance,
                      pub_rate, dvs_queue, resize_factor, crop)

# Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE) / anneling_steps

jList = []
rList = []