def run_ES(population, i):
    model = population[0].genome

    es = EvolutionStrategy(model,
                           get_reward,
                           population_size=POPULATION_SIZE,
                           sigma=0.25,
                           learning_rate=0.03,
                           decay=0.998,
                           num_threads=2)
    es.run(5000, print_step=5, start=i)
    optimized = es.get_weights()
Example #2
0
def find_shapelets_es(timeseries, labels, max_len=100, min_len=1, population_size=100,
                       iterations=25, verbose=True, sigma=0.1, learning_rate=0.001):

    def cost(shapelet):
        return check_candidate(timeseries, labels, shapelet)[0]

    candidates = np.array(generate_candidates(timeseries, labels, max_len, min_len))

    es = EvolutionStrategy(candidates[np.random.choice(range(len(candidates)), size=population_size)][0][0],
                           cost, population_size=population_size, sigma=sigma, learning_rate=learning_rate)
    es.run(iterations, print_step=1)

    best_shapelet = es.get_weights()
    return best_shapelet
Example #3
0
import argparse

parser = argparse.ArgumentParser(description='Evolution Strategies. ')
parser.add_argument('--env', default="Humanoid-v2")
parser.add_argument('--render', type=bool, default=False)

args = parser.parse_args()

observationSpace, actionSpace = env_info(args.env)

# A feed forward neural network with input size of 5, two hidden layers of size 4 and output of size 3
model = FeedForwardNetwork(layer_sizes=[observationSpace, 32, 16, actionSpace])

get_reward = make_get_reward(args.env, model, args.render)
# if your task is computationally expensive, you can use num_threads > 1 to use multiple processes;
# if you set num_threads=-1, it will use number of cores available on the machine; Here we use 1 process as the
#  task is not computationally expensive and using more processes would decrease the performance due to the IPC overhead.
es = EvolutionStrategy(model.get_weights(),
                       get_reward,
                       population_size=20,
                       sigma=0.1,
                       learning_rate=0.03,
                       decay=0.995,
                       num_threads=1)
es.run(1000, print_step=100)
with open(args.env + ".pkl", 'wb') as fp:
    pickle.dump(es.get_weights(), fp)
#while True:
#   print(get_reward(es.get_weights(),True))
Example #4
0
class Agent:

    AGENT_HISTORY_LENGTH = 1
    NUM_OF_ACTIONS = 2
    POPULATION_SIZE = 15
    EPS_AVG = 1
    SIGMA = 0.1
    LEARNING_RATE = 0.03
    INITIAL_EXPLORATION = 0.0
    FINAL_EXPLORATION = 0.0
    EXPLORATION_DEC_STEPS = 100000


    def __init__(self):
        self.model = Model()
        self.game = FlappyBird(pipe_gap=125)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.env.init()
        self.env.getGameState = self.game.getGameState
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
        self.exploration = self.INITIAL_EXPLORATION


    def get_predicted_action(self, sequence):
        prediction = self.model.predict(np.array(sequence))
        x = np.argmax(prediction)
        return 119 if x == 1 else None


    def load(self, filename='weights.pkl'):
        with open(filename,'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()


    def get_observation(self):
        state = self.env.getGameState()
        return np.array(state.values())
    

    def save(self, filename='weights.pkl'):
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    
    def play(self, episodes):
        self.env.display_screen = True
        self.model.set_weights(self.es.weights)
        for episode in xrange(episodes):
            self.env.reset_game()
            observation = self.get_observation()
            sequence = [observation]*self.AGENT_HISTORY_LENGTH
            done = False
            score = 0
            while not done:
                action = self.get_predicted_action(sequence)
                reward = self.env.act(action)
                observation = self.get_observation()
                sequence = sequence[1:]
                sequence.append(observation)
                done = self.env.game_over()
                if self.game.getScore() > score:
                    score = self.game.getScore()
                    print "score: %d" % score
        self.env.display_screen = False


    def train(self, iterations):
        self.es.run(iterations, print_step=1)


    def get_reward(self, weights):
        total_reward = 0.0
        self.model.set_weights(weights)

        for episode in xrange(self.EPS_AVG):
            self.env.reset_game()
            observation = self.get_observation()
            sequence = [observation]*self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                self.exploration = max(self.FINAL_EXPLORATION, self.exploration - self.INITIAL_EXPLORATION/self.EXPLORATION_DEC_STEPS)
                if random.random() < self.exploration:
                    action = random.choice([119, None])
                else:
                    action = self.get_predicted_action(sequence)
                reward = self.env.act(action)
                reward += random.choice([0.0001, -0.0001])
                total_reward += reward
                observation = self.get_observation()
                sequence = sequence[1:]
                sequence.append(observation)
                done = self.env.game_over()

        return total_reward/self.EPS_AVG
            idx_map += 1

    # we save the score along with the name of the run
    score_str = str(round(best, 2))
    with open(f'{name}_{score_str}.pickle', 'wb') as handle:
        pickle.dump(list_data, handle, protocol=pickle.HIGHEST_PROTOCOL)


# create the circuit_mapping object
circuit_mapping = CircuitMapping(library_data)
# create the neural net
net = m.MODEL(20, library_data, args.path_json)

# run the evolution strategies (ES)
# note: if you want, you can play with the paramters.
# those seems to give reasonable results
es = EvolutionStrategy(
    net.model.get_weights(),
    get_reward,
    population_size=5,
    sigma=0.01,  # noise std deviation
    learning_rate=0.001,
    decay=0.995,
    num_threads=1)

es.run(args.n_epoch)
save_dict(args.path_json, args.name, verbose=True)
done = time.time()
elapsed = done - start
print(f'elapsed time: {elapsed}')
Example #6
0
class Agent:
    AGENT_HISTORY_LENGTH = 1
    POPULATION_SIZE = 25
    EPS_AVG = 1
    SIGMA = 0.5
    LEARNING_RATE = 0.1
    INITIAL_EXPLORATION = 1.0
    FINAL_EXPLORATION = 0.0
    EXPLORATION_DEC_STEPS = 100000

    def __init__(self):
        self.env = gym.make('BipedalWalker-v2')
        self.model = Model()
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward,
                                    self.POPULATION_SIZE, self.SIGMA,
                                    self.LEARNING_RATE)
        self.exploration = self.INITIAL_EXPLORATION

    def get_predicted_action(self, sequence):
        prediction = self.model.predict(np.array(sequence))
        return prediction

    def load(self, filename='weights.pkl'):
        with open(filename, 'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()

    def save(self, filename='weights.pkl'):
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    def play(self, episodes, render=True):
        self.model.set_weights(self.es.weights)
        for episode in range(episodes):
            total_reward = 0
            observation = self.env.reset()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                if render:
                    self.env.render()
                action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)
            print("total reward:", total_reward)

    def train(self, iterations):
        self.es.run(iterations, print_step=1)

    def get_reward(self, weights):
        total_reward = 0.0
        self.model.set_weights(weights)

        for episode in range(self.EPS_AVG):
            observation = self.env.reset()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                self.exploration = max(
                    self.FINAL_EXPLORATION, self.exploration -
                    self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)

        return total_reward / self.EPS_AVG
Example #7
0
class Agent:
	AGENT_HISTORY_LENGTH = 1
	POPULATION_SIZE = 20
	EPS_AVG = 1
	SIGMA = 0.1
	LEARNING_RATE = 0.01
	INITIAL_EXPLORATION = 1.0
	FINAL_EXPLORATION = 0.01
	EXPLORATION_DEC_STEPS = 50000

	plotScores = []
	plotEpisodes = []
	plotMaxTiles = []
	plotEpiCounter = 0

	GRID_SIZE = 3

	action_space = [0, 1, 2, 3]

	def __init__(self):
		random.seed(int(time.time()))
		np.random.seed(int(time.time()))

		window_length = 1
		nb_hidden = 256
		nb_actions = 4

		self.env = GameLogic(size = self.GRID_SIZE)

		# input_layer = Input(shape=(1, self.GRID_SIZE * self.GRID_SIZE))

		# layer = Dense(8)(input_layer)
		# output_layer = Dense(3)(layer)
		
		# self.model = Model(input_layer, output_layer)
		# self.model.compile(Adam(), 'mse')

		self.model = Sequential()
		self.model.add(Flatten(input_shape=(window_length, self.GRID_SIZE * self.GRID_SIZE)))
		self.model.add(Dense(nb_hidden))
		self.model.add(Activation('relu'))
		self.model.add(Dense(nb_hidden))
		self.model.add(Activation('relu'))
		self.model.add(Dense(nb_actions, activation='linear'))
		print(self.model.summary())

		self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
		self.exploration = self.INITIAL_EXPLORATION

	def get_predicted_action(self, sequence):
		prediction = self.model.predict(np.array(sequence))
		return prediction

	def load(self, filename='data/weights.pkl'):
		self.model.load_weights(filename)
		self.es.weights = self.model.get_weights()

	def save(self, filename='data/weights.pkl'):
		self.model.save_weights(filename, overwrite=True)

	def play(self, episodes, render=True):
		self.model.set_weights(self.es.weights)
		for episode in range(episodes):
			total_reward = 0
			observation = self.env.reset()
			done = False
			while not done:
				action = self.model.predict(np.array(observation))
				observation, reward, done, _ = self.env.step(action)
				total_reward += reward
		print("total reward: " + str(total_reward))

	def train(self, iterations):
		self.es.run(iterations, print_step=1)

	def get_reward(self, weights):
		total_reward = 0.0
		self.model.set_weights(weights)

		for episode in range(self.EPS_AVG):
			observation = self.env.reset()
			observation = np.reshape(observation, [1, self.GRID_SIZE * self.GRID_SIZE])
			done = False
			while not done:
				self.exploration = max(self.FINAL_EXPLORATION, self.exploration - self.INITIAL_EXPLORATION/self.EXPLORATION_DEC_STEPS)
				if random.random() < self.exploration:
					action = random.randint(0, 3)
				else:
					action = np.argmax(self.model.predict(np.array([observation]))[0])

				observation, reward, done, _ = self.env.step(action)
				observation = np.reshape(observation, [1, self.GRID_SIZE * self.GRID_SIZE])
				total_reward += reward

		self.plotEpiCounter += 1
		self.plotEpisodes.append(self.plotEpiCounter)
		self.plotScores.append(self.env._score)
		self.plotMaxTiles.append(2**self.env._getMaxNumber())

		pylab.plot(self.plotEpisodes, self.plotScores, '-b', label='Score')
		pylab.plot(self.plotEpisodes, self.plotMaxTiles, '-r', label='Max Tile')
		pylab.savefig('data/evostra_{}_{}x.png'.format(ENV_NAME, self.GRID_SIZE))

		print("Game Score: {} Max Tile: {} Exploration: {}".format(self.env._score, 2**self.env._getMaxNumber(), self.exploration))
		return total_reward/self.EPS_AVG
Example #8
0
class Agent:
    agent_hist = 1
    population = 50
    eps_avg = 1
    sigma = 0.2
    #learning Rate
    lr = 0.1
    init_explore = 0.9
    final_explore = 0.1
    explore_steps = 1E+5

    def __init__(self):
        # Initializes environment, Model, Algorithm and Exploration
        self.env = gym.make(GYM_ENV)
        self.model = Model()
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward,
                                    self.population, self.sigma, self.lr)
        self.exploration = self.init_explore

    def get_predicted_action(self, sequence):
        # Retreive the predicted action
        prediction = self.model.predict(np.array(sequence))
        return prediction

    def load(self, filename='weights.pkl'):
        # Loads weights for agent_play
        with open(filename, 'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()

    def save(self, filename='weights.pkl'):
        # Saves weigths to Pickle file
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    def play(self, episodes, render=True):
        # Run the model in the OpenAI environment
        self.model.set_weights(self.es.weights)
        for episode in range(episodes):
            total_reward = 0
            observation = self.env.reset()
            sequence = [observation] * self.agent_hist
            done = False
            while not done:
                if render:
                    self.env.render()
                action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)
            print("Total reward:", total_reward)

    def train(self, iterations):
        # Begin training
        self.es.run(iterations, print_step=1)

    def get_reward(self, weights):
        # Initialize reward
        total_reward = 0.0
        self.model.set_weights(weights)

        # Calculate reward
        for episode in range(self.eps_avg):
            observation = self.env.reset()
            sequence = [observation] * self.agent_hist
            done = False
            while not done:
                self.exploration = max(
                    self.final_explore,
                    self.exploration - self.init_explore / self.explore_steps)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)
        return total_reward / self.eps_avg
Example #9
0
     for i in range(NUM_PARAMETERS):
         weights['w' + str(i)] = (-W_MAX, W_MAX)  
         weights_explore['w' + str(i)] = [-W_MAX, W_MAX]
         
     gp_params = {
          'alpha': 1e-5,
          'xi': 0.01
         }
         
     kappa = 1.0
     bo = BayesianOptimization(optimization_function_BAYESIAN, weights)
     bo.maximize(init_points=NUM_PARAMETERS, n_iter=99999999999, acq='ucb', kappa=kappa, **gp_params)        
 elif SIMPLE_ES:
     server.player = -1
     es = EvolutionStrategy(np.zeros(NUM_PARAMETERS), optimization_function_simple_es, population_size=NUM_KID, sigma=0.4, learning_rate=0.2, decay=0.98, num_threads=1) # 0.4 0.2 0.99
     es.run(N_GENERATIONS, print_step=9999999999)
     
 else:
     
     if MAX_PARAMETERS:
         predator_sigma = PREDATOR_SIGMA_LOW
         wall_sigma = WALL_SIGMA_LOW
         PW_RATIO = PW_RATIO_LOW
         update_parameters(predator_sigma, wall_sigma, PW_RATIO)
         min_predator_fitness = 100000
         best_sigma = (predator_sigma, wall_sigma, PW_RATIO)        
     
     while True:
         if not server.playing:
             if player != -1:
                 kids_fitness[player] = server.fitness
Example #10
0
class Agent:
    """The agent class."""

    ENV_ID = 'BipedalWalker-v2'
    # This is the number of the history obervations used in action prediction.
    AGENT_HISTORY_LENGTH = 1
    POPULATION_SIZE = 20
    EPS_AVG = 1
    SIGMA = 0.1
    LEARNING_RATE = 0.01
    # The following three parameters control the exlporation probabilities.
    # It starts with INITIAL_EXPLORATION, ends with FINAL_EXPLORATION after
    # EXLPORATION_DEC_STEPS steps.
    INITIAL_EXPLORATION = 1.0
    FINAL_EXPLORATION = 0.0
    EXPLORATION_DEC_STEPS = 1000000

    def __init__(self):
        """Initialize the agent."""
        # Initialize the openai-gym environment.
        self.env = gym.make(self.ENV_ID)

        # uncomment following lines if you want to record the video
        # self.env = gym.wrappers.Monitor(self.env, "{}_monitor".format(self.ENV_ID),
        #     lambda episode_id: True, force=True)

        # Initialze the training model.
        self.model = Model()
        # Initialize the evolution strategy of evostra
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward,
                                    self.POPULATION_SIZE, self.SIGMA,
                                    self.LEARNING_RATE)
        self.exploration = self.INITIAL_EXPLORATION
        self.exploration_dec = self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS

    def train(self, iterations=100, print_step=1, filename='weights.pkl'):
        """Train the model."""
        self.es.run(iterations, print_step=print_step)
        self.save(filename)

    def load(self, filename='weights.pkl'):
        """Load the model weights from file."""
        with open(filename, 'rb') as fp:
            self.model.set_weights(pickle.load(fp, encoding='bytes'))
        self.es.weights = self.model.get_weights()

    def save(self, filename='weights.pkl'):
        """Save the weights of current model into file."""
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    def play(self, episodes=1, render=True):
        """Play the agent for episodes."""
        self.model.set_weights(self.es.weights)

        for episode in range(episodes):
            total_reward = 0
            # Get the initial observation.
            observation = self.env.reset()
            # Fill the observation sequence with repeated initial obsercations
            # for AGENT_HISTORY_LENGTH times.
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                if render:
                    # Visualize.
                    self.env.render()
                action = self.get_predicted_action(sequence)
                # Get the results of the action.
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                # Shift the observation sequence to include the new one.
                sequence = sequence[1:]
                sequence.append(observation)

            print("total reward: ", total_reward)

    def get_predicted_action(self, sequence):
        """Get the model's predicted action based on sequence of states."""
        prediction = self.model.predict(np.array(sequence))
        return prediction

    def get_reward(self, weights):
        """Get the reward of the current model based on EPS_AVG times of
        tests."""
        total_reward = 0.0
        self.model.set_weights(weights)

        # Run tests for EPS_AVG times.
        for episode in range(self.EPS_AVG):
            # Get the initial observation.
            observation = self.env.reset()
            # Fill the observation sequence with repeated initial obsercations
            # for AGENT_HISTORY_LENGTH times.
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                self.exploration = max(self.FINAL_EXPLORATION,
                                       self.exploration - self.exploration_dec)
                # Randomize exploration.
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(sequence)
                # Get the results of the action.
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                # Shift the observation sequence to include the new one.
                sequence = sequence[1:]
                sequence.append(observation)

        return total_reward / self.EPS_AVG
Example #11
0
prediction = Dense(2, activation='softmax')(x)

model = Model(inputs=inputs, outputs=prediction)


def get_reward(weights):
    env = gym.make("CartPole-v0")

    model.set_weights(weights)
    ob = env.reset()
    done = False
    total_reward = 0
    while not done:
        batch = ob[np.newaxis, ...]
        prediction = model.predict(batch)
        action = np.argmax(prediction)
        ob, reward, done, _ = env.step(action)

        total_reward += reward

    return total_reward


es = EvolutionStrategy(model.get_weights(),
                       get_reward,
                       population_size=100,
                       sigma=0.1,
                       learning_rate=0.001,
                       render_test=False)
es.run(300)
model.save('cartpole.h5')
Example #12
0
class EvolutionStrategy:
    def __init__(self, model, weights, env):
        self.model = model
        self.weights = weights
        self.POPULATION_SIZE = 20
        self.SIGMA = 0.1
        self.LEARNING_RATE = 0.01
        self.decay = 0.999
        self.env = env

        self.es = Eee(self.weights, self.__get_reward)

    def __update_weights(self):
        pass

    def __get_population_rewards(self, population_weights):
        # self.env.step
        solution = 0  # target
        rewards = []
        for w in population_weights:
            reward = self.__get_reward(w)
            rewards.append(reward)

        normalized_rewards = (rewards - np.mean(rewards)) / np.std(rewards)

        return normalized_rewards, rewards

    def __get_reward(self, current_weights):
        self.model.set_weights(current_weights)

        rewards = 0
        episodes = 1
        for _ in range(episodes):
            done = False
            obs = self.env.reset()
            obs = obs.reshape([1, 4])
            while not done:
                prediction = self.model.predict(obs)
                prediction = np.argmax(prediction)
                obs, reward, done, _ = self.env.step(prediction)
                # self.env.render()
                obs = obs.reshape([1, 4])
                rewards += reward

        return rewards / episodes

    def __generate_population_weights(self):
        population_weights = []
        for i in range(self.POPULATION_SIZE):
            weights_jitter = []
            for w in self.weights:
                weights_jitter.append(np.random.randn(*w.shape) * self.SIGMA)
            current_weights = self.weights + weights_jitter
            population_weights.append(current_weights)
        return population_weights

    def update(self):
        self.es.run(600, print_step=1)
        # update the weights

        # generate population weights
        population_weights = self.__generate_population_weights()
        population_norm_rewards, rewards = self.__get_population_rewards(
            population_weights)

        # update self.weights
        for index, w in enumerate(self.weights):
            current_weight = np.array([
                population_weight[index]
                for population_weight in population_weights
            ])
            obj_func = np.dot(current_weight.T, population_norm_rewards).T
            self.weights[index] = w + self.LEARNING_RATE / (
                self.SIGMA * self.POPULATION_SIZE) * obj_func
            self.LEARNING_RATE = self.LEARNING_RATE * self.decay

        return np.max(rewards)
Example #13
0
class Agent:
    def __init__(self,
                 model,
                 training_steps=500,
                 environment='BipedalWalker-v2',
                 AGENT_HISTORY_LENGTH=1,
                 POPULATION_SIZE=50,
                 EPS_AVG=1,
                 SIGMA=0.1,
                 LEARNING_RATE=0.01,
                 INITIAL_EXPLORATION=1.0,
                 FINAL_EXPLORATION=0.0,
                 EXPLORATION_DEC_STEPS=10000,
                 num_thread=1,
                 LR_mode=0):
        self.env = gym.make(environment)
        self.model = model
        self.exploration = INITIAL_EXPLORATION
        self.training_steps = training_steps
        self.AGENT_HISTORY_LENGTH = AGENT_HISTORY_LENGTH
        self.POPULATION_SIZE = POPULATION_SIZE
        self.EPS_AVG = EPS_AVG
        self.SIGMA = SIGMA
        self.LEARNING_RATE = LEARNING_RATE
        self.INITIAL_EXPLORATION = INITIAL_EXPLORATION
        self.FINAL_EXPLORATION = FINAL_EXPLORATION
        self.EXPLORATION_DEC_STEPS = EXPLORATION_DEC_STEPS
        self.num_thread = num_thread
        self.LR_mode = LR_mode
        self.es = EvolutionStrategy(self.model.get_weights(),
                                    self.get_reward,
                                    self.POPULATION_SIZE,
                                    self.SIGMA,
                                    self.LEARNING_RATE,
                                    num_threads=num_thread,
                                    LR_mode=self.LR_mode)

    def get_predicted_action(self, sequence):
        prediction = self.model.predict(np.array(sequence))
        return prediction

    def load(self, model_file):
        with open(model_file, 'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()

    def save(self, model_file):
        with open(model_file, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)

    def train(self, iterations):
        print('Training')
        self.es.run(iterations, print_step=1)
        optimized_weights = self.es.get_weights()
        self.model.set_weights(optimized_weights)

    def play(self, episodes, render=True):
        self.model.set_weights(self.es.weights)
        for episode in range(episodes):
            print('On episode number {}'.format(episode))
            total_reward = 0
            observation = self.env.reset()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                if render:
                    self.env.render()
                action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)
            print("total reward:", total_reward)

    def get_reward(self, weights):
        total_reward = 0.0
        self.model.set_weights(weights)

        for episode in range(self.EPS_AVG):
            start_time = time.time()
            observation = self.env.reset()
            sequence = [observation] * self.AGENT_HISTORY_LENGTH
            done = False
            while not done:
                self.exploration = max(
                    self.FINAL_EXPLORATION, self.exploration -
                    self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(sequence)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                sequence = sequence[1:]
                sequence.append(observation)
        #print("total reward: ", total_reward)
        #print('Finished in {} seconds'.format(time.time() - start_time))
        return total_reward / self.EPS_AVG