def __init__(self, environment: EnvironmentInterface, memory: Memory, image_size: int, random_action_policy: RandomActionPolicy, batch_size: int, discount: float, should_load_model: bool, should_save: bool, action_type: Any, create_model: Callable[[Any, int], Model], batches_per_frame: int): self.environment = environment self.random_action_policy = random_action_policy self.memory = memory self.image_size = image_size self.batch_size = batch_size self.discount = discount self.action_type = action_type self.should_save = should_save self.should_exit = False self.default_sigint_handler = signal.getsignal(signal.SIGINT) self.training_info = TrainingInfo(should_load_model) self.mean_training_time = RunningAverage(1000, self.training_info['mean_training_time']) if batches_per_frame: self.training_info['batches_per_frame'] = batches_per_frame if should_load_model and Path(self.MODEL_PATH).is_file(): self.model = load_model(self.MODEL_PATH) print('LOADED') #K.set_value(self.model.optimizer.lr, 0.005) else: self.model = create_model((self.image_size, self.image_size, StateAssembler.FRAME_COUNT), action_type.COUNT) print('CREATED')
class TerminalDistanceRAPolicy(RandomActionPolicy): def __init__(self, running_average_count: int): self.running_average = RunningAverage(running_average_count, start_value=10000) self.epoch_started_time = None self.last_epoch_duration = 0.0 def epoch_started(self): self.epoch_started_time = time.perf_counter() def epoch_ended(self): self.last_epoch_duration = time.perf_counter() - self.epoch_started_time self.running_average.add(self.last_epoch_duration) def get_probability(self, frame: int) -> float: ratio = self.last_epoch_duration / self.running_average.get() return min(4 ** (-ratio + 0.8), 1.0)
class QLearner: MODEL_PATH = 'actionValue.model' def __init__(self, environment: EnvironmentInterface, memory: Memory, image_size: int, random_action_policy: RandomActionPolicy, batch_size: int, discount: float, should_load_model: bool, should_save: bool, action_type: Any, create_model: Callable[[Any, int], Model], batches_per_frame: int): self.environment = environment self.random_action_policy = random_action_policy self.memory = memory self.image_size = image_size self.batch_size = batch_size self.discount = discount self.action_type = action_type self.should_save = should_save self.should_exit = False self.default_sigint_handler = signal.getsignal(signal.SIGINT) self.training_info = TrainingInfo(should_load_model) self.mean_training_time = RunningAverage(1000, self.training_info['mean_training_time']) if batches_per_frame: self.training_info['batches_per_frame'] = batches_per_frame if should_load_model and Path(self.MODEL_PATH).is_file(): self.model = load_model(self.MODEL_PATH) print('LOADED') #K.set_value(self.model.optimizer.lr, 0.005) else: self.model = create_model((self.image_size, self.image_size, StateAssembler.FRAME_COUNT), action_type.COUNT) print('CREATED') def stop(self, sig, frame): print('Exiting...') self.should_exit = True def _predict(self, state: State) -> np.ndarray: # Add batch dimension x = np.expand_dims(state.data, axis=0) #print('PREDICT 1', x.shape) predict = self.model.predict_on_batch(x)[0] print('PREDICT 2', predict) #print('X shape', x.shape) #print('predict ', self.model.predict_on_batch(x)[0]) return predict def _predict_multiple(self, states: Iterable[State]) -> np.ndarray: x = np.stack(state.data for state in states) return self.model.predict_on_batch(x) def _generate_minibatch(self) -> (np.ndarray, np.ndarray): batch = self.memory.random_sample(self.batch_size) new_batch = [] for exp in batch: new_batch.append(exp) new_batch.append(self.mirror_experience(exp)) # print(exp.from_state.data[0,:,0], ' ------------- ', b.from_state.data[0,:,0]) # cv2.imshow('a', exp.from_state.data[:,:,0]) # cv2.imshow('b', b.from_state.data[:,:,0]) # cv2.waitKey(0) # Estimate Q values using current model from_state_estimates = self._predict_multiple(experience.from_state for experience in batch) to_state_estimates = self._predict_multiple(experience.to_state for experience in batch) # Create arrays to hold input and expected output x = np.stack(experience.from_state.data for experience in batch) y = from_state_estimates # Reestimate y values where new reward is known for index, experience in enumerate(batch): new_y = experience.reward if not experience.to_state.is_terminal: new_y += self.discount * np.max(to_state_estimates[index]) # if new_y > 0: # y[index, :] = 0 y[index, experience.action.get_code()] = new_y print() return x, y def mirror_experience(self,experience): new_from_state_data = np.empty((256,256,4))#64 new_to_state_data = np.empty((256,256,4)) for i in range(0,4): image_from = experience.from_state.data[:,:,i] new_from_state_data[:,:,i] = self.mirror_image(image_from) # print('===========================') # print(experience.from_state.data[:,:,i]) # print('-------------') # print(new_from_state_data[:,:,i]) #cv2.waitKey(0) image_to = experience.to_state.data[:,:,i] new_to_state_data[:,:,i] = self.mirror_image(image_to) reward = experience.reward action_horizontal = None if experience.action.horizontal == 0: action_horizontal = 0 elif experience.action.horizontal == -1: action_horizontal = 1 else: action_horizontal = -1 return Experience(StateBare(new_from_state_data, experience.from_state.is_terminal), LeftRightAction(action_horizontal), reward, StateBare(new_to_state_data, experience.to_state.is_terminal)) def mirror_image(self, image): return cv2.flip(image, 1) def _train_minibatch(self): if len(self.memory) < 1: return start = time.perf_counter() x, y = self._generate_minibatch() #print('X ', x.shape) #print('Y ', y) self.model.train_on_batch(x, y) end = time.perf_counter() self.mean_training_time.add(end - start) def predict(self): signal.signal(signal.SIGINT, self.stop) while True: state = self.environment.read_sensors(self.image_size, self.image_size)[0] while not state.is_terminal: #print('HAPPENING', self._predict(state)) action = self.action_type.from_code(np.argmax(self._predict(state))) print('ACTION', action.get_code()) # print('############# 1 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) self.environment.write_action(action) # Wait as long as we usually need to wait due to training time.sleep(self.training_info['batches_per_frame'] * self.training_info['mean_training_time']) new_state, reward = self.environment.read_sensors(self.image_size, self.image_size) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) print('EQUAL ', np.array_equal(state.data, new_state.data)) state = new_state if self.should_exit: sys.exit(0) def start_training(self, episodes: int): signal.signal(signal.SIGINT, self.stop) start_episode = self.training_info['episode'] frames_passed = self.training_info['frames'] for episode in range(start_episode, episodes + 1): self.random_action_policy.epoch_started() # Set initial state state = self.environment.read_sensors(self.image_size, self.image_size)[0] while not state.is_terminal: random_probability = self.random_action_policy.get_probability(frames_passed) # print('############# 1 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) episode_start_time = time.time() if random.random() < random_probability: action = self.random_action_policy.sample_action(self.action_type) else: # noinspection PyTypeChecker action = self.action_type.from_code(np.argmax(self._predict(state))) # print('############# 2 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) self.environment.write_action(action) # print('############# 3 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) for _ in range(self.training_info['batches_per_frame']): self._train_minibatch() # print('############# 4 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) new_state, reward = self.environment.read_sensors(self.image_size, self.image_size) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) # print('############# 5 ############') # all_objects = muppy.get_objects() # sum1 = summary.summarize(all_objects) # summary.print_(sum1) # print('All objects len ', len(all_objects)) if new_state.is_terminal: self.memory.report_failure() state = new_state frames_passed += 1 # Print status time_since_failure = time.time() - episode_start_time print('Episode {}, Total frames {}, ε={:.4f}, Action (v={:+d}, h={:+d}), Reward {:.4f}, ' '{:.0f}s since failure' .format(episode, frames_passed, random_probability, action.vertical, action.horizontal, reward, time_since_failure), end='\r') # Save model after a fixed amount of frames if self.should_save and frames_passed % 2000 == 0: self.training_info['episode'] = episode self.training_info['frames'] = frames_passed self.training_info['mean_training_time'] = self.mean_training_time.get() self.training_info.save() self.model.save(self.MODEL_PATH) if self.should_exit: sys.exit(0) self.random_action_policy.epoch_ended() signal.signal(signal.SIGINT, self.default_sigint_handler)
def __init__(self, running_average_count: int): self.running_average = RunningAverage(running_average_count, start_value=10000) self.epoch_started_time = None self.last_epoch_duration = 0.0
class QLearner: MODEL_PATH = 'actionValue.model' def __init__(self, environment: TrainStrategy, memory: Memory, image_size: int, random_action_policy: RandomActionPolicy, batch_size: int, discount: float, should_load_model: bool, should_save: bool, action_type: Any, create_model: Callable[[Any, int], Model], batches_per_frame: int): self.environment = environment self.random_action_policy = random_action_policy self.memory = memory self.image_size = image_size self.batch_size = batch_size self.discount = discount self.action_type = action_type self.should_save = should_save self.should_exit = False self.default_sigint_handler = signal.getsignal(signal.SIGINT) self.training_info = TrainingInfo(should_load_model) self.mean_training_time = RunningAverage(1000, self.training_info['mean_training_time']) if batches_per_frame: self.training_info['batches_per_frame'] = batches_per_frame if should_load_model and Path(self.MODEL_PATH).is_file(): print('LOADED') self.model = load_model(self.MODEL_PATH) else: print('CREATED') self.model = create_model((self.image_size, self.image_size, StateAssembler.FRAME_COUNT), action_type.COUNT) def stop(self, sig, frame): print('Exiting...') self.should_exit = True def _predict(self, state: State) -> np.ndarray: # Add batch dimension x = np.expand_dims(state.data, axis=0) #print("X", x) #print('X shape', x.shape) #print('predict ', self.model.predict_on_batch(x)[0]) return self.model.predict_on_batch(x)[0] def _predict_multiple(self, states: Iterable[State]) -> np.ndarray: x = np.stack(state.data for state in states) return self.model.predict_on_batch(x) def _generate_minibatch(self) -> (np.ndarray, np.ndarray): batch = self.memory.random_sample(self.batch_size) # Estimate Q values using current model from_state_estimates = self._predict_multiple(experience.from_state for experience in batch) to_state_estimates = self._predict_multiple(experience.to_state for experience in batch) # Create arrays to hold input and expected output x = np.stack(experience.from_state.data for experience in batch) y = from_state_estimates # Reestimate y values where new reward is known for index, experience in enumerate(batch): new_y = experience.reward if not experience.to_state.is_terminal: new_y += self.discount * np.max(to_state_estimates[index]) y[index, experience.action.get_code()] = new_y return x, y def _train_minibatch(self): if len(self.memory) < 1: return start = time.perf_counter() x, y = self._generate_minibatch() self.model.train_on_batch(x, y) end = time.perf_counter() self.mean_training_time.add(end - start) def predict(self): signal.signal(signal.SIGINT, self.stop) while True: state = self.environment.read_sensors(self.image_size, self.image_size)[0] while not state.is_terminal: print('HAPPENING') action = self.action_type.from_code(np.argmax(self._predict(state))) print('ACTION', np.argmax(self._predict(state))) self.environment.write_action(action) # Wait as long as we usually need to wait due to training time.sleep(self.training_info['batches_per_frame'] * self.training_info['mean_training_time']) new_state, reward = self.environment.read_sensors(self.image_size, self.image_size) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) state = new_state if self.should_exit: sys.exit(0) def start_training(self, episodes: int): signal.signal(signal.SIGINT, self.stop) start_episode = self.training_info['episode'] frames_passed = self.training_info['frames'] for episode in range(start_episode, episodes + 1): self.random_action_policy.epoch_started() # Set initial state state = self.environment.read_sensors(self.image_size, self.image_size)[0] episode_start_time = time.time() while not state.is_terminal: random_probability = self.random_action_policy.get_probability(frames_passed) if random.random() < random_probability: action = self.random_action_policy.sample_action(self.action_type) else: # noinspection PyTypeChecker action = self.action_type.from_code(np.argmax(self._predict(state))) self.environment.write_action(action) for _ in range(self.training_info['batches_per_frame']): self._train_minibatch() new_state, reward = self.environment.read_sensors(self.image_size, self.image_size) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) if new_state.is_terminal: self.memory.report_failure() state = new_state frames_passed += 1 # Print status time_since_failure = time.time() - episode_start_time print('Episode {}, Total frames {}, ε={:.4f}, Action (v={:+d}, h={:+d}), Reward {:.4f}, ' '{:.0f}s since failure' .format(episode, frames_passed, random_probability, action.vertical, action.horizontal, reward, time_since_failure), end='\r') # Save model after a fixed amount of frames if self.should_save and frames_passed % 5000 == 0: self.training_info['episode'] = episode self.training_info['frames'] = frames_passed self.training_info['mean_training_time'] = self.mean_training_time.get() self.training_info.save() self.model.save(self.MODEL_PATH) if self.should_exit: sys.exit(0) self.random_action_policy.epoch_ended() signal.signal(signal.SIGINT, self.default_sigint_handler)