def loop(n): logger_her.info("***************************") logger_her.info("**** Bit flipping game ****") logger_her.info("***************************") logger_her.info("Start main loop with size {}".format(n)) logger_her.info("HER STATUS: {}".format(HER)) actor = QModel(n, HER) critic = QModel(n, HER) if not TRAIN_FROM_SCRATCH: actor.load() critic.load() else: logger_her.info("Training QNetworks from scratch") re_buffer = Buffer(BUFFER_SIZE) for epoch in range(EPOCHS): logger_her.info("Start epoch {}".format(epoch + 1)) for episode_idx in range(EPISODES): goal = State.sample_status(n) start = State.sample_status(n) # here we will going to store start and goal in a state object state = State(start, goal) _, episode = sample_episode(actor, state, epsilon_greedy=True) re_buffer.add(episode) if HER: new_experience = [] for s, a, r, sn in episode: for t in _sample(n, HER_NEW_GOALS): _g = episode[t][-1].status _sn = State(sn.status.copy(), _g.copy()) exp = (State(s.status.copy(), _g.copy()), a, 0 if _sn.is_final else -1, _sn) new_experience.append(exp) re_buffer.add(new_experience) for training_step in range(TRAINING_STEPS): minibatch = re_buffer.sample(BATCH_SIZE) train(critic, actor, minibatch) if (epoch + 1) % UPDATE_ACTOR == 0: actor.update(critic) success_rate = evaluate_actor(actor) re_buffer.log_stats() if success_rate >= 1. - 1e-9: logger_her.info("Learned policy (QAction-Value) for {} bits in {} epochs".format(n, epoch + 1)) break
class TestBufferBasic(unittest.TestCase): def setUp(self): self.n_samples = 10 self.d_state = 3 self.d_action = 2 self.buffer_size = 10 self.batch_size = 4 self.ensemble_size = 3 self.buf = Buffer(d_state=self.d_state, d_action=self.d_action, buffer_size=self.buffer_size, ensemble_size=self.ensemble_size) self.samples = [(np.random.random(self.d_state), np.random.random(self.d_action), np.random.random(self.d_state)) for _ in range(self.n_samples)] for state, action, next_state in self.samples: self.buf.add(state, action, next_state) def test_insertion(self): for i, (state, action, next_state) in enumerate(self.samples): self.assertTrue(np.allclose(self.buf.states[i], state)) self.assertTrue(np.allclose(self.buf.actions[i], action)) self.assertTrue(np.allclose(self.buf.state_deltas[i], next_state - state)) def test_sampling_size(self): for states, actions, state_deltas in self.buf.train_batches(batch_size=self.batch_size): self.assertEqual(states.shape[0], self.ensemble_size) self.assertEqual(states.shape[1], self.batch_size) self.assertEqual(states.shape[2], self.d_state) self.assertEqual(actions.shape[0], self.ensemble_size) self.assertEqual(actions.shape[1], self.batch_size) self.assertEqual(actions.shape[2], self.d_action) self.assertEqual(state_deltas.shape[0], self.ensemble_size) self.assertEqual(state_deltas.shape[1], self.batch_size) self.assertEqual(state_deltas.shape[2], self.d_state) break def test_sampling(self): for e_state, e_action, e_state_delta in self.buf.train_batches(batch_size=3): for b_state, b_action, b_state_delta in zip(e_state, e_action, e_state_delta): for s_state, s_action, s_state_delta in zip(b_state, b_action, b_state_delta): found = False for state, action, next_state in self.samples: if np.allclose(s_state, state) and np.allclose(s_action, action) and np.allclose(s_state_delta, next_state - state): found = True break assert found
def main(): config = Config() env = Environment(config) #for training eval_env = Eval_Environment(config)#for testing num_actions = env.action_size() config.setaction_set_size(num_actions) brain = Control(config) plt = Plotter() plt.writesummary(0) #adding progress bar for training pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress') episode_buffer = Buffer(config) episode_length = 0 eval_count = 1 while(env.frame_history <= config.MAX_FRAMES): if env.frame_history/(config.EVAL_FREQ*eval_count) == 1: evaluate(eval_env,config,brain,env.frame_history,plt)#testing happens now eval_count+=1 past_num_frames = env.frame_history #algorithm beigns now if episode_length == 0: env.reset() s,a,r,t = env.act(0) episode_buffer.add(s,a,r) episode_length += 1 s,a,r,t = env.act(brain.getaction(s)) episode_length += 1 episode_buffer.add(s,a,r) if (env.START_NEW_GAME or episode_length >= config.T) and not(episode_buffer.isempty()):#then epsiode ends episode_values = episode_buffer.get_returns() brain.update_table(episode_values) episode_buffer.reset() episode_length = 0 pbar.update(env.frame_history-past_num_frames) env.close_render()
def test_complete_replace_twice(self): n_samples = 9 d_state = 3 d_action = 2 buffer_size = 3 ensemble_size = 5 buf = Buffer(d_state=d_state, d_action=d_action, buffer_size=buffer_size, ensemble_size=ensemble_size) samples = [(np.random.random(d_state), np.random.random(d_action), np.random.random(d_state)) for _ in range(n_samples)] for state, action, next_state in samples: buf.add(state, action, next_state) for i, (state, action, next_state) in enumerate(samples[-buffer_size:]): self.assertTrue(np.allclose(buf.states[i], state)) self.assertTrue(np.allclose(buf.actions[i], action)) self.assertTrue(np.allclose(buf.state_deltas[i], next_state - state))
def test_partial_replacement(self): n_samples = 17 d_state = 3 d_action = 2 buffer_size = 7 ensemble_size = 3 buf = Buffer(d_state=d_state, d_action=d_action, buffer_size=buffer_size, ensemble_size=ensemble_size) samples = [(np.random.random(d_state), np.random.random(d_action), np.random.random(d_state)) for _ in range(n_samples)] for state, action, next_state in samples: buf.add(state, action, next_state) r = n_samples % buffer_size for i, (state, action, next_state) in enumerate(samples[-r:]): self.assertTrue(np.allclose(buf.states[i], state)) self.assertTrue(np.allclose(buf.actions[i], action)) self.assertTrue(np.allclose(buf.state_deltas[i], next_state - state))
class DCOACH: def __init__(self, dim_a, action_upper_limits, action_lower_limits, e, buffer_min_size, buffer_max_size, buffer_sampling_rate, buffer_sampling_size, train_end_episode): # Initialize variables self.h = None self.state_representation = None self.policy_action_label = None self.e = np.array(str_2_array(e, type_n='float')) self.dim_a = dim_a self.action_upper_limits = str_2_array(action_upper_limits, type_n='float') self.action_lower_limits = str_2_array(action_lower_limits, type_n='float') self.count = 0 self.buffer_sampling_rate = buffer_sampling_rate self.buffer_sampling_size = buffer_sampling_size self.train_end_episode = train_end_episode # Initialize DCOACH buffer self.buffer = Buffer(min_size=buffer_min_size, max_size=buffer_max_size) def _generate_policy_label(self, action): if np.any(self.h): error = np.array(self.h * self.e).reshape(1, self.dim_a) self.policy_action_label = [] for i in range(self.dim_a): self.policy_action_label.append( np.clip( action[i] / self.action_upper_limits[i] + error[0, i], -1, 1)) self.policy_action_label = np.array( self.policy_action_label).reshape(1, self.dim_a) else: self.policy_action_label = np.reshape(action, [1, self.dim_a]) def _single_update(self, neural_network, state_representation): neural_network.sess.run(neural_network.train_policy, feed_dict={ 'policy/state_representation:0': state_representation, 'policy/policy_label:0': self.policy_action_label }) def _batch_update(self, neural_network, transition_model, batch): observation_sequence_batch = [np.array(pair[0]) for pair in batch] # state(t) sequence action_sequence_batch = [np.array(pair[1]) for pair in batch] current_observation_batch = [np.array(pair[2]) for pair in batch] # last action_label_batch = [np.array(pair[3]) for pair in batch] state_representation_batch = transition_model.get_state_representation_batch( neural_network, observation_sequence_batch, action_sequence_batch, current_observation_batch) neural_network.sess.run(neural_network.train_policy, feed_dict={ 'policy/state_representation:0': state_representation_batch, 'policy/policy_label:0': action_label_batch }) def feed_h(self, h): self.h = h def action(self, neural_network, state_representation): self.count += 1 self.state_representation = state_representation action = neural_network.sess.run(neural_network.policy_output, feed_dict={ 'policy/state_representation:0': self.state_representation }) out_action = [] for i in range(self.dim_a): action[0, i] = np.clip(action[0, i], -1, 1) * self.action_upper_limits[i] out_action.append(action[0, i]) return np.array(out_action) def train(self, neural_network, transition_model, action, t, done): self._generate_policy_label(action) # Policy training if np.any(self.h): # if any element is not 0 self._single_update(neural_network, self.state_representation) print("feedback:", self.h) # Add last step to memory buffer if transition_model.last_step( self.policy_action_label) is not None: self.buffer.add( transition_model.last_step(self.policy_action_label)) # Train sampling from buffer if self.buffer.initialized(): batch = self.buffer.sample( batch_size=self.buffer_sampling_size ) # TODO: probably this config thing should not be here self._batch_update(neural_network, transition_model, batch) # Train policy every k time steps from buffer if self.buffer.initialized( ) and t % self.buffer_sampling_rate == 0 or (self.train_end_episode and done): batch = self.buffer.sample(batch_size=self.buffer_sampling_size) self._batch_update(neural_network, transition_model, batch)
class HG_DAGGER: def __init__(self, dim_a, action_upper_limits, action_lower_limits, buffer_min_size, buffer_max_size, buffer_sampling_rate, buffer_sampling_size, number_training_iterations, train_end_episode): # Initialize variables self.dim_a = dim_a self.action_upper_limits = str_2_array(action_upper_limits, type_n='float') self.action_lower_limits = str_2_array(action_lower_limits, type_n='float') self.count = 0 self.buffer_sampling_rate = buffer_sampling_rate self.buffer_sampling_size = buffer_sampling_size self.number_training_iterations = number_training_iterations self.train_end_episode = train_end_episode # Initialize HG_DAgger buffer self.buffer = Buffer(min_size=buffer_min_size, max_size=buffer_max_size) def feed_h(self, h): self.h = np.reshape(h, [1, self.dim_a]) def action(self, neural_network, state_representation): self.count += 1 if np.any(self.h): # if feedback, human teleoperates action = self.h print("feedback:", self.h[0]) else: action = neural_network.sess.run(neural_network.policy_output, feed_dict={'policy/state_representation:0': state_representation}) out_action = [] for i in range(self.dim_a): action[0, i] = np.clip(action[0, i], -1, 1) * self.action_upper_limits[i] out_action.append(action[0, i]) return np.array(out_action) def train(self, neural_network, transition_model, action, t, done): # Add last step to memory buffer if transition_model.last_step(action) is not None and np.any(self.h): # if human teleoperates, add action to database self.buffer.add(transition_model.last_step(action)) # Train policy every k time steps from buffer if self.buffer.initialized() and (t % self.buffer_sampling_rate == 0 or (self.train_end_episode and done)): for i in range(self.number_training_iterations): if i % (self.number_training_iterations / 20) == 0: print('Progress Policy training: %i %%' % (i / self.number_training_iterations * 100)) batch = self.buffer.sample(batch_size=self.buffer_sampling_size) observation_sequence_batch = [np.array(pair[0]) for pair in batch] # state(t) sequence action_sequence_batch = [np.array(pair[1]) for pair in batch] current_observation_batch = [np.array(pair[2]) for pair in batch] # last action_label_batch = [np.array(pair[3]) for pair in batch] state_representation_batch = transition_model.get_state_representation_batch(neural_network, observation_sequence_batch, action_sequence_batch, current_observation_batch) neural_network.sess.run(neural_network.train_policy, feed_dict={'policy/state_representation:0': state_representation_batch, 'policy/policy_label:0': action_label_batch})
class TransitionModel: def __init__(self, training_sequence_length, lstm_hidden_state_size, crop_observation, image_width, show_transition_model_output, show_observation, resize_observation, occlude_observation, dim_a, buffer_sampling_rate, buffer_sampling_size, number_training_iterations, train_end_episode): self.lstm_h_size = lstm_hidden_state_size self.dim_a = dim_a self.training_sequence_length = training_sequence_length self.number_training_iterations = number_training_iterations self.train_end_episode = train_end_episode # System model parameters self.lstm_hidden_state = np.zeros([1, 2 * self.lstm_h_size]) self.image_width = image_width # we assume that images are squares # High-dimensional observation initialization self.resize_observation = resize_observation self.show_observation = show_observation self.show_ae_output = show_transition_model_output self.t_counter = 0 self.crop_observation = crop_observation self.occlude_observation = occlude_observation # Buffers self.last_actions = Buffer(min_size=self.training_sequence_length + 1, max_size=self.training_sequence_length + 1) self.last_actions.add(np.zeros([1, self.dim_a])) self.last_states = Buffer(min_size=self.training_sequence_length + 1, max_size=self.training_sequence_length + 1) self.last_states.add( np.zeros([1, self.image_width, self.image_width, 1])) self.transition_model_buffer_sampling_rate = buffer_sampling_rate self.transition_model_sampling_size = buffer_sampling_size if self.show_observation: self.state_plot = FastImagePlot( 1, np.zeros([self.image_width, self.image_width]), self.image_width, 'Image State', vmax=1.0) if self.show_ae_output: self.ae_output_plot = FastImagePlot( 3, np.zeros([self.image_width, self.image_width]), self.image_width, 'Autoencoder Output', vmax=1.0) def _preprocess_observation(self, observation): if self.occlude_observation: observation[48:, :, :] = np.zeros([ 48, 96, 3 ]) + 127 # TODO: occlusion should be a function of the input size if self.crop_observation: observation = observation[:, 80: -80] # TODO: these numbers should not be hard coded if self.resize_observation: observation = cv2.resize(observation, (self.image_width, self.image_width), interpolation=cv2.INTER_AREA) self.processed_observation = observation_to_gray( observation, self.image_width) self.last_states.add(self.processed_observation) self.network_input = np.array(self.last_states.buffer) def _refresh_image_plots(self, neural_network): if self.t_counter % 4 == 0 and self.show_observation: self.state_plot.refresh(self.processed_observation) if (self.t_counter + 2) % 4 == 0 and self.show_ae_output: ae_model_output = neural_network.transition_model_output.eval( session=neural_network.sess, feed_dict={ 'transition_model/lstm_hidden_state_out:0': self.lstm_hidden_state, 'transition_model/autoencoder_mode:0': True, 'transition_model/transition_model_input:0': self.network_input[-1], 'transition_model/sequence_length:0': 1, 'transition_model/batch_size:0': 1 }) self.ae_output_plot.refresh(ae_model_output) def _train_model_from_database(self, neural_network, database): episodes_num = len(database) print('Training Transition Model...') for i in range(self.number_training_iterations): # Train if i % (self.number_training_iterations / 20) == 0: print('Progress Transition Model training: %i %%' % (i / self.number_training_iterations * 100)) observations, actions, predictions = [], [], [] # Sample batch from database for i in range(self.transition_model_sampling_size): count = 0 while True: count += 1 if count > 1000: # check if it is possible to sample print('Database too small for training!') return selected_episode = round( np.random.uniform(-0.49, episodes_num - 1) ) # select and episode from the database randomly episode_trajectory_length = len(database[selected_episode]) if episode_trajectory_length > self.training_sequence_length + 2: break sequence_start = round( np.random.uniform( 0, episode_trajectory_length - self.training_sequence_length - 1)) sequence = database[selected_episode][ sequence_start:sequence_start + self.training_sequence_length + 1] # get samples from database observation_seq = [] action_seq = [] # Separate observations, actions and expected observation predictions from sampled batch for i in range(len(sequence)): observation_seq.append(sequence[i][0]) action_seq.append(sequence[i][1]) observations.append(observation_seq[:-1]) actions.append(action_seq[:-1]) predictions.append(observation_seq[-1]) observations = np.array(observations) actions = np.array(actions) predictions = np.array(predictions) # Train transition model neural_network.sess.run( neural_network.train_transition_model, feed_dict={ 'transition_model/transition_model_input:0': np.reshape(observations, [ self.transition_model_sampling_size * self.training_sequence_length, self.image_width, self.image_width, 1 ]), 'transition_model/action_in:0': np.reshape(actions, [ self.transition_model_sampling_size * self.training_sequence_length, self.dim_a ]), 'transition_model/transition_model_label:0': np.reshape(predictions, [ self.transition_model_sampling_size, self.image_width, self.image_width, 1 ]), 'transition_model/batch_size:0': self.transition_model_sampling_size, 'transition_model/sequence_length:0': self.training_sequence_length, 'transition_model/autoencoder_mode:0': True }) def train(self, neural_network, t, done, database): # Transition model training if (t % self.transition_model_buffer_sampling_rate == 0 and t != 0 ) or ( self.train_end_episode and done ): # Sim pendulum: 200; mountain car: done TODO: check if use done self._train_model_from_database(neural_network, database) def get_state_representation(self, neural_network, observation): self._preprocess_observation(np.array(observation)) state_representation = neural_network.sess.run( neural_network.state_representation, feed_dict={ 'transition_model/transition_model_input:0': self.network_input[-1], 'transition_model/lstm_hidden_state_out:0': self.lstm_hidden_state, 'transition_model/batch_size:0': 1, 'transition_model/sequence_length:0': 1 }) self._refresh_image_plots(neural_network) # refresh image plots self.t_counter += 1 return state_representation def get_state_representation_batch(self, neural_network, observation_sequence_batch, action_sequence_batch, current_observation): batch_size = len(observation_sequence_batch) lstm_hidden_state_batch = neural_network.sess.run( neural_network.lstm_hidden_state, feed_dict={ 'transition_model/transition_model_input:0': np.reshape(observation_sequence_batch, [ batch_size * self.training_sequence_length, self.image_width, self.image_width, 1 ]), 'transition_model/action_in:0': np.reshape( action_sequence_batch, [batch_size * self.training_sequence_length, self.dim_a]), 'transition_model/batch_size:0': batch_size, 'transition_model/sequence_length:0': self.training_sequence_length }) state_representation_batch = neural_network.sess.run( neural_network.state_representation, feed_dict={ 'transition_model/transition_model_input:0': np.reshape( current_observation, [batch_size, self.image_width, self.image_width, 1]), 'transition_model/lstm_hidden_state_out:0': lstm_hidden_state_batch, 'transition_model/batch_size:0': batch_size, 'transition_model/sequence_length:0': 1 }) return state_representation_batch def compute_lstm_hidden_state(self, neural_network, action): action = np.reshape(action, [1, self.dim_a]) self.lstm_hidden_state = neural_network.sess.run( neural_network.lstm_hidden_state, feed_dict={ 'transition_model/transition_model_input:0': self.network_input[-1], 'transition_model/action_in:0': action, 'transition_model/lstm_hidden_state_in:0': self.lstm_hidden_state, 'transition_model/batch_size:0': 1, 'transition_model/sequence_length:0': 1 }) self.last_actions.add(action) def last_step(self, action_label): if self.last_states.initialized() and self.last_actions.initialized(): return [ self.network_input[:-1], self.last_actions.buffer[:-1], self.network_input[-1], action_label.reshape(self.dim_a) ] else: return None def new_episode(self): self.lstm_hidden_state = np.zeros([1, 2 * self.lstm_h_size]) self.last_states = Buffer(min_size=self.training_sequence_length + 1, max_size=self.training_sequence_length + 1) self.last_actions = Buffer(min_size=self.training_sequence_length + 1, max_size=self.training_sequence_length + 1) self.last_actions.add(np.zeros([1, self.dim_a])) self.last_states.add( np.zeros([1, self.image_width, self.image_width, 1]))
action = np.random.multivariate_normal(u, cov) else: assert False #print "action:", action, "Q:", Q(x, np.array([action])), "V:", V(x) #print "action:", action, "advantage:", A(x, np.array([action])) #print "mu:", u, "action:", action #print "Q(mu):", Q(x, np.array([u])), "Q(action):", Q(x, np.array([action])) # take the action and record reward observation, reward, done, info = env.step(action) episode_reward += reward #print "reward:", reward #print "poststate:", observation # add experience to replay memory R.add(x[0], action, reward, observation, done) loss = 0 # perform train_repeat Q-updates for k in xrange(args.train_repeat): preobs, actions, rewards, postobs, terminals = R.sample( args.batch_size) # Q-update v = V(postobs) y = rewards + args.gamma * np.squeeze(v) loss += model.train_on_batch([preobs, actions], y) # copy weights to target model, averaged by tau weights = model.get_weights() target_weights = target_model.get_weights()
if np.random.random() < args.exploration: action = env.action_space.sample() else: s = np.array([observation]) q = model.predict_on_batch(s) #print "q:", q action = np.argmax(q[0]) maxqs.append(np.max(q[0])) #print "action:", action prev_observation = observation observation, reward, done, info = env.step(action) #print info episode_reward += reward #print "reward:", reward mem.add(prev_observation, np.array([action]), reward, observation, done) for k in xrange(args.train_repeat): prestates, actions, rewards, poststates, terminals = mem.sample(args.batch_size) qpre = model.predict_on_batch(prestates) qpost = target_model.predict_on_batch(poststates) for i in xrange(qpre.shape[0]): if terminals[i]: qpre[i, actions[i]] = rewards[i] else: qpre[i, actions[i]] = rewards[i] + args.gamma * np.amax(qpost[i]) cost = model.train_on_batch(prestates, qpre) costs.append(cost) total_train_steps += 1
env.render() if np.random.random() < args.exploration: action = env.action_space.sample() else: s = np.array([observation]) q = model.predict_on_batch(s) #print "q:", q action = np.argmax(q[0]) #print "action:", action prev_observation = observation observation, reward, done, info = env.step(action) episode_reward += reward #print "reward:", reward mem.add(prev_observation, np.array([action]), reward, observation, done) for k in xrange(args.train_repeat): prestates, actions, rewards, poststates, terminals = mem.sample(args.batch_size) qpre = model.predict_on_batch(prestates) qpost = target_model.predict_on_batch(poststates) for i in xrange(qpre.shape[0]): if terminals[i]: qpre[i, actions[i]] = rewards[i] else: qpre[i, actions[i]] = rewards[i] + args.gamma * np.amax(qpost[i]) model.train_on_batch(prestates, qpre) weights = model.get_weights() target_weights = target_model.get_weights()
action = np.random.multivariate_normal(u, cov) else: assert False #print "action:", action, "Q:", Q(x, np.array([action])), "V:", V(x) #print "action:", action, "advantage:", A(x, np.array([action])) #print "mu:", u, "action:", action #print "Q(mu):", Q(x, np.array([u])), "Q(action):", Q(x, np.array([action])) # take the action and record reward observation, reward, done, info = env.step(action) episode_reward += reward #print "reward:", reward #print "poststate:", observation # add experience to replay memory R.add(x[0], action, reward, observation, done) loss = 0 # perform train_repeat Q-updates for k in range(args.train_repeat): preobs, actions, rewards, postobs, terminals = R.sample(args.batch_size) # Q-update v = V(postobs) y = rewards + args.gamma * np.squeeze(v) loss += model.train_on_batch([preobs, actions], y) # copy weights to target model, averaged by tau weights = model.get_weights() target_weights = target_model.get_weights() for i in range(len(weights)):
agent = DDPG(2, 1) buf = Buffer(BUF_SIZE) noise = OUStrategy(env.action_space, min_sigma=1e-4) updates_noise = 0 for episode in range(episodes): state = env.reset() episode_reward = 0 done = False total_reward = 0 while not done: action = agent.act(state) action = noise.get_action_from_raw_action(action, updates_noise) updates_noise += 1 next_state, reward, done, _ = env.step(action) total_reward += reward buf.add((state, action, reward, next_state, done)) if len(buf) >= BATCH_SIZE: agent.update(buf.sample(BATCH_SIZE)) state = next_state print( f"I did {episode}th episode. Result: {total_reward}, sigma = {noise.sigma}" ) # Я решила тренироваться до 150 эпизодов, хотя с этим сидом оно крутится около 90, начиная с 30 эпизода. # Вывод на последних 10 эпизодах: # I did 139th episode. Result: 91.13059676792551, sigma = 0.17022727199999999 # I did 140th episode. Result: 90.62383628427916, sigma = 0.16973243699999999 # I did 141th episode. Result: 94.36829967370625, sigma = 0.16948352 # I did 142th episode. Result: 87.05158580519061, sigma = 0.168778755 # I did 143th episode. Result: 89.52206836735917, sigma = 0.16824493299999999 # I did 144th episode. Result: 92.20854623030216, sigma = 0.167951031
def train_one_update(step, epochs, tracing_on): # initialize replay buffer buffer = Buffer( batch_size, minibatch_size, MINIMAP_RES, MINIMAP_RES, env.action_spec()[0], ) # initial observation timestep = env.reset() step_type, reward, _, obs = timestep[0] obs = preprocess(obs) ep_ret = [] # episode return (score) ep_rew = 0 # fill in recorded trajectories while True: tf_obs = ( tf.constant(each_obs, shape=(1, *each_obs.shape)) for each_obs in obs ) val, act_id, arg_spatial, arg_nonspatial, logp_a = actor_critic.step( *tf_obs ) sc2act_args = translateActionToSC2( arg_spatial, arg_nonspatial, MINIMAP_RES, MINIMAP_RES ) act_mask = get_mask(act_id.numpy().item(), actor_critic.action_spec) buffer.add( *obs, act_id.numpy().item(), sc2act_args, act_mask, logp_a.numpy().item(), val.numpy().item() ) step_type, reward, _, obs = env.step( [actions.FunctionCall(act_id.numpy().item(), sc2act_args)] )[0] # print("action:{}: {} reward {}".format(act_id.numpy().item(), sc2act_args, reward)) buffer.add_rew(reward) obs = preprocess(obs) ep_rew += reward if step_type == step_type.LAST or buffer.is_full(): if step_type == step_type.LAST: buffer.finalize(0) else: # trajectory is cut off, bootstrap last state with estimated value tf_obs = ( tf.constant(each_obs, shape=(1, *each_obs.shape)) for each_obs in obs ) val, _, _, _, _ = actor_critic.step(*tf_obs) buffer.finalize(val) ep_rew += reward ep_ret.append(ep_rew) ep_rew = 0 if buffer.is_full(): break # respawn env env.render(True) timestep = env.reset() _, _, _, obs = timestep[0] obs = preprocess(obs) # train in minibatches buffer.post_process() mb_loss = [] for ep in range(epochs): buffer.shuffle() for ind in range(batch_size // minibatch_size): ( player, available_act, minimap, # screen, act_id, act_args, act_mask, logp, val, ret, adv, ) = buffer.minibatch(ind) assert ret.shape == val.shape assert logp.shape == adv.shape if tracing_on: tf.summary.trace_on(graph=True, profiler=False) mb_loss.append( actor_critic.train_step( tf.constant(step, dtype=tf.int64), player, available_act, minimap, # screen, act_id, act_args, act_mask, logp, val, ret, adv, ) ) step += 1 if tracing_on: tracing_on = False with train_summary_writer.as_default(): tf.summary.trace_export(name="train_step", step=0) batch_loss = np.mean(mb_loss) return ( batch_loss, ep_ret, buffer.batch_ret, np.asarray(buffer.batch_vals, dtype=np.float32), )
def infer_on_stream(args, client, stats): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() buffer = Buffer() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### Load the model through `infer_network` ### infer_network.load_model(args.model, args.device, args.cpu_extension) net_input_shape = infer_network.get_input_shape() ##net_input_shape = [1, 3, 600, 600] net_output_name = infer_network.get_output_name() net_input_name = infer_network.get_input_blob_name() net_input_shape = infer_network.get_input_shape() net_output_info = infer_network.get_output_info() log.info("network output name") log.info(net_output_name) log.info("network output info") log.info(net_output_info.shape) log.info("network input shape") log.info(net_input_name) log.info(net_input_shape) ### Handle the input stream ### iflag = False input_stream_arg = 0 if args.input == "cam" else args.input if input_stream_arg.endswith('.jpg') or input_stream_arg.endswith('.bmp'): iflag = True width = 0 height = 0 frame = None cap = None captureOpen = False ## Handle image or stream or CAM if iflag: frame = cv2.imread(input_stream_arg) log.info("single frame shape: %s", frame.shape) width = frame.shape[1] height = frame.shape[0] else: log.info("attempting VideoCapture for: %s", input_stream_arg) cap = cv2.VideoCapture(input_stream_arg) cap.open(args.input) captureOpen = True width = int(cap.get(3)) height = int(cap.get(4)) log.info("input image width: %s, height: %s", width, height) #steam input shape: input_width = 0 input_height = 0 total_person_count = 0 duration = 0 cur_request_id = 0 next_request_id = 1 render_time = 0 parsing_time = 0 waitingOnInference = False ### Loop until stream is over ### while (captureOpen or iflag or waitingOnInference): ### Read from the video capture ### flag = True key_pressed = None if not iflag: flag, frame = cap.read() if not cap.isOpened(): captureOpen = False key_pressed = cv2.waitKey(60) if not flag: break ### Pre-process the image as needed ### input_width = net_input_shape[2] input_height = net_input_shape[3] p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) ### Start asynchronous inference for specified request ### start_time = time() infer_network.exec_net(p_frame) waitingOnInference = True render_time = 0 inf_time = 0 ### Wait for the result ### if infer_network.wait() == 0: ### Get the results of the inference request ### result = infer_network.get_output() inf_time = time() - start_time ###restart clock to capture evaluate/draw time start_time = time() boxes = post_process(result, width, height, PERSON_CLASS) ##if len(boxes) > 1: ##log.info("initial boxes: %s", boxes) boxes = list(boxes.values()) boxes = nms(boxes) buffer_avg = 0 if (iflag): boxes = filter_confidence(boxes, args.prob_threshold) if len(boxes) > 0: ##we have a person in frame (maybe) first_prop = boxes[0] confidence = first_prop[4] buffer.add(confidence) buffer_avg = buffer.average() if confidence > args.prob_threshold: if duration > 0: ##this is not the first time they have been in the frame ##increase duration and move along duration = duration + 1 else: ##very first time this person has entered the frame ##pulse out new count total_person_count = total_person_count + 1 duration = duration + 1 client.publish( "person", json.dumps({ "count": 1, "total": total_person_count })) draw_box(frame, boxes, inf_time) else: ##we have a person in frame, but they don't meet confidence threshold if duration > 0: ##we know we were tracking someone last frame ##so check our rolling buffer average if buffer_avg > BUFFER_AVERAGE_CUTOFF: ##same person, keep counting, move along duration = duration + 1 client.publish( "person", json.dumps({ "count": 1, "total": total_person_count })) draw_box(frame, boxes, inf_time) else: ##log.info("NO-DRAW: c:%s, b:%s, d:%s : else:if:else", confidence, buffer_avg, duration) ##no longer meet confidence or buffer avg client.publish( "person", json.dumps({ "count": 0, "total": total_person_count })) client.publish("person/duration", json.dumps({"duration": duration})) duration = 0 buffer.flush() else: ##log.info("NO-DRAW: c:%s, b:%s, d:%s : else:else", confidence, buffer_avg, duration) ##also nobody in the last frame (duration == 0) client.publish( "person", json.dumps({ "count": 0, "total": total_person_count })) else: ##no boxes with our target class was found, make sure we didn't see one in the last frame (or so) buffer.add(0) buffer_avg = buffer.average() if buffer_avg > BUFFER_AVERAGE_CUTOFF: ##we has someone previously, keep counting, move along duration = duration + 1 else: ##nobody previously, nobody now, make sure we say so client.publish( "person", json.dumps({ "count": 0, "total": total_person_count })) if duration > 0: ##we were previously tracking someone, pulse out duration before zeroing out client.publish("person/duration", json.dumps({"duration": duration})) duration = 0 render_time = time() - start_time render_time_message = "OpenCV rendering time: {:.3f} ms".format( render_time * 1e3) cv2.putText(frame, render_time_message, (15, 45), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) stats.append(dict(it=inf_time, rt=render_time)) sys.stdout.buffer.write(frame) sys.stdout.flush() if key_pressed == 27: break if iflag and not waitingOnInference: iflag = False if infer_network.wait() == 0: iflag = False waitingOnInference = False if cap: cap.release() cv2.destroyAllWindows() client.disconnect()
len(video_buffer.q), len(clip) if clip is not None else 0, building, #face_locations ) point = { 'time': datetime.now(), #'face_locations': face_locations, 'frame': frame, 'current_weight': weight, } if building: clip.append(point) else: video_buffer.add(point) if not building and enough_diff: building = True clip = copy(video_buffer.q) video_buffer.clear() elif building and datetime.now() >= last_weight_event + timedelta(seconds=TIMEOUT): frames = list(clip) clip = None building = False print("creating clip of len", len(frames)) print(archive.create_from_clip(frames)) previous_weight = weight
def sac(args): #set seed if non default is entered if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) env, test_env = TorchEnv(args.env_name, args.max_ep_len), TorchEnv( args.env_name, args.max_ep_len) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = ActorCritic(state_dim, action_dim, action_limit, args.hidden_size, args.gamma, args.alpha, device=args.device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer buffer = Buffer(state_dim, action_dim, buffer_size=args.buffer_size, device=args.device) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=args.lr) q_optimizer = Adam(q_params, lr=args.lr) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q = ac.compute_loss_q(data, ac_targ) loss_q.backward() q_optimizer.step() # Freeze Q-networks so you don't waste computational effort computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = ac.compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(args.polyak) p_targ.data.add_((1 - args.polyak) * p.data) def test_agent(deterministic=True): for j in range(args.num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == args.max_ep_len)): # Take deterministic actions at test time o, r, d = test_env.step( ac.act( torch.as_tensor(o, dtype=torch.float32).to(args.device), deterministic)) ep_ret += r ep_len += 1 # Prepare for interaction with environment total_steps = args.steps_per_epoch * args.epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. if t > args.start_steps: a = ac.act(torch.as_tensor(o, dtype=torch.float32).to(args.device)) else: a = env.action_space.sample() # Step the env o2, r, d = env.step(a) if args.render_env: env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time horizon (that is, when it's an artificial terminal signal that isn't based on the agent's state) d = False if ep_len == args.max_ep_len else d # Store experience to replay buffer buffer.add(o, a, r, o2, d) o = o2 # End of trajectory handling if d or (ep_len == args.max_ep_len): print("EPISODE REWARD: ", ep_ret) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= args.update_after and t % args.update_every == 0: batch_generator = buffer.get_train_batches(args.batch_size) for j in range(args.update_every): #my_batch = my_buffer.get_train_batches(args.batch_size).__next__() try: batch = batch_generator.__next__() except: batch_generator = buffer.get_train_batches(args.batch_size) batch = batch_generator.__next__() update(batch) # End of epoch handling if (t + 1) % args.steps_per_epoch == 0: epoch = (t + 1) // args.steps_per_epoch # Test the performance of the deterministic version of the agent. test_agent()