예제 #1
0
def loop(n):
    logger_her.info("***************************")
    logger_her.info("**** Bit flipping game ****")
    logger_her.info("***************************")

    logger_her.info("Start main loop with size {}".format(n))
    logger_her.info("HER STATUS: {}".format(HER))

    actor = QModel(n, HER)
    critic = QModel(n, HER)

    if not TRAIN_FROM_SCRATCH:
        actor.load()
        critic.load()
    else:
        logger_her.info("Training QNetworks from scratch")

    re_buffer = Buffer(BUFFER_SIZE)

    for epoch in range(EPOCHS):
        logger_her.info("Start epoch {}".format(epoch + 1))

        for episode_idx in range(EPISODES):
            goal = State.sample_status(n)
            start = State.sample_status(n)

            # here we will going to store start and goal in a state object
            state = State(start, goal)

            _, episode = sample_episode(actor, state, epsilon_greedy=True)
            re_buffer.add(episode)

            if HER:
                new_experience = []
                for s, a, r, sn in episode:
                    for t in _sample(n, HER_NEW_GOALS):
                        _g = episode[t][-1].status
                        _sn = State(sn.status.copy(), _g.copy())

                        exp = (State(s.status.copy(), _g.copy()), a, 0 if _sn.is_final else -1, _sn)

                        new_experience.append(exp)

                re_buffer.add(new_experience)

        for training_step in range(TRAINING_STEPS):
            minibatch = re_buffer.sample(BATCH_SIZE)
            train(critic, actor, minibatch)

        if (epoch + 1) % UPDATE_ACTOR == 0:
            actor.update(critic)

            success_rate = evaluate_actor(actor)

            re_buffer.log_stats()

            if success_rate >= 1. - 1e-9:
                logger_her.info("Learned policy (QAction-Value) for {} bits in {} epochs".format(n, epoch + 1))
                break
예제 #2
0
class TestBufferBasic(unittest.TestCase):
    def setUp(self):
        self.n_samples = 10
        self.d_state = 3
        self.d_action = 2
        self.buffer_size = 10
        self.batch_size = 4
        self.ensemble_size = 3

        self.buf = Buffer(d_state=self.d_state,
                          d_action=self.d_action,
                          buffer_size=self.buffer_size,
                          ensemble_size=self.ensemble_size)

        self.samples = [(np.random.random(self.d_state),
                         np.random.random(self.d_action),
                         np.random.random(self.d_state)) for _ in range(self.n_samples)]

        for state, action, next_state in self.samples:
            self.buf.add(state, action, next_state)

    def test_insertion(self):
        for i, (state, action, next_state) in enumerate(self.samples):
            self.assertTrue(np.allclose(self.buf.states[i], state))
            self.assertTrue(np.allclose(self.buf.actions[i], action))
            self.assertTrue(np.allclose(self.buf.state_deltas[i], next_state - state))

    def test_sampling_size(self):
        for states, actions, state_deltas in self.buf.train_batches(batch_size=self.batch_size):
            self.assertEqual(states.shape[0], self.ensemble_size)
            self.assertEqual(states.shape[1], self.batch_size)
            self.assertEqual(states.shape[2], self.d_state)

            self.assertEqual(actions.shape[0], self.ensemble_size)
            self.assertEqual(actions.shape[1], self.batch_size)
            self.assertEqual(actions.shape[2], self.d_action)

            self.assertEqual(state_deltas.shape[0], self.ensemble_size)
            self.assertEqual(state_deltas.shape[1], self.batch_size)
            self.assertEqual(state_deltas.shape[2], self.d_state)

            break

    def test_sampling(self):
        for e_state, e_action, e_state_delta in self.buf.train_batches(batch_size=3):
            for b_state, b_action, b_state_delta in zip(e_state, e_action, e_state_delta):
                for s_state, s_action, s_state_delta in zip(b_state, b_action, b_state_delta):
                    found = False
                    for state, action, next_state in self.samples:
                        if np.allclose(s_state, state) and np.allclose(s_action, action) and np.allclose(s_state_delta, next_state - state):
                            found = True
                            break

                    assert found
def main():
    config = Config()
    env = Environment(config) #for training
    eval_env = Eval_Environment(config)#for testing
    num_actions = env.action_size()
    config.setaction_set_size(num_actions)
    brain = Control(config)
    plt = Plotter()
    plt.writesummary(0)
    #adding progress bar for training
    pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress')


    episode_buffer = Buffer(config)
    episode_length = 0

    eval_count = 1
    while(env.frame_history <= config.MAX_FRAMES):
        if env.frame_history/(config.EVAL_FREQ*eval_count) == 1:
            evaluate(eval_env,config,brain,env.frame_history,plt)#testing happens now
            eval_count+=1
        past_num_frames = env.frame_history
        #algorithm beigns now

        if episode_length == 0:
            env.reset()
            s,a,r,t = env.act(0)
            episode_buffer.add(s,a,r)
            episode_length += 1

        s,a,r,t = env.act(brain.getaction(s))
        episode_length += 1
        episode_buffer.add(s,a,r)

        if (env.START_NEW_GAME or episode_length >= config.T) and not(episode_buffer.isempty()):#then epsiode ends
            episode_values = episode_buffer.get_returns()
            brain.update_table(episode_values)
            episode_buffer.reset()
            episode_length = 0

        pbar.update(env.frame_history-past_num_frames)

    env.close_render()
예제 #4
0
    def test_complete_replace_twice(self):
        n_samples = 9
        d_state = 3
        d_action = 2
        buffer_size = 3
        ensemble_size = 5

        buf = Buffer(d_state=d_state,
                     d_action=d_action,
                     buffer_size=buffer_size,
                     ensemble_size=ensemble_size)

        samples = [(np.random.random(d_state),
                    np.random.random(d_action),
                    np.random.random(d_state)) for _ in range(n_samples)]

        for state, action, next_state in samples:
            buf.add(state, action, next_state)

        for i, (state, action, next_state) in enumerate(samples[-buffer_size:]):
            self.assertTrue(np.allclose(buf.states[i], state))
            self.assertTrue(np.allclose(buf.actions[i], action))
            self.assertTrue(np.allclose(buf.state_deltas[i], next_state - state))
예제 #5
0
    def test_partial_replacement(self):
        n_samples = 17
        d_state = 3
        d_action = 2
        buffer_size = 7
        ensemble_size = 3

        buf = Buffer(d_state=d_state,
                     d_action=d_action,
                     buffer_size=buffer_size,
                     ensemble_size=ensemble_size)

        samples = [(np.random.random(d_state),
                    np.random.random(d_action),
                    np.random.random(d_state)) for _ in range(n_samples)]

        for state, action, next_state in samples:
            buf.add(state, action, next_state)

        r = n_samples % buffer_size
        for i, (state, action, next_state) in enumerate(samples[-r:]):
            self.assertTrue(np.allclose(buf.states[i], state))
            self.assertTrue(np.allclose(buf.actions[i], action))
            self.assertTrue(np.allclose(buf.state_deltas[i], next_state - state))
class DCOACH:
    def __init__(self, dim_a, action_upper_limits, action_lower_limits, e,
                 buffer_min_size, buffer_max_size, buffer_sampling_rate,
                 buffer_sampling_size, train_end_episode):
        # Initialize variables
        self.h = None
        self.state_representation = None
        self.policy_action_label = None
        self.e = np.array(str_2_array(e, type_n='float'))
        self.dim_a = dim_a
        self.action_upper_limits = str_2_array(action_upper_limits,
                                               type_n='float')
        self.action_lower_limits = str_2_array(action_lower_limits,
                                               type_n='float')
        self.count = 0
        self.buffer_sampling_rate = buffer_sampling_rate
        self.buffer_sampling_size = buffer_sampling_size
        self.train_end_episode = train_end_episode

        # Initialize DCOACH buffer
        self.buffer = Buffer(min_size=buffer_min_size,
                             max_size=buffer_max_size)

    def _generate_policy_label(self, action):
        if np.any(self.h):
            error = np.array(self.h * self.e).reshape(1, self.dim_a)
            self.policy_action_label = []

            for i in range(self.dim_a):
                self.policy_action_label.append(
                    np.clip(
                        action[i] / self.action_upper_limits[i] + error[0, i],
                        -1, 1))

            self.policy_action_label = np.array(
                self.policy_action_label).reshape(1, self.dim_a)
        else:
            self.policy_action_label = np.reshape(action, [1, self.dim_a])

    def _single_update(self, neural_network, state_representation):
        neural_network.sess.run(neural_network.train_policy,
                                feed_dict={
                                    'policy/state_representation:0':
                                    state_representation,
                                    'policy/policy_label:0':
                                    self.policy_action_label
                                })

    def _batch_update(self, neural_network, transition_model, batch):
        observation_sequence_batch = [np.array(pair[0])
                                      for pair in batch]  # state(t) sequence
        action_sequence_batch = [np.array(pair[1]) for pair in batch]
        current_observation_batch = [np.array(pair[2])
                                     for pair in batch]  # last
        action_label_batch = [np.array(pair[3]) for pair in batch]

        state_representation_batch = transition_model.get_state_representation_batch(
            neural_network, observation_sequence_batch, action_sequence_batch,
            current_observation_batch)

        neural_network.sess.run(neural_network.train_policy,
                                feed_dict={
                                    'policy/state_representation:0':
                                    state_representation_batch,
                                    'policy/policy_label:0': action_label_batch
                                })

    def feed_h(self, h):
        self.h = h

    def action(self, neural_network, state_representation):
        self.count += 1
        self.state_representation = state_representation

        action = neural_network.sess.run(neural_network.policy_output,
                                         feed_dict={
                                             'policy/state_representation:0':
                                             self.state_representation
                                         })
        out_action = []

        for i in range(self.dim_a):
            action[0, i] = np.clip(action[0, i], -1,
                                   1) * self.action_upper_limits[i]
            out_action.append(action[0, i])

        return np.array(out_action)

    def train(self, neural_network, transition_model, action, t, done):
        self._generate_policy_label(action)

        # Policy training
        if np.any(self.h):  # if any element is not 0
            self._single_update(neural_network, self.state_representation)
            print("feedback:", self.h)

            # Add last step to memory buffer
            if transition_model.last_step(
                    self.policy_action_label) is not None:
                self.buffer.add(
                    transition_model.last_step(self.policy_action_label))

            # Train sampling from buffer
            if self.buffer.initialized():
                batch = self.buffer.sample(
                    batch_size=self.buffer_sampling_size
                )  # TODO: probably this config thing should not be here
                self._batch_update(neural_network, transition_model, batch)

        # Train policy every k time steps from buffer
        if self.buffer.initialized(
        ) and t % self.buffer_sampling_rate == 0 or (self.train_end_episode
                                                     and done):
            batch = self.buffer.sample(batch_size=self.buffer_sampling_size)
            self._batch_update(neural_network, transition_model, batch)
class HG_DAGGER:
    def __init__(self, dim_a, action_upper_limits, action_lower_limits, buffer_min_size, buffer_max_size,
                 buffer_sampling_rate, buffer_sampling_size, number_training_iterations, train_end_episode):
        # Initialize variables
        self.dim_a = dim_a
        self.action_upper_limits = str_2_array(action_upper_limits, type_n='float')
        self.action_lower_limits = str_2_array(action_lower_limits, type_n='float')
        self.count = 0
        self.buffer_sampling_rate = buffer_sampling_rate
        self.buffer_sampling_size = buffer_sampling_size
        self.number_training_iterations = number_training_iterations
        self.train_end_episode = train_end_episode

        # Initialize HG_DAgger buffer
        self.buffer = Buffer(min_size=buffer_min_size, max_size=buffer_max_size)

    def feed_h(self, h):
        self.h = np.reshape(h, [1, self.dim_a])

    def action(self, neural_network, state_representation):
        self.count += 1

        if np.any(self.h):  # if feedback, human teleoperates
            action = self.h
            print("feedback:", self.h[0])
        else:
            action = neural_network.sess.run(neural_network.policy_output,
                                             feed_dict={'policy/state_representation:0': state_representation})

        out_action = []

        for i in range(self.dim_a):
            action[0, i] = np.clip(action[0, i], -1, 1) * self.action_upper_limits[i]
            out_action.append(action[0, i])

        return np.array(out_action)

    def train(self, neural_network, transition_model, action, t, done):
        # Add last step to memory buffer
        if transition_model.last_step(action) is not None and np.any(self.h):  # if human teleoperates, add action to database
            self.buffer.add(transition_model.last_step(action))

        # Train policy every k time steps from buffer
        if self.buffer.initialized() and (t % self.buffer_sampling_rate == 0 or (self.train_end_episode and done)):
            for i in range(self.number_training_iterations):
                if i % (self.number_training_iterations / 20) == 0:
                    print('Progress Policy training: %i %%' % (i / self.number_training_iterations * 100))

                batch = self.buffer.sample(batch_size=self.buffer_sampling_size)
                observation_sequence_batch = [np.array(pair[0]) for pair in batch]  # state(t) sequence
                action_sequence_batch = [np.array(pair[1]) for pair in batch]
                current_observation_batch = [np.array(pair[2]) for pair in batch]  # last
                action_label_batch = [np.array(pair[3]) for pair in batch]

                state_representation_batch = transition_model.get_state_representation_batch(neural_network,
                                                                                             observation_sequence_batch,
                                                                                             action_sequence_batch,
                                                                                             current_observation_batch)

                neural_network.sess.run(neural_network.train_policy,
                                        feed_dict={'policy/state_representation:0': state_representation_batch,
                                                   'policy/policy_label:0': action_label_batch})
예제 #8
0
class TransitionModel:
    def __init__(self, training_sequence_length, lstm_hidden_state_size,
                 crop_observation, image_width, show_transition_model_output,
                 show_observation, resize_observation, occlude_observation,
                 dim_a, buffer_sampling_rate, buffer_sampling_size,
                 number_training_iterations, train_end_episode):

        self.lstm_h_size = lstm_hidden_state_size
        self.dim_a = dim_a
        self.training_sequence_length = training_sequence_length
        self.number_training_iterations = number_training_iterations
        self.train_end_episode = train_end_episode

        # System model parameters
        self.lstm_hidden_state = np.zeros([1, 2 * self.lstm_h_size])
        self.image_width = image_width  # we assume that images are squares

        # High-dimensional observation initialization
        self.resize_observation = resize_observation
        self.show_observation = show_observation
        self.show_ae_output = show_transition_model_output
        self.t_counter = 0
        self.crop_observation = crop_observation
        self.occlude_observation = occlude_observation

        # Buffers
        self.last_actions = Buffer(min_size=self.training_sequence_length + 1,
                                   max_size=self.training_sequence_length + 1)
        self.last_actions.add(np.zeros([1, self.dim_a]))
        self.last_states = Buffer(min_size=self.training_sequence_length + 1,
                                  max_size=self.training_sequence_length + 1)
        self.last_states.add(
            np.zeros([1, self.image_width, self.image_width, 1]))
        self.transition_model_buffer_sampling_rate = buffer_sampling_rate
        self.transition_model_sampling_size = buffer_sampling_size

        if self.show_observation:
            self.state_plot = FastImagePlot(
                1,
                np.zeros([self.image_width, self.image_width]),
                self.image_width,
                'Image State',
                vmax=1.0)

        if self.show_ae_output:
            self.ae_output_plot = FastImagePlot(
                3,
                np.zeros([self.image_width, self.image_width]),
                self.image_width,
                'Autoencoder Output',
                vmax=1.0)

    def _preprocess_observation(self, observation):
        if self.occlude_observation:
            observation[48:, :, :] = np.zeros([
                48, 96, 3
            ]) + 127  # TODO: occlusion should be a function of the input size

        if self.crop_observation:
            observation = observation[:, 80:
                                      -80]  # TODO: these numbers should not be hard coded

        if self.resize_observation:
            observation = cv2.resize(observation,
                                     (self.image_width, self.image_width),
                                     interpolation=cv2.INTER_AREA)

        self.processed_observation = observation_to_gray(
            observation, self.image_width)
        self.last_states.add(self.processed_observation)
        self.network_input = np.array(self.last_states.buffer)

    def _refresh_image_plots(self, neural_network):
        if self.t_counter % 4 == 0 and self.show_observation:
            self.state_plot.refresh(self.processed_observation)

        if (self.t_counter + 2) % 4 == 0 and self.show_ae_output:
            ae_model_output = neural_network.transition_model_output.eval(
                session=neural_network.sess,
                feed_dict={
                    'transition_model/lstm_hidden_state_out:0':
                    self.lstm_hidden_state,
                    'transition_model/autoencoder_mode:0':
                    True,
                    'transition_model/transition_model_input:0':
                    self.network_input[-1],
                    'transition_model/sequence_length:0':
                    1,
                    'transition_model/batch_size:0':
                    1
                })

            self.ae_output_plot.refresh(ae_model_output)

    def _train_model_from_database(self, neural_network, database):
        episodes_num = len(database)

        print('Training Transition Model...')
        for i in range(self.number_training_iterations):  # Train
            if i % (self.number_training_iterations / 20) == 0:
                print('Progress Transition Model training: %i %%' %
                      (i / self.number_training_iterations * 100))

            observations, actions, predictions = [], [], []

            # Sample batch from database
            for i in range(self.transition_model_sampling_size):
                count = 0
                while True:
                    count += 1
                    if count > 1000:  # check if it is possible to sample
                        print('Database too small for training!')
                        return

                    selected_episode = round(
                        np.random.uniform(-0.49, episodes_num - 1)
                    )  # select and episode from the database randomly
                    episode_trajectory_length = len(database[selected_episode])

                    if episode_trajectory_length > self.training_sequence_length + 2:
                        break

                sequence_start = round(
                    np.random.uniform(
                        0, episode_trajectory_length -
                        self.training_sequence_length - 1))

                sequence = database[selected_episode][
                    sequence_start:sequence_start +
                    self.training_sequence_length +
                    1]  # get samples from database

                observation_seq = []
                action_seq = []

                # Separate observations, actions and expected observation predictions from sampled batch
                for i in range(len(sequence)):
                    observation_seq.append(sequence[i][0])
                    action_seq.append(sequence[i][1])

                observations.append(observation_seq[:-1])
                actions.append(action_seq[:-1])
                predictions.append(observation_seq[-1])

            observations = np.array(observations)
            actions = np.array(actions)
            predictions = np.array(predictions)

            # Train transition model
            neural_network.sess.run(
                neural_network.train_transition_model,
                feed_dict={
                    'transition_model/transition_model_input:0':
                    np.reshape(observations, [
                        self.transition_model_sampling_size *
                        self.training_sequence_length, self.image_width,
                        self.image_width, 1
                    ]),
                    'transition_model/action_in:0':
                    np.reshape(actions, [
                        self.transition_model_sampling_size *
                        self.training_sequence_length, self.dim_a
                    ]),
                    'transition_model/transition_model_label:0':
                    np.reshape(predictions, [
                        self.transition_model_sampling_size, self.image_width,
                        self.image_width, 1
                    ]),
                    'transition_model/batch_size:0':
                    self.transition_model_sampling_size,
                    'transition_model/sequence_length:0':
                    self.training_sequence_length,
                    'transition_model/autoencoder_mode:0':
                    True
                })

    def train(self, neural_network, t, done, database):
        # Transition model training
        if (t % self.transition_model_buffer_sampling_rate == 0 and t != 0
            ) or (
                self.train_end_episode and done
            ):  # Sim pendulum: 200; mountain car: done TODO: check if use done
            self._train_model_from_database(neural_network, database)

    def get_state_representation(self, neural_network, observation):
        self._preprocess_observation(np.array(observation))

        state_representation = neural_network.sess.run(
            neural_network.state_representation,
            feed_dict={
                'transition_model/transition_model_input:0':
                self.network_input[-1],
                'transition_model/lstm_hidden_state_out:0':
                self.lstm_hidden_state,
                'transition_model/batch_size:0': 1,
                'transition_model/sequence_length:0': 1
            })

        self._refresh_image_plots(neural_network)  # refresh image plots
        self.t_counter += 1
        return state_representation

    def get_state_representation_batch(self, neural_network,
                                       observation_sequence_batch,
                                       action_sequence_batch,
                                       current_observation):
        batch_size = len(observation_sequence_batch)

        lstm_hidden_state_batch = neural_network.sess.run(
            neural_network.lstm_hidden_state,
            feed_dict={
                'transition_model/transition_model_input:0':
                np.reshape(observation_sequence_batch, [
                    batch_size * self.training_sequence_length,
                    self.image_width, self.image_width, 1
                ]),
                'transition_model/action_in:0':
                np.reshape(
                    action_sequence_batch,
                    [batch_size * self.training_sequence_length, self.dim_a]),
                'transition_model/batch_size:0':
                batch_size,
                'transition_model/sequence_length:0':
                self.training_sequence_length
            })

        state_representation_batch = neural_network.sess.run(
            neural_network.state_representation,
            feed_dict={
                'transition_model/transition_model_input:0':
                np.reshape(
                    current_observation,
                    [batch_size, self.image_width, self.image_width, 1]),
                'transition_model/lstm_hidden_state_out:0':
                lstm_hidden_state_batch,
                'transition_model/batch_size:0':
                batch_size,
                'transition_model/sequence_length:0':
                1
            })

        return state_representation_batch

    def compute_lstm_hidden_state(self, neural_network, action):
        action = np.reshape(action, [1, self.dim_a])

        self.lstm_hidden_state = neural_network.sess.run(
            neural_network.lstm_hidden_state,
            feed_dict={
                'transition_model/transition_model_input:0':
                self.network_input[-1],
                'transition_model/action_in:0': action,
                'transition_model/lstm_hidden_state_in:0':
                self.lstm_hidden_state,
                'transition_model/batch_size:0': 1,
                'transition_model/sequence_length:0': 1
            })
        self.last_actions.add(action)

    def last_step(self, action_label):
        if self.last_states.initialized() and self.last_actions.initialized():
            return [
                self.network_input[:-1], self.last_actions.buffer[:-1],
                self.network_input[-1],
                action_label.reshape(self.dim_a)
            ]
        else:
            return None

    def new_episode(self):
        self.lstm_hidden_state = np.zeros([1, 2 * self.lstm_h_size])
        self.last_states = Buffer(min_size=self.training_sequence_length + 1,
                                  max_size=self.training_sequence_length + 1)
        self.last_actions = Buffer(min_size=self.training_sequence_length + 1,
                                   max_size=self.training_sequence_length + 1)
        self.last_actions.add(np.zeros([1, self.dim_a]))
        self.last_states.add(
            np.zeros([1, self.image_width, self.image_width, 1]))
예제 #9
0
                action = np.random.multivariate_normal(u, cov)
        else:
            assert False
        #print "action:", action, "Q:", Q(x, np.array([action])), "V:", V(x)
        #print "action:", action, "advantage:", A(x, np.array([action]))
        #print "mu:", u, "action:", action
        #print "Q(mu):", Q(x, np.array([u])), "Q(action):", Q(x, np.array([action]))

        # take the action and record reward
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        #print "reward:", reward
        #print "poststate:", observation

        # add experience to replay memory
        R.add(x[0], action, reward, observation, done)

        loss = 0
        # perform train_repeat Q-updates
        for k in xrange(args.train_repeat):
            preobs, actions, rewards, postobs, terminals = R.sample(
                args.batch_size)

            # Q-update
            v = V(postobs)
            y = rewards + args.gamma * np.squeeze(v)
            loss += model.train_on_batch([preobs, actions], y)

            # copy weights to target model, averaged by tau
            weights = model.get_weights()
            target_weights = target_model.get_weights()
예제 #10
0
        if np.random.random() < args.exploration:
            action = env.action_space.sample()
        else:
            s = np.array([observation])
            q = model.predict_on_batch(s)
            #print "q:", q
            action = np.argmax(q[0])
            maxqs.append(np.max(q[0]))
        #print "action:", action

        prev_observation = observation
        observation, reward, done, info = env.step(action)
        #print info
        episode_reward += reward
        #print "reward:", reward
        mem.add(prev_observation, np.array([action]), reward, observation, done)

        for k in xrange(args.train_repeat):
            prestates, actions, rewards, poststates, terminals = mem.sample(args.batch_size)

            qpre = model.predict_on_batch(prestates)
            qpost = target_model.predict_on_batch(poststates)
            for i in xrange(qpre.shape[0]):
                if terminals[i]:
                    qpre[i, actions[i]] = rewards[i]
                else:
                    qpre[i, actions[i]] = rewards[i] + args.gamma * np.amax(qpost[i])
            cost = model.train_on_batch(prestates, qpre)
            costs.append(cost)
            total_train_steps += 1
예제 #11
0
            env.render()

        if np.random.random() < args.exploration:
            action = env.action_space.sample()
        else:
            s = np.array([observation])
            q = model.predict_on_batch(s)
            #print "q:", q
            action = np.argmax(q[0])
        #print "action:", action

        prev_observation = observation
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        #print "reward:", reward
        mem.add(prev_observation, np.array([action]), reward, observation, done)

        for k in xrange(args.train_repeat):
            prestates, actions, rewards, poststates, terminals = mem.sample(args.batch_size)

            qpre = model.predict_on_batch(prestates)
            qpost = target_model.predict_on_batch(poststates)
            for i in xrange(qpre.shape[0]):
                if terminals[i]:
                    qpre[i, actions[i]] = rewards[i]
                else:
                    qpre[i, actions[i]] = rewards[i] + args.gamma * np.amax(qpost[i])
            model.train_on_batch(prestates, qpre)

            weights = model.get_weights()
            target_weights = target_model.get_weights()
예제 #12
0
            action = np.random.multivariate_normal(u, cov)
        else:
          assert False
        #print "action:", action, "Q:", Q(x, np.array([action])), "V:", V(x)
        #print "action:", action, "advantage:", A(x, np.array([action]))
        #print "mu:", u, "action:", action
        #print "Q(mu):", Q(x, np.array([u])), "Q(action):", Q(x, np.array([action]))

        # take the action and record reward
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        #print "reward:", reward
        #print "poststate:", observation

        # add experience to replay memory
        R.add(x[0], action, reward, observation, done)

        loss = 0
        # perform train_repeat Q-updates
        for k in range(args.train_repeat):
          preobs, actions, rewards, postobs, terminals = R.sample(args.batch_size)

          # Q-update
          v = V(postobs)
          y = rewards + args.gamma * np.squeeze(v)
          loss += model.train_on_batch([preobs, actions], y)

          # copy weights to target model, averaged by tau
          weights = model.get_weights()
          target_weights = target_model.get_weights()
          for i in range(len(weights)):
예제 #13
0
agent = DDPG(2, 1)
buf = Buffer(BUF_SIZE)
noise = OUStrategy(env.action_space, min_sigma=1e-4)
updates_noise = 0
for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    total_reward = 0
    while not done:
        action = agent.act(state)
        action = noise.get_action_from_raw_action(action, updates_noise)
        updates_noise += 1
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        buf.add((state, action, reward, next_state, done))
        if len(buf) >= BATCH_SIZE:
            agent.update(buf.sample(BATCH_SIZE))
        state = next_state
    print(
        f"I did {episode}th episode. Result: {total_reward}, sigma = {noise.sigma}"
    )
# Я решила тренироваться до 150 эпизодов, хотя  с этим сидом оно крутится около 90, начиная с 30 эпизода.

# Вывод на последних 10 эпизодах:
# I did 139th episode. Result: 91.13059676792551, sigma = 0.17022727199999999
# I did 140th episode. Result: 90.62383628427916, sigma = 0.16973243699999999
# I did 141th episode. Result: 94.36829967370625, sigma = 0.16948352
# I did 142th episode. Result: 87.05158580519061, sigma = 0.168778755
# I did 143th episode. Result: 89.52206836735917, sigma = 0.16824493299999999
# I did 144th episode. Result: 92.20854623030216, sigma = 0.167951031
예제 #14
0
        def train_one_update(step, epochs, tracing_on):
            # initialize replay buffer
            buffer = Buffer(
                batch_size,
                minibatch_size,
                MINIMAP_RES,
                MINIMAP_RES,
                env.action_spec()[0],
            )

            # initial observation
            timestep = env.reset()
            step_type, reward, _, obs = timestep[0]
            obs = preprocess(obs)

            ep_ret = []  # episode return (score)
            ep_rew = 0

            # fill in recorded trajectories
            while True:
                tf_obs = (
                    tf.constant(each_obs, shape=(1, *each_obs.shape))
                    for each_obs in obs
                )

                val, act_id, arg_spatial, arg_nonspatial, logp_a = actor_critic.step(
                    *tf_obs
                )

                sc2act_args = translateActionToSC2(
                    arg_spatial, arg_nonspatial, MINIMAP_RES, MINIMAP_RES
                )

                act_mask = get_mask(act_id.numpy().item(), actor_critic.action_spec)
                buffer.add(
                    *obs,
                    act_id.numpy().item(),
                    sc2act_args,
                    act_mask,
                    logp_a.numpy().item(),
                    val.numpy().item()
                )
                step_type, reward, _, obs = env.step(
                    [actions.FunctionCall(act_id.numpy().item(), sc2act_args)]
                )[0]
                # print("action:{}: {} reward {}".format(act_id.numpy().item(), sc2act_args, reward))
                buffer.add_rew(reward)
                obs = preprocess(obs)

                ep_rew += reward

                if step_type == step_type.LAST or buffer.is_full():
                    if step_type == step_type.LAST:
                        buffer.finalize(0)
                    else:
                        # trajectory is cut off, bootstrap last state with estimated value
                        tf_obs = (
                            tf.constant(each_obs, shape=(1, *each_obs.shape))
                            for each_obs in obs
                        )
                        val, _, _, _, _ = actor_critic.step(*tf_obs)
                        buffer.finalize(val)

                    ep_rew += reward
                    ep_ret.append(ep_rew)
                    ep_rew = 0

                    if buffer.is_full():
                        break

                    # respawn env
                    env.render(True)
                    timestep = env.reset()
                    _, _, _, obs = timestep[0]
                    obs = preprocess(obs)

            # train in minibatches
            buffer.post_process()

            mb_loss = []
            for ep in range(epochs):
                buffer.shuffle()

                for ind in range(batch_size // minibatch_size):
                    (
                        player,
                        available_act,
                        minimap,
                        # screen,
                        act_id,
                        act_args,
                        act_mask,
                        logp,
                        val,
                        ret,
                        adv,
                    ) = buffer.minibatch(ind)

                    assert ret.shape == val.shape
                    assert logp.shape == adv.shape
                    if tracing_on:
                        tf.summary.trace_on(graph=True, profiler=False)

                    mb_loss.append(
                        actor_critic.train_step(
                            tf.constant(step, dtype=tf.int64),
                            player,
                            available_act,
                            minimap,
                            # screen,
                            act_id,
                            act_args,
                            act_mask,
                            logp,
                            val,
                            ret,
                            adv,
                        )
                    )
                    step += 1

                    if tracing_on:
                        tracing_on = False
                        with train_summary_writer.as_default():
                            tf.summary.trace_export(name="train_step", step=0)

            batch_loss = np.mean(mb_loss)

            return (
                batch_loss,
                ep_ret,
                buffer.batch_ret,
                np.asarray(buffer.batch_vals, dtype=np.float32),
            )
예제 #15
0
def infer_on_stream(args, client, stats):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :param client: MQTT client
    :return: None
    """
    # Initialise the class
    infer_network = Network()
    buffer = Buffer()
    # Set Probability threshold for detections
    prob_threshold = args.prob_threshold
    ### Load the model through `infer_network` ###
    infer_network.load_model(args.model, args.device, args.cpu_extension)
    net_input_shape = infer_network.get_input_shape()
    ##net_input_shape = [1, 3, 600, 600]
    net_output_name = infer_network.get_output_name()
    net_input_name = infer_network.get_input_blob_name()
    net_input_shape = infer_network.get_input_shape()
    net_output_info = infer_network.get_output_info()
    log.info("network output name")
    log.info(net_output_name)
    log.info("network output info")
    log.info(net_output_info.shape)
    log.info("network input shape")
    log.info(net_input_name)
    log.info(net_input_shape)

    ### Handle the input stream ###
    iflag = False
    input_stream_arg = 0 if args.input == "cam" else args.input
    if input_stream_arg.endswith('.jpg') or input_stream_arg.endswith('.bmp'):
        iflag = True

    width = 0
    height = 0
    frame = None
    cap = None
    captureOpen = False
    ## Handle image or stream or CAM
    if iflag:
        frame = cv2.imread(input_stream_arg)
        log.info("single frame shape: %s", frame.shape)
        width = frame.shape[1]
        height = frame.shape[0]
    else:
        log.info("attempting VideoCapture for: %s", input_stream_arg)
        cap = cv2.VideoCapture(input_stream_arg)
        cap.open(args.input)
        captureOpen = True
        width = int(cap.get(3))
        height = int(cap.get(4))

    log.info("input image width: %s, height: %s", width, height)
    #steam input shape:
    input_width = 0
    input_height = 0
    total_person_count = 0
    duration = 0

    cur_request_id = 0
    next_request_id = 1
    render_time = 0
    parsing_time = 0
    waitingOnInference = False
    ### Loop until stream is over ###
    while (captureOpen or iflag or waitingOnInference):
        ### Read from the video capture ###
        flag = True
        key_pressed = None
        if not iflag:
            flag, frame = cap.read()
            if not cap.isOpened():
                captureOpen = False
            key_pressed = cv2.waitKey(60)
        if not flag:
            break
        ### Pre-process the image as needed ###
        input_width = net_input_shape[2]
        input_height = net_input_shape[3]
        p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2]))
        p_frame = p_frame.transpose((2, 0, 1))
        p_frame = p_frame.reshape(1, *p_frame.shape)

        ### Start asynchronous inference for specified request ###
        start_time = time()
        infer_network.exec_net(p_frame)
        waitingOnInference = True
        render_time = 0
        inf_time = 0

        ### Wait for the result ###
        if infer_network.wait() == 0:
            ### Get the results of the inference request ###
            result = infer_network.get_output()
            inf_time = time() - start_time
            ###restart clock to capture evaluate/draw time
            start_time = time()
            boxes = post_process(result, width, height, PERSON_CLASS)
            ##if len(boxes) > 1:
            ##log.info("initial boxes: %s", boxes)
            boxes = list(boxes.values())
            boxes = nms(boxes)
            buffer_avg = 0

            if (iflag):
                boxes = filter_confidence(boxes, args.prob_threshold)

            if len(boxes) > 0:
                ##we have a person in frame (maybe)
                first_prop = boxes[0]
                confidence = first_prop[4]
                buffer.add(confidence)
                buffer_avg = buffer.average()
                if confidence > args.prob_threshold:
                    if duration > 0:
                        ##this is not the first time they have been in the frame
                        ##increase duration and move along
                        duration = duration + 1
                    else:
                        ##very first time this person has entered the frame
                        ##pulse out new count
                        total_person_count = total_person_count + 1
                        duration = duration + 1
                    client.publish(
                        "person",
                        json.dumps({
                            "count": 1,
                            "total": total_person_count
                        }))
                    draw_box(frame, boxes, inf_time)
                else:
                    ##we have a person in frame, but they don't meet confidence threshold
                    if duration > 0:
                        ##we know we were tracking someone last frame
                        ##so check our rolling buffer average
                        if buffer_avg > BUFFER_AVERAGE_CUTOFF:
                            ##same person, keep counting, move along
                            duration = duration + 1
                            client.publish(
                                "person",
                                json.dumps({
                                    "count": 1,
                                    "total": total_person_count
                                }))
                            draw_box(frame, boxes, inf_time)
                        else:
                            ##log.info("NO-DRAW: c:%s, b:%s, d:%s : else:if:else", confidence, buffer_avg, duration)
                            ##no longer meet confidence or buffer avg
                            client.publish(
                                "person",
                                json.dumps({
                                    "count": 0,
                                    "total": total_person_count
                                }))
                            client.publish("person/duration",
                                           json.dumps({"duration": duration}))
                            duration = 0
                            buffer.flush()
                    else:
                        ##log.info("NO-DRAW: c:%s, b:%s, d:%s : else:else", confidence, buffer_avg, duration)
                        ##also nobody in the last frame (duration == 0)
                        client.publish(
                            "person",
                            json.dumps({
                                "count": 0,
                                "total": total_person_count
                            }))
            else:
                ##no boxes with our target class was found, make sure we didn't see one in the last frame (or so)
                buffer.add(0)
                buffer_avg = buffer.average()
                if buffer_avg > BUFFER_AVERAGE_CUTOFF:
                    ##we has someone previously, keep counting, move along
                    duration = duration + 1
                else:
                    ##nobody previously, nobody now, make sure we say so
                    client.publish(
                        "person",
                        json.dumps({
                            "count": 0,
                            "total": total_person_count
                        }))
                    if duration > 0:
                        ##we were previously tracking someone, pulse out duration before zeroing out
                        client.publish("person/duration",
                                       json.dumps({"duration": duration}))
                        duration = 0

            render_time = time() - start_time
            render_time_message = "OpenCV rendering time: {:.3f} ms".format(
                render_time * 1e3)
            cv2.putText(frame, render_time_message, (15, 45),
                        cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1)
            stats.append(dict(it=inf_time, rt=render_time))
            sys.stdout.buffer.write(frame)
            sys.stdout.flush()
        if key_pressed == 27:
            break
        if iflag and not waitingOnInference:
            iflag = False
        if infer_network.wait() == 0:
            iflag = False
            waitingOnInference = False
    if cap:
        cap.release()
        cv2.destroyAllWindows()
    client.disconnect()
예제 #16
0
        len(video_buffer.q),
        len(clip) if clip is not None else 0,
        building,
        #face_locations
    )

    point = {
        'time': datetime.now(),
        #'face_locations': face_locations,
        'frame': frame,
        'current_weight': weight,
    }
    if building:
        clip.append(point)
    else:
        video_buffer.add(point)

    if not building and enough_diff:
        building = True
        clip = copy(video_buffer.q)
        video_buffer.clear()
    elif building and datetime.now() >= last_weight_event + timedelta(seconds=TIMEOUT):
        frames = list(clip)

        clip = None
        building = False

        print("creating clip of len", len(frames))
        print(archive.create_from_clip(frames))

    previous_weight = weight
예제 #17
0
def sac(args):
    #set seed if non default is entered
    if args.seed != -1:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    env, test_env = TorchEnv(args.env_name, args.max_ep_len), TorchEnv(
        args.env_name, args.max_ep_len)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_limit = env.action_space.high[0]
    # Create actor-critic module and target networks
    ac = ActorCritic(state_dim,
                     action_dim,
                     action_limit,
                     args.hidden_size,
                     args.gamma,
                     args.alpha,
                     device=args.device)
    ac_targ = deepcopy(ac)
    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
    # Experience buffer
    buffer = Buffer(state_dim,
                    action_dim,
                    buffer_size=args.buffer_size,
                    device=args.device)
    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=args.lr)
    q_optimizer = Adam(q_params, lr=args.lr)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q = ac.compute_loss_q(data, ac_targ)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-networks so you don't waste computational effort computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = ac.compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(args.polyak)
                p_targ.data.add_((1 - args.polyak) * p.data)

    def test_agent(deterministic=True):
        for j in range(args.num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == args.max_ep_len)):
                # Take deterministic actions at test time
                o, r, d = test_env.step(
                    ac.act(
                        torch.as_tensor(o,
                                        dtype=torch.float32).to(args.device),
                        deterministic))
                ep_ret += r
                ep_len += 1

    # Prepare for interaction with environment
    total_steps = args.steps_per_epoch * args.epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        # Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy.
        if t > args.start_steps:
            a = ac.act(torch.as_tensor(o, dtype=torch.float32).to(args.device))
        else:
            a = env.action_space.sample()
        # Step the env
        o2, r, d = env.step(a)
        if args.render_env:
            env.render()
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time horizon (that is, when it's an artificial terminal signal that isn't based on the agent's state)
        d = False if ep_len == args.max_ep_len else d
        # Store experience to replay buffer
        buffer.add(o, a, r, o2, d)
        o = o2
        # End of trajectory handling
        if d or (ep_len == args.max_ep_len):
            print("EPISODE REWARD: ", ep_ret)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= args.update_after and t % args.update_every == 0:
            batch_generator = buffer.get_train_batches(args.batch_size)
            for j in range(args.update_every):
                #my_batch = my_buffer.get_train_batches(args.batch_size).__next__()
                try:
                    batch = batch_generator.__next__()
                except:
                    batch_generator = buffer.get_train_batches(args.batch_size)
                    batch = batch_generator.__next__()
                update(batch)

        # End of epoch handling
        if (t + 1) % args.steps_per_epoch == 0:
            epoch = (t + 1) // args.steps_per_epoch
            # Test the performance of the deterministic version of the agent.
            test_agent()