예제 #1
0
 def _random_turn(self):
     turn = numpy.random.random() * random.choice([-1, 1])
     pitch = numpy.random.random() * random.choice([-0.5, 0.5])
     self.act(["turn {0}".format(turn)])
     self.act(["pitch {0}".format(pitch)])
     time.sleep(0.5)
     stop_motion(self.mc)
예제 #2
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        self.agent.clear_state()
        mc = self.mc
        # apply random turn and pitch
        self._random_turn()
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        max_t = 60
        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.99

        eps = eps_start

        total_reward = 0

        t = 0

        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        mean_loss = numpy.mean(
            [self.learn(self.agent, self.optimizer) for _ in range(5)])
        logging.info('loss %f', mean_loss)
        while True:
            t += 1
            reward = 0
            import pdb
            pdb.set_trace()
            try:
                data = self.collect_state()
                new_pos = data['position']
                target = self.is_tree_visible()
            except DeadException:
                stop_motion(mc)
                # should not die is this mission
                # so don't add this event to the replay buffer
                reward = 0
                logging.warning("died at step %i", t)
                break
            if prev_pos is None:
                prev_pos = new_pos
            else:
                # use only dist change for now
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    # should not die is this mission
                    continue
                reward += (life - prev_life) * 2
                prev_life = life
                if target is not None:
                    if target[0] == 'log':
                        reward = 100
                        self.agent.push_final(reward)
                        logging.debug('solved in %i steps', t)
                        mc.sendCommand("quit")
                        solved = True
                        break
                    elif target[0] == 'leaves':
                        reward = -0.05
                if reward == 0:
                    reward -= 1
            data['prev_pos'] = prev_pos
            logging.debug("current reward %f", reward)
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)
            prev_pos = new_pos
            if t == max_t:
                logging.debug("too long")
                stop_motion(mc)
                reward = -10
                self.agent.push_final(-10)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward
        logging.info("Final reward: %d" % reward)

        return total_reward, t, solved
예제 #3
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        self.agent.clear_state()
        max_t = 250
        self._random_turn()

        mc = self.mc
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.999

        eps = eps_start

        total_reward = 0

        t = 0

        state = self.collect_state()
        if state['ypos'] < 0:
            raise DeadException('started in a pit')
        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        while True:
            t += 1
            logging.debug('\n\n\nstep %i', t)
            # target = search4blocks(mc, ['lapis_block'], run=False)
            reward = 0
            try:
                data = self.collect_state()
            except DeadException:
                stop_motion(mc)
                self.agent.push_final(-100)
                reward = -100
                logging.debug("failed at step %i", t)
                import pdb
                pdb.set_trace()
                self.learn(self.agent, self.optimizer)
                break
            if self.state_queue:
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    reward = -100
                    stop_motion(mc)
                    if t > 2:
                        self.agent.push_final(reward)
                    self.learn(self.agent, self.optimizer)
                    break
                if 'visible' in self.state_queue[-1]:
                    prev_action = self.agent.policy_net.actions[0].to_string(
                        self.state_queue[-1]['action'])
                    prev_item = self.state_queue[-1]['visible']
                    logging.debug(prev_item)
                    prev_dist = prev_item[DIST]
                    prev_block = prev_item[BLOCK_TYPE]
                    if prev_block in ('water', 'lava', 'flowing_lava'):
                        reward -= 2
                    if prev_action == 'attack 1':
                        h_target = 24
                        if prev_dist <= 4:
                            reward += 0.5
                        else:
                            reward -= 1
                        if 'visible' in data:
                            current_dist = data['visible'][-1]
                            # if block is removed visible would change
                            if (0.1 < (current_dist - prev_dist)):
                                logging.debug('distance is more than before!')
                                reward += 1
                                if prev_block in ('double_plant', 'tallgrass'):
                                    reward -= 0.5
                                tmp = ((30 - h_target) -
                                       abs(prev_item[HEIGHT] - h_target))
                                logging.info('tmp dist %f', tmp)
                                tmp = max(tmp, 0)**2
                                if h_target < prev_item[HEIGHT]:
                                    tmp /= 3
                                reward += tmp
                                if prev_block not in ('dirt', 'grass', 'stone',
                                                      'double_plant',
                                                      'tallgrass', 'leaves',
                                                      'log'):
                                    reward += 25
                            else:
                                # give small reward for removing block under self
                                prev_height = 30 - self.state_queue[-1]['ypos']
                                curr_height = 30 - data['ypos']
                                if curr_height < prev_height:
                                    tmp = ((30 - h_target) -
                                           abs(prev_height - h_target))
                                    logging.debug('removed block!')
                                    logging.debug('tmp dist %f', tmp)
                                    reward += max(tmp, 0)**2 / 2
                else:
                    if 'visible' not in data:
                        logging.debug('not visible')
                        reward -= 1
                reward -= 1
                reward += (life - prev_life) * 2
                prev_life = life
                if not mc.is_mission_running():
                    logging.debug('failed in %i steps', t)
                    reward = -100
            logging.debug("current reward %f", reward)
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            data['action'] = self.agent.prev_action
            self.state_queue.append(copy.copy(data))
            if 'visible' in data:
                data.pop('visible')
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)
            if t == max_t:
                logging.debug("too long")
                stop_motion(mc)
                self.agent.push_final(reward)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward

        aPos = self.mc.getAgentPos()
        if aPos is not None and aPos[1] <= 25:
            solved = True
        logging.debug("Final reward: %f", reward)
        self._end()
        return total_reward, t, solved
예제 #4
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        self.agent.clear_state()
        mc = self.mc
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        max_t = 80
        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.99

        eps = eps_start

        total_reward = 0

        t = 0

        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        mean_loss = numpy.mean(
            [self.learn(self.agent, self.optimizer) for _ in range(5)])
        logging.info('loss %f', mean_loss)
        while True:
            t += 1
            # target = search4blocks(mc, ['lapis_block'], run=False)
            reward = 0
            try:
                data = self.collect_state()
                target_enc = data['target']
                new_pos = data['pos']
            except DeadException:
                stop_motion(mc)
                self.agent.push_final(-100)
                reward = -100
                logging.debug("failed at step %i", t)
                self.learn(self.agent, self.optimizer)
                break
            if prev_pos is None:
                prev_pos = new_pos
            else:
                # use only dist change for now
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    reward = -100
                    stop_motion(mc)
                    self.agent.push_final(reward)
                    self.learn(self.agent, self.optimizer)
                    break
                reward += (prev_target_dist -
                           target_enc)[2] + (life - prev_life) * 2
                prev_life = life
                grid = mc.getNearGrid()
                if target_enc[2] < 0.53:
                    reward = 100
                    self.agent.push_final(reward)
                    logging.debug('solved in %i steps', t)
                    mc.sendCommand("quit")
                    solved = True
                    break
                if not mc.is_mission_running():
                    logging.debug('failed in %i steps', t)
                    reward = -100
                    self.agent.push_final(reward)
                    break
                if reward == 0:
                    reward -= 2
            logging.debug("current reward %f", reward)
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)
            prev_pos = new_pos
            prev_target_dist = target_enc
            if t == max_t:
                logging.debug("too long")
                stop_motion(mc)
                reward = -10
                self.agent.push_final(-10)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward
        logging.debug("Final reward: %d" % reward)

        return total_reward, t, solved
예제 #5
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        self.agent.clear_state()
        start = self._start()
        end = self.end
        max_t = 3000
        self._random_turn()

        logging.info('current target (%i, %i)', self.target_x, self.target_y)

        mc = self.mc
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        logging.info('max dist %i', max_t)
        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.999

        eps = eps_start

        total_reward = 0

        t = 0

        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        while True:
            t += 1
            # target = search4blocks(mc, ['lapis_block'], run=False)
            reward = 0
            try:
                data = self.collect_state()
            except DeadException:
                stop_motion(mc)
                self.agent.push_final(-100)
                reward = -100
                logging.debug("failed at step %i", t)
                self.learn(self.agent, self.optimizer)
                break
            if self.state_queue:
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    reward = -100
                    stop_motion(mc)
                    if t > 2:
                        self.agent.push_final(reward)
                    self.learn(self.agent, self.optimizer)
                    break
                logging.debug('distance %f', data['dist'])
                prev_target_dist = self.state_queue[-1]['dist']
                dist_diff = (prev_target_dist - data['dist'])
                if dist_diff > 1:
                    reward += 1
                if dist_diff < 0:
                    reward -= 1
                reward += dist_diff + (life - prev_life) * 2
                prev_life = life
                grid = mc.getNearGrid()
                if not mc.is_mission_running():
                    logging.debug('failed in %i steps', t)
                    reward = -100
                if data['dist'] < 0.88:
                    time.sleep(1)
                    mc.observeProc()
                    life = mc.getLife()
                    mc.sendCommand("quit")
                    if life == prev_life:
                        reward += 25
                        self.agent.push_final(reward)
                    logging.debug('solved in %i steps', t)
                    solved = True
                    break
                if reward == 0:
                    reward -= 0.5
                if 'visible' in data:
                    d = data['visible'][-1]
                    if d < 1:
                        logging.debug('visible {0}'.format(d))
                        reward -= 3
            logging.debug("current reward %f", reward)
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            data['action'] = self.agent.prev_action
            if 'visible' in data:
                data.pop('visible')
            self.state_queue.append(data)
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)
            if t == max_t or total_reward < -200:
                reward -= 1
                logging.debug("too long")
                stop_motion(mc)
                self.agent.push_final(reward)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward
        logging.debug("Final reward: %f", reward)
        self._end(start, end, solved, t, total_reward)
        return total_reward, t, solved
예제 #6
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        self.agent.clear_state()
        mc = self.mc
        # apply random turn and pitch
        self._random_turn()
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        max_t = 80
        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.99

        eps = eps_start

        total_reward = 0

        t = 0

        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        while True:
            t += 1
            reward = -0.2
            try:
                data = self.collect_state()
                target = self.is_tree_visible()
            except DeadException:
                stop_motion(mc)
                # should not die is this mission
                # so don't add this event to the replay buffer
                reward = 0
                logging.warning("died at step %i", t)
                break
            if self.state_queue:
                # use only dist change for now
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    # should not die is this mission
                    break
                solved, reward = self.is_solved(target)
                if solved:
                    logging.info('solved in %i steps', t)
                    self.agent.push_final(reward)
                    self.mc.sendCommand("quit")
                    break
                else:
                    r = reward
                    if reward > 0:
                        for item in self.state_queue:
                            if 'reward' in item and item['reward'] > 0:
                                r += item['reward']
                        if r >= 3:
                            solved = True
                            logging.info('solved in %i steps', t)
                            reward = 45
                            self.agent.push_final(reward)
                            self.mc.sendCommand("quit")
                            break
                data['reward'] = torch.as_tensor(reward)
                #if 'stop' in new_actions:
                #    # either it solved, or the tree is blocked
                #    if not solved:
                #        mc.sendCommand('move 1')
                #        time.sleep(5)
                #        stop_motion(mc)
                #        self.collect_state()
                #        target = self.is_tree_visible()
                #        solved, reward = self.is_solved(target)
                #    if solved:
                #        logging.debug('solved in %i steps', t)
                #        reward += 5
                #    else:
                #        logging.debug('actually not solved!')
                #        reward -= 2
                #    self.mc.sendCommand("quit")
                #    self.agent.push_final(reward)
                #    break
                #elif solved:
                #    logging.debug('solved but not signaling about that')
                #    reward -= 2
                #    self.mc.sendCommand("quit")
                #    self.agent.push_final(reward)
                #    break
            logging.debug('reward %f', reward)
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            data['action'] = self.agent.prev_action
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            self.state_queue.append(data)
            if t == max_t and reward <= 0:
                reward -= 1
                logging.debug("too long")
                stop_motion(mc)
                self.agent.push_final(reward)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)

            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward
        logging.info("Final reward: %f" % reward)

        mean_loss = numpy.mean([self.learn(self.agent, self.optimizer) for _ in range(3)])
        logging.info('loss %f', mean_loss)
        return total_reward, t, solved
예제 #7
0
    def run_episode(self):
        """ Deep Q-Learning episode
        """
        from_queue = False
        self.agent.clear_state()

        if random.random() < 0.4 and (self.failed_queue or self.episode_stats):
            self.mc.sendCommand("quit")
            time.sleep(1)
            if self.failed_queue:
                from_queue = True
                start, end = self.failed_queue.pop()
                x, y = end
                start_x = x + random.choice(numpy.arange(-20, 20))
                start_y = y + random.choice(numpy.arange(-20, 20))
                logging.info('evaluating from queue {0}, {1}'.format(
                    start, end))
            else:
                pairs = list(self.episode_stats.items())
                r = numpy.asarray([p[1][0] for p in pairs])
                idx = inverse_priority_sample(r)
                logging.debug('prority sample idx=%i', idx)
                start, end = pairs[idx][0]
                logging.info('evaluating from stats {0}, {1}'.format(
                    start, end))
                start_x, start_y = start
            # start somewhere near end
            self.mc = self.init_mission(0,
                                        self.mc,
                                        start_x=start_x,
                                        start_y=start_y)
            self.mc.safeStart()
            self.target_x, self.target_y = end
        else:
            self.mc.observeProc()
            aPos = self.mc.getAgentPos()
            while aPos is None:
                time.sleep(0.05)
                self.mc.observeProc()
                aPos = self.mc.getAgentPos()

            XPos, _, YPos = aPos[:3]
            self.target_x = XPos + random.choice(
                numpy.arange(-self.dist, self.dist))
            self.target_y = YPos + random.choice(
                numpy.arange(-self.dist, self.dist))

            start = (XPos, YPos)
            end = self.target_x, self.target_y
        logging.info('current target (%i, %i)', self.target_x, self.target_y)

        mc = self.mc
        logging.debug('memory: %i', self.agent.memory.position)
        self.agent.train()

        max_t = self.dist * 4
        eps_start = self.eps
        eps_end = 0.05
        eps_decay = 0.9999

        eps = eps_start

        total_reward = 0

        t = 0

        # pitch, yaw, xpos, ypos, zpos
        prev_pos = None
        prev_target_dist = None
        prev_life = 20
        solved = False

        mean_loss = numpy.mean(
            [self.learn(self.agent, self.optimizer) for _ in range(10)])
        logging.info('loss %f', mean_loss)
        while True:
            t += 1
            # target = search4blocks(mc, ['lapis_block'], run=False)
            reward = 0
            try:
                data = self.collect_state()
                target_enc = data['target']
                new_pos = data['pos']
            except DeadException:
                stop_motion(mc)
                self.agent.push_final(-100)
                reward = -100
                logging.debug("failed at step %i", t)
                self.learn(self.agent, self.optimizer)
                break
            if prev_pos is None:
                prev_pos = new_pos
            else:
                # use only dist change for now
                life = mc.getLife()
                logging.debug('current life %f', life)
                if life == 0:
                    reward = -100
                    stop_motion(mc)
                    if t > 2:
                        self.agent.push_final(reward)
                    self.learn(self.agent, self.optimizer)
                    break
                logging.debug('distance %f', target_enc[2])
                reward += (prev_target_dist -
                           target_enc)[2] + (life - prev_life) * 2
                prev_life = life
                grid = mc.getNearGrid()
                if target_enc[2] < 0.58:
                    time.sleep(1)
                    mc.observeProc()
                    life = mc.getLife()
                    mc.sendCommand("quit")
                    if life == prev_life:
                        self.agent.push_final(reward)
                    logging.debug('solved in %i steps', t)
                    solved = True
                    break
                if not mc.is_mission_running():
                    logging.debug('failed in %i steps', t)
                    reward = -100
            if reward == 0:
                reward -= 0.5
            logging.debug("current reward %f", reward)
            data['prev_pos'] = prev_pos
            new_actions = self.agent(data, reward=reward, epsilon=eps)
            eps = max(eps * eps_decay, eps_end)
            logging.debug('epsilon %f', eps)
            self.act(new_actions)
            time.sleep(0.4)
            stop_motion(mc)
            time.sleep(0.1)
            prev_pos = new_pos
            prev_target_dist = target_enc
            if t == max_t:
                logging.debug("too long")
                stop_motion(mc)
                self.agent.push_final(reward)
                self.mc.sendCommand("quit")
                self.learn(self.agent, self.optimizer)
                break
            total_reward += reward
        # in termial state reward is not added due loop breaking
        total_reward += reward
        logging.debug("Final reward: %f", reward)

        if from_queue:
            """
            If episode failed check length, if 15 < t
                start from a different point near the target
                if success add to episode_stats
            """
            # failed
            if not solved:
                pass
            else:
                logging.info('from queue run succeeded')
                self.episode_stats[(start, end)] = [0, 0]
        else:
            if (start, end) in self.episode_stats:
                r, l = self.episode_stats[(start, end)]
                r = iterative_avg(r, total_reward)
                l = iterative_avg(l, t)
                self.episode_stats[(start, end)] = (r, l)
                logging.info('new episode stats reward {0} length {1}'.format(
                    r, l))
            elif 10 < t and not solved:
                self.failed_queue.append((start, end))
                logging.info('adding to failed queue {0}, {1}'.format(
                    start, end))
        return total_reward, t, solved