Esempio n. 1
0
def load_model(filename):
    model = DQN()

    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['state_dict'])

    return model
Esempio n. 2
0
def test(episodes=20, agent=None, load_path=None, ifrender=False, log=False):
    if log:
        logger.configure(dir="./log/", format_strs="stdout")
    if agent is None:
        agent = DQN(num_state=16, num_action=4)
        if load_path:
            agent.load(load_path)
        else:
            agent.load()

    env = Game2048Env()
    score_list = []
    highest_list = []

    for i in range(episodes):
        state, _, done, info = env.reset()
        state = log2_shaping(state)

        start = time.time()
        while True:
            action = agent.select_action(state, deterministic=True)
            next_state, _, done, info = env.step(action)
            next_state = log2_shaping(next_state)
            state = next_state

            if ifrender:
                env.render()

            if done:
                print(env.Matrix)
                if log:
                    logger.logkv('episode number', i + 1)
                    logger.logkv('episode reward', info['score'])
                    logger.logkv('episode steps', info['steps'])
                    logger.logkv('highest', info['highest'])
                    logger.dumpkvs()
                break

        end = time.time()
        if log:
            print('episode time:{} s\n'.format(end - start))

        score_list.append(info['score'])
        highest_list.append(info['highest'])

    print('mean score:{}, mean highest:{}'.format(np.mean(score_list),
                                                  np.mean(highest_list)))
    print('max score:{}, max hightest:{}'.format(np.max(score_list),
                                                 np.max(highest_list)))
    result_info = {
        'mean': np.mean(score_list),
        'max': np.max(score_list),
        'list': score_list
    }
    print(highest_list)
    return result_info
Esempio n. 3
0
    def _Learning_init(self):

        #tag_velK,anzenK,uv_velK=self.tag_detector.getGain()
        '''
        env = Environment(anzenK,uv_velK)
        self.QLagent = QLearningAgent(env)
        '''
        #v1
        self.env = Environment2()
        self.dqn_agent = DQN(self.env.observation_space, self.env.action_space)
        """#v2
        self.env = Environment2_z()
        self.dqn_agent = DQNv2(self.env.observation_space, self.env.action_space)
        """

        self._episode = -2
        self._episode_reward = None
        self._episode_reward_count = 0
        self.__episode_reward_store = []
        #self.__episode_reward_store.append(self._episode_reward)
        self._Learning_reset()
Esempio n. 4
0
 def __init__(self, model = 0):
     self.movie_db = pickle.load(open('data/movie_db.pkl', 'rb'), encoding='latin1')
     self.movie_dict = pickle.load(open('data/movie_dict.pkl', 'rb'), encoding='latin1')
     self.user_goals = pickle.load(open('data/movie_user_goals.pkl', 'rb'), encoding='latin1')
     remove_empty_slots(self.movie_db)
     self.dst = DST(self.movie_db)
     self.emc = EMC(self.movie_dict)
     if model == 0:
         self.dqn = DQN(self.dst.state_size)
     elif model == 1:
         self.dqn = DDQN(self.dst.state_size)
     else:
         self.dqn = PerDQN(self.dst.state_size)
Esempio n. 5
0
def train_dqn(episode, net):

    if not os.path.exists('./checkpoints'):
        os.mkdir('./checkpoints')

    loss = []
    if net == "ddqn":
        agent = DDQN(env.action_space.n, env.observation_space.shape[0])
    elif net == "dqn_fixed":
        agent = DQN_F(env.action_space.n, env.observation_space.shape[0])
    else:
        agent = DQN(env.action_space.n, env.observation_space.shape[0])

    current_time = datetime.datetime.now().strftime("%d%m%Y-%H%M%S")
    log_dir = 'logs/' + net + '/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    for e in range(episode):
        state = env.reset()
        state = np.reshape(state, (1, agent.state_space))
        score = 0
        done = False
        i = 0
        while not done:
            action = agent.act_greedy_policy(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, (1, agent.state_space))
            score += reward
            agent.remember(state, action, reward, next_state, done)
            #env.render()
            state = next_state
            agent.experience_replay()
            i += 1
            if i % 100 == 0 and not isinstance(agent, DQN):
                agent.copy_weights()
                # if you want to try a soft_update with the DDQN, substitute with agent.soft_update_weights()
                #agent.soft_update_weights()
            if done:
                print("Episode: {}/{}, score: {}, eps: {}".format(
                    e + 1, episode, np.round(score, decimals=2),
                    np.round(agent.epsilon, decimals=2)))
                break
        loss.append(score)

        # Average score of last 100 episode
        is_solved = np.mean(loss[-100:])
        count_is_solved = 0
        with summary_writer.as_default():
            tf.summary.scalar('Episode reward', score, step=e)
            tf.summary.scalar('Avg reward (last 100):', is_solved, step=e)

        print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
        if is_solved >= 200:
            print('\n Task Completed! \n')
            if count_is_solved == 0:
                count_is_solved += 1
                agent.dqn_network.model.save_weights('./checkpoints/' + net +
                                                     '_' + current_time +
                                                     '.h5')
                if isinstance(agent, DDQN) or isinstance(agent, DQN_F):
                    agent.target_network.model.save_weights('./checkpoints/' +
                                                            net + 'target' +
                                                            '_' +
                                                            current_time +
                                                            '.h5')

    return loss
Esempio n. 6
0
class LApriltags_ros(Apriltags_ros, object):
    contact_img_converter = True
    detected_flag = False
    frame = np.array([[0, 0], [Camera.image_size[0], Camera.image_size[1]]])
    pure_frame_sizes = []

    def __init__(self, bcc, set_learn=True):
        super(LApriltags_ros, self).__init__(callback=False)
        self.bcc = bcc
        #self.tag_detector = ApriltagsDetector()

        self._Learning_init()

        self.__detected = 0
        self.__nodetected = 0

        self.set_learn = set_learn
        self.__go_learn = False

        self.sub_tag = rospy.Subscriber('/tag_topic', AprilTagDetectionArray,
                                        self.LtagDetectedCallback)
        self.setup_time = self.begin = self.now = self.pre_time = time.time()
        self.TCR = TCR()

        self.count = 0
        self.err_count = 0
        self.detect_count = 0
        self.__continuous_detect_count = 0

        self.pure_frame_size = np.array([0, 0])
        self.__pure_frame = np.array([[0, 0], [0, 0]])

    def LtagDetectedCallback(self, msg):
        ##########################pre_measure############################
        response = self.TCR.response()
        lefttop, rightbottom = LApriltags_ros.frame
        pure_lefttop, pure_rightbottom = self.__pure_frame
        ##########################measure2############################
        """
        recoder2.count += 1
        recoder2.time = self.TCR.now()
        recoder2.response = response
        w = rightbottom[0] - lefttop[0]
        h = rightbottom[1] - lefttop[1]
        recoder2.pixel_w = w
        recoder2.pixel_h = h
        w,h =  self.pure_frame_size
        recoder2.pure_pixel_w = w
        recoder2.pure_pixel_h = h
        try:
            recoder2.pos_x = self.tag_detector.getApriltag(0).pose[0][0]
            recoder2.pos_y = self.tag_detector.getApriltag(0).pose[1][0]
            recoder2.pos_z = self.tag_detector.getApriltag(0).pose[2][0]
            recoder2.speed_x = self.tag_detector.getApriltag(0).speed[0][0]
            recoder2.speed_y = self.tag_detector.getApriltag(0).speed[1][0]
            recoder2.speed_z = self.tag_detector.getApriltag(0).speed[2][0]
            recoder2.euler_x = self.tag_detector.getApriltag(0).euler[0]
            recoder2.euler_y = self.tag_detector.getApriltag(0).euler[1]
            recoder2.euler_z = self.tag_detector.getApriltag(0).euler[2]
            recoder2.speed_eulerx = self.tag_detector.getApriltag(0).d_euler[0]
            recoder2.speed_eulery = self.tag_detector.getApriltag(0).d_euler[1]
            recoder2.speed_eulerz = self.tag_detector.getApriltag(0).d_euler[2]
        except:
            recoder2.pos_x = 0
            recoder2.pos_y = 0
            recoder2.pos_z = 0
            recoder2.speed_x = 0
            recoder2.speed_y = 0
            recoder2.speed_z = 0
            recoder2.euler_x = 0
            recoder2.euler_y = 0
            recoder2.euler_z = 0
            recoder2.speed_eulerx = 0
            recoder2.speed_eulery = 0
            recoder2.speed_eulerz = 0

        recoder2.save()
        """
        ##########################measure3############################
        recoder_rl2.time = self.TCR.between()
        recoder_rl2.response = response
        tag_velK, anzenK, uv_velK = self.tag_detector.getGain()
        recoder_rl2.tag_velK = tag_velK
        recoder_rl2.anzenK = anzenK
        recoder_rl2.uv_velK = uv_velK

        recoder_rl2.bbox_lefttop_x = lefttop[0]
        recoder_rl2.bbox_lefttop_y = lefttop[1]
        recoder_rl2.bbox_rightbottom_x = rightbottom[0]
        recoder_rl2.bbox_rightbottom_y = rightbottom[1]
        recoder_rl2.pure_bbox_lefttop_x = pure_lefttop[0]
        recoder_rl2.pure_bbox_lefttop_y = pure_lefttop[1]
        recoder_rl2.pure_bbox_rightbottom_x = pure_rightbottom[0]
        recoder_rl2.pure_bbox_rightbottom_y = pure_rightbottom[1]

        recoder_rl2.episode = self._episode
        recoder_rl2.reward = self._episode_reward

        recoder_rl2.save()

        ###########################!--measure--#########################
        if (self.bcc.need_switch_fase()):
            recoder_rl.episode = self._episode
            recoder_rl.reward = self._episode_reward
            if not self._episode_reward_count == 0:
                recoder_rl.reward_ave = self._episode_reward / self._episode_reward_count

            if self._episode_learn_count == 0:
                recoder_rl.loss = None
            else:
                recoder_rl.loss = self._episode_loss / self._episode_learn_count
            recoder_rl.learn_count = self._episode_learn_count
            recoder_rl.save()
            """
            recoder.to_csv()
            recoder2.to_csv()
            recoder.reset()
            recoder2.reset()
            """
            #print(termcolor.colored("switch",'red'))
            self.TCR.reset()

            self._Learning_reset()
            self.__go_learn = False
            self.begin = time.time()
            self.count = 0
            self.err_count = 0
            self.detect_count = 0
            self.__continuous_detect_count = 0

            #LApriltags_ros.detected_flag = False
        """
        > detect_t-1 > learning_t-1 > k_t-1 > frame_t
        >  detect_t  >  learning_t  >  k_t  > frame_t+1
        > detect_t+1 > learning_t+1 > k_t+1 > frame_t+2
        """
        ##########################learning###########################
        if (self.set_learn and self.__go_learn):
            tag_velK, anzenK, uv_velK = self.tag_detector.getGain()
            #v1
            k2,k3 = self._GainWithLearning((anzenK,uv_velK),LApriltags_ros.detected_flag,\
                    self.__pure_frame,LApriltags_ros.frame)
            """#v2
            z = self.tag_detector.getApriltag(0).pose[2][0]
            k2,k3 = self._GainWithLearning((anzenK,uv_velK,z),LApriltags_ros.detected_flag,\
                    self.__pure_frame,LApriltags_ros.frame)
            """
            self.tag_detector.setGain(anzenK=k2, uv_velK=k3)
        #######################!--learning--#########################

        ids = []
        #LApriltags_ros.pure_frame_sizes.clear()
        LApriltags_ros.pure_frame_sizes = []
        if len(msg.detections) > 0:

            self.__continuous_detect_count += 1
            LApriltags_ros.detected_flag = True

            for i in range(len(msg.detections)):
                ids.append(msg.detections[i].id[0])
                self.tag_detector.setApriltag(msg.detections[i])

                if (self.__continuous_detect_count >= 2
                        and self.TCR.between() >= 1.0):
                    self.__go_learn = True

            self.tag_detector.reset_tag_vels(ids)
            for i in range(len(msg.detections)):
                iid = msg.detections[i].id[0]
                self.pure_frame_size = self.tag_detector._getUvPureApriltagSize(
                    iid)
                LApriltags_ros.frame, self.__pure_frame = self.tag_detector.getUvApriltag(
                    msg.detections[i])
                #LApriltags_ros.pure_frame_sizes.append(self.pure_frame_size)
                #LApriltags_ros.frames.append(Apriltags_ros.frame)
            #print(termcolor.colored("detect"+str(self.count),'blue'))
        else:

            self.__continuous_detect_count = 0
            self.tag_detector.all_clear_tags()
            LApriltags_ros.detected_flag = False
            self.tag_detector.reset_tag_vels(ids)
            self.pure_frame_size = np.array([0, 0])
            self.__pure_frame = np.array([[0, 0], [0, 0]])
            LApriltags_ros.frame = np.array(
                [[0, 0], [Camera.image_size[0], Camera.image_size[1]]])
            #LApriltags_ros.pure_frame_sizes.append(self.pure_frame_size)
            #            LApriltags_ros.frames.append(Apriltags_ros.frame)
            self.err_count += 1
            #print(termcolor.colored("nondetect"+str(self.err_count),'yellow'))

        self.count += 1
        self.recode_count(LApriltags_ros.detected_flag)

        if not (LApriltags_ros.detected_flag):
            self.__go_learn = False

    def recode_count(self, detect_flag):
        recoder_rl.count = self.count
        recoder_rl2.count = self.count
        if (detect_flag):
            self.detect_count += 1
            recoder_rl.detect_count = self.detect_count
            recoder_rl2.detect_count = self.detect_count
            self.__detected += 1
            self.__nodetected = 0
        else:
            self.__detected = 0
            self.__nodetected += 1

    def _GainWithLearning(self, state, detected_flag, pure_frame, frame):
        #v1
        k2, k3 = self._Learning(state,detected_flag\
                            ,pure_frame,frame)
        """#v2
            k2, k3,_ = self._Learning(state,detected_flag\
                                ,pure_frame,frame)
            """
        return k2, k3

    def _Learning_init(self):

        #tag_velK,anzenK,uv_velK=self.tag_detector.getGain()
        '''
        env = Environment(anzenK,uv_velK)
        self.QLagent = QLearningAgent(env)
        '''
        #v1
        self.env = Environment2()
        self.dqn_agent = DQN(self.env.observation_space, self.env.action_space)
        """#v2
        self.env = Environment2_z()
        self.dqn_agent = DQNv2(self.env.observation_space, self.env.action_space)
        """

        self._episode = -2
        self._episode_reward = None
        self._episode_reward_count = 0
        self.__episode_reward_store = []
        #self.__episode_reward_store.append(self._episode_reward)
        self._Learning_reset()

    def _Learning_reset(self):
        '''
        self.QLagent.reset_episode()
        '''
        self._episode += 1

        if not self._episode_reward_count == 0:
            print(termcolor.colored("Episode: "+str(self._episode)\
                    +", reward_sum: "+str(self._episode_reward)\
                    +", reward_ave:"+str(self._episode_reward/self._episode_reward_count)\
                    ,'yellow'))
        self._episode_reward = 0.
        self._episode_reward_count = 0
        self._episode_learn_count = 0
        self._episode_loss = 0.
        s = self.env.reset()
        return s

    def _Learning(self, state, detect_flag, pure_frame, frame):
        s = state
        a = self.dqn_agent.choose_action(s)
        n_s, r, done, info = self.env.step(s, a)

        pure_lt, pure_rb = pure_frame
        lt, rb = frame  #lefttop,rightbottom
        if(lt[0] < pure_lt[0]\
            and pure_rb[0] < rb[0]\
            and lt[1] < pure_lt[1]\
            and pure_rb[1] < rb[1]\
            ):
            pure_frame_in_frame = 1.0
        else:
            pure_frame_in_frame = 0.0

        pixel = (rb[0] - lt[0]) * (rb[1] - lt[1])
        pure_pixel = (pure_rb[0] - pure_lt[0]) * (pure_rb[1] - pure_lt[1])
        #v1
        anzenK, uv_velK = n_s
        anzenK = anzenK**2
        """#v2
        anzenK,uv_velK,_ = n_s
        """
        if detect_flag:
            """
            r1 =pure_frame_in_frame*\
                   100.* (1.-((pixel-anzenK*pure_pixel)/(pixel)))
            """

            z = self.tag_detector.getApriltag(0).pose[2][0]
            if (anzenK * pure_pixel >= pixel):
                r1 = 1
            else:
                r1 = 1 * 2 / (1 + np.exp(1 * (pixel - anzenK * pure_pixel) /
                                         (anzenK * pure_pixel)))

            print(r1)
        else:
            r1 = 0.

        r = r1  # + (r2 + r3)
        if done:
            self.env.reset()
            r = -10.0

            r2 = -2.
            r3 = -2
        else:
            minth, maxth = self.env._k1_threshold
            r2 = (maxth - anzenK) / (maxth - minth)
            minth, maxth = self.env._k2_threshold
            r3 = (maxth - uv_velK) / (maxth - minth)

        #if not detect_flag : r=-0.1
        self.dqn_agent.store_transition(s, a, r, n_s)
        self._episode_reward += r
        self._episode_reward_count += 1

        if (self.dqn_agent.memory_counter > self.dqn_agent._MEMORY_CAPACITY):
            self._episode_loss += self.dqn_agent.learn()
            self._episode_learn_count += 1
            if done:
                #print("Ep : ",self._episode," r: ",self._episode_reward)
                ppp = 0
        return n_s
        """
        if(self.__detected >= count or self.__nodetected==1):
            return self.QLagent.learn(detect_flag\
                    , pure_pixel, pixel) #anzenk, uv_velk
        else:
            return [0,0]
        """

    def plot_reward(self):
        plt.figure(1)
        plt.clf()
        sum_reward = torch.tensor(self.__episode_reward_store,
                                  dtype=torch.float)
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('reward')
        plt.plot(sum_reward.numpy())
        if len(sum_reward) > 100.:
            means = sum_reward.unfold(0., 100., 1.).mean(1).view(-1)
            means = torch.cat((torch.zeros(99), means))
            plt.plot(means.numpy())
        plt.pause(0.01)

        if self.is_ipython:
            display.clear_output(wait=True)
            display.display(plt.gcf())
Esempio n. 7
0
from flask import Flask, request, render_template, redirect, url_for, abort, jsonify, make_response
import os
import numpy as np
from gym_2048 import Game2048Env
from dqn_agent import DQN
from utils import log2_shaping

env = Game2048Env()

app = Flask(__name__, static_folder="static", static_url_path="/static")

agent = DQN(num_state=16, num_action=4)
agent.load(path='./save/', name='dqn_9017.pkl')

# @app.route('/1')
# def hello_world():
#     with open('./templates/index.html', encoding='utf-8')as f:
#         text = f.read()
#     return text


@app.route('/')
def hello_world2():
    with open('./templates/py2048.html', encoding='utf-8') as f:
        text = f.read()
    return text


@app.route('/move', methods=['POST'])
def tile_move():
    json = request.get_json()
Esempio n. 8
0
    to before, the action it took, the reward it got and the new state it reached
    at that time
    """
    def __init__(self, size):
        self.size = size
        self.memory = deque(maxlen=self.size)

    def update(self, SARS):
        self.memory.append(SARS)

    def sample(self, batch_size):
        return zip(*random.sample(self.memory, batch_size))


r_memory = ReplayMemory(memory_size)
agent = DQN(12, 12, 16)
target = DQN(12, 12, 16)
target.load_state_dict(agent.state_dict())
optimizer = Adam(agent.parameters())


def update_target():
    if len(r_memory.memory) < batch_size:
        return
    observation, action, reward, observation_next, done = r_memory.sample(
        batch_size)
    observations = torch.cat(observation)
    observation_next = torch.cat(observation_next)
    actions = index_action(torch.LongTensor(action))
    rewards = torch.LongTensor(reward)
    done = torch.FloatTensor(done)
Esempio n. 9
0
def train():
    episodes = train_episodes
    logger.configure(dir="./log/", format_strs="stdout,tensorboard,log")
    agent = DQN(num_state=16, num_action=4)
    env = Game2048Env()

    pf_saver = Perfomance_Saver()
    model_saver = Model_Saver(num=10)

    eval_max_score = 0
    for i in range(episodes):
        state, reward, done, info = env.reset()
        state = log2_shaping(state)

        start = time.time()
        loss = None
        while True:
            if agent.buffer.memory_counter <= agent.memory_capacity:
                action = agent.select_action(state, random=True)
            else:
                action = agent.select_action(state)

            next_state, reward, done, info = env.step(action)
            next_state = log2_shaping(next_state)
            reward = log2_shaping(reward, divide=1)

            agent.store_transition(state, action, reward, next_state)
            state = next_state

            if ifrender:
                env.render()

            if agent.buffer.memory_counter % agent.train_interval == 0 and agent.buffer.memory_counter > agent.memory_capacity:  # 相当于填满后才update
                loss = agent.update()

            if done:
                if i % log_interval == 0:
                    if loss:
                        logger.logkv('loss', loss)
                    logger.logkv('training progress', (i+1) / episodes)
                    logger.logkv('episode reward', info['score'])
                    logger.logkv('episode steps', info['steps'])
                    logger.logkv('highest', info['highest'])
                    logger.logkv('epsilon', agent.epsilon)
                    logger.dumpkvs()

                    loss = None

                if i % epsilon_decay_interval == 0:   # episilon decay
                    agent.epsilon_decay(i, episodes)
                break
        
        end = time.time()
        print('episode time:{} s\n'.format(end - start))

        # eval 
        if i % eval_interval == 0 and i:
            eval_info = test(episodes=test_episodes, agent=agent)
            average_score, max_score, score_lis = eval_info['mean'], eval_info['max'], eval_info['list']

            pf_saver.save(score_lis, info=f'episode:{i}')

            if int(average_score) > eval_max_score:
                eval_max_score = int(average_score)
                name = 'dqn_{}.pkl'.format(int(eval_max_score))
                agent.save(name=name)
                model_saver.save("./save/" + name)

            logger.logkv('eval average score', average_score)
            logger.logkv('eval max socre', max_score)
            logger.dumpkvs()
Esempio n. 10
0
    loss = criterion(preds, Y_batch)
    loss.backward()
    optimizer.step()

    return loss.data[0]


def save_checkpoint(state, filename):
    torch.save(state, filename)


#def load_model():
#torch.load(

if __name__ == '__main__':
    model = DQN()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    # Load data
    data = np.load('training_data.npy')
    X_train = np.stack(data[:, 0], axis=0).reshape(
        (len(data), 1, len(data[0][0]), len(data[0][0][0])))
    Y_train = data[:, 3]

    # Training loop
    for epoch in range(100):
        # Randomize and batch training data
        batchsize = 8
        # Randomly shuffle each epoch
        np.random.shuffle(X_train)