Example #1
0
class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        print 'now Q_func is implemented'
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model.l5(h3)) # right side connected with A value
        h6 = self.model.l6(h4) # s value
        h7 = self.model.l7(h5) # A value
        Q = self.model.q_value(h6, h7) # Q value
        return Q

    def Q_func_target(self, state):
        print 'now Q_func_target is implemented'
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value
        h6 = self.model_target.l6(h4) # s value
        h7 = self.model_target.l7(h5) # A value
        Q = self.model_target.q_value(h6, h7) # Q value
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #2
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # 報酬の割引率

    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4

    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 4 #original: 4
    save_model_freq = 10**4 # モデルを保存する頻度

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")
        print("Input Dim of Q-Network : ",self.dim*self.hist_size)

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            l5=F.Linear(hidden_dim,hidden_dim,wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                    initialW=np.zeros((self.num_of_actions, hidden_dim),
                    dtype=np.float32))
        )

        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,state, action, reward,
                        state_dash,episode_end_flag):
        data_index = time % self.data_size #timeを引数に入れることでqueueを実現
        if episode_end_flag is True: # ep_endがTrueならstate_dashが全て0になる
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            if time < self.data_size: #during the first sweep of the History
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = F.relu(self.model.l5(h4))
        q = self.model.q_value(h5)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = F.relu(self.model_target.l5(h4))
        q = self.model_target.q_value(h5)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data
        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random")
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy")
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

    def save_model(self,folder,time):
        try:
            model_path = "./%s/%dmodel"%(folder,time)
            serializers.save_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()
        print "model is saved!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"


    def load_model(self,folder,model_num):
        try:
            model_path = "./%s/%dmodel"%(folder,model_num)
            serializers.load_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

        print "model load is done!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"
        self.target_model_update()
class DQN_class:
	gamma = 0.99
	#initial_exploration = 10**2
	initial_exploration = 10
	replay_size = 32  # Replay (batch) size
	target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
	#data_size = 10**6
	data_size = 10**5

	def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		self.num_of_actions = len(enable_controller)
		self.enable_controller = enable_controller

		print "Initializing DQN..."
		print "CUDA init"
		#cuda.init()

		print "Model Building"
		self.model = FunctionSet(
			l1 = F.Linear(INPUT_SIZE, 5000),	# input map[100, 100] + v[2] + w[1] + wp[2]
			#l1 = F.Linear(INPUT_SIZE, 100),	# input map[100, 100] + v[2] + w[1] + wp[2]
			l2 = F.Linear(5000, 1000),
			l3 = F.Linear(1000, 500),
			l4 = F.Linear(500, 100),
			l5 = F.Linear(100, self.num_of_actions,
			#l2 = F.Linear(100, self.num_of_actions,
						initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32))
		).to_gpu()

		self.model_target = copy.deepcopy(self.model)
		
		print "Initizlizing Optimizer"
		self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)	### 重要!!!!  RMSProp!!
		self.optimizer.setup(self.model.collect_parameters())

		# History Data :  D=[s, a, r, s_dash, end_episode_flag]
		self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros(self.data_size, dtype=np.uint8),
				  np.zeros((self.data_size, 1), dtype=np.float32),
				  np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros((self.data_size, 1), dtype=np.bool)]
		#self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros(self.data_size, dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.int8),
		#		  np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.bool)]

	def forward(self, state, action, Reward, state_dash, episode_end):
		num_of_batch = state.shape[0]
		s = Variable(state)
		s_dash = Variable(state_dash)

		Q = self.Q_func(s)  # Get Q-value

		# Generate Target Signals
		tmp = self.Q_func_target(s_dash)  # Q(s',*)
		tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
		max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
		target = np.asanyarray(Q.data.get(), dtype=np.float32)

		for i in xrange(num_of_batch):
			if not episode_end[i][0]:
				tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
			else:
				tmp_ = np.sign(Reward[i])

			#action_index = self.action_to_index(action[i])
			#target[i, action_index] = tmp_
			target[i, action[i]] = tmp_

		# TD-error clipping
		td = Variable(cuda.to_gpu(target)) - Q  # TD error
		#print "td-error"
		#print "np.max(td.data) : ",
		#print np.max(td.data.get())
		# 何のためにあるのか不明	td = td_clipとなっている
		td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
		td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
		#print "td_clip.data :",
		#print td_clip.data

		zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32))
		#zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))))
		loss = F.mean_squared_error(td_clip, zero_val)
		return loss, Q

	# Dataを保存
	def stockExperience(self, time,
						state, action, reward, state_dash,
						episode_end_flag):
		data_index = time % self.data_size

		if episode_end_flag is True:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
		else:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
			self.D[3][data_index] = state_dash
		self.D[4][data_index] = episode_end_flag

	# mini batch学習
	def experienceReplay(self, time):
		if self.initial_exploration < time:
			# Pick up replay_size number of samples from the Data
			if time < self.data_size:  # during the first sweep of the History Data
				replay_index = np.random.randint(0, time, (self.replay_size, 1))
			else:
				replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

			#s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32)
			s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
			r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
			s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
			for i in xrange(self.replay_size):
				s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
				a_replay[i] = self.D[1][replay_index[i]]
				r_replay[i] = self.D[2][replay_index[i]]
				s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
				episode_end_replay[i] = self.D[4][replay_index[i]]
				if i == 0:
					print "s", s_replay[0]
					print "a", a_replay[0]
					print "s\'", s_dash_replay[0]
					print "r", r_replay[0]

			s_replay = cuda.to_gpu(s_replay)
			s_dash_replay = cuda.to_gpu(s_dash_replay)

			# Gradient-based update
			self.optimizer.zero_grads()
			loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
			loss.backward()	### 逆伝播
			self.optimizer.update()	### 学習!!!!!!!!!, ネットワークの更新

	def Q_func(self, state):
		h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		#h1 = F.relu(self.model.l1(state))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		h4 = F.relu(self.model.l4(h3))
		Q = self.model.l5(h4)
		#Q = self.model.l2(h1)
		return Q

	def Q_func_target(self, state):
		h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		#h1 = F.relu(self.model_target.l1(state))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model_target.l2(h1))
		h3 = F.relu(self.model_target.l3(h2))
		h4 = F.relu(self.model_target.l4(h3))
		Q = self.model.l5(h4)
		#Q = self.model.l2(h1)
		return Q

	def e_greedy(self, state, epsilon):
		s = Variable(state)
		Q = self.Q_func(s)
		Q = Q.data

		if np.random.rand() < epsilon:
			#index_action = np.random.randint(0, self.num_of_actions)
			action = np.random.randint(0, self.num_of_actions)
			print "RANDOM"
		else:
			#index_action = np.argmax(Q.get())
			action = np.argmax(Q.get())
			print "GREEDY"
		#return self.index_to_action(index_action), Q
		return action, Q

	def action_to_vec(self, action, vec):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		#vec = Twist()
		if action == 3 or action == 4 or action == 5:
			#vec.linear.x += 0.1
			vec.linear.x = 0.3
		elif action == 6 or action == 7 or action == 8:
			#vec.linear.x -= 0.1
			vec.linear.x = -0.3
		else:
			vec.linear.x = 0.0

		if action == 1 or action == 4 or action == 7:
			#vec.angular.z += 0.1
			vec.angular.z = 0.3
		elif action == 2 or action == 5 or action == 8:
			#vec.angular.z -= 0.1
			vec.angular.z = -0.3
		else:
			vec.angular.z = 0.0

		if vec.linear.x > 1:
			vec.linear.x = 1
		elif vec.linear.x < -1:
			vec.linear.x = -1

		if vec.angular.z > 1:
			vec.angular.z = 1
		elif vec.angular.z < -1:
			vec.angular.z = -1

		return vec
Example #4
0
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4
    time_M = 11

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim1 = 64
        #hidden_dim1 = 32
        hidden_dim2 = 128
        hidden_dim3 = 10
        hidden_cont = 100

        self.model = FunctionSet(
            l4=linearL4_link.LinearL4_link(self.dim * self.hist_size *
                                           self.time_M,
                                           hidden_cont,
                                           wscale=np.sqrt(2)),
            l5=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M,
                                      hidden_dim3 * hidden_cont,
                                      wscale=np.sqrt(2)),
            l6=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M,
                                      hidden_dim3 * hidden_cont,
                                      wscale=np.sqrt(2)),
            l7=attention.Attention(hidden_cont, hidden_dim3 * hidden_cont,
                                   hidden_dim3),
            l8=retrieval.Retrieval(hidden_dim3, hidden_dim3 * hidden_cont,
                                   hidden_cont),
            l9=F.Bilinear(hidden_cont, hidden_cont, hidden_dim2),
            q_value=F.Linear(hidden_dim2,
                             self.num_of_actions,
                             initialW=np.zeros(
                                 (self.num_of_actions, hidden_dim2),
                                 dtype=np.float32)))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s(now & 10history), a, r, s_dash, end_episode_flag]
        # modified to MQN
        self.d = [
            np.zeros((self.data_size, self.hist_size * self.time_M, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))
            #modify s_replay for MQN
            s_replay = np.ndarray(shape=(self.replay_size,
                                         self.hist_size * self.time_M,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            #modify s_dash_replay to 11times for MQN model
            #print 'now state 1'
            s_dash_tmp = s_dash_replay.reshape(len(s_dash_replay),
                                               -1).astype(dtype=np.float32)
            #print 'now state 2'
            s_dash_M = np.ndarray(shape=(self.replay_size,
                                         self.hist_size * self.time_M,
                                         self.dim),
                                  dtype=np.float32)
            #print 'now state 3'
            s_dash_M[:, 0] = s_dash_tmp
            #print 'now state 4'
            for i in range(self.time_M - 1):
                s_dash_M[:, i + 1] = s_replay[:, i]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_M)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = self.model.l5(state / 255.0)
        h6 = self.model.l6(state / 255.0)
        h7 = F.softmax(self.model.l7(h4, h5))
        h8 = self.model.l8(h7, h6)
        h9 = F.relu(self.model.l9(h4, h8))
        q = self.model.q_value(h9)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = self.model_target.l5(state / 255.0)
        h6 = self.model_target.l6(state / 255.0)
        h7 = F.softmax(self.model_target.l7(h4, h5))
        h8 = self.model_target.l8(h7, h6)
        h9 = F.relu(self.model_target.l9(h4, h8))
        q = self.model_target.q_value(h9)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        #hidden_dim = 256
        hidden_dim128 = 128

        self.model = FunctionSet(
            l4=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l5=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l6=F.Linear(hidden_dim128,
                        1,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((1, hidden_dim128),
                                          dtype=np.float32)),  #V(s,a)
            l7=F.Linear(hidden_dim128,
                        self.num_of_actions,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((self.num_of_actions, hidden_dim128),
                                          dtype=np.float32)),  #A(a)         
            q_value=DN_out.DN_out(1,
                                  self.num_of_actions,
                                  self.num_of_actions,
                                  nobias=True))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = F.relu(self.model.l5(state / 255.0))
        #h6 = F.relu(self.model.l6(h4))
        #h7 = relu_l7.relu(self.model.l7(h5))
        h6 = self.model.l6(h4)
        h7 = self.model.l7(h5)
        q = self.model.q_value(h6, h7)
        return q

    def q_func_target(self, state):
        #h4 = F.relu(self.model_target.l4(state / 255.0))
        #q = self.model_target.q_value(h4)
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = F.relu(self.model_target.l5(state / 255.0))
        #h6 = F.relu(self.model_target.l6(h4))
        #h7 = relu_l7.relu(self.model_target.l7(h5))
        h6 = self.model_target.l6(h4)
        h7 = self.model_target.l7(h5)
        q = self.model_target.q_value(h6, h7)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)
Example #6
0
class ConvQAgent(Agent):
	def __init__(self, frames_per_action=4):
		super(ConvQAgent, self).__init__()
		cuda.init()
		self.epsilon = 1.0
		self.gamma = 0.99
		self.iterations = 0
		
		self.model = FunctionSet(
			l1 = F.Convolution2D(frames_per_action, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
			l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
			l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
			l4 = F.Linear(64 * 7 * 7, 512),
			l5 = F.Linear(512, 2)
		).to_gpu()

		self.optimizer = optimizers.RMSprop(lr=1e-5)
		self.optimizer.setup(self.model)
		self.update_target()

		self.num_frames = 0
		self.frames_per_action = frames_per_action
		self.prev_reward = 0.0

		self.history = ConvHistory((frames_per_action, 84, 84))

	def update_target(self):
		self.target_model = copy.deepcopy(self.model)
		self.target_model = self.target_model.to_gpu()

	def act(self, state):
		self.update_state_vector(state)

		if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0:
			return None

		if random.random() < 0.001:
			print 'Epsilon: {}'.format(self.epsilon)

		if self.epsilon > 0.05:
			self.epsilon -= (0.95 / 300000)

		if random.random() < self.epsilon:
			return random.random() > 0.375

		q = self.get_q(Variable(cuda.to_gpu(self.curr_state[np.newaxis, :, :, :])))

		if random.random() < 0.01:
			if q.data[0,1] > q.data[0,0]:
				print 'On: {}'.format(q.data)
			else:
				print 'Off: {}'.format(q.data)

		return q.data[0,1] > q.data[0,0]

	def update_state_vector(self, state):
		if self.num_frames < self.frames_per_action:
			if self.num_frames == 0:
				self.curr_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.curr_state[self.num_frames, :, :] = state
		else:
			if self.num_frames == self.frames_per_action:
				self.prev_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.prev_state[1:, :, :] = self.prev_state[:-1, :, :]
			self.prev_state[0, :, :] = self.curr_state[-1, :, :]

			self.curr_state[1:, :, :] = self.curr_state[:-1, :, :]
			self.curr_state[0, :, :] = state

		self.num_frames += 1

	def accept_reward(self, state, action, reward, new_state, is_terminal):
		self.prev_reward += reward

		if not (is_terminal or self.num_frames % self.frames_per_action == 0):
			return

		if self.num_frames == self.frames_per_action:
			self.prev_reward = 0.0
			self.prev_action = action
			return

		self.history.add((self.prev_state, self.prev_action, self.prev_reward,
			self.curr_state, is_terminal))
		self.prev_reward = 0.0
		self.prev_action = action

		self.iterations += 1
		if self.iterations % 10000 == 0:
			print '*** UPDATING TARGET NETWORK ***'
			self.update_target()
		
		state, action, reward, new_state, is_terminal = self.history.get(num=32)

		state = cuda.to_gpu(state)
		action = cuda.to_gpu(action)
		new_state = cuda.to_gpu(new_state)
		reward = cuda.to_gpu(reward)

		loss, q = self.forward(state, action, reward, new_state, is_terminal)
		self.optimizer.zero_grads()
		loss.backward()
		self.optimizer.update()

	def forward(self, state, action, reward, new_state, is_terminal):
		q = self.get_q(Variable(state))
		q_target = self.get_target_q(Variable(new_state))

		max_target_q = cp.max(q_target.data, axis=1)

		target = cp.copy(q.data)

		for i in xrange(target.shape[0]):
			curr_action = int(action[i, 0])
			if is_terminal[i]:
				target[i, curr_action] = reward[i]
			else:
				target[i, curr_action] = reward[i] + self.gamma * max_target_q[i]
		
		loss = F.mean_squared_error(Variable(target), q)
		return loss, 0.0 #cp.mean(q.data[:, action[i]])

	def get_q(self, state):
		h1 = F.relu(self.model.l1(state))
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		h4 = self.model.l4(h3)
		return self.model.l5(h4)

	def get_target_q(self, state):
		h1 = F.relu(self.target_model.l1(state))
		h2 = F.relu(self.target_model.l2(h1))
		h3 = F.relu(self.target_model.l3(h2))
		h4 = self.target_model.l4(h3)
		return self.target_model.l5(h4)

	def save(self, file_name):
		with open(file_name, 'wb') as out_file:
			pickle.dump((self.model, self.optimizer), out_file)

	def load(self, file_name):
		self.epsilon = 0.0

		with open(file_name, 'rb') as in_file:
			model, optimizer = pickle.load(in_file)
			self.model.copy_parameters_from(model.parameters)
			self.optimizer = optimizer

	def start_new_game(self):
		self.num_frames = 0