Python FunctionSet.l5 Examples

Programming Language: Python

Namespace/Package Name: chainer

Class/Type: FunctionSet

Method/Function: l5

Examples at hotexamples.com: 6

Python FunctionSet.l5 - 6 examples found. These are the top rated real world Python examples of chainer.FunctionSet.l5 extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FunctionSet(30)

collect_parameters(11)

decode(4)

conv1(3)

__setattr__(2)

_get_sorted_funcs(2)

conv2(2)

decoder(2)

embed(2)

conv4_34_3(1)

conv4_32_3(1)

conv4_33_1(1)

conv4_33_2(1)

conv4_33_3(1)

conv4_34_1(1)

conv4_34_2(1)

conv4_36_1(1)

conv4_35_1(1)

conv4_35_2(1)

conv4_35_3(1)

conv4_32_1(1)

conv4_36_2(1)

conv4_36_3(1)

conv4_3_1(1)

conv4_3_2(1)

conv4_3_3(1)

conv4_4_1(1)

conv4_4_2(1)

conv4_32_2(1)

conv4_30_2(1)

conv4_31_3(1)

conv4_29_1(1)

conv4_26_2(1)

conv4_26_3(1)

conv4_27_1(1)

conv4_27_2(1)

conv4_27_3(1)

conv4_28_1(1)

conv4_28_2(1)

conv4_28_3(1)

conv4_29_2(1)

conv4_31_2(1)

conv4_29_3(1)

conv4_2_1(1)

conv4_2_2(1)

conv4_2_3(1)

conv4_30_1(1)

conv4_5_1(1)

conv4_30_3(1)

conv4_31_1(1)

Example #1

Show file

File: dn_agent.py Project: masataka46/DuelingNetwork

class DN_class:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 100#10**4  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6

    def __init__(self, enable_controller=[0, 1, 3, 4]):
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller  # Default setting : "Breakout"

        print "Initializing DN..."
#	Initialization of Chainer 1.1.0 or older.
#        print "CUDA init"
#        cuda.init()

        print "Model Building"
        self.model = FunctionSet(
            l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
            l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
            l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
            l4=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l5=F.Linear(3136, 256, wscale=np.sqrt(2)),
            l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)),
            l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256),
                                               dtype=np.float32)),
            q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True)
        ).to_gpu()
        
        if args.resumemodel:
            # load saved model
            serializers.load_npz(args.resumemodel, self.model)
            print "load model from resume.model"
        

        self.model_target = copy.deepcopy(self.model)

        print "Initizlizing Optimizer"
        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        if args.resumeD1 and args.resumeD2:
            # load saved D1 and D2
            npz_tmp1 = np.load(args.resumeD1)
            print "finished loading half of D data"
            npz_tmp2 = np.load(args.resumeD2)
            self.D = [npz_tmp1['D0'],
                      npz_tmp1['D1'],
                      npz_tmp1['D2'],
                      npz_tmp2['D3'],
                      npz_tmp2['D4']]
            npz_tmp1.close()
            npz_tmp2.close()
            print "loaded stored all D data"
        else:
            self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros(self.data_size, dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.int8),
                      np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8),
                      np.zeros((self.data_size, 1), dtype=np.bool)]
            print "initialized D data"

    def forward(self, state, action, Reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        Q = self.Q_func(s)  # Get Q-value
        # Generate Target Signals
        tmp2 = self.Q_func(s_dash)
        tmp2 = list(map(np.argmax, tmp2.data.get()))  # argmaxQ(s',a)
        tmp = self.Q_func_target(s_dash)  # Q'(s',*)
        tmp = list(tmp.data.get())
        # select Q'(s',*) due to argmaxQ(s',a)
        res1 = []
        for i in range(num_of_batch):
            res1.append(tmp[i][tmp2[i]])

        #max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
        max_Q_dash = np.asanyarray(res1, dtype=np.float32)
        target = np.asanyarray(Q.data.get(), dtype=np.float32)
        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
            else:
                tmp_ = np.sign(Reward[i])

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_
        # TD-error clipping
        td = Variable(cuda.to_gpu(target)) - Q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)))
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, Q

    def stockExperience(self, time,
                        state, action, reward, state_dash,
                        episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
        else:
            self.D[0][data_index] = state
            self.D[1][data_index] = action
            self.D[2][data_index] = reward
            self.D[3][data_index] = state_dash
        self.D[4][data_index] = episode_end_flag

    def experienceReplay(self, time):

        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.D[1][replay_index[i]]
                r_replay[i] = self.D[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.D[4][replay_index[i]]

            s_replay = cuda.to_gpu(s_replay)
            s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def Q_func(self, state):
        print 'now Q_func is implemented'
        h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model.l2(h1))
        h3 = F.relu(self.model.l3(h2))
        h4 = F.relu(self.model.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model.l5(h3)) # right side connected with A value
        h6 = self.model.l6(h4) # s value
        h7 = self.model.l7(h5) # A value
        Q = self.model.q_value(h6, h7) # Q value
        return Q

    def Q_func_target(self, state):
        print 'now Q_func_target is implemented'
        h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs in [0.0 1.0]
        h2 = F.relu(self.model_target.l2(h1))
        h3 = F.relu(self.model_target.l3(h2))
        h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value
        h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value
        h6 = self.model_target.l6(h4) # s value
        h7 = self.model_target.l7(h5) # A value
        Q = self.model_target.q_value(h6, h7) # Q value
        return Q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        Q = self.Q_func(s)
        Q = Q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print "RANDOM"
        else:
            index_action = np.argmax(Q.get())
            print "GREEDY"
        return self.index_to_action(index_action), Q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

Example #2

Show file

class QNet:
    # Hyper-Parameters
    gamma = 0.99  # 報酬の割引率

    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4

    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 4 #original: 4
    save_model_freq = 10**4 # モデルを保存する頻度

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")
        print("Input Dim of Q-Network : ",self.dim*self.hist_size)

        hidden_dim = 256
        self.model = FunctionSet(
            l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)),
            l5=F.Linear(hidden_dim,hidden_dim,wscale=np.sqrt(2)),
            q_value=F.Linear(hidden_dim, self.num_of_actions,
                    initialW=np.zeros((self.num_of_actions, hidden_dim),
                    dtype=np.float32))
        )

        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros(self.data_size, dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.int8),
                  np.zeros((self.data_size, self.hist_size, self.dim),
                    dtype=np.uint8),
                  np.zeros((self.data_size, 1), dtype=np.bool)]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time,state, action, reward,
                        state_dash,episode_end_flag):
        data_index = time % self.data_size #timeを引数に入れることでqueueを実現
        if episode_end_flag is True: # ep_endがTrueならstate_dashが全て0になる
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            if time < self.data_size: #during the first sweep of the History
                replay_index = np.random.randint(0, time, (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = F.relu(self.model.l5(h4))
        q = self.model.q_value(h5)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = F.relu(self.model_target.l5(h4))
        q = self.model_target.q_value(h5)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data
        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random")
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy")
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

    def save_model(self,folder,time):
        try:
            model_path = "./%s/%dmodel"%(folder,time)
            serializers.save_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()
        print "model is saved!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"


    def load_model(self,folder,model_num):
        try:
            model_path = "./%s/%dmodel"%(folder,model_num)
            serializers.load_npz(model_path,self.model)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

        print "model load is done!!(Model_Path=%s)"%(model_path)
        print "----------------------------------------------"
        self.target_model_update()

Example #3

Show file

File: agent_keyboard.py Project: tk0khmhm/deep_reinforcement_learning

class DQN_class:
	gamma = 0.99
	#initial_exploration = 10**2
	initial_exploration = 10
	replay_size = 32  # Replay (batch) size
	target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
	#data_size = 10**6
	data_size = 10**5

	def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		self.num_of_actions = len(enable_controller)
		self.enable_controller = enable_controller

		print "Initializing DQN..."
		print "CUDA init"
		#cuda.init()

		print "Model Building"
		self.model = FunctionSet(
			l1 = F.Linear(INPUT_SIZE, 5000),	# input map[100, 100] + v[2] + w[1] + wp[2]
			#l1 = F.Linear(INPUT_SIZE, 100),	# input map[100, 100] + v[2] + w[1] + wp[2]
			l2 = F.Linear(5000, 1000),
			l3 = F.Linear(1000, 500),
			l4 = F.Linear(500, 100),
			l5 = F.Linear(100, self.num_of_actions,
			#l2 = F.Linear(100, self.num_of_actions,
						initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32))
		).to_gpu()

		self.model_target = copy.deepcopy(self.model)
		
		print "Initizlizing Optimizer"
		self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001)	### 重要!!!!  RMSProp!!
		self.optimizer.setup(self.model.collect_parameters())

		# History Data :  D=[s, a, r, s_dash, end_episode_flag]
		self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros(self.data_size, dtype=np.uint8),
				  np.zeros((self.data_size, 1), dtype=np.float32),
				  np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32),
				  np.zeros((self.data_size, 1), dtype=np.bool)]
		#self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros(self.data_size, dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.int8),
		#		  np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8),
		#		  np.zeros((self.data_size, 1), dtype=np.bool)]

	def forward(self, state, action, Reward, state_dash, episode_end):
		num_of_batch = state.shape[0]
		s = Variable(state)
		s_dash = Variable(state_dash)

		Q = self.Q_func(s)  # Get Q-value

		# Generate Target Signals
		tmp = self.Q_func_target(s_dash)  # Q(s',*)
		tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
		max_Q_dash = np.asanyarray(tmp, dtype=np.float32)
		target = np.asanyarray(Q.data.get(), dtype=np.float32)

		for i in xrange(num_of_batch):
			if not episode_end[i][0]:
				tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i]
			else:
				tmp_ = np.sign(Reward[i])

			#action_index = self.action_to_index(action[i])
			#target[i, action_index] = tmp_
			target[i, action[i]] = tmp_

		# TD-error clipping
		td = Variable(cuda.to_gpu(target)) - Q  # TD error
		#print "td-error"
		#print "np.max(td.data) : ",
		#print np.max(td.data.get())
		# 何のためにあるのか不明	td = td_clipとなっている
		td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
		td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1)
		#print "td_clip.data :",
		#print td_clip.data

		zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32))
		#zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))))
		loss = F.mean_squared_error(td_clip, zero_val)
		return loss, Q

	# Dataを保存
	def stockExperience(self, time,
						state, action, reward, state_dash,
						episode_end_flag):
		data_index = time % self.data_size

		if episode_end_flag is True:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
		else:
			self.D[0][data_index] = state
			self.D[1][data_index] = action
			self.D[2][data_index] = reward
			self.D[3][data_index] = state_dash
		self.D[4][data_index] = episode_end_flag

	# mini batch学習
	def experienceReplay(self, time):
		if self.initial_exploration < time:
			# Pick up replay_size number of samples from the Data
			if time < self.data_size:  # during the first sweep of the History Data
				replay_index = np.random.randint(0, time, (self.replay_size, 1))
			else:
				replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1))

			#s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32)
			s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
			r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32)
			s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32)
			episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool)
			for i in xrange(self.replay_size):
				s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32)
				a_replay[i] = self.D[1][replay_index[i]]
				r_replay[i] = self.D[2][replay_index[i]]
				s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32)
				episode_end_replay[i] = self.D[4][replay_index[i]]
				if i == 0:
					print "s", s_replay[0]
					print "a", a_replay[0]
					print "s\'", s_dash_replay[0]
					print "r", r_replay[0]

			s_replay = cuda.to_gpu(s_replay)
			s_dash_replay = cuda.to_gpu(s_dash_replay)

			# Gradient-based update
			self.optimizer.zero_grads()
			loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay)
			loss.backward()	### 逆伝播
			self.optimizer.update()	### 学習!!!!!!!!!, ネットワークの更新

	def Q_func(self, state):
		h1 = F.relu(self.model.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		#h1 = F.relu(self.model.l1(state))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		h4 = F.relu(self.model.l4(h3))
		Q = self.model.l5(h4)
		#Q = self.model.l2(h1)
		return Q

	def Q_func_target(self, state):
		h1 = F.relu(self.model_target.l1(state / 254.0))  # scale inputs into [0.0 1.0]
		#h1 = F.relu(self.model_target.l1(state))  # scale inputs into [0.0 1.0]
		h2 = F.relu(self.model_target.l2(h1))
		h3 = F.relu(self.model_target.l3(h2))
		h4 = F.relu(self.model_target.l4(h3))
		Q = self.model.l5(h4)
		#Q = self.model.l2(h1)
		return Q

	def e_greedy(self, state, epsilon):
		s = Variable(state)
		Q = self.Q_func(s)
		Q = Q.data

		if np.random.rand() < epsilon:
			#index_action = np.random.randint(0, self.num_of_actions)
			action = np.random.randint(0, self.num_of_actions)
			print "RANDOM"
		else:
			#index_action = np.argmax(Q.get())
			action = np.argmax(Q.get())
			print "GREEDY"
		#return self.index_to_action(index_action), Q
		return action, Q

	def action_to_vec(self, action, vec):
		#	"""	[ 0, 0],
		#		[ 0, 1],
		#		[ 0,-1],
		#		[ 1, 0],
		#		[ 1, 1],
		#		[ 1,-1],
		#		[-1, 0],
		#		[-1, 1],
		#		[-1,-1]]):"""
		#vec = Twist()
		if action == 3 or action == 4 or action == 5:
			#vec.linear.x += 0.1
			vec.linear.x = 0.3
		elif action == 6 or action == 7 or action == 8:
			#vec.linear.x -= 0.1
			vec.linear.x = -0.3
		else:
			vec.linear.x = 0.0

		if action == 1 or action == 4 or action == 7:
			#vec.angular.z += 0.1
			vec.angular.z = 0.3
		elif action == 2 or action == 5 or action == 8:
			#vec.angular.z -= 0.1
			vec.angular.z = -0.3
		else:
			vec.angular.z = 0.0

		if vec.linear.x > 1:
			vec.linear.x = 1
		elif vec.linear.x < -1:
			vec.linear.x = -1

		if vec.angular.z > 1:
			vec.angular.z = 1
		elif vec.angular.z < -1:
			vec.angular.z = -1

		return vec

Example #4

Show file

class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4
    time_M = 11

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        hidden_dim1 = 64
        #hidden_dim1 = 32
        hidden_dim2 = 128
        hidden_dim3 = 10
        hidden_cont = 100

        self.model = FunctionSet(
            l4=linearL4_link.LinearL4_link(self.dim * self.hist_size *
                                           self.time_M,
                                           hidden_cont,
                                           wscale=np.sqrt(2)),
            l5=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M,
                                      hidden_dim3 * hidden_cont,
                                      wscale=np.sqrt(2)),
            l6=MU_l6.memory_unit_link(self.dim * self.hist_size * self.time_M,
                                      hidden_dim3 * hidden_cont,
                                      wscale=np.sqrt(2)),
            l7=attention.Attention(hidden_cont, hidden_dim3 * hidden_cont,
                                   hidden_dim3),
            l8=retrieval.Retrieval(hidden_dim3, hidden_dim3 * hidden_cont,
                                   hidden_cont),
            l9=F.Bilinear(hidden_cont, hidden_cont, hidden_dim2),
            q_value=F.Linear(hidden_dim2,
                             self.num_of_actions,
                             initialW=np.zeros(
                                 (self.num_of_actions, hidden_dim2),
                                 dtype=np.float32)))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s(now & 10history), a, r, s_dash, end_episode_flag]
        # modified to MQN
        self.d = [
            np.zeros((self.data_size, self.hist_size * self.time_M, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))
            #modify s_replay for MQN
            s_replay = np.ndarray(shape=(self.replay_size,
                                         self.hist_size * self.time_M,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            #modify s_dash_replay to 11times for MQN model
            #print 'now state 1'
            s_dash_tmp = s_dash_replay.reshape(len(s_dash_replay),
                                               -1).astype(dtype=np.float32)
            #print 'now state 2'
            s_dash_M = np.ndarray(shape=(self.replay_size,
                                         self.hist_size * self.time_M,
                                         self.dim),
                                  dtype=np.float32)
            #print 'now state 3'
            s_dash_M[:, 0] = s_dash_tmp
            #print 'now state 4'
            for i in range(self.time_M - 1):
                s_dash_M[:, i + 1] = s_replay[:, i]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_M)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = self.model.l5(state / 255.0)
        h6 = self.model.l6(state / 255.0)
        h7 = F.softmax(self.model.l7(h4, h5))
        h8 = self.model.l8(h7, h6)
        h9 = F.relu(self.model.l9(h4, h8))
        q = self.model.q_value(h9)
        return q

    def q_func_target(self, state):
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = self.model_target.l5(state / 255.0)
        h6 = self.model_target.l6(state / 255.0)
        h7 = F.softmax(self.model_target.l7(h4, h5))
        h8 = self.model_target.l8(h7, h6)
        h9 = F.relu(self.model_target.l9(h4, h8))
        q = self.model_target.q_value(h9)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

Example #5

Show file

File: q_net_DN.py Project: masataka46/DuelingNetwork_for_LIS

class QNet:
    # Hyper-Parameters
    gamma = 0.99  # Discount factor
    initial_exploration = 10**3  # Initial exploratoin. original: 5x10^4
    replay_size = 32  # Replay (batch) size
    target_model_update_freq = 10**4  # Target update frequancy. original: 10^4
    data_size = 10**5  # Data size of history. original: 10^6
    hist_size = 1  #original: 4

    def __init__(self, use_gpu, enable_controller, dim):
        self.use_gpu = use_gpu
        self.num_of_actions = len(enable_controller)
        self.enable_controller = enable_controller
        self.dim = dim

        print("Initializing Q-Network...")

        #hidden_dim = 256
        hidden_dim128 = 128

        self.model = FunctionSet(
            l4=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l5=F.Linear(self.dim * self.hist_size,
                        hidden_dim128,
                        wscale=np.sqrt(2)),
            l6=F.Linear(hidden_dim128,
                        1,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((1, hidden_dim128),
                                          dtype=np.float32)),  #V(s,a)
            l7=F.Linear(hidden_dim128,
                        self.num_of_actions,
                        wscale=np.sqrt(2),
                        initialW=np.zeros((self.num_of_actions, hidden_dim128),
                                          dtype=np.float32)),  #A(a)         
            q_value=DN_out.DN_out(1,
                                  self.num_of_actions,
                                  self.num_of_actions,
                                  nobias=True))
        if self.use_gpu >= 0:
            self.model.to_gpu()

        self.model_target = copy.deepcopy(self.model)

        self.optimizer = optimizers.RMSpropGraves(lr=0.00025,
                                                  alpha=0.95,
                                                  momentum=0.95,
                                                  eps=0.0001)
        self.optimizer.setup(self.model.collect_parameters())

        # History Data :  D=[s, a, r, s_dash, end_episode_flag]
        self.d = [
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros(self.data_size, dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.int8),
            np.zeros((self.data_size, self.hist_size, self.dim),
                     dtype=np.uint8),
            np.zeros((self.data_size, 1), dtype=np.bool)
        ]

    def forward(self, state, action, reward, state_dash, episode_end):
        num_of_batch = state.shape[0]
        s = Variable(state)
        s_dash = Variable(state_dash)

        q = self.q_func(s)  # Get Q-value

        # Generate Target Signals
        tmp = self.q_func_target(s_dash)  # Q(s',*)
        if self.use_gpu >= 0:
            tmp = list(map(np.max, tmp.data.get()))  # max_a Q(s',a)
        else:
            tmp = list(map(np.max, tmp.data))  # max_a Q(s',a)

        max_q_dash = np.asanyarray(tmp, dtype=np.float32)
        if self.use_gpu >= 0:
            target = np.asanyarray(q.data.get(), dtype=np.float32)
        else:
            # make new array
            target = np.array(q.data, dtype=np.float32)

        for i in xrange(num_of_batch):
            if not episode_end[i][0]:
                tmp_ = reward[i] + self.gamma * max_q_dash[i]
            else:
                tmp_ = reward[i]

            action_index = self.action_to_index(action[i])
            target[i, action_index] = tmp_

        # TD-error clipping
        if self.use_gpu >= 0:
            target = cuda.to_gpu(target)
        td = Variable(target) - q  # TD error
        td_tmp = td.data + 1000.0 * (abs(td.data) <= 1)  # Avoid zero division
        td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) >
                                                                 1)

        zero_val = np.zeros((self.replay_size, self.num_of_actions),
                            dtype=np.float32)
        if self.use_gpu >= 0:
            zero_val = cuda.to_gpu(zero_val)
        zero_val = Variable(zero_val)
        loss = F.mean_squared_error(td_clip, zero_val)
        return loss, q

    def stock_experience(self, time, state, action, reward, state_dash,
                         episode_end_flag):
        data_index = time % self.data_size

        if episode_end_flag is True:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
        else:
            self.d[0][data_index] = state
            self.d[1][data_index] = action
            self.d[2][data_index] = reward
            self.d[3][data_index] = state_dash
        self.d[4][data_index] = episode_end_flag

    def experience_replay(self, time):
        if self.initial_exploration < time:
            # Pick up replay_size number of samples from the Data
            if time < self.data_size:  # during the first sweep of the History Data
                replay_index = np.random.randint(0, time,
                                                 (self.replay_size, 1))
            else:
                replay_index = np.random.randint(0, self.data_size,
                                                 (self.replay_size, 1))

            s_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                         self.dim),
                                  dtype=np.float32)
            a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8)
            r_replay = np.ndarray(shape=(self.replay_size, 1),
                                  dtype=np.float32)
            s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size,
                                              self.dim),
                                       dtype=np.float32)
            episode_end_replay = np.ndarray(shape=(self.replay_size, 1),
                                            dtype=np.bool)
            for i in xrange(self.replay_size):
                s_replay[i] = np.asarray(self.d[0][replay_index[i]],
                                         dtype=np.float32)
                a_replay[i] = self.d[1][replay_index[i]]
                r_replay[i] = self.d[2][replay_index[i]]
                s_dash_replay[i] = np.array(self.d[3][replay_index[i]],
                                            dtype=np.float32)
                episode_end_replay[i] = self.d[4][replay_index[i]]

            if self.use_gpu >= 0:
                s_replay = cuda.to_gpu(s_replay)
                s_dash_replay = cuda.to_gpu(s_dash_replay)

            # Gradient-based update
            self.optimizer.zero_grads()
            loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay,
                                   episode_end_replay)
            loss.backward()
            self.optimizer.update()

    def q_func(self, state):
        h4 = F.relu(self.model.l4(state / 255.0))
        h5 = F.relu(self.model.l5(state / 255.0))
        #h6 = F.relu(self.model.l6(h4))
        #h7 = relu_l7.relu(self.model.l7(h5))
        h6 = self.model.l6(h4)
        h7 = self.model.l7(h5)
        q = self.model.q_value(h6, h7)
        return q

    def q_func_target(self, state):
        #h4 = F.relu(self.model_target.l4(state / 255.0))
        #q = self.model_target.q_value(h4)
        h4 = F.relu(self.model_target.l4(state / 255.0))
        h5 = F.relu(self.model_target.l5(state / 255.0))
        #h6 = F.relu(self.model_target.l6(h4))
        #h7 = relu_l7.relu(self.model_target.l7(h5))
        h6 = self.model_target.l6(h4)
        h7 = self.model_target.l7(h5)
        q = self.model_target.q_value(h6, h7)
        return q

    def e_greedy(self, state, epsilon):
        s = Variable(state)
        q = self.q_func(s)
        q = q.data

        if np.random.rand() < epsilon:
            index_action = np.random.randint(0, self.num_of_actions)
            print(" Random"),
        else:
            if self.use_gpu >= 0:
                index_action = np.argmax(q.get())
            else:
                index_action = np.argmax(q)
            print("#Greedy"),
        return self.index_to_action(index_action), q

    def target_model_update(self):
        self.model_target = copy.deepcopy(self.model)

    def index_to_action(self, index_of_action):
        return self.enable_controller[index_of_action]

    def action_to_index(self, action):
        return self.enable_controller.index(action)

Example #6

Show file

File: agents.py Project: dylanrhodes/helicopter-ai

class ConvQAgent(Agent):
	def __init__(self, frames_per_action=4):
		super(ConvQAgent, self).__init__()
		cuda.init()
		self.epsilon = 1.0
		self.gamma = 0.99
		self.iterations = 0
		
		self.model = FunctionSet(
			l1 = F.Convolution2D(frames_per_action, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)),
			l2 = F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)),
			l3 = F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)),
			l4 = F.Linear(64 * 7 * 7, 512),
			l5 = F.Linear(512, 2)
		).to_gpu()

		self.optimizer = optimizers.RMSprop(lr=1e-5)
		self.optimizer.setup(self.model)
		self.update_target()

		self.num_frames = 0
		self.frames_per_action = frames_per_action
		self.prev_reward = 0.0

		self.history = ConvHistory((frames_per_action, 84, 84))

	def update_target(self):
		self.target_model = copy.deepcopy(self.model)
		self.target_model = self.target_model.to_gpu()

	def act(self, state):
		self.update_state_vector(state)

		if self.num_frames < self.frames_per_action - 1 or self.num_frames % self.frames_per_action != 0:
			return None

		if random.random() < 0.001:
			print 'Epsilon: {}'.format(self.epsilon)

		if self.epsilon > 0.05:
			self.epsilon -= (0.95 / 300000)

		if random.random() < self.epsilon:
			return random.random() > 0.375

		q = self.get_q(Variable(cuda.to_gpu(self.curr_state[np.newaxis, :, :, :])))

		if random.random() < 0.01:
			if q.data[0,1] > q.data[0,0]:
				print 'On: {}'.format(q.data)
			else:
				print 'Off: {}'.format(q.data)

		return q.data[0,1] > q.data[0,0]

	def update_state_vector(self, state):
		if self.num_frames < self.frames_per_action:
			if self.num_frames == 0:
				self.curr_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.curr_state[self.num_frames, :, :] = state
		else:
			if self.num_frames == self.frames_per_action:
				self.prev_state = np.zeros((self.frames_per_action, 84, 84), dtype=np.float32)
			self.prev_state[1:, :, :] = self.prev_state[:-1, :, :]
			self.prev_state[0, :, :] = self.curr_state[-1, :, :]

			self.curr_state[1:, :, :] = self.curr_state[:-1, :, :]
			self.curr_state[0, :, :] = state

		self.num_frames += 1

	def accept_reward(self, state, action, reward, new_state, is_terminal):
		self.prev_reward += reward

		if not (is_terminal or self.num_frames % self.frames_per_action == 0):
			return

		if self.num_frames == self.frames_per_action:
			self.prev_reward = 0.0
			self.prev_action = action
			return

		self.history.add((self.prev_state, self.prev_action, self.prev_reward,
			self.curr_state, is_terminal))
		self.prev_reward = 0.0
		self.prev_action = action

		self.iterations += 1
		if self.iterations % 10000 == 0:
			print '*** UPDATING TARGET NETWORK ***'
			self.update_target()
		
		state, action, reward, new_state, is_terminal = self.history.get(num=32)

		state = cuda.to_gpu(state)
		action = cuda.to_gpu(action)
		new_state = cuda.to_gpu(new_state)
		reward = cuda.to_gpu(reward)

		loss, q = self.forward(state, action, reward, new_state, is_terminal)
		self.optimizer.zero_grads()
		loss.backward()
		self.optimizer.update()

	def forward(self, state, action, reward, new_state, is_terminal):
		q = self.get_q(Variable(state))
		q_target = self.get_target_q(Variable(new_state))

		max_target_q = cp.max(q_target.data, axis=1)

		target = cp.copy(q.data)

		for i in xrange(target.shape[0]):
			curr_action = int(action[i, 0])
			if is_terminal[i]:
				target[i, curr_action] = reward[i]
			else:
				target[i, curr_action] = reward[i] + self.gamma * max_target_q[i]
		
		loss = F.mean_squared_error(Variable(target), q)
		return loss, 0.0 #cp.mean(q.data[:, action[i]])

	def get_q(self, state):
		h1 = F.relu(self.model.l1(state))
		h2 = F.relu(self.model.l2(h1))
		h3 = F.relu(self.model.l3(h2))
		h4 = self.model.l4(h3)
		return self.model.l5(h4)

	def get_target_q(self, state):
		h1 = F.relu(self.target_model.l1(state))
		h2 = F.relu(self.target_model.l2(h1))
		h3 = F.relu(self.target_model.l3(h2))
		h4 = self.target_model.l4(h3)
		return self.target_model.l5(h4)

	def save(self, file_name):
		with open(file_name, 'wb') as out_file:
			pickle.dump((self.model, self.optimizer), out_file)

	def load(self, file_name):
		self.epsilon = 0.0

		with open(file_name, 'rb') as in_file:
			model, optimizer = pickle.load(in_file)
			self.model.copy_parameters_from(model.parameters)
			self.optimizer = optimizer

	def start_new_game(self):
		self.num_frames = 0