def main(): experiment= 'InvertedPendulum-v1' #specify environments here env= gym.make(experiment) steps= env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:",i,"====","\n" observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print "Action at step", t ," :",action,"\n" observation,reward,done,info=env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode print "Printing reward to file" exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def s2l(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states = feature_size #num_states = env.observation_space.shape[0] num_actions = num_controls print("Number of States:", num_states) print("Number of Actions:", num_actions) action_space_high = [0.0, 0.0, 0.0] action_space_low = [0.5, 0.5, 0.5] print("Action space highest values", action_space_high) print("Action space lowest values:", action_space_low) robot = RoboControl() agent = DDPG(is_batch_norm, num_states, num_actions, action_space_high, action_space_low) exploration_noise = OUNoise(num_actions) counter = 0 total_reward = 0 print("Number of Rollouts per episode:", num_rollouts) print("Number of Steps per roll out:", steps) reward_st = np.array([0]) #saving reward eval_metric_st = np.array([0]) reward_st_all = np.array([0]) #saving reward after every step activity_obj = Vid_Feature() demo_vid_array = demo_array_extractor(demo_folder) demo_features = activity_obj.feature_extractor(demo_vid_array) frame_obj = Frame_Feature() camera_obj = Camera() for episode in range(num_episodes): print("==== Starting episode no:", episode, "====", "\n") robot.reset() # Reset env in the begining of each episode obs_img = camera_obj.render() # Get the observation #obs_img=np.array(misc.imresize(obs_img,[112,112,3])) observation = np.array(frame_obj.frame_feature_extractor(obs_img)) observation = observation.reshape(-1) reward_per_episode = 0 for t in range(num_rollouts): reward_per_rollout = 0 vid_robo_ = [] for i in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print('Action at episode-', episode, 'rollout-', t, 'step-', i, " :", action) robot.publish_control(action) obs_robo = camera_obj.render() # Get the observation #obs_robo=misc.imresize(obs_robo,[112,112,3]) vid_robo_.append(obs_robo) observation = np.array( frame_obj.frame_feature_extractor(np.array(obs_robo))) observation = observation.reshape(-1) #pasue() if (i == 15): vid_robo = np.array(vid_robo_) robo_features = activity_obj.feature_extractor(vid_robo) reward = -(distance(demo_features, robo_features)) reward = np.array(reward) print('reward: ', reward) else: reward = 0 reward = np.array(reward) print('reward: ', reward) # Storing reward after every rollout reward_st_all = np.append(reward_st_all, reward) np.savetxt('reward_all.txt', reward_st_all, newline="\n") #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, False) reward_per_rollout += reward counter += 1 #train critic and actor network if counter > start_training: agent.train() print('\n\n') #Saving policy if ((episode % 100) == 0 and t == num_rollouts - 1): print('saving policy...........................!') agent.save_actor(episode) reward_per_episode += reward_per_rollout #check if episode ends: print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n") print('\n\n') total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / num_episodes))
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG() exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 #saving reward: reward_st = np.array([0]) # network setup s = socket.socket() # Create a socket object #host = socket.gethostname() # Get local machine name host = '' # Get local machine name port = 21567 # Reserve a port for your service. s.bind((host, port)) s.listen(5) imgorigin_t = np.zeros(300,400) imgorigin_t_1 = np.zeros(300,400) actor_t = np.zeros(6) actor_t_1 =np.zeros(6) index = 0 #the first time c, addr = s.accept() # Establish connection with client. print ('Got connection from'), addr print ("Receiving...") l = c.recv(1024) f = open('temp.tif','wb') while (l): f.write(l) l = c.recv(1024) f.close() print ("Done Receiving") imgorigin_t = np.array(Image.open('temp.tif')) tempimg = imgorigin_t[np.newaxis,:,:,np.newaxis] tempimg = tempimg.transpose([0,2,1,3]) test_pred = agent.evaluate_actor(tempimg) action_t = test_pred[0] print action_t str_buf = '' str_buf = str_buf+str(action_t[0,0])+" " str_buf = str_buf+str(action_t[0,1])+" " str_buf = str_buf+str(action_t[0,2])+" " str_buf = str_buf+str(action_t[0,3])+" " str_buf = str_buf+str(action_t[0,4])+" " str_buf = str_buf+str(action_t[0,5])+" " imgorigin_t_1 = imgorigin_t actor_t_1 = actor_t c.send(str_buf) c.close() index =1 while True: #update imgorigin_t and actor_t imgorigin_t = img_origin_t_1 actor_t = actor_t_1 c, addr = s.accept() # Establish connection with client. print ('Got connection from'), addr print ("Receiving...") l = c.recv(1024) f = open('temp.tif','wb') while (l): f.write(l) l = c.recv(1024) f.close() print ("Done Receiving") imgorigin_t_1 = np.array(Image.open('temp.tif')) tempimg = imgorigin_t_1[np.newaxis,:,:,np.newaxis] tempimg = tempimg.transpose([0,2,1,3]) test_pred = agent.evaluate_actor(tempimg) action_t_1 = test_pred[0] print action_t_1 reward = compute_reward(imgorigin_t_1) agent.add_experience(imgorigin_t,imgorigin_t_1,action_t,reward,index) if index > 32: agent.train() str_buf = '' str_buf = str_buf+str(action_t_1[0,0])+" " str_buf = str_buf+str(action_t_1[0,1])+" " str_buf = str_buf+str(action_t_1[0,2])+" " str_buf = str_buf+str(action_t_1[0,3])+" " str_buf = str_buf+str(action_t_1[0,4])+" " str_buf = str_buf+str(action_t_1[0,5])+" " c.send(str_buf) print("send action finished!") c.close() index = index+1
def main(): experiment = 'quadruped-robot-v0' #specify environments here backupNameFile = "quadruped_robot_0" backupPathFile = "storage/" + backupNameFile bFullPath = os.path.join( os.path.split(os.path.abspath(__file__))[0], backupPathFile) env = gym.make(experiment) steps = env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer global agent agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise # print ("Action at step", t ," :",action,"\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode) # print ("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break # Save some episodes # print(episodes) # if (episodes == 10): # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file: # pickle.dump(agent, file) # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl") # print ('SAVE EPISODE ',episodes) # break; total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def main(): sess = tf.Session() setting.load_data(setting.currency, train_test_data.file_list, train_test_data.test_file) agent = DDPG(sess, CURRENCY, CHART, TIMELINE, LENGTH) counter = 0 reward_for_episode = 0 total_reward = 0 epsilon = 1.0 # parameter defining ratio between random action and DQN decision time_step = 0 # frame number # saving reward reward_st = np.array([0]) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state('./trade_model') if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('model has been loaded successfully!') saver.restore(sess, ckpt.model_checkpoint_path) else: print('start new progress.') sess.run(tf.global_variables_initializer()) for idx in range(MAX_EPISODE): terminal = False print('Starting episode no: %d' % idx) state = setting.reset() reward_for_episode = 0 step_on_episode = 0 while not terminal: present_state = state if np.random.rand() < epsilon: selected_currency = np.random.choice(CURRENCY) ratio = 2 * (np.random.rand() - 0.5) action = setting.action_value(CURRENCY, selected_currency, ratio) else: action = agent.evaluate_actor(present_state) if idx > OBSERVE: epsilon -= 1 / 50000 state, reward, terminal, _ = setting.step(action) # add s_t, s_(t+1), action, reward to experience memory agent.add_experience(present_state, state, action, reward, terminal) # train critic and actor network if time_step > 2000 and time_step % TRAIN_INTERVAL == 0: agent.train() reward_for_episode += reward time_step += 1 step_on_episode += 1 # check if episode ends print('at %s, EPISODE: %d, Steps: %d, Reward: %d' % (str(datetime.datetime.now()), idx, step_on_episode, reward_for_episode)) reward_st = np.append(reward_st, reward_for_episode) if idx % 500 == 0 and idx != 0: saver.save(sess, 'trade_model/actor_critic_network.ckpt', global_step=time_step) total_reward += reward_for_episode print('Average reward per episode: {}'.format(total_reward / MAX_EPISODE))
def main(): experiment = 'wob.mini.ClickTest-v0' #specify environments here env = gym.make(experiment) print "Observation space for ", experiment, ": ", env.observation_space print "Action space for ", experiment, ": ", env.action_space steps = env.spec.timestep_limit #steps per episode env.configure(remotes=1, fps=5, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 90, 'fine_quality_level': 100, 'subsample_level': 0 }) #assert isinstance(env.observation_space, Box), "observation space must be continuous" #assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) num_states = 1 #env.observation_space.shape[0] num_actions = 3 #env.action_space.shape[0] exploration_noise = OUNoise(num_actions) counter = 0 reward_per_episode = 0 total_reward = 0 print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" reward_per_episode = 0 observation = env.reset() print "OBSERVATION: ", observation #initialize xcoord and ycoord randomly for each episode xcoord = np.random.randint(0, 160) + 10 ycoord = np.random.randint(0, 160) + 75 + 50 for t in xrange(steps): #rendering environment env.render() for ob in observation: if ob is not None: x = ob['vision'] crop = x[75:75 + 210, 10:10 + 160, :] print "Previous observation: ", crop print "Shape? ", crop.shape else: crop = None ##Original code for action # action = agent.evaluate_actor(np.reshape(prevobv,[1,num_states])) #currently returning [ nan nan nan nan nan nan] # noise = exploration_noise.noise() # action = action[0] + noise #Select action according to current policy and exploration noise # print "Noise: ", noise action = move(xcoord, ycoord, choose_distance(), choose_angle()) print "Action at step", t, " :", action, "\n" observation, reward, done, info = env.step(action) env.render() print "Done?", done #add previous observation,current observation,action and reward to agent's experience memory agent.add_experience(crop, observation, action, reward, done) #train critic and actor network if counter > 64: #why 64? Perhaps to account for initialiisation? agent.train() reward_per_episode += reward[0] counter += 1 #check if episode ends: if (done[0] == True or (t == steps - 1)): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print '\n\n' break total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
class PlayAgent: def __init__(self, ip="127.0.0.1", id=28888): # ip 는 서버 주소.id 는 agent 식별자 self.ar = ClientActionRobotJava(ip) self.se = GameStateExtractor() # ㅎ믇 self.tp = TrajectoryPlanner() # 궤적 계산 모듈 self.firstShot = True self.solved = [] # clear 한 레벨은 1, 아닌 레벨은 0의 값을 가진다 self.currentLevel = -1 self.failedCounter = 0 self.id = id self.width = 840 # 게임 화면 너비 self.height = 480 # 게임 화면 높이 # 게임 화면 스크린샷(=state) 의 크기. 즉. state의 개수는 # 화면의 픽셀 수와 같다. # [height, width, RGB 3 channels] self.num_states = [self.height, self.width, 3] # Action space 정의 # [거리(0~90px), 각도(0~90degree), tapTime(0~5000ms)] self.num_actions = 3 self.action_space_high = [90, 75, 50] self.action_space_low = [0, 0, 0] self.noise_mean = [20, -20, 0] self.noise_sigma = [10, 30, 20] self.ddpg = DDPG(self.num_states, self.num_actions, self.action_space_high, self.action_space_low, is_batch_norm) def getNextLevel(self): # 다음 레벨을 얻어온다 level = 0 unsolved = False for i in range(len(self.solved)): if self.solved[i] == 0: unsolved = True level = i + 1 if level <= self.currentLevel and self.currentLevel < len(self.solved): continue else: return level if unsolved: return level level = (self.currentLevel + 1) % len(self.solved) if level == 0: level = len(self.solved) return level def checkMyScore(self): scores = self.ar.checkMyScore() # 현재 점수 확인 level = 1 for s in scores: # 각 level 별 점수 확인 print "||\tlevel %d score is : %d\t||" % (level, s) if s > 0: self.solved[level - 1] = 1 level += 1 def getScreenBuffer(self, buffer, width=840, height=480): """ 현재 게임플레이 스크린샷을 받아온다. RGB 별로 따로 저장한다. """ print "## Get ScreenBuffer" # returnBuffer's size = (480, 840, 3) returnBuffer = np.zeros((height, width, 3)) for i in range(height): for j in range(width): RGB = buffer.getRGB(j, i) returnBuffer[i, j, 0] = RGB & 0x0000ff returnBuffer[i, j, 1] = RGB & 0x00ff00 returnBuffer[i, j, 2] = RGB & 0xff0000 print "## Return ScreenBuffer" return returnBuffer def shoot(self, action): """ 새를 쏘고, 쏜 후의 상태를 반환한다. """ # 새총 detection screenshot = self.ar.doScreenShot() vision = Vision(screenshot) sling = vision.findSlingshotMBR() # 현재 게임 state pigs = vision.findPigsMBR() state = self.ar.checkState() # 새총이 감지되면 플레이하고, 아니라면 스킵 if sling != None: # 맵에 돼지가 존재하면 임의로 한 마리를 타겟으로 잡고 쏜다. if len(pigs) != 0: refPoint = self.tp.getReferencePoint(sling) print "## Ref Sling Point : ", refPoint # DDPG 로부터 취할 action을 받아온다 releaseDistance = action[0] releaseAngle = action[1] tapTime = action[2] print "## Release Distance : ", releaseDistance print "## Release Angle : ", releaseAngle self.ar.fullyZoomOut() screenshot = self.ar.doScreenShot() vision = Vision(screenshot) _sling = vision.findSlingshotMBR() # zoom out 했을 때 감지된 새총. if _sling != None: # zoom out 하지 않았을 때의 새총 위치와 zoom out 한 후의 새총 위치의 차이를 구하여 # 너무 차이가 난다면, 쏘지 않고 다시 screenshot 을 찍어 분석하도록 함 scale_diff = (sling.width - _sling.width) ** 2 + \ (sling.height - _sling.height) ** 2 if scale_diff < 25: self.ar.shoot(int(refPoint.x), int(refPoint.y), int( releaseDistance), int(releaseAngle), 0, int(tapTime), True) print "## Shooting is Done" state = self.ar.checkState() if state == state.PLAYING: self.firstShot = False else: print "## Scale is changed. So sling can not execute the shot and will re-segment the image" else: print "## No sling was detected. So agent can not execute the shot and will re-segment the image" return state def ddpg_run(self): """ DDPG algorithm 을 raw pixel data(screenshot)에 대해서 돌린다 """ info = self.ar.configure(ClientActionRobot.intToByteArray(self.id)) self.solved = np.zeros(info[2]) self.checkMyScore() print "## current level : %d" % self.currentLevel # DDPG # random 하게 critic, actor, target critic net, target actor net 을 초기화하고 # experience memory 도 deque 로 초기화 한다 exploration_noise = OUNoise( self.num_actions, self.noise_mean, self.noise_sigma) counter = 1 reward_per_episode = 0 # episode는 한 판을 의미. total_reward = 0 print "# of States : ", self.num_states print "# of Actions : ", self.num_actions # reward 저장 reward_st = np.array([0]) # parameter 로 정한 episode 수 만큼 training 학습 진행 for i in xrange(episodes): # 다음 레벨 받아오기 self.currentLevel = self.getNextLevel() # 받아온 레벨이 1~3 이면 해당 레벨 로드, 아니면 1로 초기화 후 로드 if self.currentLevel < 4: self.ar.loadLevel(self.currentLevel) else: self.currentLevel = 1 self.ar.loadLevel(self.currentLevel) prevscore = 0 reward_per_episode = 0 steps = 0 print "======== Starting Episode No : ", (i + 1), "========", "\n" # 하나의 episode 에 대한 루프 while True: # 게임 플레이 screenshot 가져오기 screenshot = self.ar.doScreenShot() x = self.getScreenBuffer(screenshot, self.width, self.height) # actor evaluation 을 통해서 다음에 취할 action 을 얻는다 action = self.ddpg.evaluate_actor(np.reshape( x, [1, self.num_states[0], self.num_states[1], self.num_states[2]])) print "## Get Action from network!! : ", action action = action[0] noise = exploration_noise.noise() # action 을 현재의 policy 에 따라 정하되, # epsilon(noise) 수치 정도에 따라 실험적인 action을 # stochastic 하게 취하도록 한다. action = action + noise print action # distance 가 음수이면 양수로 뒤집어준다. action[0] = action[0] if action[0] > self.action_space_low[0] else -action[0] # distance 가 최대 범위를 넘어서면 최대 범위로 설정한다. action[0] = action[0] if action[0] < self.action_space_high[0] else self.action_space_high[0] # 각도의 경우에도 마찬가지 처리를 해준다 action[1] = action[1] if action[1] > self.action_space_low[1] else -action[1] action[1] = action[1] if action[1] < self.action_space_high[1] else self.action_space_high[1] # tap time 도 마찬가지 action[2] = action[2] if action[2] > self.action_space_low[2] else -action[2] action[2] = action[2] if action[2] < self.action_space_low[2] else self.action_space_high[2] print "## Action at step ", steps, " :", action, "\n" # 쏘고나서 점수가 안정화 될 때까지 조금 기다리는 로직이 들어있다 state = self.shoot(action) if state == state.WON or state == state.LOST: # episode 가 끝나면( 한 레벨이 끝나면 ) print "## Episode End" screenshot = self.ar.doScreenShot() observation = self.getScreenBuffer( screenshot, self.width, self.height) # 이기면 reward를 받고 지면 받지 않는다. if state == state.WON: score = self.se.getScoreEndGame(screenshot) # 현재 episode 에서 얻은 점수를 1000으로 나눈 값을 reward 로 사용 reward = (score - prevscore) / 1000.0 else: reward = 0.00 self.currentLevel = self.currentLevel self.firstShot = True # episode 가 끝나면 first shot 초기화 done = True # episode done 처리 # experience memory 에 # s(t), s(t + 1), action, reward 를 저장한다 print "######## SCORE : ", score print "######## REWARD : ", reward # x = state(screenBuffer) at t # obervation = state(screenBuffer) at (t + 1) self.ddpg.add_experience( x, observation, action, reward, done) # critic network 와 actor network 학습 # 정해둔 step 이상 진행됐을 경우부터 학습을 시작하도록 한다. # experience 를 충분히 경험해야 하기 때문. if counter > TRAIN_STEP: self.ddpg.train() counter += 1 steps += 1 print "==== EPISODE: ", i, ' Steps: ', steps, ' Total Reward: ', reward_per_episode print "Writing reward info into file..." exploration_noise.reset() # reward_st 는 배열이다. # 마지막 원소에 해당 판에서 얻은 총 점수를 기록하고 # 파일로 내보낸다 reward_st = np.append(reward_st, reward_per_episode) np.savetxt("episodes_reward.txt", reward_st, newline="\n") print "\n\n" break elif state == state.PLAYING: # PLAYING 상태일 때 screenshot = self.ar.doScreenShot() vision = Vision(screenshot) sling = vision.findSlingshotMBR() while sling == None and self.ar.checkState() == state.PLAYING: print "## No slingshot was detected. Please remove pop up or zoom out" self.ar.fullyZoomOut() screenshot = self.ar.doScreenShot() # S(t + 1) 을 얻는다 observation = self.getScreenBuffer( screenshot, self.width, self.height) # experience memory 에 # S(t), S(t + 1), action, reward 를 저장한다 score = self.ar.getInGameScore(screenshot) reward = (score - prevscore) / 1000.0 prevscore = score done = False reward_st = np.append(reward_st, reward) self.ddpg.add_experience( x, observation, action, reward, done) print "## Add experience (action) (reward) (done)", action, reward, done # critie, actor network 학습 if counter > TRAIN_STEP: self.ddpg.train() reward_per_episode += reward counter += 1 steps += 1 # 일반적인 상황이 아닌 상황들에 대한 예외처리 elif state == state.LEVEL_SELECTION: print "unexpected level selection page, go to the last current level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) elif state == state.MAIN_MENU: print"unexpected main menu page, reload the level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) elif state == state.EPISODE_MENU: print "unexpected episode menu page, reload the level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) total_reward += reward_per_episode # episode 들의 reward 를 누계 avg_reward = total_reward / episodes print "## Average reward per episode is : ", avg_reward
def main(): experiment= 'InvertedPendulum-v1' env= gym.make(experiment) assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] #saving reward: reward_st = np.array([0]) for i in xrange(episodes): observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) #env.render() x = observation #select action using actor network model action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states])) noise = exploration_noise.noise() action = action[0] + noise print 'Agent.Action :',action print '\n' print '\n' observation,reward,done,[]=env.step(action) #add s_t,s_t+1,action,reward to experience memeroy agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if done: print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n' print '\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def main(): ''' In this file, we first load the system state parameter from the .mat files, then for each each slot, we observe the state parameter and make the action. Then, we save this state-actor record into the memory for the latter train. Finally, the system convert into te next ecopids. ''' #load the state parameter form .mat file task_size = sio.loadmat('./data/data')['input_data_size'] #load the task size CPU_density = sio.loadmat('./data/data')['input_CPU_density'] #load the required CPU cycles of each task bit task_delay_re = sio.loadmat('./data/data')['input_task_delay_re'] #load the maximum toleration delay of each task task_gain = sio.loadmat('./data/data')['input_task_gain'] #load the gain of each task user_power = sio.loadmat('./data/data')['input_user_power'] #load the transmit power of each user user_chan_gain = sio.loadmat('./data/data')['input_user_chan_gain'] #load the wireless channel gain of each user bs_capacity = sio.loadmat('./data/data')['input_bs_capacity'] #load the computing capacity of each base station #set the number of users in these base station bs_1_user_num = 10 bs_2_user_num = 20 bs_3_user_num = 10 #set the wireless channel nosie, channel bandiwidth, transmission rate of wired connection, chan_noise = 10**(-8) chan_band = 10**6 wired_rate = 10 #set the length of time slot slot_len = 10000 #Set the record number in the replay buffer, the total reward, the reward record of the whole time slots counter = 0 total_reward = 0 reward_st = np.array([0]) #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states, num_actions = len(task_size[:,1]) * 7, len(task_size[:,1]) agent = DDPG(num_states, num_actions, is_batch_norm) #set the explore nosie to guarantee the algrithm's optimal performance exploration_noise = OUNoise(1) #travel each slot, and make the action decision for i in range(slot_len): print ("==== Starting episode no:",i,"====","\n") current_state = np.hstack((task_size[:,i], CPU_density[:,i], task_delay_re[:,i], task_gain[:,i],\ user_power[:,0], user_chan_gain[:,i],bs_capacity[:,i])) #obtain the current system state current_state = np.reshape(current_state, [1, -1]) actor_input = current_state #set the input of actor network actor_output = agent.evaluate_actor(actor_input) #predict the action in this slot noise = exploration_noise.noise() #obtain the noise added in the action action = actor_output[0] + noise #Select action according to current policy and exploration noise # print ("Action at slot", i ," :",action,"\n") reward = 1#fuction(action,current_state) #obtain the reward in this slot next_state = np.hstack((task_size[:,i+1], CPU_density[:,i+1], task_delay_re[:,i+1], task_gain[:,i+1], user_power[:,0],\ user_chan_gain[:,i+1], bs_capacity[:,i+1])) #obtain the system state in the next slot next_state = np.reshape(next_state, [1, -1]) agent.add_experience(current_state, next_state, action, reward) #add s_t,s_t+1,action,reward to experience memory #train critic and actor network if counter > 64: agent.train() counter+=1 # print ('EPISODE: ',i,'Reward: ',reward) reward_st = np.append(reward_st,reward) np.savetxt('episode_reward.txt',reward_st, newline="\n") total_reward+=reward print ("Average reward per episode {}".format(total_reward / slot_len))
def main(): enable_actuator_dynamics = True env = ControlSystem(enable_actuator_dynamics=enable_actuator_dynamics) steps = env.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) # agent.load_model() exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'action') if enable_actuator_dynamics == True: filtered_log_dir = os.path.join( os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'filtered_action') y_hat_log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_hat') y_ref_log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_ref') gen_function_log_dir = os.path.join( os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'function') os.makedirs(log_dir) if enable_actuator_dynamics == True: os.makedirs(filtered_log_dir) os.makedirs(y_hat_log_dir) os.makedirs(y_ref_log_dir) os.makedirs(gen_function_log_dir) for i in range(episodes): print("==== Starting episode no:", i, "====") observation = env.reset() reward_per_episode = 0 actions_per_episode = [] if enable_actuator_dynamics == True: filtered_action_per_episode = [] for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise actions_per_episode.append(action) # if i % 100 == 0: # print ("Action at step", t ," :",action,"\n") # print("#", action[0]) if action[0] < 0: action = [0] elif action[0] > 1: action = [1] # print("Step", t, 'action', action) if enable_actuator_dynamics == False: observation, reward, Y_plot, t_plot, y_ref, random_function = env.step( action, t) elif enable_actuator_dynamics == True: observation, reward, filtered_action, Y_plot, t_plot, y_ref, random_function = env.step( action, t) filtered_action_per_episode.append(filtered_action) # print ("Reward at step", t ," :",reward,"\n") #add y_t,y_t-1,action,reward,timestep to experience memory agent.add_experience(x, observation, action, reward, t) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (t == steps - 1): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) # print ("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") # print("Y_plot") # plt.step(t_plot,Y_plot) # plt.grid() # plt.xlabel('t') # plt.ylabel('y') # plt.show() # Save actions np.savetxt(log_dir + '/' + str(i).zfill(7) + '.txt', actions_per_episode) if enable_actuator_dynamics == True: np.savetxt( filtered_log_dir + '/' + str(i).zfill(7) + '.txt', filtered_action_per_episode) np.savetxt(y_hat_log_dir + '/' + str(i).zfill(7) + '.txt', Y_plot) np.savetxt(y_ref_log_dir + '/' + str(i).zfill(7) + '.txt', y_ref) # np.savetxt(gen_function_log_dir + '/' + str(i).zfill(7) + '.txt', random_function) # save model if i % 100 == 0: print('save') agent.save_model() # print ('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))