Ejemplo n.º 1
0
def main():
    experiment= 'InvertedPendulum-v1' #specify environments here
    env= gym.make(experiment)
    steps= env.spec.timestep_limit #steps per episode    
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]    
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])
      
    
    for i in xrange(episodes):
        print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            print "Action at step", t ," :",action,"\n"
            
            observation,reward,done,info=env.step(action)
            
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
def s2l():

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    num_states = feature_size  #num_states = env.observation_space.shape[0]
    num_actions = num_controls
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    action_space_high = [0.0, 0.0, 0.0]
    action_space_low = [0.5, 0.5, 0.5]
    print("Action space highest values", action_space_high)
    print("Action space lowest values:", action_space_low)
    robot = RoboControl()

    agent = DDPG(is_batch_norm, num_states, num_actions, action_space_high,
                 action_space_low)
    exploration_noise = OUNoise(num_actions)
    counter = 0
    total_reward = 0
    print("Number of Rollouts per episode:", num_rollouts)
    print("Number of Steps per roll out:", steps)
    reward_st = np.array([0])  #saving reward
    eval_metric_st = np.array([0])
    reward_st_all = np.array([0])  #saving reward after every step

    activity_obj = Vid_Feature()
    demo_vid_array = demo_array_extractor(demo_folder)
    demo_features = activity_obj.feature_extractor(demo_vid_array)

    frame_obj = Frame_Feature()
    camera_obj = Camera()

    for episode in range(num_episodes):
        print("==== Starting episode no:", episode, "====", "\n")

        robot.reset()  # Reset env in the begining of each episode
        obs_img = camera_obj.render()  # Get the observation
        #obs_img=np.array(misc.imresize(obs_img,[112,112,3]))
        observation = np.array(frame_obj.frame_feature_extractor(obs_img))
        observation = observation.reshape(-1)
        reward_per_episode = 0

        for t in range(num_rollouts):

            reward_per_rollout = 0
            vid_robo_ = []

            for i in range(steps):

                x = observation

                action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
                noise = exploration_noise.noise()
                action = action[
                    0] + noise  #Select action according to current policy and exploration noise
                print('Action at episode-', episode, 'rollout-', t, 'step-', i,
                      " :", action)

                robot.publish_control(action)
                obs_robo = camera_obj.render()  # Get the observation
                #obs_robo=misc.imresize(obs_robo,[112,112,3])
                vid_robo_.append(obs_robo)
                observation = np.array(
                    frame_obj.frame_feature_extractor(np.array(obs_robo)))
                observation = observation.reshape(-1)
                #pasue()

                if (i == 15):
                    vid_robo = np.array(vid_robo_)
                    robo_features = activity_obj.feature_extractor(vid_robo)
                    reward = -(distance(demo_features, robo_features))
                    reward = np.array(reward)
                    print('reward: ', reward)
                else:
                    reward = 0
                    reward = np.array(reward)
                    print('reward: ', reward)

                # Storing reward after every rollout
                reward_st_all = np.append(reward_st_all, reward)
                np.savetxt('reward_all.txt', reward_st_all, newline="\n")

                #add s_t,s_t+1,action,reward to experience memory
                agent.add_experience(x, observation, action, reward, False)
                reward_per_rollout += reward
                counter += 1

            #train critic and actor network
            if counter > start_training:
                agent.train()
            print('\n\n')

            #Saving policy
            if ((episode % 100) == 0 and t == num_rollouts - 1):
                print('saving policy...........................!')
                agent.save_actor(episode)

            reward_per_episode += reward_per_rollout

        #check if episode ends:

        print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode)
        print("Printing reward to file")
        exploration_noise.reset(
        )  #reinitializing random noise for action exploration
        reward_st = np.append(reward_st, reward_per_episode)
        np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n")
        print('\n\n')

        total_reward += reward_per_episode

    print("Average reward per episode {}".format(total_reward / num_episodes))
Ejemplo n.º 3
0
def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG()
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    #saving reward:
    reward_st = np.array([0])
      
    # network setup
    s = socket.socket()         # Create a socket object
    #host = socket.gethostname() # Get local machine name
    host = ''                    # Get local machine name
    port = 21567  # Reserve a port for your service.
    s.bind((host, port))
    
    s.listen(5)
    imgorigin_t = np.zeros(300,400)
    imgorigin_t_1 = np.zeros(300,400)
    actor_t = np.zeros(6)
    actor_t_1 =np.zeros(6)
    index = 0


    #the first time
    c, addr = s.accept()     # Establish connection with client.
    print ('Got connection from'), addr
    print ("Receiving...")
    l = c.recv(1024)
    f = open('temp.tif','wb')
    while (l):
        f.write(l)
        l = c.recv(1024)
    f.close()
    print ("Done Receiving")
    imgorigin_t = np.array(Image.open('temp.tif'))
    tempimg = imgorigin_t[np.newaxis,:,:,np.newaxis]
    tempimg = tempimg.transpose([0,2,1,3])
    test_pred = agent.evaluate_actor(tempimg)
    action_t = test_pred[0]

    print action_t

    str_buf = ''
    str_buf = str_buf+str(action_t[0,0])+" "
    str_buf = str_buf+str(action_t[0,1])+" "
    str_buf = str_buf+str(action_t[0,2])+" "
    str_buf = str_buf+str(action_t[0,3])+" "
    str_buf = str_buf+str(action_t[0,4])+" "
    str_buf = str_buf+str(action_t[0,5])+" "
    
    imgorigin_t_1 = imgorigin_t
    actor_t_1 = actor_t

    c.send(str_buf)
    c.close()
    
    index =1
    while True:
        #update imgorigin_t and actor_t
        imgorigin_t = img_origin_t_1
        actor_t = actor_t_1
        c, addr = s.accept()     # Establish connection with client.
        print ('Got connection from'), addr
        print ("Receiving...")
        l = c.recv(1024)
        f = open('temp.tif','wb')
        while (l):
            f.write(l)
            l = c.recv(1024)
        f.close()
        print ("Done Receiving")
        imgorigin_t_1 = np.array(Image.open('temp.tif'))
        tempimg = imgorigin_t_1[np.newaxis,:,:,np.newaxis]
        tempimg = tempimg.transpose([0,2,1,3])
        test_pred = agent.evaluate_actor(tempimg)
        action_t_1 = test_pred[0]
        print action_t_1

        reward = compute_reward(imgorigin_t_1)
        agent.add_experience(imgorigin_t,imgorigin_t_1,action_t,reward,index)

        if index > 32:
            agent.train()

        str_buf = ''
        str_buf = str_buf+str(action_t_1[0,0])+" "
        str_buf = str_buf+str(action_t_1[0,1])+" "
        str_buf = str_buf+str(action_t_1[0,2])+" "
        str_buf = str_buf+str(action_t_1[0,3])+" "
        str_buf = str_buf+str(action_t_1[0,4])+" "
        str_buf = str_buf+str(action_t_1[0,5])+" "
        c.send(str_buf)
        print("send action finished!")
        c.close()

        index = index+1
Ejemplo n.º 4
0
def main():
    experiment = 'quadruped-robot-v0'  #specify environments here
    backupNameFile = "quadruped_robot_0"

    backupPathFile = "storage/" + backupNameFile
    bFullPath = os.path.join(
        os.path.split(os.path.abspath(__file__))[0], backupPathFile)

    env = gym.make(experiment)
    steps = env.spec.timestep_limit  #steps per episode
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    global agent
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            # print ("Action at step", t ," :",action,"\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
                # print ("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
        # Save some episodes
        # print(episodes)
        # if (episodes == 10):
        # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file:
        #     pickle.dump(agent, file)
        # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl")
        # print ('SAVE EPISODE ',episodes)
        # break;
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
Ejemplo n.º 5
0
def main():
    sess = tf.Session()

    setting.load_data(setting.currency, train_test_data.file_list,
                      train_test_data.test_file)
    agent = DDPG(sess, CURRENCY, CHART, TIMELINE, LENGTH)
    counter = 0
    reward_for_episode = 0
    total_reward = 0

    epsilon = 1.0  # parameter defining ratio between random action and DQN decision
    time_step = 0  # frame number

    # saving reward
    reward_st = np.array([0])

    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state('./trade_model')

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print('model has been loaded successfully!')
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        print('start new progress.')
        sess.run(tf.global_variables_initializer())

    for idx in range(MAX_EPISODE):
        terminal = False
        print('Starting episode no: %d' % idx)
        state = setting.reset()
        reward_for_episode = 0
        step_on_episode = 0

        while not terminal:
            present_state = state
            if np.random.rand() < epsilon:
                selected_currency = np.random.choice(CURRENCY)
                ratio = 2 * (np.random.rand() - 0.5)
                action = setting.action_value(CURRENCY, selected_currency,
                                              ratio)

            else:
                action = agent.evaluate_actor(present_state)

            if idx > OBSERVE:
                epsilon -= 1 / 50000

            state, reward, terminal, _ = setting.step(action)

            # add s_t, s_(t+1), action, reward to experience memory
            agent.add_experience(present_state, state, action, reward,
                                 terminal)

            # train critic and actor network
            if time_step > 2000 and time_step % TRAIN_INTERVAL == 0:
                agent.train()

            reward_for_episode += reward
            time_step += 1
            step_on_episode += 1

        # check if episode ends
        print('at %s, EPISODE: %d, Steps: %d, Reward: %d' %
              (str(datetime.datetime.now()), idx, step_on_episode,
               reward_for_episode))
        reward_st = np.append(reward_st, reward_for_episode)

        if idx % 500 == 0 and idx != 0:
            saver.save(sess,
                       'trade_model/actor_critic_network.ckpt',
                       global_step=time_step)

    total_reward += reward_for_episode
    print('Average reward per episode: {}'.format(total_reward / MAX_EPISODE))
Ejemplo n.º 6
0
def main():
    experiment = 'wob.mini.ClickTest-v0'  #specify environments here
    env = gym.make(experiment)
    print "Observation space for ", experiment, ": ", env.observation_space
    print "Action space for ", experiment, ": ", env.action_space
    steps = env.spec.timestep_limit  #steps per episode
    env.configure(remotes=1,
                  fps=5,
                  vnc_driver='go',
                  vnc_kwargs={
                      'encoding': 'tight',
                      'compress_level': 90,
                      'fine_quality_level': 100,
                      'subsample_level': 0
                  })
    #assert isinstance(env.observation_space, Box), "observation space must be continuous"
    #assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    num_states = 1  #env.observation_space.shape[0]
    num_actions = 3  #env.action_space.shape[0]
    exploration_noise = OUNoise(num_actions)
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])

    for i in xrange(episodes):
        print "==== Starting episode no:", i, "====", "\n"
        reward_per_episode = 0
        observation = env.reset()
        print "OBSERVATION: ", observation
        #initialize xcoord and ycoord randomly for each episode
        xcoord = np.random.randint(0, 160) + 10
        ycoord = np.random.randint(0, 160) + 75 + 50
        for t in xrange(steps):
            #rendering environment
            env.render()
            for ob in observation:
                if ob is not None:
                    x = ob['vision']
                    crop = x[75:75 + 210, 10:10 + 160, :]
                    print "Previous observation: ", crop
                    print "Shape? ", crop.shape
                else:
                    crop = None

            ##Original code for action
            # action = agent.evaluate_actor(np.reshape(prevobv,[1,num_states])) #currently returning [ nan  nan  nan  nan  nan  nan]
            # noise = exploration_noise.noise()
            # action = action[0] + noise #Select action according to current policy and exploration noise
            # print "Noise: ", noise

            action = move(xcoord, ycoord, choose_distance(), choose_angle())

            print "Action at step", t, " :", action, "\n"

            observation, reward, done, info = env.step(action)
            env.render()
            print "Done?", done

            #add previous observation,current observation,action and reward to agent's experience memory
            agent.add_experience(crop, observation, action, reward, done)

            #train critic and actor network
            if counter > 64:  #why 64? Perhaps to account for initialiisation?
                agent.train()

            reward_per_episode += reward[0]
            counter += 1
            #check if episode ends:
            if (done[0] == True or (t == steps - 1)):
                print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print '\n\n'
                break
    total_reward += reward_per_episode
    print "Average reward per episode {}".format(total_reward / episodes)
Ejemplo n.º 7
0
class PlayAgent:

    def __init__(self, ip="127.0.0.1", id=28888):   # ip 는 서버 주소.id 는 agent 식별자
        self.ar = ClientActionRobotJava(ip)
        self.se = GameStateExtractor()  # ㅎ믇
        self.tp = TrajectoryPlanner()   # 궤적 계산 모듈
        self.firstShot = True
        self.solved = []            # clear 한 레벨은 1, 아닌 레벨은 0의 값을 가진다
        self.currentLevel = -1
        self.failedCounter = 0
        self.id = id
        self.width = 840            # 게임 화면 너비
        self.height = 480           # 게임 화면 높이
        # 게임 화면 스크린샷(=state) 의 크기. 즉. state의 개수는
        # 화면의 픽셀 수와 같다.
        # [height, width, RGB 3 channels]
        self.num_states = [self.height, self.width, 3]
        # Action space 정의
        # [거리(0~90px), 각도(0~90degree), tapTime(0~5000ms)]
        self.num_actions = 3
        self.action_space_high = [90, 75, 50]
        self.action_space_low = [0, 0, 0]
        self.noise_mean = [20, -20, 0]
        self.noise_sigma = [10, 30, 20]
        self.ddpg = DDPG(self.num_states, self.num_actions,
                         self.action_space_high, self.action_space_low, is_batch_norm)

    def getNextLevel(self):     # 다음 레벨을 얻어온다

        level = 0
        unsolved = False

        for i in range(len(self.solved)):
            if self.solved[i] == 0:
                unsolved = True
                level = i + 1
                if level <= self.currentLevel and self.currentLevel < len(self.solved):
                    continue
                else:
                    return level

        if unsolved:
            return level

        level = (self.currentLevel + 1) % len(self.solved)
        if level == 0:
            level = len(self.solved)

        return level

    def checkMyScore(self):

        scores = self.ar.checkMyScore()     # 현재 점수 확인
        level = 1
        for s in scores:    # 각 level 별 점수 확인
            print "||\tlevel %d score is : %d\t||" % (level, s)
            if s > 0:
                self.solved[level - 1] = 1
            level += 1

    def getScreenBuffer(self, buffer, width=840, height=480):
        """
            현재 게임플레이 스크린샷을 받아온다.
            RGB 별로 따로 저장한다.
        """
        print "## Get ScreenBuffer"
        # returnBuffer's size = (480, 840, 3)
        returnBuffer = np.zeros((height, width, 3))
        for i in range(height):
            for j in range(width):
                RGB = buffer.getRGB(j, i)
                returnBuffer[i, j, 0] = RGB & 0x0000ff
                returnBuffer[i, j, 1] = RGB & 0x00ff00
                returnBuffer[i, j, 2] = RGB & 0xff0000

        print "## Return ScreenBuffer"
        return returnBuffer

    def shoot(self, action):
        """
            새를 쏘고,
            쏜 후의 상태를 반환한다.
        """
        # 새총 detection
        screenshot = self.ar.doScreenShot()
        vision = Vision(screenshot)
        sling = vision.findSlingshotMBR()

        # 현재 게임 state
        pigs = vision.findPigsMBR()
        state = self.ar.checkState()

        # 새총이 감지되면 플레이하고, 아니라면 스킵
        if sling != None:

            # 맵에 돼지가 존재하면 임의로 한 마리를 타겟으로 잡고 쏜다.
            if len(pigs) != 0:

                refPoint = self.tp.getReferencePoint(sling)
                print "## Ref Sling Point : ", refPoint

                # DDPG 로부터 취할 action을 받아온다
                releaseDistance = action[0]
                releaseAngle = action[1]
                tapTime = action[2]
                print "## Release Distance : ", releaseDistance
                print "## Release Angle : ", releaseAngle

                self.ar.fullyZoomOut()
                screenshot = self.ar.doScreenShot()
                vision = Vision(screenshot)
                _sling = vision.findSlingshotMBR()  # zoom out 했을 때 감지된 새총.

                if _sling != None:
                    # zoom out 하지 않았을 때의 새총 위치와 zoom out 한 후의 새총 위치의 차이를 구하여
                    # 너무 차이가 난다면, 쏘지 않고 다시 screenshot 을 찍어 분석하도록 함
                    scale_diff = (sling.width - _sling.width) ** 2 + \
                        (sling.height - _sling.height) ** 2

                    if scale_diff < 25:
                        self.ar.shoot(int(refPoint.x), int(refPoint.y), int(
                            releaseDistance), int(releaseAngle), 0, int(tapTime), True)
                        print "## Shooting is Done"
                        state = self.ar.checkState()

                        if state == state.PLAYING:
                            self.firstShot = False

                    else:
                        print "## Scale is changed. So sling can not execute the shot and will re-segment the image"
                else:
                    print "## No sling was detected. So agent can not execute the shot and will re-segment the image"

        return state

    def ddpg_run(self):
        """
            DDPG algorithm 을 raw pixel data(screenshot)에 대해서 돌린다
        """

        info = self.ar.configure(ClientActionRobot.intToByteArray(self.id))
        self.solved = np.zeros(info[2])
        self.checkMyScore()
        print "## current level : %d" % self.currentLevel

        # DDPG
        # random 하게 critic, actor, target critic net, target actor net 을 초기화하고
        # experience memory 도 deque 로 초기화 한다
        exploration_noise = OUNoise(
            self.num_actions, self.noise_mean, self.noise_sigma)
        counter = 1
        reward_per_episode = 0      # episode는 한 판을 의미.
        total_reward = 0
        print "# of States : ", self.num_states
        print "# of Actions : ", self.num_actions

        # reward 저장
        reward_st = np.array([0])

        # parameter 로 정한 episode 수 만큼 training 학습 진행
        for i in xrange(episodes):

            # 다음 레벨 받아오기
            self.currentLevel = self.getNextLevel()
            # 받아온 레벨이 1~3 이면 해당 레벨 로드, 아니면 1로 초기화 후 로드
            if self.currentLevel < 4:
                self.ar.loadLevel(self.currentLevel)
            else:
                self.currentLevel = 1
                self.ar.loadLevel(self.currentLevel)

            prevscore = 0
            reward_per_episode = 0
            steps = 0
            print "======== Starting Episode No : ", (i + 1), "========", "\n"

            # 하나의 episode 에 대한 루프
            while True:

                # 게임 플레이 screenshot 가져오기
                screenshot = self.ar.doScreenShot()
                x = self.getScreenBuffer(screenshot, self.width, self.height)
                # actor evaluation 을 통해서 다음에 취할 action 을 얻는다
                action = self.ddpg.evaluate_actor(np.reshape(
                    x, [1, self.num_states[0], self.num_states[1], self.num_states[2]]))
                print "## Get Action from network!! : ", action
                action = action[0]
                noise = exploration_noise.noise()
                # action 을 현재의 policy 에 따라 정하되,
                # epsilon(noise) 수치 정도에 따라 실험적인 action을
                # stochastic 하게 취하도록 한다.
                action = action + noise
                print action
                # distance 가 음수이면 양수로 뒤집어준다.
                action[0] = action[0] if action[0] > self.action_space_low[0] else -action[0]
                # distance 가 최대 범위를 넘어서면 최대 범위로 설정한다.
                action[0] = action[0] if action[0] < self.action_space_high[0] else self.action_space_high[0]
                # 각도의 경우에도 마찬가지 처리를 해준다
                action[1] = action[1] if action[1] > self.action_space_low[1] else -action[1]
                action[1] = action[1] if action[1] < self.action_space_high[1] else self.action_space_high[1]
                # tap time 도 마찬가지
                action[2] = action[2] if action[2] > self.action_space_low[2] else -action[2]
                action[2] = action[2] if action[2] < self.action_space_low[2] else self.action_space_high[2]
                print "## Action at step ", steps, " :", action, "\n"
                # 쏘고나서 점수가 안정화 될 때까지 조금 기다리는 로직이 들어있다
                state = self.shoot(action)

                if state == state.WON or state == state.LOST:
                    # episode 가 끝나면( 한 레벨이 끝나면 )
                    print "## Episode End"

                    screenshot = self.ar.doScreenShot()
                    observation = self.getScreenBuffer(
                        screenshot, self.width, self.height)

                    # 이기면 reward를 받고 지면 받지 않는다.
                    if state == state.WON:
                        score = self.se.getScoreEndGame(screenshot)
                        # 현재 episode 에서 얻은 점수를 1000으로 나눈 값을 reward 로 사용
                        reward = (score - prevscore) / 1000.0
                    else:
                        reward = 0.00

                    self.currentLevel = self.currentLevel
                    self.firstShot = True   # episode 가 끝나면 first shot 초기화
                    done = True             # episode done 처리

                    # experience memory 에
                    # s(t), s(t + 1), action, reward 를 저장한다
                    print "######## SCORE : ", score
                    print "######## REWARD : ", reward
                    # x = state(screenBuffer) at t
                    # obervation = state(screenBuffer) at (t + 1)
                    self.ddpg.add_experience(
                        x, observation, action, reward, done)

                    # critic network 와 actor network 학습
                    # 정해둔 step 이상 진행됐을 경우부터 학습을 시작하도록 한다.
                    # experience 를 충분히 경험해야 하기 때문.
                    if counter > TRAIN_STEP:
                        self.ddpg.train()
                    counter += 1
                    steps += 1

                    print "==== EPISODE: ", i, ' Steps: ', steps, ' Total Reward: ', reward_per_episode
                    print "Writing reward info into file..."
                    exploration_noise.reset()
                    # reward_st 는 배열이다.
                    # 마지막 원소에 해당 판에서 얻은 총 점수를 기록하고
                    # 파일로 내보낸다
                    reward_st = np.append(reward_st, reward_per_episode)
                    np.savetxt("episodes_reward.txt", reward_st, newline="\n")
                    print "\n\n"

                    break

                elif state == state.PLAYING:    # PLAYING 상태일 때
                    screenshot = self.ar.doScreenShot()
                    vision = Vision(screenshot)
                    sling = vision.findSlingshotMBR()

                    while sling == None and self.ar.checkState() == state.PLAYING:
                        print "## No slingshot was detected. Please remove pop up or zoom out"
                        self.ar.fullyZoomOut()
                        screenshot = self.ar.doScreenShot()

                    # S(t + 1) 을 얻는다
                    observation = self.getScreenBuffer(
                        screenshot, self.width, self.height)
                    # experience memory 에
                    # S(t), S(t + 1), action, reward 를 저장한다
                    score = self.ar.getInGameScore(screenshot)
                    reward = (score - prevscore) / 1000.0
                    prevscore = score
                    done = False
                    reward_st = np.append(reward_st, reward)

                    self.ddpg.add_experience(
                        x, observation, action, reward, done)
                    print "## Add experience (action) (reward) (done)", action, reward, done

                    # critie, actor network 학습
                    if counter > TRAIN_STEP:
                        self.ddpg.train()
                    reward_per_episode += reward
                    counter += 1
                    steps += 1

                # 일반적인 상황이 아닌 상황들에 대한 예외처리
                elif state == state.LEVEL_SELECTION:
                    print "unexpected level selection page, go to the last current level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)
                elif state == state.MAIN_MENU:
                    print"unexpected main menu page, reload the level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)
                elif state == state.EPISODE_MENU:
                    print "unexpected episode menu page, reload the level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)

        total_reward += reward_per_episode  # episode 들의 reward 를 누계
        avg_reward = total_reward / episodes
        print "## Average reward per episode is : ", avg_reward
Ejemplo n.º 8
0
def main():
    experiment= 'InvertedPendulum-v1'
    env= gym.make(experiment)
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    
    #saving reward:
    reward_st = np.array([0])
    
    
    
    for i in xrange(episodes):
        observation = env.reset()
    
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            #env.render()
            
            x = observation
            #select action using actor network model
            action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states]))
            
            noise = exploration_noise.noise()
            
                       
            action = action[0] + noise
            
            
            print 'Agent.Action :',action
            print '\n'
            print '\n'
            
                      
            observation,reward,done,[]=env.step(action)
            #add s_t,s_t+1,action,reward to experience memeroy
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()            
            
            reward_per_episode+=reward
            
            counter+=1
            #check if episode ends:
            if done:
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n'
                print '\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
Ejemplo n.º 9
0
def main():
    
    '''
    In this file, we first load the system state parameter from the .mat files, then for each 
    each slot, we observe the state parameter and make the action. Then, we save this state-actor
    record into the memory for the latter train. Finally, the system convert into te next ecopids.
    '''
    #load the state parameter form .mat file
    task_size = sio.loadmat('./data/data')['input_data_size']   #load the task size 
    CPU_density = sio.loadmat('./data/data')['input_CPU_density']   #load the required CPU cycles of each task bit 
    task_delay_re = sio.loadmat('./data/data')['input_task_delay_re']  #load the maximum toleration delay of each task
    task_gain = sio.loadmat('./data/data')['input_task_gain']  #load the gain of each task
    user_power = sio.loadmat('./data/data')['input_user_power']  #load the transmit power of each user
    user_chan_gain = sio.loadmat('./data/data')['input_user_chan_gain']  #load the wireless channel gain of each user
    bs_capacity = sio.loadmat('./data/data')['input_bs_capacity']  #load the computing capacity of each base station    
    
    
    #set the number of users in these base station
    bs_1_user_num = 10
    bs_2_user_num = 20
    bs_3_user_num = 10
    
    #set the wireless channel nosie, channel bandiwidth, transmission rate of wired connection,
    chan_noise =   10**(-8)
    chan_band = 10**6
    wired_rate = 10
    
    #set the length of time slot 
    slot_len = 10000
    
    #Set the record number in the replay buffer, the total reward, the reward record of the whole time slots
    counter = 0 
    total_reward = 0
    reward_st = np.array([0])
    
    #Randomly initialize critic,actor,target critic, target actor network and replay buffer
    num_states, num_actions = len(task_size[:,1]) * 7, len(task_size[:,1])
    agent = DDPG(num_states, num_actions, is_batch_norm)
    
    #set the explore nosie to guarantee the algrithm's optimal performance
    exploration_noise = OUNoise(1)
    
    #travel each slot, and make the action decision
    for i in range(slot_len):
        print ("==== Starting episode no:",i,"====","\n")
        current_state = np.hstack((task_size[:,i], CPU_density[:,i], task_delay_re[:,i], task_gain[:,i],\
        user_power[:,0], user_chan_gain[:,i],bs_capacity[:,i]))   #obtain the current system state
        current_state = np.reshape(current_state, [1, -1])
        actor_input = current_state   #set the input of actor network
        actor_output = agent.evaluate_actor(actor_input)   #predict the action in this slot
        noise = exploration_noise.noise()   #obtain the noise added in the action
        action = actor_output[0] + noise #Select action according to current policy and exploration noise
#        print ("Action at slot", i ," :",action,"\n")
        reward = 1#fuction(action,current_state)   #obtain the reward in this slot
        next_state = np.hstack((task_size[:,i+1], CPU_density[:,i+1], task_delay_re[:,i+1], task_gain[:,i+1], user_power[:,0],\
        user_chan_gain[:,i+1], bs_capacity[:,i+1]))   #obtain the system state in the next slot
        next_state = np.reshape(next_state, [1, -1])
        agent.add_experience(current_state, next_state, action, reward)   #add s_t,s_t+1,action,reward to experience memory
        #train critic and actor network
        if counter > 64: 
            agent.train()
        counter+=1
#        print ('EPISODE: ',i,'Reward: ',reward)
        reward_st = np.append(reward_st,reward)
        np.savetxt('episode_reward.txt',reward_st, newline="\n")
    total_reward+=reward
    print ("Average reward per episode {}".format(total_reward / slot_len))
Ejemplo n.º 10
0
def main():
    enable_actuator_dynamics = True
    env = ControlSystem(enable_actuator_dynamics=enable_actuator_dynamics)

    steps = env.timestep_limit  #steps per episode
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)

    # agent.load_model()

    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    log_dir = os.path.join(os.getcwd(), 'logs',
                           datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                           'action')

    if enable_actuator_dynamics == True:
        filtered_log_dir = os.path.join(
            os.getcwd(), 'logs',
            datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'filtered_action')

    y_hat_log_dir = os.path.join(os.getcwd(), 'logs',
                                 datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                                 'y_hat')

    y_ref_log_dir = os.path.join(os.getcwd(), 'logs',
                                 datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                                 'y_ref')

    gen_function_log_dir = os.path.join(
        os.getcwd(), 'logs',
        datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'function')

    os.makedirs(log_dir)
    if enable_actuator_dynamics == True:
        os.makedirs(filtered_log_dir)
    os.makedirs(y_hat_log_dir)
    os.makedirs(y_ref_log_dir)
    os.makedirs(gen_function_log_dir)

    for i in range(episodes):
        print("==== Starting episode no:", i, "====")
        observation = env.reset()
        reward_per_episode = 0
        actions_per_episode = []
        if enable_actuator_dynamics == True:
            filtered_action_per_episode = []

        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))

            noise = exploration_noise.noise()

            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            actions_per_episode.append(action)
            # if i % 100 == 0:
            #     print ("Action at step", t ," :",action,"\n")
            # print("#", action[0])
            if action[0] < 0:
                action = [0]
            elif action[0] > 1:
                action = [1]

            # print("Step", t, 'action', action)

            if enable_actuator_dynamics == False:
                observation, reward, Y_plot, t_plot, y_ref, random_function = env.step(
                    action, t)
            elif enable_actuator_dynamics == True:
                observation, reward, filtered_action, Y_plot, t_plot, y_ref, random_function = env.step(
                    action, t)
                filtered_action_per_episode.append(filtered_action)

            # print ("Reward at step", t ," :",reward,"\n")
            #add y_t,y_t-1,action,reward,timestep to experience memory
            agent.add_experience(x, observation, action, reward, t)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (t == steps - 1):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                # print ("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")

                # print("Y_plot")
                # plt.step(t_plot,Y_plot)
                # plt.grid()
                # plt.xlabel('t')
                # plt.ylabel('y')
                # plt.show()

                # Save actions
                np.savetxt(log_dir + '/' + str(i).zfill(7) + '.txt',
                           actions_per_episode)
                if enable_actuator_dynamics == True:
                    np.savetxt(
                        filtered_log_dir + '/' + str(i).zfill(7) + '.txt',
                        filtered_action_per_episode)
                np.savetxt(y_hat_log_dir + '/' + str(i).zfill(7) + '.txt',
                           Y_plot)
                np.savetxt(y_ref_log_dir + '/' + str(i).zfill(7) + '.txt',
                           y_ref)
                # np.savetxt(gen_function_log_dir + '/' + str(i).zfill(7) + '.txt', random_function)

                # save model
                if i % 100 == 0:
                    print('save')
                    agent.save_model()
                # print ('\n\n')

                break

    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))