Ejemplo n.º 1
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
Ejemplo n.º 2
0
def test(rank, scene_scope, task_scope, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    
    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
        })
    
    model = ActorCriticFFNetwork(ACTION_SIZE)

    model.eval()

    height, width, layers = env.observation.shape
    video = cv2.VideoWriter('video/' + task_scope + '.mp4',-1,1,(width,height))

    env.reset()
    state = torch.from_numpy(env.s_t)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0

    img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB)
    video.write(img)
    for i in range(100):
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        logit, value = model(env.s_t, env.target)
        prob = F.softmax(logit, dim=1)
        action = prob.max(1, keepdim=True)[1].data.numpy()
        env.step(action[0, 0])
        env.update()        
        img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB)
        video.write(img)
        
        reward = env.reward
        state = env.s_t
        done = env.terminal
        print(env.terminal_state_id, env.current_state_id)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            env.reset()
            state = env.s_t
            break

        state = torch.from_numpy(state)
    cv2.destroyAllWindows()
    video.release()
Ejemplo n.º 3
0
from constants import CHECKPOINT_DIR
from constants import NUM_EVAL_EPISODES
from constants import VERBOSE

from constants import TASK_TYPE
from constants import TASK_LIST

if __name__ == '__main__':

    device = "/cpu:0"  # use CPU for display tool
    network_scope = TASK_TYPE
    list_of_tasks = TASK_LIST
    scene_scopes = list_of_tasks.keys()

    global_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                          device=device,
                                          network_scope=network_scope,
                                          scene_scopes=scene_scopes)

    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver(global_network.get_vars())
    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint.model_checkpoint_path))
    else:
        print("Could not find old checkpoint")
Ejemplo n.º 4
0
    def __init__(self,
                 thread_index,
                 global_network,
                 global_discriminator,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 grad_applier_discriminator,
                 max_global_time_step,
                 device,
                 device2,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.network_scope_D = network_scope + "_d"
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.scopes_d = [self.network_scope_D, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)

        self.trainer.prepare_minimize(
            self.local_network.
            total_loss,  #getting the gradients of for the local network variablkes
            self.local_network.get_vars())

        #This part is for the newly added PPO loss (we need to keep old and new update parameters)
        new_variable_list = self.local_network.get_vars()
        old_varaible_list = self.local_network.get_vars_old()

        #For the ppo loss begining of the each iteration we need to sync old with current
        self.old_new_sync = self.local_network.sync_curre_old()

        self.accum_gradients = self.trainer.accumulate_gradients(
        )  #This is to assign gradients
        self.reset_gradients = self.trainer.reset_gradients(
        )  #after applying the grads to variables we need to resent those variables

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]  #get the name list of all the grad vars

        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]  #check whether the global_network vars are mentioned in gradiet computations for them
        local_net_vars = [
            x for x in self.local_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())
        self.apply_gradients_local = grad_applier.apply_gradients_local_net(
            local_net_vars, self.trainer.get_accum_grad_list())

        #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply
        self.sync = self.local_network.sync_from(
            global_network
        )  #this is to sync from the glocal network Apply updated global params to the local network

        #This part is for the Discriminator
        #########################################################################################
        #
        self.local_discriminator = Discriminator_WGAN(  #
            action_size=ACTION_SIZE,  # 
            device=device,  #  
            network_scope=network_scope,  #
            scene_scopes=[scene_scope])  #
        #
        self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d)  #
        #
        self.trainer_D = AccumTrainer_d(device=device,
                                        name="AccumTrainer_d")  #
        #
        self.trainer_D.prepare_minimize(
            self.local_discriminator.total_loss_d,  #
            self.local_discriminator.get_vars())  #
        #
        #
        self.accum_gradients_d = self.trainer_D.accumulate_gradients()  #
        self.reset_gradients_d = self.trainer_D.reset_gradients()

        #
        #
        accum_grad_names_discrimi = [
            self._local_var_name(x)
            for x in self.trainer_D.get_accum_grad_list()
        ]
        #
        #
        global_discri_vars = [
            x for x in global_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        local_discri_vars = [
            x for x in self.local_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        #
        self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients(
            local_discri_vars, self.trainer_D.get_accum_grad_list()
        )  #applying grad to the LOCAL network

        #
        self.clip_local_d_weights = self.local_discriminator.clip_weights(
        )  #here we are clipping the global net weights directly.
        #
        self.sync_discriminator_l_G = self.local_discriminator.sync_to(
            global_discriminator)  #
        self.sync_discriminator_G_l = self.local_discriminator.sync_from(
            global_discriminator)
        #
        self.D_var_G = global_discriminator.get_vars()
        self.D_var_l = self.local_discriminator.get_vars()  #
        #
        #
        #########################################################################################

        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
Ejemplo n.º 5
0
def train(rank,
          scene_scope,
          task_scope,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None):
    torch.manual_seed(args.seed + rank)

    #env = create_atari_env(args.env_name)
    #env.seed(args.seed + rank)

    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
    })

    model = ActorCriticFFNetwork(ACTION_SIZE)

    if optimizer is None:
        # TODO: Discount learning rate based on episode length
        optimizer = my_optim.SharedRMSprop(shared_model.parameters(),
                                           lr=args.lr,
                                           alpha=args.alpha,
                                           eps=args.eps)
        optimizer.share_memory()

    model.train()

    env.reset()
    state = torch.from_numpy(env.s_t)
    done = True

    episode_length = 0
    for i in range(int(args.max_episode_length)):
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        '''
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)
        '''

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            print('Thread: ', rank, ', step: ', step, 'epochs:', i)
            episode_length += 1
            logit, value = model(env.s_t, env.target)
            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).data
            log_prob = log_prob.gather(1, Variable(action))

            env.step(action)
            #state, reward, done, _ = env.step(action.numpy())
            env.update()
            state = env.s_t
            reward = env.reward
            done = env.terminal

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                if counter.value % 1000 == 0:
                    print('Now saving data. Please wait.')
                    torch.save(shared_model.state_dict(),
                               CHECKPOINT_DIR + '/' + 'checkpoint.pth.tar')
                counter.value += 1

            if done:
                episode_length = 0
                if env.terminal:
                    print('Task completed')
                counter.value += 1

            if done:
                episode_length = 0
                env.reset()
                state = env.s_t

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            _, value = model(env.s_t, env.target)
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Ejemplo n.º 6
0
                    help='environment to train on (default: bathroom_02)')
parser.add_argument('--no_shared',
                    default=False,
                    help='environment to train on (default: bathroom_02)')

if __name__ == '__main__':
    torch.multiprocessing.set_start_method('spawn')
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = ""
    list_of_tasks = TASK_LIST
    scene_scopes = list_of_tasks.keys()
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    #env = create_atari_env(args.env_name)
    shared_model = ActorCriticFFNetwork(ACTION_SIZE)
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedRMSprop(shared_model.parameters(),
                                           lr=args.lr,
                                           alpha=args.alpha,
                                           eps=args.eps)
        optimizer.share_memory()

    processes = []

    counter = mp.Value('i', 0)
    lock = mp.Lock()
Ejemplo n.º 7
0
def main():
    # disable all v2 behavior
    tf.disable_v2_behavior()
    tf.disable_eager_execution()

    device = "/cpu:0"  # use CPU for display tool
    network_scope = TASK_TYPE  # Always 'navigation'
    list_of_tasks = TASK_LIST
    scene_scopes = list_of_tasks.keys()

    global_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                          device=device,
                                          network_scope=network_scope,
                                          scene_scopes=scene_scopes)
    sess = tf.Session()
    # sess = tf.coSession()
    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    # see if we saved a checkpoint from past training?
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint.model_checkpoint_path))
    else:
        print("Could not find old checkpoint")

    scene_stats = dict()
    for scene_scope in scene_scopes:
        # TODO: remove
        scene_scope = "FloorPlan402"
        scene_stats[scene_scope] = []
        for task_scope in list_of_tasks[scene_scope]:
            # tasks are positions!!!
            # env = ai2thor.controller.Controller(scene="FloorPlan227", gridSize=0.25, width=1000, height=1000)
            with open(GOAL_FILE, 'r') as f:
                GOAL_DATA = json.load(f)

            GOAL_POS = GOAL_DATA["agent_position"]
            env = RLController({
                'scene': scene_scope,
                'terminal_state_id': int(task_scope),
                'goal_pos': GOAL_POS,
                'goal_image_fpath': "data/FP402_goal_towel.png"
            })
            env.docker_enabled = True
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope]

            for i_episode in range(NUM_EVAL_EPISODES):
                env.reset()

                terminal = False
                ep_reward = 0
                ep_collision = 0
                ep_t = 0

                while not terminal:
                    # mirrors actions taken in paper
                    # NOTE: rearranged these to mirror code in scene_loader
                    list_of_actions = [
                        "MoveAhead", "RotateRight", "RotateLeft", "MoveBack"
                    ]

                    pi_values = global_network.run_policy(
                        sess, env.curr_state, env.target, scopes)
                    # action returned is an integer -- critical that the list_of_actions is in correct order

                    action = sample_action(pi_values)
                    print(
                        "Ep_t: {} \n\tCollided?: {} \n\tAction: {} \n\tValue: {} \n\tAll Action Values: {}"
                        .format(ep_t, env.collided, list_of_actions[action],
                                pi_values[action], pi_values))
                    env.step(list_of_actions[action])

                    env.update()

                    terminal = env.terminal
                    if ep_t == 10000: break

                    if env.collided: ep_collision += 1
                    ep_reward += env.reward
                    ep_t += 1

                    ep_lengths.append(ep_t)
                    ep_rewards.append(ep_reward)
                    ep_collisions.append(ep_collision)

                print('evaluation: %s %s' % (scene_scope, task_scope))
                print('mean episode reward: %.2f' % np.mean(ep_rewards))
                print('mean episode length: %.2f' % np.mean(ep_lengths))
                print('mean episode collision: %.2f' % np.mean(ep_collisions))