Python DRQN.init_hidden_states Exemples, networks.dqn.DRQN.init_hidden_states Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : pomdp.py Projet : kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

class POMDP:
    def __init__(self, args):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up',
                        'down']  # 'open', 'close']
        self.P_START = 0.999
        self.P_END = 0.05
        self.P_DECAY = 600
        self.max_iter = args.max_iter
        self.gripping_force = args.grip_force
        self.break_threshold = args.break_thresh

        # Prepare the drawing figure
        fig, (ax1, ax2) = plt.subplots(1, 2)
        self.figure = (fig, ax1, ax2)

    # Function to select an action from our policy or a random one
    def select_action(self, observation, hidden_state, cell_state):
        sample = random.random()
        p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp(
            -1. * self.steps_done / self.P_DECAY)

        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            self.policy_net_1.eval()
            torch_observation = torch.from_numpy(observation).float().to(
                self.args.device).unsqueeze(0)
            model_out = self.policy_net_1(torch_observation,
                                          batch_size=1,
                                          time_step=1,
                                          hidden_state=hidden_state,
                                          cell_state=cell_state)
            out = model_out[0]
            hidden_state = model_out[1][0]
            cell_state = model_out[1][1]
            self.policy_net_1.train()

            if sample > p_threshold:
                action = int(torch.argmax(out[0]))
                return action, hidden_state, cell_state
            else:
                return random.randrange(
                    0, self.args.outdim), hidden_state, cell_state

    def optimize_model(self):
        args = self.args
        if len(self.memory) < (args.batch_size):
            return

        hidden_batch_1, cell_batch_1 = self.policy_net_1.init_hidden_states(
            args.batch_size, args.device)
        hidden_batch_2, cell_batch_2 = self.policy_net_2.init_hidden_states(
            args.batch_size, args.device)
        hidden_batch_3, cell_batch_3 = self.policy_net_3.init_hidden_states(
            args.batch_size, args.device)
        batch = self.memory.sample(args.batch_size, args.time_step)
        if not batch:
            return

        current_states, actions, rewards, next_states = [], [], [], []
        for b in batch:
            cs, ac, rw, ns = [], [], [], []
            for element in b:
                cs.append(element.state)
                ac.append(element.action)
                rw.append(element.reward)
                ns.append(element.next_state)
            current_states.append(cs)
            actions.append(ac)
            rewards.append(rw)
            next_states.append(ns)

        current_states = torch.from_numpy(np.array(current_states)).float().to(
            args.device)
        actions = torch.from_numpy(np.array(actions)).long().to(args.device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(args.device)
        next_states = torch.from_numpy(np.array(next_states)).float().to(
            args.device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        Q_s_1, _ = self.policy_net_1.forward(current_states,
                                             batch_size=args.batch_size,
                                             time_step=args.time_step,
                                             hidden_state=hidden_batch_1,
                                             cell_state=cell_batch_1)
        Q_s_a_1 = Q_s_1.gather(
            dim=1,
            index=actions[:,
                          args.time_step - 1].unsqueeze(dim=1)).squeeze(dim=1)

        Q_s_2, _ = self.policy_net_2.forward(current_states,
                                             batch_size=args.batch_size,
                                             time_step=args.time_step,
                                             hidden_state=hidden_batch_2,
                                             cell_state=cell_batch_2)
        Q_s_a_2 = Q_s_2.gather(
            dim=1,
            index=actions[:,
                          args.time_step - 1].unsqueeze(dim=1)).squeeze(dim=1)

        Q_s_3, _ = self.policy_net_3.forward(current_states,
                                             batch_size=args.batch_size,
                                             time_step=args.time_step,
                                             hidden_state=hidden_batch_3,
                                             cell_state=cell_batch_3)
        Q_s_a_3 = Q_s_3.gather(
            dim=1,
            index=actions[:,
                          args.time_step - 1].unsqueeze(dim=1)).squeeze(dim=1)

        Q_next_1, _ = self.policy_net_1.forward(next_states,
                                                batch_size=args.batch_size,
                                                time_step=args.time_step,
                                                hidden_state=hidden_batch_1,
                                                cell_state=cell_batch_1)
        Q_next_max_1 = Q_next_1.detach().max(dim=1)[0]

        Q_next_2, _ = self.policy_net_2.forward(next_states,
                                                batch_size=args.batch_size,
                                                time_step=args.time_step,
                                                hidden_state=hidden_batch_2,
                                                cell_state=cell_batch_2)
        Q_next_max_2 = Q_next_2.detach().max(dim=1)[0]

        Q_next_3, _ = self.policy_net_3.forward(next_states,
                                                batch_size=args.batch_size,
                                                time_step=args.time_step,
                                                hidden_state=hidden_batch_3,
                                                cell_state=cell_batch_3)
        Q_next_max_3 = Q_next_3.detach().max(dim=1)[0]

        Q_next_max = torch.min(torch.min(Q_next_max_1, Q_next_max_2),
                               Q_next_max_3)

        # Compute the expected Q values
        target_values = rewards[:,
                                args.time_step - 1] + (args.gamma * Q_next_max)

        # Compute Huber loss
        loss_1 = F.smooth_l1_loss(Q_s_a_1, target_values)
        loss_2 = F.smooth_l1_loss(Q_s_a_2, target_values)
        loss_3 = F.smooth_l1_loss(Q_s_a_3, target_values)

        # Optimize the model
        self.optimizer_1.zero_grad()
        self.optimizer_2.zero_grad()
        self.optimizer_3.zero_grad()
        loss_1.backward()
        loss_2.backward()
        loss_3.backward()
        for param in self.policy_net_1.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_3.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_1.step()
        self.optimizer_2.step()
        self.optimizer_3.step()
        return [loss_1, loss_2, loss_3]

    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DRQN(args.indim, args.outdim).to(args.device)

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters())
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters())
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters())
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_net_3.load_state_dict(checkpoint['policy_net_3'])
                self.optimizer_1.load_state_dict(checkpoint['optimizer_1'])
                self.optimizer_2.load_state_dict(checkpoint['optimizer_2'])
                self.optimizer_3.load_state_dict(checkpoint['optimizer_3'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            self.steps_done += 1
            act_sequence = []
            if args.sim:
                sim_params = init_model(robot.mj_sim)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Local memory for current episode
                localMemory = []

                # Get current observation
                hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_3, cell_state_3 = self.policy_net_3.init_hidden_states(
                    batch_size=1, device=args.device)
                observation_space = TactileObs(
                    robot.get_gripper_xpos(),  # 24
                    robot.get_all_touch_buffer(args.hap_sample))  # 30 x 7
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                observation = observation_space.get_state()
                if args.position:
                    observation = observation[6:]
                if args.shear:
                    indices = np.ones(len(observation), dtype=bool)
                    indices[6:166] = False
                    observation = observation[indices]
                if args.force:
                    observation = observation[:166]
                normalizer.observe(observation)
                observation = normalizer.normalize(observation)
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.05 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    observation_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x7

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = observation_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if t % 10 == 0:
                    loss = self.optimize_model()
        #        if loss:
        #            print_variables['loss'].append(loss.item())

        # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(sim_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_net_3': self.policy_net_3.state_dict(),
                        'optimizer_1': self.optimizer_1.state_dict(),
                        'optimizer_2': self.optimizer_2.state_dict(),
                        'optimizer_3': self.optimizer_3.state_dict(),
                    }, save_path)

            self.memory.save_memory(os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables

Exemple #2

0

Afficher le fichier

Fichier : multimodal_pomdp.py Projet : kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

class POMDP:
    def __init__(self, args):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up',
                        'down']  # 'open', 'close']
        self.P_START = 0.999
        self.P_END = 0.1
        self.P_DECAY = 60000
        self.max_iter = args.max_iter
        self.gripping_force = args.grip_force
        self.break_threshold = args.break_thresh

        # Prepare the drawing figure
        fig, (ax1, ax2) = plt.subplots(1, 2)
        self.figure = (fig, ax1, ax2)

    # Function to select an action from our policy or a random one
    def select_action(self, observation, hidden_state, cell_state):
        args = self.args
        sample = random.random()
        p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp(
            -1. * self.steps_done / self.P_DECAY)
        self.steps_done += 1

        tactile_obs, img_norm = observation

        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            self.policy_net_1.eval()
            torch_tactile_obs = torch.from_numpy(tactile_obs).float().to(
                args.device).unsqueeze(0)
            torch_img_norm = torch.from_numpy(img_norm).float().to(
                args.device).unsqueeze(0)
            img_ft = self.conv_net_1.forward(torch_img_norm)

            torch_observation = torch.cat((torch_tactile_obs, img_ft), dim=1)
            model_out = self.policy_net_1(torch_observation,
                                          batch_size=1,
                                          time_step=1,
                                          hidden_state=hidden_state,
                                          cell_state=cell_state)
            out = model_out[0]
            hidden_state = model_out[1][0]
            cell_state = model_out[1][1]
            self.policy_net_1.train()

            if sample > p_threshold:
                action = int(torch.argmax(out[0]))
                return action, hidden_state, cell_state
            else:
                return random.randrange(0,
                                        args.outdim), hidden_state, cell_state

    def sample_memory(self):
        batch = self.memory.sample(self.args.batch_size, self.args.time_step)
        if not batch:
            return

        current_states, actions, rewards, next_states = [], [], [], []
        for b in batch:
            cs, ac, rw, ns = [], [], [], []
            for element in b:
                cs.append(element.state)
                ac.append(element.action)
                rw.append(element.reward)
                ns.append(element.next_state)
            current_states.append(cs)
            actions.append(ac)
            rewards.append(rw)
            next_states.append(ns)
        return current_states, actions, rewards, next_states

    def extract_ft(self, conv_net, obs_batch):
        args = self.args
        assert len(obs_batch) == args.batch_size
        assert len(obs_batch[0]) == args.time_step
        result = []
        for b in obs_batch:
            obs_squence = torch.tensor([]).to(args.device)
            for item in b:
                tactile_obs, img_norm = item
                torch_tactile_obs = torch.from_numpy(tactile_obs).float().to(
                    args.device)
                torch_img_norm = torch.from_numpy(img_norm).float().to(
                    args.device).unsqueeze(0)
                img_ft = conv_net.forward(torch_img_norm)
                torch_full_obs = torch.cat((torch_tactile_obs, img_ft[0]))
                obs_squence = torch.cat((obs_squence, torch_full_obs))
            torch_obs_squence = obs_squence.view(args.time_step, -1)
            result.append(obs_squence)
        return torch.stack(result)

    def bootstrap(self, policy_net, conv_net, memory_subset):
        args = self.args
        if len(self.memory) < (args.batch_size):
            return

        current_states, actions, rewards, next_states = memory_subset

        # process observation (tactile_obs, img) to 1d tensor of (tactile_obs, feature)
        # and then stack them to corresponding dimension: (batch_size, time_step, *)
        current_states = self.extract_ft(conv_net, current_states)
        next_states = self.extract_ft(conv_net, next_states)

        # convert all to torch tensors
        actions = torch.from_numpy(np.array(actions)).long().to(args.device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(args.device)

        hidden_batch, cell_batch = policy_net.init_hidden_states(
            args.batch_size, args.device)

        Q_s, _ = policy_net.forward(current_states,
                                    batch_size=args.batch_size,
                                    time_step=args.time_step,
                                    hidden_state=hidden_batch,
                                    cell_state=cell_batch)
        Q_s_a = Q_s.gather(dim=1,
                           index=actions[:, args.time_step -
                                         1].unsqueeze(dim=1)).squeeze(dim=1)

        Q_next, _ = policy_net.forward(next_states,
                                       batch_size=args.batch_size,
                                       time_step=args.time_step,
                                       hidden_state=hidden_batch,
                                       cell_state=cell_batch)
        Q_next_max = Q_next.detach().max(dim=1)[0]
        return Q_s_a, Q_next_max

    def optimize(self):
        args = self.args
        if len(self.memory) < (args.batch_size):
            return

        memory_subset = self.sample_memory()
        _, _, rewards, _ = memory_subset
        rewards = torch.from_numpy(np.array(rewards)).float().to(args.device)

        Q_s_a_1, Q_next_max_1 = self.bootstrap(self.policy_net_1,
                                               self.conv_net_1, memory_subset)
        Q_s_a_2, Q_next_max_2 = self.bootstrap(self.policy_net_2,
                                               self.conv_net_2, memory_subset)

        Q_next_max = torch.min(Q_next_max_1, Q_next_max_2)

        # Compute the expected Q values
        target_values = rewards[:,
                                args.time_step - 1] + (args.gamma * Q_next_max)

        # Compute Huber loss
        loss_1 = F.smooth_l1_loss(Q_s_a_1, target_values)
        loss_2 = F.smooth_l1_loss(Q_s_a_2, target_values)

        # Optimize the model
        self.policy_optimizer_1.zero_grad()
        self.policy_optimizer_2.zero_grad()
        self.conv_optimizer_1.zero_grad()
        self.conv_optimizer_2.zero_grad()
        loss_1.backward()
        loss_2.backward()
        for param in self.policy_net_1.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.conv_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.conv_net_2.parameters():
            param.grad.data.clamp_(-1, 1)

        self.policy_optimizer_1.step()
        self.policy_optimizer_2.step()
        self.conv_optimizer_1.step()
        self.conv_optimizer_2.step()

    def train_POMDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.conv_net_1 = ConvNet(args.ftdim, args.depth).to(args.device)
        self.conv_net_2 = ConvNet(args.ftdim, args.depth).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.conv_optimizer_1 = optim.RMSprop(self.conv_net_1.parameters(),
                                              lr=1e-5)
        self.conv_optimizer_2 = optim.RMSprop(self.conv_net_2.parameters(),
                                              lr=1e-5)
        self.memory = RecurrentMemory(70)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim - args.ftdim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.conv_net_1.load_state_dict(checkpoint['conv_net_1'])
                self.conv_net_2.load_state_dict(checkpoint['conv_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                self.conv_optimizer_1.load_state_dict(
                    checkpoint['conv_optimizer_1'])
                self.conv_optimizer_2.load_state_dict(
                    checkpoint['conv_optimizer_2'])
                start_episode = checkpoint['epoch']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_conv:
            checkpoint = torch.load(args.weight_conv)
            self.conv_net_1.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_1.load_state_dict(checkpoint['conv_optimizer'])
            self.conv_net_2.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_2.load_state_dict(checkpoint['conv_optimizer'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)
        tactile_obs_space = TactileObs(
            robot.get_gripper_jpos(),  # 6
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 12

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            act_sequence = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                tactile_obs = tactile_obs_space.get_state()
                normalizer.observe(tactile_obs)
                tactile_obs = normalizer.normalize(tactile_obs)
                # Get image and normalize it
                img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
                if args.depth:
                    depth = norm_depth(img[1])
                    img = norm_img(img[0])
                    img_norm = np.empty((4, args.img_w, args.img_h))
                    img_norm[:3, :, :] = img
                    img_norm[3, :, :] = depth
                else:
                    img_norm = norm_img(img)

                observation = [tactile_obs, img_norm]
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))
                robot.move_joint(target_pose,
                                 True,
                                 self.gripping_force,
                                 hap_sample=args.hap_sample)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    reward = 0

                # Observe new state
                tactile_obs_space.update(
                    robot.get_gripper_jpos(),  # 6
                    robot.get_all_touch_buffer(args.hap_sample))  # 30x12

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(next_tactile_obs)
                    next_tactile_obs = normalizer.normalize(next_tactile_obs)
                    # Get image and normalize it
                    next_img = robot.get_img(args.img_w, args.img_h, 'c1',
                                             args.depth)
                    if args.depth:
                        next_depth = norm_depth(next_img[1])
                        next_img = norm_img(next_img[0])
                        next_img_norm = np.empty((4, args.img_w, args.img_h))
                        next_img_norm[:3, :, :] = next_img
                        next_img_norm[3, :, :] = next_depth
                    else:
                        next_img_norm = norm_img(next_img)
                    next_state = [next_tactile_obs, next_img_norm]
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'conv_net_1': self.conv_net_1.state_dict(),
                        'conv_net_2': self.conv_net_2.state_dict(),
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'conv_optimizer_1': self.conv_optimizer_1.state_dict(),
                        'conv_optimizer_2': self.conv_optimizer_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        self.memory.save_memory(os.path.join(args.output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables

Exemple #3

0

Afficher le fichier

Fichier : multistep_pomdp.py Projet : kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

class POMDP:
    def __init__(self, args):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up',
                        'down']  # 'open', 'close']
        self.P_START = 0.999
        self.P_END = 0.05
        self.P_DECAY = 500
        self.max_iter = args.max_iter
        self.gripping_force = args.grip_force
        self.break_threshold = args.break_thresh

        # Prepare the drawing figure
        fig, (ax1, ax2) = plt.subplots(1, 2)
        self.figure = (fig, ax1, ax2)

    # Function to select an action from our policy or a random one
    def select_action(self, observation, hidden_state, cell_state):
        args = self.args
        sample = random.random()
        p_threshold = self.P_END + (self.P_START - self.P_END) * math.exp(
            -1. * self.steps_done / self.P_DECAY)

        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            self.tactile_net_1.eval()
            self.policy_net_1.eval()
            torch_obs = torch.from_numpy(observation).float().to(
                args.device).unsqueeze(0)
            h_tac, c_tac = self.tactile_net_1.init_hidden_states(args.device)
            tactile_ft = self.tactile_net_1.forward(torch_obs,
                                                    hidden_state=h_tac,
                                                    cell_state=c_tac)

            model_out = self.policy_net_1(tactile_ft.unsqueeze(1),
                                          batch_size=1,
                                          time_step=1,
                                          hidden_state=hidden_state,
                                          cell_state=cell_state)
            out = model_out[0]
            hidden_state = model_out[1][0]
            cell_state = model_out[1][1]
            self.tactile_net_1.train()
            self.policy_net_1.train()

            if sample > p_threshold:
                action = int(torch.argmax(out[0]))
                return action, hidden_state, cell_state
            else:
                return random.randrange(0,
                                        args.outdim), hidden_state, cell_state

    def sample_memory(self):
        batch = self.memory.sample(self.args.batch_size, self.args.time_step)
        if not batch:
            return

        current_states, actions, rewards, next_states = [], [], [], []
        for b in batch:
            cs, ac, rw, ns = [], [], [], []
            for element in b:
                cs.append(element.state)
                ac.append(element.action)
                rw.append(element.reward)
                ns.append(element.next_state)
            current_states.append(cs)
            actions.append(ac)
            rewards.append(rw)
            next_states.append(ns)
        return current_states, actions, rewards, next_states

    def extract_ft(self, tactile_net, batch):
        args = self.args
        h_tac, c_tac = tactile_net.init_hidden_states(args.device)
        result = []
        for b in batch:
            obs_sequence = []
            for item in b:
                torch_obs = torch.from_numpy(item).float().to(
                    args.device).unsqueeze(0)
                tactile_ft = tactile_net.forward(torch_obs,
                                                 hidden_state=h_tac,
                                                 cell_state=c_tac)
                obs_sequence.append(tactile_ft)
            torch_obs_sequence = torch.stack(obs_sequence)
            result.append(torch_obs_sequence)
        return torch.stack(result)

    def bootstrap(self, policy_net, tactile_net, memory_subset):
        args = self.args
        if len(self.memory) < (args.batch_size):
            return

        current_states, actions, rewards, next_states = memory_subset

        # padded_current_states, current_lengths = self.pad_batch(current_states)
        # padded_next_states, next_lengths = self.pad_batch(next_states)

        # process observation (tactile_obs, img) to 1d tensor of (tactile_obs, feature)
        # and then stack them to corresponding dimension: (batch_size, time_step, *)
        current_features = self.extract_ft(tactile_net, current_states)
        next_features = self.extract_ft(tactile_net, next_states)

        # convert all to torch tensors
        actions = torch.from_numpy(np.array(actions)).long().to(args.device)
        rewards = torch.from_numpy(np.array(rewards)).float().to(args.device)

        hidden_batch, cell_batch = policy_net.init_hidden_states(
            args.batch_size, args.device)

        Q_s, _ = policy_net.forward(current_features,
                                    batch_size=args.batch_size,
                                    time_step=args.time_step,
                                    hidden_state=hidden_batch,
                                    cell_state=cell_batch)
        Q_s_a = Q_s.gather(dim=1,
                           index=actions[:, args.time_step -
                                         1].unsqueeze(dim=1)).squeeze(dim=1)

        Q_next, _ = policy_net.forward(next_features,
                                       batch_size=args.batch_size,
                                       time_step=args.time_step,
                                       hidden_state=hidden_batch,
                                       cell_state=cell_batch)
        Q_next_max = Q_next.detach().max(dim=1)[0]
        return Q_s_a, Q_next_max

    def optimize(self):
        args = self.args
        if len(self.memory) < (args.batch_size):
            return

        memory_subset = self.sample_memory()
        _, _, rewards, _ = memory_subset
        rewards = torch.from_numpy(np.array(rewards)).float().to(args.device)

        Q_s_a_1, Q_next_max_1 = self.bootstrap(self.policy_net_1,
                                               self.tactile_net_1,
                                               memory_subset)
        Q_s_a_2, Q_next_max_2 = self.bootstrap(self.policy_net_2,
                                               self.tactile_net_2,
                                               memory_subset)

        Q_next_max = torch.min(Q_next_max_1, Q_next_max_2)

        # Compute the expected Q values
        target_values = rewards[:,
                                args.time_step - 1] + (args.gamma * Q_next_max)

        # Compute Huber loss
        loss_1 = F.smooth_l1_loss(Q_s_a_1, target_values)
        loss_2 = F.smooth_l1_loss(Q_s_a_2, target_values)

        # Optimize the model
        self.policy_optimizer_1.zero_grad()
        self.policy_optimizer_2.zero_grad()
        self.tactile_optimizer_1.zero_grad()
        self.tactile_optimizer_2.zero_grad()
        loss_1.backward()
        loss_2.backward()
        for param in self.policy_net_1.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.policy_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.tactile_net_2.parameters():
            param.grad.data.clamp_(-1, 1)
        for param in self.tactile_net_2.parameters():
            param.grad.data.clamp_(-1, 1)

        self.policy_optimizer_1.step()
        self.policy_optimizer_2.step()
        self.tactile_optimizer_1.step()
        self.tactile_optimizer_2.step()

    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.ftdim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.ftdim, args.outdim).to(args.device)
        if args.position:
            self.tactile_net_1 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
        elif args.force:
            self.tactile_net_1 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
        else:
            self.tactile_net_1 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.tactile_optimizer_1 = optim.RMSprop(
            self.tactile_net_1.parameters(), lr=args.lr)
        self.tactile_optimizer_2 = optim.RMSprop(
            self.tactile_net_2.parameters(), lr=args.lr)
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.weight_policy:
            if os.path.exists(args.weight_policy):
                checkpoint = torch.load(args.weight_policy)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.weight_policy),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_tactile:
            checkpoint = torch.load(args.weight_tactile)
            self.tactile_net_1.load_state_dict(checkpoint['tactile_net_1'])
            self.tactile_optimizer_1.load_state_dict(
                checkpoint['tactile_optimizer_1'])
            self.tactile_net_2.load_state_dict(checkpoint['tactile_net_2'])
            self.tactile_optimizer_2.load_state_dict(
                checkpoint['tactile_optimizer_2'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        tactile_obs_space = TactileObs(
            robot.get_gripper_xpos(),  # 24
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 6

        # Main training loop
        for ii in range(start_episode, args.epochs):
            self.steps_done += 1
            start_time = time.time()
            act_sequence = []
            act_length = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            # pick a random action initially
            action = random.randrange(0, 5)
            current_state = None
            next_state = None

            t = 0

            while t < args.max_iter:
                if not args.quiet and t == 0:
                    print("Running training episode: {}".format(ii, t))

                if args.position:
                    multistep_obs = np.empty((0, args.indim - 6))
                elif args.force:
                    multistep_obs = np.empty((0, args.indim - 390))
                else:
                    multistep_obs = np.empty((0, args.indim))

                prev_action = action

                for k in range(args.len_ub):
                    # Observe tactile features and stack them
                    tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(tactile_obs)
                    tactile_obs = normalizer.normalize(tactile_obs)

                    if args.position:
                        tactile_obs = tactile_obs[6:]
                    elif args.force:
                        tactile_obs = tactile_obs[:6]

                    multistep_obs = np.vstack((multistep_obs, tactile_obs))

                    # current jpos
                    current_pos = robot.get_gripper_jpos()[:3]

                    # Perform action
                    delta = action_space.get_action(
                        self.ACTIONS[action])['delta'][:3]
                    target_position = np.add(robot.get_gripper_jpos()[:3],
                                             np.array(delta))
                    target_pose = np.hstack(
                        (target_position, robot.get_gripper_jpos()[3:]))
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Observe new state
                    tactile_obs_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x6

                    displacement = la.norm(robot.get_gripper_jpos()[:3] -
                                           current_pos)

                    if displacement / 0.06 < 0.7:
                        break

                # input stiched multi-step tactile observation into tactile-net to generate tactile feature
                action, hidden_state_1, cell_state_1 = self.select_action(
                    multistep_obs, hidden_state_1, cell_state_1)

                if t == 0:
                    next_state = multistep_obs.copy()
                else:
                    current_state = next_state.copy()
                    next_state = multistep_obs.copy()

                # record actions in this epoch
                act_sequence.append(prev_action)
                act_length.append(k)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    if failure:
                        reward = -20
                    else:
                        reward = 0

                t += k + 1
                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                if done or failure:
                    next_state = None

                # Push new Transition into memory
                if t > k + 1:
                    localMemory.append(
                        Transition(current_state, prev_action, next_state,
                                   reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print(
                        "{} of Actions in this epoch are: {} \n Action length are: {}"
                        .format(len(act_sequence), act_sequence, act_length))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(output_dir,
                                         'policy_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)
                save_path = os.path.join(output_dir,
                                         'tactile_' + str(ii) + '.pth')
                torch.save(
                    {
                        'tactile_net_1':
                        self.tactile_net_1.state_dict(),
                        'tactile_net_2':
                        self.tactile_net_2.state_dict(),
                        'tactile_optimizer_1':
                        self.tactile_optimizer_1.state_dict(),
                        'tactile_optimizer_2':
                        self.tactile_optimizer_2.state_dict(),
                    }, save_path)

                write_results(os.path.join(output_dir, 'results_pomdp.pkl'),
                              print_variables)

                self.memory.save_memory(
                    os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables