Python ActionSpace Examples

Programming Language: Python

Namespace/Package Name: utils.action_buffer

Class/Type: ActionSpace

Examples at hotexamples.com: 17

Python ActionSpace - 17 examples found. These are the top rated real world Python examples of utils.action_buffer.ActionSpace extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ActionSpace(14)

get_action(11)

Frequently Used Methods

ActionSpace (14)

get_action (11)

Example #1

Show file

File: env.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

 def __init__(self, args):
     self.args = args
     self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']  # 'open', 'close']
     self.gripping_force = args.grip_force
     self.break_threshold = args.break_thresh
     self.action_space = ActionSpace(dp=0.06, df=10)
     self.broken_so_far = 0
     self.robot = None
     self.model_params = None

Example #2

Show file

File: collect_state_estimation_traj.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

 def __init__(self, args, robot, velcro_util, tactile_obs_space):
     self.args = args
     self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
     self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10)
     self.robot = robot
     self.velcro_util = velcro_util
     self.tactile_obs_space = tactile_obs_space
     self.normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                             device=args.device)
     self.normalizer.restore_state(args.normalizer_file)

Example #3

Show file

File: collect_oneshot_expert_traj.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

def main(args):

    policy_net = Geom_DQN(args.indim, args.outdim).to(args.device)
    policy_net.load_state_dict(torch.load(args.weight_expert)['policy_net_1'])
    policy_net.eval()

    normalizer = Geom_Normalizer(args.indim, device=args.device)
    normalizer.restore_state(args.norm_expert)

    tactile_normalizer = Multimodal_Normalizer(num_inputs = args.tactile_indim, device=args.device)
    tactile_normalizer.restore_state(args.tactile_normalizer)

    action_space = ActionSpace(dp=0.06, df=10)

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None

    # load all velcro parameters
    model_dir = os.path.dirname(args.model_path)
    param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    velcro_params = pickle.load(open(param_path, 'rb'))

    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    velcro_util = VelcroUtil(robot, robot.mj_sim_param)

    traj_logger = TrajLogger(args, robot, velcro_util, policy_net, normalizer, tactile_normalizer)

    all_trajectories = []
    all_success = [None for i in range(len(velcro_params))]
    all_time = [None for i in range(len(velcro_params))]

    for i in range(len(velcro_params)):
        geom_type, origin_offset, euler, radius = velcro_params[i]
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        performance = {'time':[], 'success':[], 'action_hist':[0,0,0,0,0,0]}
        min_time = args.max_iter
        all_trajectories.append(None)
    
        for j in range(args.num_try):
            robot.reset_simulation()
            ret = robot.grasp_handle()
            performance, expert_traj = traj_logger.test_network(performance)
            if performance['time'][-1] <= min_time:
                all_trajectories[i] = expert_traj
                min_time = performance['time'][-1]
                all_success[i] = performance['success'][j]
                all_time[i] = performance['time'][-1]


        print('\n\nFinished trajectory {}'.format(i))
        print('Velcro parameters are:{} {} {} {}'.format(geom_type, origin_offset, euler, radius))
        print(performance)
        success = np.array(performance['success'])
        time = np.array(performance['time'])
        print('Successfully opened the velcro in: {}% of cases'.format(100 * np.sum(success) / len(performance['success'])))
        print('Average time to open: {}'.format(np.average(time[success>0])))
        print('Action histogram for the test is: {}'.format(performance['action_hist']))

    print('\nCollected {} successful expert trajectories in total'.format(np.sum(np.array(all_success))))
    print('Total success and time: {}, {}'.format(all_success, all_time))
    output = {'args': args, 'traj': all_trajectories, 'success': all_success, 'all_time': all_time}
    output_path = args.result_dir + 'oneshot_expert_traj.pkl'
    write_results(output_path, output)

Example #4

Show file

File: env.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

class RobotEnv:
    def __init__(self, args):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']  # 'open', 'close']
        self.gripping_force = args.grip_force
        self.break_threshold = args.break_thresh
        self.action_space = ActionSpace(dp=0.06, df=10)
        self.broken_so_far = 0
        self.robot = None
        self.model_params = None

    def reset(self):
        args = self.args
        model, self.model_params = init_model(args.model_path, geom=False)
        sim = MjSim(model)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        sim_param = SimParameter(sim)
        robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold)
        self.robot = robot
        robot.reset_simulation()
        ret = robot.grasp_handle()
        if not ret:
            return None

        self.obs_space = Observation(robot.get_gripper_jpos(),
                                     robot.get_num_broken_buffer(args.hap_sample),
                                     robot.get_main_touch_buffer(args.hap_sample)) 

        # Get current observation
        self.broken_so_far = 0

        self.obs_space.update(robot.get_gripper_jpos(),            # 6
                                 robot.get_num_broken_buffer(args.hap_sample),         # 30x1
                                 robot.get_main_touch_buffer(args.hap_sample))     # 30x12

        obs = self.obs_space.get_state()

        reward = 0
        done = False
        info = {'slippage': False}
        return obs, reward, done, info

    def step(self, action):
        robot = self.robot
        args = self.args
        delta = self.action_space.get_action(self.ACTIONS[action])['delta'][:3]
        target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta))
        target_pose = np.hstack((target_position, robot.get_gripper_jpos()[3:]))
        robot.move_joint(target_pose, True, self.gripping_force, hap_sample = args.hap_sample)
        
        # Get reward
        done, num = robot.update_tendons()
        slippage = robot.check_slippage()
        if num > self.broken_so_far:
            reward = num - self.broken_so_far
            self.broken_so_far = num
        else:
            reward = 0

        if slippage:
            done = True

        info = {'slippage': slippage}

        self.obs_space.update(robot.get_gripper_jpos(),            # 6
                                 robot.get_num_broken_buffer(args.hap_sample),         # 30x2
                                 robot.get_main_touch_buffer(args.hap_sample))     # 30x7
        obs = self.obs_space.get_state()
        return obs, reward, done, info

Example #5

Show file

File: vision_baseline_test.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

def main(args):
    if not os.path.isdir(args.result_dir):
        os.makedirs(args.result_dir)

    conv_net = ConvNet(args.outdim, args.depth).to(args.device)
    if os.path.exists(args.weight_convnet):
        checkpoint = torch.load(args.weight_convnet)
        conv_net.load_state_dict(checkpoint['conv_net'])

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    # load all velcro parameters
    # model_dir = os.path.dirname(args.model_path)
    # param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    param_path = '/home/jc/research/corl2019_learningHaptics/tests/test_xmls/case_{}.pickle'.format(
        args.case)
    velcro_params = pickle.load(open(param_path, 'rb'))
    if args.shuffle:
        random.shuffle(velcro_params)

    velcro_util = VelcroUtil(robot, sim_param)
    state_space = Observation(
        robot.get_gripper_jpos(),  # 6
        velcro_util.break_center(),  # 6
        velcro_util.break_norm())
    action_space = ActionSpace(dp=0.06, df=10)
    performance = {
        'time': [],
        'success': [],
        'num_broken': [],
        'tendon_hist': [0, 0, 0, 0, 0]
    }

    for n, item in enumerate(velcro_params):
        geom_type, origin_offset, euler, radius = item
        print('\n\nTest {} Velcro parameters are: {}, {}, {}, {}'.format(
            n, geom_type, origin_offset, euler, radius))
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        broken_so_far = 0

        # ax.clear()

        for t in range(args.max_iter):
            # take image an predict norm direction
            # Get image and normalize it
            img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
            if args.depth:
                depth = norm_depth(img[1])
                img = norm_img(img[0])
                img_norm = np.empty((4, args.img_w, args.img_h))
                img_norm[:3, :, :] = img
                img_norm[3, :, :] = depth
            else:
                img_norm = norm_img(img)

            torch_img = torch.from_numpy(img_norm).float().to(
                args.device).unsqueeze(0)
            pred = conv_net.forward(torch_img).detach().cpu()
            fl_norm = pred[0][3:6].numpy()
            break_dir_norm = pred[0][6:9].numpy()
            # normalize these vectors
            fl_norm = fl_norm / la.norm(fl_norm)
            break_dir_norm = break_dir_norm / la.norm(break_dir_norm)

            ################ choose action and get action direction vector ################
            action_direction = args.act_mag * (-0.5 * fl_norm +
                                               0.5 * break_dir_norm)
            action_key = (action_vec @ action_direction).argmax()
            action_direction = action_space.get_action(
                ACTIONS[action_key])['delta'][:3]

            gripper_pose = robot.get_gripper_jpos()[:3]

            # Perform action
            target_position = np.add(robot.get_gripper_jpos()[:3],
                                     action_direction)
            target_pose = np.hstack(
                (target_position, robot.get_gripper_jpos()[3:]))
            robot.move_joint(target_pose, True, 300, hap_sample=30)

            # check tendons and slippage
            done, num = robot.update_tendons()
            failure = robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done or failure:
                ratio_broken = float(num) / float(NUM_TENDON)
                if ratio_broken < 0.2:
                    performance['tendon_hist'][0] += 1
                elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                    performance['tendon_hist'][1] += 1
                elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                    performance['tendon_hist'][2] += 1
                elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                    performance['tendon_hist'][3] += 1
                else:
                    performance['tendon_hist'][4] += 1
                performance['num_broken'].append(num)
                if done:
                    performance['success'].append(1)
                    performance['time'].append(t + 1)
                if failure:
                    performance['success'].append(0)
                    performance['time'].append(t + 1)
                break

            if t == args.max_iter - 1:

                ################## exceed max iterations ####################
                performance['success'].append(0)
                performance['time'].append(args.max_iter)
                ratio_broken = float(num) / float(NUM_TENDON)
                performance['num_broken'].append(num)
                if ratio_broken < 0.2:
                    performance['tendon_hist'][0] += 1
                elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                    performance['tendon_hist'][1] += 1
                elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                    performance['tendon_hist'][2] += 1
                elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                    performance['tendon_hist'][3] += 1
                else:
                    performance['tendon_hist'][4] += 1

        # print episode performance
        print('Success: {}, time: {}, num_broken: {} '.format(
            performance['success'][-1], performance['time'][-1],
            performance['num_broken'][-1]))

    print('Finished opening velcro with haptics test \n')
    success = np.array(performance['success'])
    time = np.array(performance['time'])
    print('Successfully opened the velcro in: {}% of cases'.format(
        100 * np.sum(success) / len(performance['success'])))
    print('Average time to open: {}'.format(np.average(time[success > 0])))

    out_fname = 'vision_case{}.txt'.format(args.case)
    with open(os.path.join(args.result_dir, out_fname), 'w+') as f:
        f.write('Time: {}\n'.format(performance['time']))
        f.write('Success: {}\n'.format(performance['success']))
        f.write('Successfully opened the velcro in: {}% of cases\n'.format(
            100 * np.sum(success) / len(performance['success'])))
        f.write('Average time to open: {}\n'.format(
            np.average(time[success > 0])))
        f.write('Num_broken: {}\n'.format(performance['num_broken']))
        f.write('Tendon histogram: {}\n'.format(performance['tendon_hist']))
    f.close()

Example #6

Show file

File: collect_state_estimation_traj.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

class TrajLogger:
    def __init__(self, args, robot, velcro_util, tactile_obs_space):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
        self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10)
        self.robot = robot
        self.velcro_util = velcro_util
        self.tactile_obs_space = tactile_obs_space
        self.normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                                device=args.device)
        self.normalizer.restore_state(args.normalizer_file)

    def select_action(self):
        sample = random.random()
        p_threshold = self.args.p
        if sample > p_threshold:
            return self.expert_action()
        else:
            action = random.randrange(6)
            return self.action_space.get_action(
                self.ACTIONS[action])['delta'][:3]

    def expert_action(self):
        norms = self.velcro_util.break_norm()
        centers = self.velcro_util.break_center()
        fl_center = centers[:3]
        fs_center = centers[3:]
        fl_norm = norms[:3]
        fs_norm = norms[3:6]
        break_dir_norm = norms[6:9]

        action_direction = self.args.act_mag * (-0.5 * fl_norm +
                                                0.5 * break_dir_norm)
        return action_direction

    def test_network(self, performance):
        args = self.args
        max_iterations = args.max_iter

        broken_so_far = 0
        expert_traj = []

        for t in range(max_iterations):
            delta = self.select_action()

            # sample tactile feedback and norm info
            tactile_obs = self.tactile_obs_space.get_state()
            tactile_obs = self.normalizer.normalize(tactile_obs)

            expert_traj.append({
                'tactile': tactile_obs[:366],
                'norm': self.velcro_util.break_norm(),
                'center': self.velcro_util.break_center(),
                'gpos': self.robot.get_gripper_jpos()[:3]
            })

            # perform action
            target_position = np.add(self.robot.get_gripper_jpos()[:3],
                                     np.array(delta))
            target_pose = np.hstack(
                (target_position, self.robot.get_gripper_jpos()[3:]))
            self.robot.move_joint(target_pose,
                                  True,
                                  args.grip_force,
                                  hap_sample=args.hap_sample)

            self.tactile_obs_space.update(
                self.robot.get_gripper_jpos(),  # 6
                self.robot.get_all_touch_buffer(args.hap_sample))  # 30x12

            # Get reward
            done, num = self.robot.update_tendons()
            failure = self.robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done:
                performance['success'].append(1)
                performance['time'].append(t + 1)
                return performance, expert_traj
                break
            if failure:
                performance['success'].append(0)
                performance['time'].append(t + 1)
                return performance, expert_traj
                break
        # exceed max iterations
        performance['success'].append(0)
        performance['time'].append(max_iterations)
        return performance, expert_traj

Example #7

Show file

    def train(self):
        args = self.args

        self.load_all()

        print_variable = {'loss': None}
        start_epoch = 0
        if args.weight_policy:
            if os.path.exists(args.weight_policy):
                checkpoint = torch.load(args.weight_policy)
                start_epoch = checkpoint['epoch']
                self.steps_done = start_epoch * args.num_steps
                with open(
                        os.path.join(os.path.dirname(args.weight_policy),
                                     'results_imitation.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variable['loss'] = plot_dict['loss']

        self.set_lr()

        action_space = ActionSpace(dp=0.06, df=10)
        BCELoss = nn.BCELoss(reduction='sum')

        for ii in range(start_epoch, start_epoch + args.epochs):
            start_time = time.time()
            epoch_loss = []

            # sample expert trajectories and labeled actions
            batch, labels = self.memory.sample(args.batch_size, args.time_step)

            # encode labels
            onehot_labels = self.encode_label(labels)

            torch_labels = torch.tensor(onehot_labels).float().to(args.device)

            for step in range(args.num_steps):
                # forward networks and predict action
                states = self.extract_ft(self.conv_net, batch)

                hidden_batch, cell_batch = self.policy_net.init_hidden_states(
                    args.batch_size, args.device)

                prediction, h = self.policy_net.forward(
                    states,
                    batch_size=args.batch_size,
                    time_step=args.time_step,
                    hidden_state=hidden_batch,
                    cell_state=cell_batch)

                ground_truth = torch_labels[:,
                                            args.time_step - 1, :].squeeze(1)

                loss = BCELoss(prediction, ground_truth)

                # print and plot
                detached_loss = loss.detach().cpu()
                if print_variable['loss'] is not None:
                    print_variable['loss'] = torch.cat(
                        (print_variable['loss'], detached_loss.unsqueeze(0)))
                else:
                    print_variable['loss'] = detached_loss.unsqueeze(0)
                self.plot_variables(self.figure, print_variable,
                                    'Training Imitation Agent')
                epoch_loss.append(detached_loss.item())

                # Optimize the model
                self.policy_optimizer.zero_grad()
                self.conv_optimizer.zero_grad()
                loss.backward()
                for param in self.policy_net.parameters():
                    param.grad.data.clamp_(-1, 1)
                for param in self.conv_net.parameters():
                    param.grad.data.clamp_(-1, 1)

                self.policy_optimizer.step()
                self.conv_optimizer.step()

            # end of training epoch
            print(
                "Epoch {} took {}s, average loss of this epoch: {}\n\n".format(
                    ii,
                    time.time() - start_time, np.average(epoch_loss)))

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                policy_save_path = os.path.join(
                    args.output_dir, 'policy_checkpoint_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'policy_net': self.policy_net.state_dict(),
                        'policy_optimizer': self.policy_optimizer.state_dict(),
                    }, policy_save_path)

                conv_save_path = os.path.join(
                    args.output_dir, 'conv_checkpoint_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'conv_net': self.conv_net.state_dict(),
                        'conv_optimizer': self.conv_optimizer.state_dict(),
                    }, conv_save_path)

        savefig_path = os.path.join(args.output_dir, 'training_imitation')
        self.figure[0].savefig(savefig_path, format='png')

        print('Training done')
        plt.show()
        return print_variable

Example #8

Show file

File: pomdp.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DRQN(args.indim, args.outdim).to(args.device)

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters())
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters())
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters())
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_net_3.load_state_dict(checkpoint['policy_net_3'])
                self.optimizer_1.load_state_dict(checkpoint['optimizer_1'])
                self.optimizer_2.load_state_dict(checkpoint['optimizer_2'])
                self.optimizer_3.load_state_dict(checkpoint['optimizer_3'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            self.steps_done += 1
            act_sequence = []
            if args.sim:
                sim_params = init_model(robot.mj_sim)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Local memory for current episode
                localMemory = []

                # Get current observation
                hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_3, cell_state_3 = self.policy_net_3.init_hidden_states(
                    batch_size=1, device=args.device)
                observation_space = TactileObs(
                    robot.get_gripper_xpos(),  # 24
                    robot.get_all_touch_buffer(args.hap_sample))  # 30 x 7
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                observation = observation_space.get_state()
                if args.position:
                    observation = observation[6:]
                if args.shear:
                    indices = np.ones(len(observation), dtype=bool)
                    indices[6:166] = False
                    observation = observation[indices]
                if args.force:
                    observation = observation[:166]
                normalizer.observe(observation)
                observation = normalizer.normalize(observation)
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.05 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    observation_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x7

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = observation_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if t % 10 == 0:
                    loss = self.optimize_model()
        #        if loss:
        #            print_variables['loss'].append(loss.item())

        # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(sim_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_net_3': self.policy_net_3.state_dict(),
                        'optimizer_1': self.optimizer_1.state_dict(),
                        'optimizer_2': self.optimizer_2.state_dict(),
                        'optimizer_3': self.optimizer_3.state_dict(),
                    }, save_path)

            self.memory.save_memory(os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables

Example #9

Show file

 def __init__(self, args, robot, velcro_util):
     self.args = args
     self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
     self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10)
     self.robot = robot
     self.velcro_util = velcro_util

Example #10

Show file

File: mdp.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

    def train_MDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device)

        self.target_net = DQN(args.indim, args.outdim).to(args.device)
        self.target_net.load_state_dict(self.policy_net_1.state_dict())
        self.target_net.eval()

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                         args.lr)
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                         args.lr)
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(),
                                         args.lr)
        self.memory = ReplayMemory(500000)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Normalizer(args.indim, device=args.device)
        print_variables = {'durations': [], 'rewards': [], 'loss': []}

        # Load old checkpoint if provided
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_2.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_3.load_state_dict(
                    checkpoint['model_state_dict'])
                self.target_net.load_state_dict(checkpoint['model_state_dict'])
                start_episode = checkpoint['epoch']
                self.steps_done = start_episode
                self.optimizer_1.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_2.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_3.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_geom_mdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        action_space = ActionSpace(dp=0.06, df=10)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            if args.sim:
                # Create robot, reset simulation and grasp handle
                model, model_params = init_model(args.model_path)
                sim = MjSim(model)
                sim.step()
                viewer = None
                if args.render:
                    viewer = MjViewer(sim)
                else:
                    viewer = None

                sim_param = SimParameter(sim)
                robot = RobotSim(sim, viewer, sim_param, args.render,
                                 self.breaking_threshold)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Get current state
                state_space = Observation(
                    robot.get_gripper_jpos(),
                    robot.get_shear_buffer(args.hap_sample),
                    robot.get_all_touch_buffer(args.hap_sample))
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 20 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                state = state_space.get_state()
                if args.position:
                    state = state[6:]
                if args.shear:
                    indices = np.ones(len(state), dtype=bool)
                    indices[6:166] = False
                    state = state[indices]
                if args.force:
                    state = state[:166]
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = self.select_action(state)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(state_space.get_current_position(),
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    state_space.update(
                        robot.get_gripper_jpos(),
                        robot.get_shear_buffer(args.hap_sample),
                        robot.get_all_touch_buffer(args.hap_sample))

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = state_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                self.memory.push(state, action, next_state, reward)

                # Optimize the model
                loss = self.optimize_model()
                #        if loss:
                #            print_variables['loss'].append(loss.item())

                # If we are done, reset the model
                if done or failure:
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   'Training MDP')
                    print("Model parameters: {}".format(model_params))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))
                    break

            # Update the target network, every x iterations
            if ii % 10 == 0:
                self.target_net.load_state_dict(self.policy_net_1.state_dict())

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'model_state_dict': self.target_net.state_dict(),
                        'optimizer_state_dict': self.optimizer_1.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables

Example #11

Show file

class TrajLogger:
    def __init__(self, args, robot, velcro_util):
        self.args = args
        self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
        self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10)
        self.robot = robot
        self.velcro_util = velcro_util

    def select_action(self):
        sample = random.random()
        p_threshold = self.args.p
        if sample > p_threshold:
            return self.expert_action()
        else:
            action = random.randrange(6)
            return self.action_space.get_action(
                self.ACTIONS[action])['delta'][:3]

    def expert_action(self):
        norms = self.velcro_util.break_norm()
        centers = self.velcro_util.break_center()
        fl_center = centers[:3]
        fs_center = centers[3:]
        fl_norm = norms[:3]
        fs_norm = norms[3:6]
        break_dir_norm = norms[6:9]

        action_direction = self.args.act_mag * (-0.5 * fl_norm +
                                                0.5 * break_dir_norm)
        return action_direction

    def test_network(self, performance):
        args = self.args
        max_iterations = args.max_iter

        broken_so_far = 0
        expert_traj = []

        for t in range(max_iterations):
            delta = self.select_action()

            # sample images and norm info
            sample = random.random()
            if sample < args.sample_ratio:
                img = self.robot.get_img(args.img_w, args.img_h, 'c1',
                                         args.depth)
                if args.depth:
                    depth = norm_depth(img[1])
                    img = norm_img(img[0])
                    img_norm = np.empty((4, args.img_w, args.img_h))
                    img_norm[:3, :, :] = img
                    img_norm[3, :, :] = depth
                else:
                    img_norm = norm_img(img)

                expert_traj.append({
                    'image': img_norm,
                    'norm': self.velcro_util.break_norm(),
                    'center': self.velcro_util.break_center(),
                    'gpos': self.robot.get_gripper_jpos()[:3]
                })

            # perform action
            target_position = np.add(self.robot.get_gripper_jpos()[:3],
                                     np.array(delta))
            target_pose = np.hstack(
                (target_position, self.robot.get_gripper_jpos()[3:]))
            self.robot.move_joint(target_pose,
                                  True,
                                  args.grip_force,
                                  hap_sample=args.hap_sample)

            # Get reward
            done, num = self.robot.update_tendons()
            failure = self.robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done:
                performance['success'].append(1)
                performance['time'].append(t + 1)
                return performance, expert_traj
                break
            if failure:
                performance['success'].append(0)
                performance['time'].append(t + 1)
                return performance, expert_traj
                break
        # exceed max iterations
        performance['success'].append(0)
        performance['time'].append(max_iterations)
        return performance, expert_traj

Example #12

Show file

File: multimodal_pomdp.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

    def train_POMDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.conv_net_1 = ConvNet(args.ftdim, args.depth).to(args.device)
        self.conv_net_2 = ConvNet(args.ftdim, args.depth).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.conv_optimizer_1 = optim.RMSprop(self.conv_net_1.parameters(),
                                              lr=1e-5)
        self.conv_optimizer_2 = optim.RMSprop(self.conv_net_2.parameters(),
                                              lr=1e-5)
        self.memory = RecurrentMemory(70)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim - args.ftdim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.conv_net_1.load_state_dict(checkpoint['conv_net_1'])
                self.conv_net_2.load_state_dict(checkpoint['conv_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                self.conv_optimizer_1.load_state_dict(
                    checkpoint['conv_optimizer_1'])
                self.conv_optimizer_2.load_state_dict(
                    checkpoint['conv_optimizer_2'])
                start_episode = checkpoint['epoch']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_conv:
            checkpoint = torch.load(args.weight_conv)
            self.conv_net_1.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_1.load_state_dict(checkpoint['conv_optimizer'])
            self.conv_net_2.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_2.load_state_dict(checkpoint['conv_optimizer'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)
        tactile_obs_space = TactileObs(
            robot.get_gripper_jpos(),  # 6
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 12

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            act_sequence = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                tactile_obs = tactile_obs_space.get_state()
                normalizer.observe(tactile_obs)
                tactile_obs = normalizer.normalize(tactile_obs)
                # Get image and normalize it
                img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
                if args.depth:
                    depth = norm_depth(img[1])
                    img = norm_img(img[0])
                    img_norm = np.empty((4, args.img_w, args.img_h))
                    img_norm[:3, :, :] = img
                    img_norm[3, :, :] = depth
                else:
                    img_norm = norm_img(img)

                observation = [tactile_obs, img_norm]
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))
                robot.move_joint(target_pose,
                                 True,
                                 self.gripping_force,
                                 hap_sample=args.hap_sample)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    reward = 0

                # Observe new state
                tactile_obs_space.update(
                    robot.get_gripper_jpos(),  # 6
                    robot.get_all_touch_buffer(args.hap_sample))  # 30x12

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(next_tactile_obs)
                    next_tactile_obs = normalizer.normalize(next_tactile_obs)
                    # Get image and normalize it
                    next_img = robot.get_img(args.img_w, args.img_h, 'c1',
                                             args.depth)
                    if args.depth:
                        next_depth = norm_depth(next_img[1])
                        next_img = norm_img(next_img[0])
                        next_img_norm = np.empty((4, args.img_w, args.img_h))
                        next_img_norm[:3, :, :] = next_img
                        next_img_norm[3, :, :] = next_depth
                    else:
                        next_img_norm = norm_img(next_img)
                    next_state = [next_tactile_obs, next_img_norm]
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'conv_net_1': self.conv_net_1.state_dict(),
                        'conv_net_2': self.conv_net_2.state_dict(),
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'conv_optimizer_1': self.conv_optimizer_1.state_dict(),
                        'conv_optimizer_2': self.conv_optimizer_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        self.memory.save_memory(os.path.join(args.output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables

Example #13

Show file

def test_network(args, policy_net, tactile_net, normalizer, robot, obs_space,
                 performance):
    hidden_state, cell_state = policy_net.init_hidden_states(1, args.device)

    action_space = ActionSpace(dp=0.06, df=10)
    ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
    broken_so_far = 0

    t = 0
    action = 4
    collision = 0

    while t < args.max_iter:

        if args.position:
            multistep_obs = np.empty((0, args.indim - 6))
        elif args.force:
            multistep_obs = np.empty((0, args.indim - 390))
        else:
            multistep_obs = np.empty((0, args.indim))

        prev_action = action

        for k in range(args.len_ub):
            # Observe tactile features and stack them
            tactile_obs = obs_space.get_state()
            normalizer.observe(tactile_obs)
            tactile_obs = normalizer.normalize(tactile_obs)

            if args.position:
                tactile_obs = tactile_obs[6:]
            elif args.force:
                tactile_obs = tactile_obs[:6]

            multistep_obs = np.vstack((multistep_obs, tactile_obs))

            # current jpos
            current_pos = robot.get_gripper_jpos()[:3]

            # Perform action
            delta = action_space.get_action(ACTIONS[action])['delta'][:3]
            target_position = np.add(robot.get_gripper_jpos()[:3],
                                     np.array(delta))
            target_pose = np.hstack(
                (target_position, robot.get_gripper_jpos()[3:]))
            robot.move_joint(target_pose,
                             True,
                             args.grip_force,
                             hap_sample=args.hap_sample)

            # check collision number
            collision += robot.feedback_buffer['collision']

            # Observe new state
            obs_space.update(
                robot.get_gripper_xpos(),  # 24
                robot.get_all_touch_buffer(args.hap_sample))  # 30x6

            displacement = la.norm(robot.get_gripper_jpos()[:3] - current_pos)

            if displacement / 0.06 < 0.7:
                break

        # input stiched multi-step tactile observation into tactile-net to generate tactile feature
        action, hidden_state, cell_state = select_action(
            args, multistep_obs, policy_net, tactile_net, hidden_state,
            cell_state)

        # record actions in this epoch
        # act_sequence.append(prev_action)

        # Get reward
        done, num = robot.update_tendons()
        failure = robot.check_slippage()
        if num > broken_so_far:
            broken_so_far = num

        t += k + 1

        if done or failure:
            ratio_broken = float(num) / float(NUM_TENDON)
            if ratio_broken < 0.2:
                performance['tendon_hist'][0] += 1
            elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                performance['tendon_hist'][1] += 1
            elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                performance['tendon_hist'][2] += 1
            elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                performance['tendon_hist'][3] += 1
            else:
                performance['tendon_hist'][4] += 1
            performance['num_broken'].append(num)
            if done:
                performance['success'].append(1)
                performance['time'].append(t + 1)
            if failure:
                performance['success'].append(0)
                performance['time'].append(t + 1)
            performance['collision'].append(collision)
            return performance
            break

    ################## exceed max iterations ####################
    performance['success'].append(0)
    performance['time'].append(args.max_iter)
    ratio_broken = float(num) / float(NUM_TENDON)
    performance['num_broken'].append(num)
    if ratio_broken < 0.2:
        performance['tendon_hist'][0] += 1
    elif ratio_broken >= 0.2 and ratio_broken < 0.4:
        performance['tendon_hist'][1] += 1
    elif ratio_broken >= 0.4 and ratio_broken < 0.6:
        performance['tendon_hist'][2] += 1
    elif ratio_broken >= 0.6 and ratio_broken < 0.8:
        performance['tendon_hist'][3] += 1
    else:
        performance['tendon_hist'][4] += 1
    performance['collision'].append(collision)
    return performance

Example #14

Show file

File: collect_oneshot_expert_traj.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

    def test_network(self, performance):
        args = self.args
        max_iterations = args.max_iter

        # Get current state
        state_space = Observation(  self.robot.get_gripper_jpos(),  # 6
                                    self.velcro_util.break_center(),         # 6
                                    self.velcro_util.break_norm())  # 12

        tactile_obs_space = TactileObs( self.robot.get_gripper_jpos(),            # 6
                                        self.robot.get_all_touch_buffer(args.hap_sample))   # 30 x 12

        action_space = ActionSpace(dp=0.06, df=10)
        
        broken_so_far = 0
        expert_traj = {'image': None, 'tactile': [], 'action': [], 'position': []}

        img = self.robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
        if args.depth:
            depth = norm_depth(img[1])
            img = norm_img(img[0])
            img_norm = np.empty((4, args.img_w, args.img_h))
            img_norm[:3,:,:] = img
            img_norm[3,:,:] = depth
        else:
            img_norm = norm_img(img)
        expert_traj['image'] = img_norm

        
        for t in range(max_iterations):
            # Observe state and normalize
            state = state_space.get_state()
            # self.normalizer.observe(state[:12])
            state[:12] = self.normalizer.normalize(state[:12])
            action = self.select_action(self.policy_net, state)
            performance['action_hist'][action] += 1

            # record tactile and visual observation, corresponding action
            tactile_obs_space.update(self.robot.get_gripper_jpos(),            # 6
                                        self.robot.get_all_touch_buffer(args.hap_sample))
            tactile_obs = tactile_obs_space.get_state()
            tactile_obs = self.tactile_normalizer.normalize(tactile_obs)
            
            expert_traj['tactile'].append(tactile_obs.tolist())
            expert_traj['action'].append(action)
            expert_traj['position'].append(self.robot.get_gripper_jpos()[:3].tolist())

            # perform action
            delta = action_space.get_action(self.ACTIONS[action])['delta'][:3]
            target_position = np.add(self.robot.get_gripper_jpos()[:3], np.array(delta))
            target_pose = np.hstack((target_position, self.robot.get_gripper_jpos()[3:]))
            self.robot.move_joint(target_pose, True, args.grip_force, hap_sample = args.hap_sample)
            
            # Get reward
            done, num = self.robot.update_tendons()
            failure = self.robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if not done and not failure:
                # Observe new state
                state_space.update( self.robot.get_gripper_jpos(),  # 6
                                    self.velcro_util.break_center(),         # 6
                                    self.velcro_util.break_norm())  # 12
            else:
                if done:
                    performance['success'].append(1)
                    performance['time'].append(t + 1)
                if failure:
                    performance['success'].append(0)
                    performance['time'].append(t + 1)
                return performance, expert_traj
                break
        # exceed max iterations
        performance['success'].append(0)
        performance['time'].append(max_iterations)
        return performance, expert_traj

Example #15

Show file

def main(args):
    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    # load all velcro parameters
    model_dir = os.path.dirname(args.model_path)
    param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    velcro_params = pickle.load(open(param_path, 'rb'))

    velcro_util = VelcroUtil(robot, sim_param)
    state_space = Observation(
        robot.get_gripper_jpos(),  # 6
        velcro_util.break_center(),  # 6
        velcro_util.break_norm())
    action_space = ActionSpace(dp=0.06, df=10)

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.clear()

    for item in VELCRO_PARAMS:
        geom_type, origin_offset, euler, radius = item
        print('Velcro parameters are: {}, {}, {}, {}'.format(
            geom_type, origin_offset, euler, radius))
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        broken_so_far = 0

        ax.clear()

        for i in range(args.max_iter):
            # check velcro breakline geometry
            norms = velcro_util.break_norm()
            centers = velcro_util.break_center()
            fl_center = centers[:3]
            fs_center = centers[3:]
            fl_norm = norms[:3]
            fs_norm = norms[3:6]
            fs_center = centers[3:]
            break_dir_norm = norms[6:9]

            action_direction = args.act_mag * (-0.5 * fl_norm +
                                               0.5 * break_dir_norm)

            gripper_pose = robot.get_gripper_jpos()[:3]

            # Perform action
            target_position = np.add(robot.get_gripper_jpos()[:3],
                                     action_direction)
            target_pose = np.hstack(
                (target_position, robot.get_gripper_jpos()[3:]))
            robot.move_joint(target_pose, True, 300, hap_sample=30)

            ax.scatter(fl_center[0],
                       fl_center[1],
                       fl_center[2],
                       c='r',
                       marker='o')
            ax.scatter(fs_center[0],
                       fs_center[1],
                       fs_center[2],
                       c='r',
                       marker='o')
            X, Y, Z = lines(fl_center, fl_norm)
            ax.plot(X, Y, Z, c='b')
            X, Y, Z = lines(fs_center, fs_norm)
            ax.plot(X, Y, Z, c='b')

            # plot gripper position and action direction
            ax.scatter(gripper_pose[0],
                       gripper_pose[1],
                       gripper_pose[2],
                       c='r',
                       marker='v')
            X, Y, Z = lines(gripper_pose, action_direction / args.act_mag)
            ax.plot(X, Y, Z, c='g')
            plt.pause(0.001)

            # check tendons and slippage
            done, num = robot.update_tendons()
            failure = robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done or failure:
                break

Example #16

Show file

File: drqn_test.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

def test_network(args, policy_net, normalizer, robot, state_space, performance):
    hidden_state, cell_state = policy_net.init_hidden_states(args.batch_size, args.device)

    action_space = ActionSpace(dp=0.06, df=10)
    ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down']
    broken_so_far = 0

    t = 0
    action = 4
    collision = 0
    
    while t < args.max_iter:
        # Observe state and normalize
        state = state_space.get_state()
        normalizer.observe(state)
        state = normalizer.normalize(state)

        torch_observation = torch.from_numpy(state).float().to(args.device).unsqueeze(0)
        model_out = policy_net(torch_observation, batch_size=1, time_step=1, hidden_state=hidden_state, cell_state=cell_state)
        out = model_out[0]
        hidden_state = model_out[1][0]
        cell_state = model_out[1][1]
        # print(out[0])
        action_key = int(torch.argmax(out[0]))
        action = select_action(action_key)
        performance['action_hist'][action] += 1

        # perform action
        delta = action_space.get_action(ACTIONS[action_key])['delta'][:3]
        target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta))
        target_pose = np.hstack((target_position, robot.get_gripper_jpos()[3:]))
        robot.move_joint(target_pose, True, args.grip, hap_sample=args.hap_sample)
        
        # Get reward
        done, num = robot.update_tendons()
        failure = robot.check_slippage()

        # Observe new state
        state_space.update(robot.get_gripper_xpos(),            # 24
                                 robot.get_all_touch_buffer(args.hap_sample))     # 30x6
        if num > broken_so_far:
            broken_so_far = num

        t += 1

        if done or failure:
            ratio_broken = float(num) / float(NUM_TENDON)
            if ratio_broken < 0.2:
                performance['tendon_hist'][0] += 1
            elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                performance['tendon_hist'][1] += 1
            elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                performance['tendon_hist'][2] += 1
            elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                performance['tendon_hist'][3] += 1
            else:
                performance['tendon_hist'][4] += 1
            performance['num_broken'].append(num)
            if done:
                performance['success'].append(1)
                performance['time'].append(t + 1)
            if failure:
                performance['success'].append(0)
                performance['time'].append(t + 1)
            performance['collision'].append(collision)
            return performance
            break

    ################## exceed max iterations ####################
    performance['success'].append(0)
    performance['time'].append(args.max_iter)
    ratio_broken = float(num) / float(NUM_TENDON)
    performance['num_broken'].append(num)
    if ratio_broken < 0.2:
        performance['tendon_hist'][0] += 1
    elif ratio_broken >= 0.2 and ratio_broken < 0.4:
        performance['tendon_hist'][1] += 1
    elif ratio_broken >= 0.4 and ratio_broken < 0.6:
        performance['tendon_hist'][2] += 1
    elif ratio_broken >= 0.6 and ratio_broken < 0.8:
        performance['tendon_hist'][3] += 1
    else:
        performance['tendon_hist'][4] += 1
    performance['collision'].append(collision)
    return performance

Example #17

Show file

File: multistep_pomdp.py Project: kracon7/Multi-Step-Recurrent-Q-Learning-for-Robotic-Velcro-Peeling

    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.ftdim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.ftdim, args.outdim).to(args.device)
        if args.position:
            self.tactile_net_1 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
        elif args.force:
            self.tactile_net_1 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
        else:
            self.tactile_net_1 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.tactile_optimizer_1 = optim.RMSprop(
            self.tactile_net_1.parameters(), lr=args.lr)
        self.tactile_optimizer_2 = optim.RMSprop(
            self.tactile_net_2.parameters(), lr=args.lr)
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.weight_policy:
            if os.path.exists(args.weight_policy):
                checkpoint = torch.load(args.weight_policy)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.weight_policy),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_tactile:
            checkpoint = torch.load(args.weight_tactile)
            self.tactile_net_1.load_state_dict(checkpoint['tactile_net_1'])
            self.tactile_optimizer_1.load_state_dict(
                checkpoint['tactile_optimizer_1'])
            self.tactile_net_2.load_state_dict(checkpoint['tactile_net_2'])
            self.tactile_optimizer_2.load_state_dict(
                checkpoint['tactile_optimizer_2'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        tactile_obs_space = TactileObs(
            robot.get_gripper_xpos(),  # 24
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 6

        # Main training loop
        for ii in range(start_episode, args.epochs):
            self.steps_done += 1
            start_time = time.time()
            act_sequence = []
            act_length = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            # pick a random action initially
            action = random.randrange(0, 5)
            current_state = None
            next_state = None

            t = 0

            while t < args.max_iter:
                if not args.quiet and t == 0:
                    print("Running training episode: {}".format(ii, t))

                if args.position:
                    multistep_obs = np.empty((0, args.indim - 6))
                elif args.force:
                    multistep_obs = np.empty((0, args.indim - 390))
                else:
                    multistep_obs = np.empty((0, args.indim))

                prev_action = action

                for k in range(args.len_ub):
                    # Observe tactile features and stack them
                    tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(tactile_obs)
                    tactile_obs = normalizer.normalize(tactile_obs)

                    if args.position:
                        tactile_obs = tactile_obs[6:]
                    elif args.force:
                        tactile_obs = tactile_obs[:6]

                    multistep_obs = np.vstack((multistep_obs, tactile_obs))

                    # current jpos
                    current_pos = robot.get_gripper_jpos()[:3]

                    # Perform action
                    delta = action_space.get_action(
                        self.ACTIONS[action])['delta'][:3]
                    target_position = np.add(robot.get_gripper_jpos()[:3],
                                             np.array(delta))
                    target_pose = np.hstack(
                        (target_position, robot.get_gripper_jpos()[3:]))
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Observe new state
                    tactile_obs_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x6

                    displacement = la.norm(robot.get_gripper_jpos()[:3] -
                                           current_pos)

                    if displacement / 0.06 < 0.7:
                        break

                # input stiched multi-step tactile observation into tactile-net to generate tactile feature
                action, hidden_state_1, cell_state_1 = self.select_action(
                    multistep_obs, hidden_state_1, cell_state_1)

                if t == 0:
                    next_state = multistep_obs.copy()
                else:
                    current_state = next_state.copy()
                    next_state = multistep_obs.copy()

                # record actions in this epoch
                act_sequence.append(prev_action)
                act_length.append(k)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    if failure:
                        reward = -20
                    else:
                        reward = 0

                t += k + 1
                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                if done or failure:
                    next_state = None

                # Push new Transition into memory
                if t > k + 1:
                    localMemory.append(
                        Transition(current_state, prev_action, next_state,
                                   reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print(
                        "{} of Actions in this epoch are: {} \n Action length are: {}"
                        .format(len(act_sequence), act_sequence, act_length))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(output_dir,
                                         'policy_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)
                save_path = os.path.join(output_dir,
                                         'tactile_' + str(ii) + '.pth')
                torch.save(
                    {
                        'tactile_net_1':
                        self.tactile_net_1.state_dict(),
                        'tactile_net_2':
                        self.tactile_net_2.state_dict(),
                        'tactile_optimizer_1':
                        self.tactile_optimizer_1.state_dict(),
                        'tactile_optimizer_2':
                        self.tactile_optimizer_2.state_dict(),
                    }, save_path)

                write_results(os.path.join(output_dir, 'results_pomdp.pkl'),
                              print_variables)

                self.memory.save_memory(
                    os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables