def reset(self):
        args = self.args
        model, self.model_params = init_model(args.model_path, geom=False)
        sim = MjSim(model)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        sim_param = SimParameter(sim)
        robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold)
        self.robot = robot
        robot.reset_simulation()
        ret = robot.grasp_handle()
        if not ret:
            return None

        self.obs_space = Observation(robot.get_gripper_jpos(),
                                     robot.get_num_broken_buffer(args.hap_sample),
                                     robot.get_main_touch_buffer(args.hap_sample)) 

        # Get current observation
        self.broken_so_far = 0

        self.obs_space.update(robot.get_gripper_jpos(),            # 6
                                 robot.get_num_broken_buffer(args.hap_sample),         # 30x1
                                 robot.get_main_touch_buffer(args.hap_sample))     # 30x12

        obs = self.obs_space.get_state()

        reward = 0
        done = False
        info = {'slippage': False}
        return obs, reward, done, info
Ejemplo n.º 2
0
def main(args):
    parent = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    # test_xml = os.path.join(parent, 'tests/test_xmls/case1_flat_-0.6392_0.2779_0_0_2.6210_inf.xml')
    test_xml = os.path.join(parent, 'models/updateVelcro.xml')
    num_success = 0
    num_trial = 0
    total_time = 0
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3)

    ACTIONS = [
        0.06 * np.array([0, 0, 1]), 0.06 * np.array([0, 1, 0]),
        0.06 * np.array([1, 0, 0]), 0.06 * np.array([0, -1, 0]),
        0.06 * np.array([-1, 0, 0])
    ]

    np.set_printoptions(suppress=True)

    # load model and sim parameters
    model = load_model_from_path(test_xml)
    sim = MjSim(model)
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    sim_param = SimParameter(sim)
    robot = RobotSim(sim, viewer, sim_param, args.render, 0.06)

    for i in range(args.num_try):
        num_trial = num_trial + 1
        # try n times for each test case
        sequence = get_action_sequence()
        print('\n\nAction sequence is: {}'.format(sequence))
        result = run_openloop(args, robot, ACTIONS, sequence, 60,
                              (fig, ax1, ax2, ax3))

        if result['done']:
            num_success = num_success + 1
            total_time = total_time + result['time']
            print(
                'Execution succeed, it took {} steps, num trail is {}, success rate is {}'
                .format(result['time'], num_trial, num_success / num_trial))

    success_rate = float(num_success) / float(args.num_try)
    if num_success > 0:
        ave_time = float(total_time) / float(num_success)
    else:
        ave_time = 0
    print(
        '\n\nTotal number of trails is: {}, success rate is: {}%, average time it took is: {}'
        .format(num_trial, success_rate * 100, ave_time))
    print(
        '=======================================================================================\n\n\n\n\n\n\n'
    )
def main(args):

    if not os.path.isdir(os.path.dirname(args.result_path)):
        os.makedirs(os.path.dirname(args.result_path))

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None

    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    velcro_util = VelcroUtil(robot, robot.mj_sim_param)
    tactile_obs_space = TactileObs(
        robot.get_gripper_jpos(),  # 6
        robot.get_all_touch_buffer(args.hap_sample))  # 30 x 12

    traj_logger = TrajLogger(args, robot, velcro_util, tactile_obs_space)

    all_trajectories = []
    performance = {
        'time': [],
        'success': [],
        'action_hist': [0, 0, 0, 0, 0, 0]
    }

    for i in range(args.num_traj):
        velcro_param = init_model(robot.mj_sim)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        performance, expert_traj = traj_logger.test_network(performance)
        if len(expert_traj) > 0:
            all_trajectories.append(expert_traj)

        print('\n\nFinished trajectory {}, sampled {} steps in this episode'.
              format(i, len(expert_traj)))
        geom_type, origin_offset, euler, radius = velcro_param
        print('Velcro parameters are:{} {} {} {}'.format(
            geom_type, origin_offset, euler, radius))
        print(performance)

    print('\nCollected {} successful expert trajectories in total'.format(
        len(all_trajectories)))
    output = {'args': args, 'traj': all_trajectories}
    write_results(args.result_path, output)
def main(args):
    num_test = len(test_xml_names)
    num_success = 0
    num_trial = 0
    total_time = 0

    print('                    ++++++++++++++++++++++++++')
    print('                    +++ Now running case {} +++'.format(args.case))
    print('                    ++++++++++++++++++++++++++\n\n')

    for i in range(num_test):
        f = test_xml_names[i]
        if 'case{}'.format(args.case) in f:
            # one more trail for case1
            num_trial = num_trial + 1
            # print('Now testing {}'.format(f))

            # load model and sim parameters
            model = load_model_from_path(os.path.join(test_xml_dir, f))
            sim = MjSim(model)
            if args.render:
                viewer = MjViewer(sim)
            else:
                viewer = None
            sim_param = SimParameter(sim)
            robot = RobotSim(sim, viewer, sim_param, args.render, 0.05)

            # try n times for each test case
            action_mag = 0.05
            actions = [
                action_mag * np.array([1, 0, 0, 0, 0, 0]),
                action_mag * np.array([-1, 0, 0, 0, 0, 0]),
                action_mag * np.array([0, 1, 0, 0, 0, 0]),
                action_mag * np.array([0, -1, 0, 0, 0, 0]),
                action_mag * np.array([0, 0, 1, 0, 0, 0]),
                action_mag * np.array([0, 0, -1, 0, 0, 0]),
            ]

            result = run_closeloop(robot, action_mag, actions)

            if result['done']:
                num_success = num_success + 1
                total_time = total_time + result['time']
                print(
                    'Execution succeed, it took {} steps, num trail is {}, success rate is {}'
                    .format(result['time'], num_trial,
                            num_success / num_trial))
def main(args):
    parent = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    model_path = os.path.join(parent, 'models/flat_velcro.xml')
    model = load_model_from_path(model_path)
    sim = MjSim(model)
    viewer = MjViewer(sim)
    sim_param = SimParameter(sim)
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)

    result = []

    num_tests = 4

    for k in range(num_tests):
        robot.reset_simulation()
        tendon_idx = sim_param.velcro_tendon_id[-(k) * 36:0]
        sim.model.tendon_stiffness[tendon_idx] = 0
        # sim.model.body_pos[sim.model._body_name2id['handle']] = np.array([-0.06* (k+1), 0.05, 0.02])
        # sim.model.tendon_range[sim_param.handle_tendon_id,1] = (k+1) *np.array([.061, .061, .061, .061, .061, .061])

        N = 10
        alpha = np.arange(1 / N, 1, 1 / N)
        for i in range(N - 1):
            dP = np.array([cos(alpha[i] * np.pi), 0,
                           sin(alpha[i] * np.pi)]) * args.act_mag
            ret, feedback = run_openloop(robot, dP, args)

            result.append({
                'percent': k * 36 / 216,
                'alpha': alpha[i],
                'feedback': feedback
            })

    # clean feedback data
    for item in result:
        fb = item['feedback']
        fb['touch'] = fb['touch'][0][:, 1:3]
        fb['touch'][:, 1] -= 300
        fb['touch'] = la.norm(fb['touch'], axis=1)
        fb['pose'] = fb['pose'][0][:, [0, 2]]

    with open(args.output_path, 'wb') as file:
        pickle.dump(result, file)
def main(args):
    test_xml = os.path.join(PARENT_DIR, 'models/flat_velcro.xml')

    # load model and sim parameters
    model = load_model_from_path(test_xml)
    sim = MjSim(model)
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    sim_param = SimParameter(sim)
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)

    act_mag = args.act_mag
    act_z = act_mag * np.array([0, 0, 1])
    act_x = act_mag * np.array([1, 0, 0])

    np.set_printoptions(suppress=True)

    fig, axes = plt.subplots(5, 2)

    success = []
    N = args.num_increment
    for i in range(N):
        velcro_params = gripper_util.init_model(robot.mj_sim)

        theta = float(i + 1) / float(N) * 3.14159
        action = math.sin(theta) * act_z + math.cos(theta) * act_x
        result = run_action(args, robot, action)
        print(i, result['done'], result['num_broken'], sim.data.sensordata[:])
        if result['done']:
            success.append(1)
        else:
            success.append(0)

        axes[i // 2, i % 2].plot(result['tactile'][:, 0], 'r')
        axes[i // 2, i % 2].plot(result['tactile'][:, 1], 'g')
        axes[i // 2, i % 2].plot(result['tactile'][:, 2], 'b')
        axes[i // 2, i % 2].set_ylim(-800, 800)
        plt.pause(0.01)
def main(args):
    if not os.path.isdir(args.result_dir):
        os.makedirs(args.result_dir)
    parent = os.path.dirname(os.path.abspath(__file__))
    # load test xml files
    test_file = os.path.join(parent, 'tests/test_xmls/temp_1_{}.pickle'.format(args.case))
    params = pickle.load(open(test_file, 'rb'))
    # params = params[:6]
    if args.shuffle:
        random.shuffle(params)

    num_test = len(params)
    print('                    ++++++++++++++++++++++++++')
    print('                    +++ Now running case {} +++'.format(args.case))
    print('                    ++++++++++++++++++++++++++\n\n')

    policy_net = DRQN(args.indim, args.outdim)
    policy_net.load_state_dict(torch.load(args.weight_path)['policy_net_1'])
    policy_net.eval()

    # load normalizer
    # Setup the state normalizer
    normalizer = Multimodal_Normalizer(num_inputs = args.indim, device=args.device)
    if args.normalizer_file:
        if os.path.exists(args.normalizer_file):
            normalizer.restore_state(args.normalizer_file)

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None

    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)

    tactile_obs_space = TactileObs(robot.get_gripper_xpos(),            # 24
                         robot.get_all_touch_buffer(args.hap_sample))   # 30 x 6

    performance = {'time':[], 'success':[], 'num_broken':[], 'tendon_hist':[0,0,0,0,0], 'collision':[], 
                    'action_hist': [0,0,0,0,0,0]}
    
    for i in range(num_test):
        velcro_params = params[i]
        geom, origin_offset, euler, radius = velcro_params
        print('\n\nTest {} Velcro parameters are: {}, {}, {}, {}'.format(i, geom, origin_offset, euler, radius))
        change_sim(robot.mj_sim, geom, origin_offset, euler, radius)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        performance = test_network(args, policy_net, normalizer, robot, tactile_obs_space, performance)
        print('Success: {}, time: {}, num_broken: {}, collision:{} '.format(
                performance['success'][-1], performance['time'][-1], performance['num_broken'][-1], performance['collision'][-1]))

    print('Finished opening velcro with haptics test \n')
    success = np.array(performance['success'])
    time = np.array(performance['time'])
    print('Successfully opened the velcro in: {}% of cases'.format(100 * np.sum(success) / len(performance['success'])))
    print('Average time to open: {}'.format(np.average(time[success>0])))
    print('Action histogram for the test is: {}'.format(performance['action_hist']))

    out_fname = 'case{}.txt'.format(args.case)
    with open(os.path.join(args.result_dir, out_fname), 'w+') as f:
        f.write('Time: {}\n'.format(performance['time']))
        f.write('Success: {}\n'.format(performance['success']))
        f.write('Successfully opened the velcro in: {}% of cases\n'.format(100 * np.sum(success) / len(performance['success'])))
        f.write('Average time to open: {}\n'.format(np.average(time[success>0])))
        f.write('Num_broken: {}\n'.format(performance['num_broken']))
        f.write('Tendon histogram: {}\n'.format(performance['tendon_hist']))
        f.write('collision: {}\n'.format(performance['collision']))
        # f.write('high_success: {} low_success: {} '.format(high_success, low_success))
    f.close()
def main(args):

    policy_net = Geom_DQN(args.indim, args.outdim).to(args.device)
    policy_net.load_state_dict(torch.load(args.weight_expert)['policy_net_1'])
    policy_net.eval()

    normalizer = Geom_Normalizer(args.indim, device=args.device)
    normalizer.restore_state(args.norm_expert)

    tactile_normalizer = Multimodal_Normalizer(num_inputs = args.tactile_indim, device=args.device)
    tactile_normalizer.restore_state(args.tactile_normalizer)

    action_space = ActionSpace(dp=0.06, df=10)

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None

    # load all velcro parameters
    model_dir = os.path.dirname(args.model_path)
    param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    velcro_params = pickle.load(open(param_path, 'rb'))

    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    velcro_util = VelcroUtil(robot, robot.mj_sim_param)

    traj_logger = TrajLogger(args, robot, velcro_util, policy_net, normalizer, tactile_normalizer)

    all_trajectories = []
    all_success = [None for i in range(len(velcro_params))]
    all_time = [None for i in range(len(velcro_params))]

    for i in range(len(velcro_params)):
        geom_type, origin_offset, euler, radius = velcro_params[i]
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        performance = {'time':[], 'success':[], 'action_hist':[0,0,0,0,0,0]}
        min_time = args.max_iter
        all_trajectories.append(None)
    
        for j in range(args.num_try):
            robot.reset_simulation()
            ret = robot.grasp_handle()
            performance, expert_traj = traj_logger.test_network(performance)
            if performance['time'][-1] <= min_time:
                all_trajectories[i] = expert_traj
                min_time = performance['time'][-1]
                all_success[i] = performance['success'][j]
                all_time[i] = performance['time'][-1]


        print('\n\nFinished trajectory {}'.format(i))
        print('Velcro parameters are:{} {} {} {}'.format(geom_type, origin_offset, euler, radius))
        print(performance)
        success = np.array(performance['success'])
        time = np.array(performance['time'])
        print('Successfully opened the velcro in: {}% of cases'.format(100 * np.sum(success) / len(performance['success'])))
        print('Average time to open: {}'.format(np.average(time[success>0])))
        print('Action histogram for the test is: {}'.format(performance['action_hist']))

    print('\nCollected {} successful expert trajectories in total'.format(np.sum(np.array(all_success))))
    print('Total success and time: {}, {}'.format(all_success, all_time))
    output = {'args': args, 'traj': all_trajectories, 'success': all_success, 'all_time': all_time}
    output_path = args.result_dir + 'oneshot_expert_traj.pkl'
    write_results(output_path, output)
def main(args):
    if not os.path.isdir(args.result_dir):
        os.makedirs(args.result_dir)

    conv_net = ConvNet(args.outdim, args.depth).to(args.device)
    if os.path.exists(args.weight_convnet):
        checkpoint = torch.load(args.weight_convnet)
        conv_net.load_state_dict(checkpoint['conv_net'])

    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    # load all velcro parameters
    # model_dir = os.path.dirname(args.model_path)
    # param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    param_path = '/home/jc/research/corl2019_learningHaptics/tests/test_xmls/case_{}.pickle'.format(
        args.case)
    velcro_params = pickle.load(open(param_path, 'rb'))
    if args.shuffle:
        random.shuffle(velcro_params)

    velcro_util = VelcroUtil(robot, sim_param)
    state_space = Observation(
        robot.get_gripper_jpos(),  # 6
        velcro_util.break_center(),  # 6
        velcro_util.break_norm())
    action_space = ActionSpace(dp=0.06, df=10)
    performance = {
        'time': [],
        'success': [],
        'num_broken': [],
        'tendon_hist': [0, 0, 0, 0, 0]
    }

    for n, item in enumerate(velcro_params):
        geom_type, origin_offset, euler, radius = item
        print('\n\nTest {} Velcro parameters are: {}, {}, {}, {}'.format(
            n, geom_type, origin_offset, euler, radius))
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        broken_so_far = 0

        # ax.clear()

        for t in range(args.max_iter):
            # take image an predict norm direction
            # Get image and normalize it
            img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
            if args.depth:
                depth = norm_depth(img[1])
                img = norm_img(img[0])
                img_norm = np.empty((4, args.img_w, args.img_h))
                img_norm[:3, :, :] = img
                img_norm[3, :, :] = depth
            else:
                img_norm = norm_img(img)

            torch_img = torch.from_numpy(img_norm).float().to(
                args.device).unsqueeze(0)
            pred = conv_net.forward(torch_img).detach().cpu()
            fl_norm = pred[0][3:6].numpy()
            break_dir_norm = pred[0][6:9].numpy()
            # normalize these vectors
            fl_norm = fl_norm / la.norm(fl_norm)
            break_dir_norm = break_dir_norm / la.norm(break_dir_norm)

            ################ choose action and get action direction vector ################
            action_direction = args.act_mag * (-0.5 * fl_norm +
                                               0.5 * break_dir_norm)
            action_key = (action_vec @ action_direction).argmax()
            action_direction = action_space.get_action(
                ACTIONS[action_key])['delta'][:3]

            gripper_pose = robot.get_gripper_jpos()[:3]

            # Perform action
            target_position = np.add(robot.get_gripper_jpos()[:3],
                                     action_direction)
            target_pose = np.hstack(
                (target_position, robot.get_gripper_jpos()[3:]))
            robot.move_joint(target_pose, True, 300, hap_sample=30)

            # check tendons and slippage
            done, num = robot.update_tendons()
            failure = robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done or failure:
                ratio_broken = float(num) / float(NUM_TENDON)
                if ratio_broken < 0.2:
                    performance['tendon_hist'][0] += 1
                elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                    performance['tendon_hist'][1] += 1
                elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                    performance['tendon_hist'][2] += 1
                elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                    performance['tendon_hist'][3] += 1
                else:
                    performance['tendon_hist'][4] += 1
                performance['num_broken'].append(num)
                if done:
                    performance['success'].append(1)
                    performance['time'].append(t + 1)
                if failure:
                    performance['success'].append(0)
                    performance['time'].append(t + 1)
                break

            if t == args.max_iter - 1:

                ################## exceed max iterations ####################
                performance['success'].append(0)
                performance['time'].append(args.max_iter)
                ratio_broken = float(num) / float(NUM_TENDON)
                performance['num_broken'].append(num)
                if ratio_broken < 0.2:
                    performance['tendon_hist'][0] += 1
                elif ratio_broken >= 0.2 and ratio_broken < 0.4:
                    performance['tendon_hist'][1] += 1
                elif ratio_broken >= 0.4 and ratio_broken < 0.6:
                    performance['tendon_hist'][2] += 1
                elif ratio_broken >= 0.6 and ratio_broken < 0.8:
                    performance['tendon_hist'][3] += 1
                else:
                    performance['tendon_hist'][4] += 1

        # print episode performance
        print('Success: {}, time: {}, num_broken: {} '.format(
            performance['success'][-1], performance['time'][-1],
            performance['num_broken'][-1]))

    print('Finished opening velcro with haptics test \n')
    success = np.array(performance['success'])
    time = np.array(performance['time'])
    print('Successfully opened the velcro in: {}% of cases'.format(
        100 * np.sum(success) / len(performance['success'])))
    print('Average time to open: {}'.format(np.average(time[success > 0])))

    out_fname = 'vision_case{}.txt'.format(args.case)
    with open(os.path.join(args.result_dir, out_fname), 'w+') as f:
        f.write('Time: {}\n'.format(performance['time']))
        f.write('Success: {}\n'.format(performance['success']))
        f.write('Successfully opened the velcro in: {}% of cases\n'.format(
            100 * np.sum(success) / len(performance['success'])))
        f.write('Average time to open: {}\n'.format(
            np.average(time[success > 0])))
        f.write('Num_broken: {}\n'.format(performance['num_broken']))
        f.write('Tendon histogram: {}\n'.format(performance['tendon_hist']))
    f.close()
Ejemplo n.º 10
0
def main(args):
    # Create robot, reset simulation and grasp handle
    model = load_model_from_path(args.model_path)
    sim = MjSim(model)
    sim_param = SimParameter(sim)
    sim.step()
    if args.render:
        viewer = MjViewer(sim)
    else:
        viewer = None
    robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh)
    # load all velcro parameters
    model_dir = os.path.dirname(args.model_path)
    param_path = os.path.join(model_dir, 'uniform_sample.pkl')
    velcro_params = pickle.load(open(param_path, 'rb'))

    velcro_util = VelcroUtil(robot, sim_param)
    state_space = Observation(
        robot.get_gripper_jpos(),  # 6
        velcro_util.break_center(),  # 6
        velcro_util.break_norm())
    action_space = ActionSpace(dp=0.06, df=10)

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.clear()

    for item in VELCRO_PARAMS:
        geom_type, origin_offset, euler, radius = item
        print('Velcro parameters are: {}, {}, {}, {}'.format(
            geom_type, origin_offset, euler, radius))
        change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius)
        robot.reset_simulation()
        ret = robot.grasp_handle()
        broken_so_far = 0

        ax.clear()

        for i in range(args.max_iter):
            # check velcro breakline geometry
            norms = velcro_util.break_norm()
            centers = velcro_util.break_center()
            fl_center = centers[:3]
            fs_center = centers[3:]
            fl_norm = norms[:3]
            fs_norm = norms[3:6]
            fs_center = centers[3:]
            break_dir_norm = norms[6:9]

            action_direction = args.act_mag * (-0.5 * fl_norm +
                                               0.5 * break_dir_norm)

            gripper_pose = robot.get_gripper_jpos()[:3]

            # Perform action
            target_position = np.add(robot.get_gripper_jpos()[:3],
                                     action_direction)
            target_pose = np.hstack(
                (target_position, robot.get_gripper_jpos()[3:]))
            robot.move_joint(target_pose, True, 300, hap_sample=30)

            ax.scatter(fl_center[0],
                       fl_center[1],
                       fl_center[2],
                       c='r',
                       marker='o')
            ax.scatter(fs_center[0],
                       fs_center[1],
                       fs_center[2],
                       c='r',
                       marker='o')
            X, Y, Z = lines(fl_center, fl_norm)
            ax.plot(X, Y, Z, c='b')
            X, Y, Z = lines(fs_center, fs_norm)
            ax.plot(X, Y, Z, c='b')

            # plot gripper position and action direction
            ax.scatter(gripper_pose[0],
                       gripper_pose[1],
                       gripper_pose[2],
                       c='r',
                       marker='v')
            X, Y, Z = lines(gripper_pose, action_direction / args.act_mag)
            ax.plot(X, Y, Z, c='g')
            plt.pause(0.001)

            # check tendons and slippage
            done, num = robot.update_tendons()
            failure = robot.check_slippage()
            if num > broken_so_far:
                broken_so_far = num

            if done or failure:
                break
    def train_MDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device)

        self.target_net = DQN(args.indim, args.outdim).to(args.device)
        self.target_net.load_state_dict(self.policy_net_1.state_dict())
        self.target_net.eval()

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                         args.lr)
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                         args.lr)
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(),
                                         args.lr)
        self.memory = ReplayMemory(500000)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Normalizer(args.indim, device=args.device)
        print_variables = {'durations': [], 'rewards': [], 'loss': []}

        # Load old checkpoint if provided
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_2.load_state_dict(
                    checkpoint['model_state_dict'])
                self.policy_net_3.load_state_dict(
                    checkpoint['model_state_dict'])
                self.target_net.load_state_dict(checkpoint['model_state_dict'])
                start_episode = checkpoint['epoch']
                self.steps_done = start_episode
                self.optimizer_1.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_2.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                self.optimizer_3.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_geom_mdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        action_space = ActionSpace(dp=0.06, df=10)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            if args.sim:
                # Create robot, reset simulation and grasp handle
                model, model_params = init_model(args.model_path)
                sim = MjSim(model)
                sim.step()
                viewer = None
                if args.render:
                    viewer = MjViewer(sim)
                else:
                    viewer = None

                sim_param = SimParameter(sim)
                robot = RobotSim(sim, viewer, sim_param, args.render,
                                 self.breaking_threshold)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Get current state
                state_space = Observation(
                    robot.get_gripper_jpos(),
                    robot.get_shear_buffer(args.hap_sample),
                    robot.get_all_touch_buffer(args.hap_sample))
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 20 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                state = state_space.get_state()
                if args.position:
                    state = state[6:]
                if args.shear:
                    indices = np.ones(len(state), dtype=bool)
                    indices[6:166] = False
                    state = state[indices]
                if args.force:
                    state = state[:166]
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = self.select_action(state)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(state_space.get_current_position(),
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    state_space.update(
                        robot.get_gripper_jpos(),
                        robot.get_shear_buffer(args.hap_sample),
                        robot.get_all_touch_buffer(args.hap_sample))

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = state_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                self.memory.push(state, action, next_state, reward)

                # Optimize the model
                loss = self.optimize_model()
                #        if loss:
                #            print_variables['loss'].append(loss.item())

                # If we are done, reset the model
                if done or failure:
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   'Training MDP')
                    print("Model parameters: {}".format(model_params))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))
                    break

            # Update the target network, every x iterations
            if ii % 10 == 0:
                self.target_net.load_state_dict(self.policy_net_1.state_dict())

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epoch': ii,
                        'model_state_dict': self.target_net.state_dict(),
                        'optimizer_state_dict': self.optimizer_1.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables
# baseline demo for peel velcro with paraGripper
import numpy as np
import matplotlib.pyplot as plt
import math

PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, PARENT_DIR)
from utils.gripper_util import *
from sim_param import SimParameter

from robot_sim import RobotSim


model_path = os.path.join(PARENT_DIR, 'models/flat_velcro.xml')
model, sim, viewer= load_model(model_path)
sim_param = SimParameter(sim)
robot = RobotSim(sim, viewer, sim_param, True, 0.06)
np.set_printoptions(suppress=True)

for i in range(3):
    change_sim(sim, 'flat',[0.2, -0.5, 0], [0, 0.3, 0.5])
    robot.reset_simulation()
    robot.grasp_handle()
    
    sim.data.ctrl[sim_param.gripper_pos_ctrl_id] = np.array([0, 0, 0.5, 0, 0, 0])
    sim.data.ctrl[sim_param.gripper_vel_ctrl_id] = np.array([0, 0, 0.5, 0, 0, 0])
    for i in range(1000):
        robot.mj_simulation_step()
        viewer.render()

    for j in range(20000):
    def train_POMDP(self):
        args = self.args
        # Create the output directory if it does not exist
        if not os.path.isdir(args.output_dir):
            os.makedirs(args.output_dir)

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.conv_net_1 = ConvNet(args.ftdim, args.depth).to(args.device)
        self.conv_net_2 = ConvNet(args.ftdim, args.depth).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.conv_optimizer_1 = optim.RMSprop(self.conv_net_1.parameters(),
                                              lr=1e-5)
        self.conv_optimizer_2 = optim.RMSprop(self.conv_net_2.parameters(),
                                              lr=1e-5)
        self.memory = RecurrentMemory(70)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim - args.ftdim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.conv_net_1.load_state_dict(checkpoint['conv_net_1'])
                self.conv_net_2.load_state_dict(checkpoint['conv_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                self.conv_optimizer_1.load_state_dict(
                    checkpoint['conv_optimizer_1'])
                self.conv_optimizer_2.load_state_dict(
                    checkpoint['conv_optimizer_2'])
                start_episode = checkpoint['epoch']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_conv:
            checkpoint = torch.load(args.weight_conv)
            self.conv_net_1.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_1.load_state_dict(checkpoint['conv_optimizer'])
            self.conv_net_2.load_state_dict(checkpoint['conv_net'])
            self.conv_optimizer_2.load_state_dict(checkpoint['conv_optimizer'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)
        tactile_obs_space = TactileObs(
            robot.get_gripper_jpos(),  # 6
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 12

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            act_sequence = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                tactile_obs = tactile_obs_space.get_state()
                normalizer.observe(tactile_obs)
                tactile_obs = normalizer.normalize(tactile_obs)
                # Get image and normalize it
                img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth)
                if args.depth:
                    depth = norm_depth(img[1])
                    img = norm_img(img[0])
                    img_norm = np.empty((4, args.img_w, args.img_h))
                    img_norm[:3, :, :] = img
                    img_norm[3, :, :] = depth
                else:
                    img_norm = norm_img(img)

                observation = [tactile_obs, img_norm]
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))
                robot.move_joint(target_pose,
                                 True,
                                 self.gripping_force,
                                 hap_sample=args.hap_sample)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    reward = 0

                # Observe new state
                tactile_obs_space.update(
                    robot.get_gripper_jpos(),  # 6
                    robot.get_all_touch_buffer(args.hap_sample))  # 30x12

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(next_tactile_obs)
                    next_tactile_obs = normalizer.normalize(next_tactile_obs)
                    # Get image and normalize it
                    next_img = robot.get_img(args.img_w, args.img_h, 'c1',
                                             args.depth)
                    if args.depth:
                        next_depth = norm_depth(next_img[1])
                        next_img = norm_img(next_img[0])
                        next_img_norm = np.empty((4, args.img_w, args.img_h))
                        next_img_norm[:3, :, :] = next_img
                        next_img_norm[3, :, :] = next_depth
                    else:
                        next_img_norm = norm_img(next_img)
                    next_state = [next_tactile_obs, next_img_norm]
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    args.output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'conv_net_1': self.conv_net_1.state_dict(),
                        'conv_net_2': self.conv_net_2.state_dict(),
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'conv_optimizer_1': self.conv_optimizer_1.state_dict(),
                        'conv_optimizer_2': self.conv_optimizer_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)

        # Save normalizer state for inference
        normalizer.save_state(
            os.path.join(args.output_dir, 'normalizer_state.pickle'))

        self.memory.save_memory(os.path.join(args.output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables
def main(args):
    test_xml_names = []
    test_xml_dir = os.path.join(parent, 'tests/test_xmls/')
    for file in os.listdir(test_xml_dir):
        if 'case' in file:
            test_xml_names.append(file)

    np.set_printoptions(suppress=True)

    num_test = len(test_xml_names)
    num_success = 0
    num_trial = 0
    total_time = 0

    act_mag = args.act_mag
    actions = [
        act_mag * np.array([1, 0, 0, 0, 0, 0]),
        act_mag * np.array([-1, 0, 0, 0, 0, 0]),
        act_mag * np.array([0, 1, 0, 0, 0, 0]),
        act_mag * np.array([0, -1, 0, 0, 0, 0]),
        act_mag * np.array([0, 0, 1, 0, 0, 0]),
        act_mag * np.array([0, 0, -1, 0, 0, 0])
    ]

    tendon_histogram = [0, 0, 0, 0]

    print('                    ++++++++++++++++++++++++++')
    print('                    +++ Now running case {} +++'.format(args.case))
    print('                    +++ Num try per case {} +++'.format(
        args.num_try))
    print('                    ++++++++++++++++++++++++++\n\n')

    for i in range(num_test):
        f = test_xml_names[i]
        if 'case{}'.format(args.case) in f:
            # one more trail for case1
            num_trial = num_trial + 1
            # print('Now testing {}'.format(f))

            # load model and sim parameters
            model = load_model_from_path(os.path.join(test_xml_dir, f))
            sim = MjSim(model)
            if args.render:
                viewer = MjViewer(sim)
            else:
                viewer = None
            sim_param = SimParameter(sim)
            robot = RobotSim(sim, viewer, sim_param, args.render,
                             args.break_thresh)

            # try n times for each test case
            for j in range(args.num_try):
                result = run_reward_greedy(args, robot, actions)
                print('Total number of broken tendon is: {}'.format(
                    result['num_broken']))
                ratio_broken = float(result['num_broken']) / float(
                    args.num_tendon)
                if ratio_broken < 0.25:
                    tendon_histogram[0] += 1
                elif ratio_broken >= 0.25 and ratio_broken < 0.5:
                    tendon_histogram[1] += 1
                elif ratio_broken >= 0.5 and ratio_broken < 0.75:
                    tendon_histogram[2] += 1
                else:
                    tendon_histogram[3] += 1

                if result['done']:
                    num_success = num_success + 1
                    total_time = total_time + result['time']
                    print(
                        'Execution succeed, it took {} steps, num trail is {}, success rate is {}'
                        .format(result['time'], num_trial,
                                num_success / num_trial))
                    break
    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device)
        self.policy_net_3 = DRQN(args.indim, args.outdim).to(args.device)

        # Set up the optimizer
        self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters())
        self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters())
        self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters())
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.checkpoint_file:
            if os.path.exists(args.checkpoint_file):
                checkpoint = torch.load(args.checkpoint_file)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_net_3.load_state_dict(checkpoint['policy_net_3'])
                self.optimizer_1.load_state_dict(checkpoint['optimizer_1'])
                self.optimizer_2.load_state_dict(checkpoint['optimizer_2'])
                self.optimizer_3.load_state_dict(checkpoint['optimizer_3'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.checkpoint_file),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        # Main training loop
        for ii in range(start_episode, args.epochs):
            start_time = time.time()
            self.steps_done += 1
            act_sequence = []
            if args.sim:
                sim_params = init_model(robot.mj_sim)
                robot.reset_simulation()
                ret = robot.grasp_handle()
                if not ret:
                    continue

                # Local memory for current episode
                localMemory = []

                # Get current observation
                hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                    batch_size=1, device=args.device)
                hidden_state_3, cell_state_3 = self.policy_net_3.init_hidden_states(
                    batch_size=1, device=args.device)
                observation_space = TactileObs(
                    robot.get_gripper_xpos(),  # 24
                    robot.get_all_touch_buffer(args.hap_sample))  # 30 x 7
                broken_so_far = 0

            for t in count():
                if not args.quiet and t % 50 == 0:
                    print("Running training episode: {}, iteration: {}".format(
                        ii, t))

                # Select action
                observation = observation_space.get_state()
                if args.position:
                    observation = observation[6:]
                if args.shear:
                    indices = np.ones(len(observation), dtype=bool)
                    indices[6:166] = False
                    observation = observation[indices]
                if args.force:
                    observation = observation[:166]
                normalizer.observe(observation)
                observation = normalizer.normalize(observation)
                action, hidden_state_1, cell_state_1 = self.select_action(
                    observation, hidden_state_1, cell_state_1)

                # record actions in this epoch
                act_sequence.append(action)

                # Perform action
                delta = action_space.get_action(
                    self.ACTIONS[action])['delta'][:3]
                target_position = np.add(robot.get_gripper_jpos()[:3],
                                         np.array(delta))
                target_pose = np.hstack(
                    (target_position, robot.get_gripper_jpos()[3:]))

                if args.sim:
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Get reward
                    done, num = robot.update_tendons()
                    failure = robot.check_slippage()
                    if num > broken_so_far:
                        reward = num - broken_so_far
                        broken_so_far = num
                    else:
                        reward = 0

                    # # Add a movement reward
                    # reward -= 0.05 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta)

                    # Observe new state
                    observation_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x7

                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                # Check if done
                if not done and not failure:
                    next_state = observation_space.get_state()
                    if args.position:
                        next_state = next_state[6:]
                    if args.shear:
                        indices = np.ones(len(next_state), dtype=bool)
                        indices[6:166] = False
                        next_state = next_state[indices]
                    if args.force:
                        next_state = next_state[:166]
                    normalizer.observe(next_state)
                    next_state = normalizer.normalize(next_state)
                else:
                    next_state = None

                # Push new Transition into memory
                localMemory.append(
                    Transition(observation, action, next_state, reward))

                # Optimize the model
                if t % 10 == 0:
                    loss = self.optimize_model()
        #        if loss:
        #            print_variables['loss'].append(loss.item())

        # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(sim_params))
                    print("Actions in this epoch are: {}".format(act_sequence))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(
                    output_dir, 'checkpoint_model_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_net_3': self.policy_net_3.state_dict(),
                        'optimizer_1': self.optimizer_1.state_dict(),
                        'optimizer_2': self.optimizer_2.state_dict(),
                        'optimizer_3': self.optimizer_3.state_dict(),
                    }, save_path)

            self.memory.save_memory(os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables
    def train_POMDP(self):
        args = self.args
        ROOT_DIR = os.path.dirname(os.path.dirname(
            os.path.abspath(__file__)))  # corl2019
        PARENT_DIR = os.path.dirname(ROOT_DIR)  # reserach
        # Create the output directory if it does not exist
        output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp',
                                  args.output_dir)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # write args to file
        with open(os.path.join(output_dir, 'args.txt'), 'w+') as f:
            json.dump(args.__dict__, f, indent=2)
        f.close()

        # Create our policy net and a target net
        self.policy_net_1 = DRQN(args.ftdim, args.outdim).to(args.device)
        self.policy_net_2 = DRQN(args.ftdim, args.outdim).to(args.device)
        if args.position:
            self.tactile_net_1 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 6,
                                            args.ftdim).to(args.device)
        elif args.force:
            self.tactile_net_1 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim - 390,
                                            args.ftdim).to(args.device)
        else:
            self.tactile_net_1 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)
            self.tactile_net_2 = TactileNet(args.indim,
                                            args.ftdim).to(args.device)

        # Set up the optimizer
        self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(),
                                                lr=args.lr)
        self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(),
                                                lr=args.lr)
        self.tactile_optimizer_1 = optim.RMSprop(
            self.tactile_net_1.parameters(), lr=args.lr)
        self.tactile_optimizer_2 = optim.RMSprop(
            self.tactile_net_2.parameters(), lr=args.lr)
        self.memory = RecurrentMemory(800)
        self.steps_done = 0

        # Setup the state normalizer
        normalizer = Multimodal_Normalizer(num_inputs=args.indim,
                                           device=args.device)

        print_variables = {'durations': [], 'rewards': [], 'loss': []}
        start_episode = 0
        if args.weight_policy:
            if os.path.exists(args.weight_policy):
                checkpoint = torch.load(args.weight_policy)
                self.policy_net_1.load_state_dict(checkpoint['policy_net_1'])
                self.policy_net_2.load_state_dict(checkpoint['policy_net_2'])
                self.policy_optimizer_1.load_state_dict(
                    checkpoint['policy_optimizer_1'])
                self.policy_optimizer_2.load_state_dict(
                    checkpoint['policy_optimizer_2'])
                start_episode = checkpoint['epochs']
                self.steps_done = checkpoint['steps_done']
                with open(
                        os.path.join(os.path.dirname(args.weight_policy),
                                     'results_pomdp.pkl'), 'rb') as file:
                    plot_dict = pickle.load(file)
                    print_variables['durations'] = plot_dict['durations']
                    print_variables['rewards'] = plot_dict['rewards']

        if args.normalizer_file:
            if os.path.exists(args.normalizer_file):
                normalizer.restore_state(args.normalizer_file)

        if args.memory:
            if os.path.exists(args.memory):
                self.memory.load(args.memory)

        if args.weight_tactile:
            checkpoint = torch.load(args.weight_tactile)
            self.tactile_net_1.load_state_dict(checkpoint['tactile_net_1'])
            self.tactile_optimizer_1.load_state_dict(
                checkpoint['tactile_optimizer_1'])
            self.tactile_net_2.load_state_dict(checkpoint['tactile_net_2'])
            self.tactile_optimizer_2.load_state_dict(
                checkpoint['tactile_optimizer_2'])

        action_space = ActionSpace(dp=0.06, df=10)

        # Create robot, reset simulation and grasp handle
        model = load_model_from_path(args.model_path)
        sim = MjSim(model)
        sim_param = SimParameter(sim)
        sim.step()
        if args.render:
            viewer = MjViewer(sim)
        else:
            viewer = None

        robot = RobotSim(sim, viewer, sim_param, args.render,
                         self.break_threshold)

        tactile_obs_space = TactileObs(
            robot.get_gripper_xpos(),  # 24
            robot.get_all_touch_buffer(args.hap_sample))  # 30 x 6

        # Main training loop
        for ii in range(start_episode, args.epochs):
            self.steps_done += 1
            start_time = time.time()
            act_sequence = []
            act_length = []
            velcro_params = init_model(robot.mj_sim)
            robot.reset_simulation()
            ret = robot.grasp_handle()
            if not ret:
                continue

            # Local memory for current episode
            localMemory = []

            # Get current observation
            hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states(
                batch_size=1, device=args.device)
            hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states(
                batch_size=1, device=args.device)

            broken_so_far = 0

            # pick a random action initially
            action = random.randrange(0, 5)
            current_state = None
            next_state = None

            t = 0

            while t < args.max_iter:
                if not args.quiet and t == 0:
                    print("Running training episode: {}".format(ii, t))

                if args.position:
                    multistep_obs = np.empty((0, args.indim - 6))
                elif args.force:
                    multistep_obs = np.empty((0, args.indim - 390))
                else:
                    multistep_obs = np.empty((0, args.indim))

                prev_action = action

                for k in range(args.len_ub):
                    # Observe tactile features and stack them
                    tactile_obs = tactile_obs_space.get_state()
                    normalizer.observe(tactile_obs)
                    tactile_obs = normalizer.normalize(tactile_obs)

                    if args.position:
                        tactile_obs = tactile_obs[6:]
                    elif args.force:
                        tactile_obs = tactile_obs[:6]

                    multistep_obs = np.vstack((multistep_obs, tactile_obs))

                    # current jpos
                    current_pos = robot.get_gripper_jpos()[:3]

                    # Perform action
                    delta = action_space.get_action(
                        self.ACTIONS[action])['delta'][:3]
                    target_position = np.add(robot.get_gripper_jpos()[:3],
                                             np.array(delta))
                    target_pose = np.hstack(
                        (target_position, robot.get_gripper_jpos()[3:]))
                    robot.move_joint(target_pose,
                                     True,
                                     self.gripping_force,
                                     hap_sample=args.hap_sample)

                    # Observe new state
                    tactile_obs_space.update(
                        robot.get_gripper_xpos(),  # 24
                        robot.get_all_touch_buffer(args.hap_sample))  # 30x6

                    displacement = la.norm(robot.get_gripper_jpos()[:3] -
                                           current_pos)

                    if displacement / 0.06 < 0.7:
                        break

                # input stiched multi-step tactile observation into tactile-net to generate tactile feature
                action, hidden_state_1, cell_state_1 = self.select_action(
                    multistep_obs, hidden_state_1, cell_state_1)

                if t == 0:
                    next_state = multistep_obs.copy()
                else:
                    current_state = next_state.copy()
                    next_state = multistep_obs.copy()

                # record actions in this epoch
                act_sequence.append(prev_action)
                act_length.append(k)

                # Get reward
                done, num = robot.update_tendons()
                failure = robot.check_slippage()
                if num > broken_so_far:
                    reward = num - broken_so_far
                    broken_so_far = num
                else:
                    if failure:
                        reward = -20
                    else:
                        reward = 0

                t += k + 1
                # Set max number of iterations
                if t >= self.max_iter:
                    done = True

                if done or failure:
                    next_state = None

                # Push new Transition into memory
                if t > k + 1:
                    localMemory.append(
                        Transition(current_state, prev_action, next_state,
                                   reward))

                # Optimize the model
                if self.steps_done % 10 == 0:
                    self.optimize()

                # If we are done, reset the model
                if done or failure:
                    self.memory.push(localMemory)
                    if failure:
                        print_variables['durations'].append(self.max_iter)
                    else:
                        print_variables['durations'].append(t)
                    print_variables['rewards'].append(broken_so_far)
                    plot_variables(self.figure, print_variables,
                                   "Training POMDP")
                    print("Model parameters: {}".format(velcro_params))
                    print(
                        "{} of Actions in this epoch are: {} \n Action length are: {}"
                        .format(len(act_sequence), act_sequence, act_length))
                    print("Epoch {} took {}s, total number broken: {}\n\n".
                          format(ii,
                                 time.time() - start_time, broken_so_far))

                    break

            # Save checkpoints every vew iterations
            if ii % args.save_freq == 0:
                save_path = os.path.join(output_dir,
                                         'policy_' + str(ii) + '.pth')
                torch.save(
                    {
                        'epochs': ii,
                        'steps_done': self.steps_done,
                        'policy_net_1': self.policy_net_1.state_dict(),
                        'policy_net_2': self.policy_net_2.state_dict(),
                        'policy_optimizer_1':
                        self.policy_optimizer_1.state_dict(),
                        'policy_optimizer_2':
                        self.policy_optimizer_2.state_dict(),
                    }, save_path)
                save_path = os.path.join(output_dir,
                                         'tactile_' + str(ii) + '.pth')
                torch.save(
                    {
                        'tactile_net_1':
                        self.tactile_net_1.state_dict(),
                        'tactile_net_2':
                        self.tactile_net_2.state_dict(),
                        'tactile_optimizer_1':
                        self.tactile_optimizer_1.state_dict(),
                        'tactile_optimizer_2':
                        self.tactile_optimizer_2.state_dict(),
                    }, save_path)

                write_results(os.path.join(output_dir, 'results_pomdp.pkl'),
                              print_variables)

                self.memory.save_memory(
                    os.path.join(output_dir, 'memory.pickle'))

        if args.savefig_path:
            now = dt.datetime.now()
            self.figure[0].savefig(
                args.savefig_path +
                '{}_{}_{}.png'.format(now.month, now.day, now.hour),
                format='png')

        print('Training done')
        plt.show()
        return print_variables