Esempio n. 1
0
def main():
    map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/"
    #os.rename(map_dir+"house_train.world",map_dir+"maze.world")
    rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN)
    env = StartOpenAI_ROS_Environment(
            'MyTurtleBot2HumanModel-v1')
    #os.rename(map_dir+"maze.world",map_dir+"house_train.world")
    s_dim = env.observation_space.shape[0]+4
    a_dim = env.action_space.shape[0]
    a_bound = env.action_space.high
    #print('a_bound test', a_bound)
    sign_talker = rospy.Publisher('/sign', Int64, queue_size=1)
    ####################### load agent #######################################
    #TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C)
    #TD3.loader()

    d = [0.5,1,0.5,1]+list(6*np.ones([180]))
    #A = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]])
    #TGREEN =  '\033[32m' # Green Text
    #ENDC = '\033[m' # reset to the defaults

    #################### Start ####################
    i=0
    while i <2:
        s = env.reset()
        #print("env printing: ",env)
        t1 = time.time()
        min_distance_ob = s[-2]
        theta_bt_uh_ob = s[-1]
        #a_old = A[0]
        a_old = np.array([1,0,0,0])
        s = np.hstack((a_old,np.array(s[3:-2])/d))
        for j in range(MAX_EP_STEPS):
            ################ Mode #########################################
            #if i%2==0:
                #a = TD3.choose_action(s,1)
                #a = [1,0,0,0]
                #if a[1]<=0.9 and min_distance_ob<=0.7 and abs(theta_bt_uh_ob)<1.3:
                    #print (TGREEN + "Going to Crash!" , ENDC)
                    #a = A[1]
            '''else:
                a = A[0]'''

            #x_test = env.state_msg.pose.position.x
            #print('x_test',env.collect_data)

            a = np.array([1,0,0,0])
            s_, r, done, info = env.step(a)
            sign_talker.publish(int(r))
            if r <-200:
                done = 0
            min_distance_ob = s_[-2]
            theta_bt_uh_ob = s_[-1]
            a_old = a
            s=np.hstack((a_old,np.array(s_[3:-2])/d))
            if done or j == MAX_EP_STEPS-1:
                t = time.time()-t1
                if j>3:
                    i=i+1
                break
Esempio n. 2
0
    def __init__(self,
                 environment_name,
                 n_observations,
                 n_actions,
                 n_episodes=1000,
                 n_win_ticks=195,
                 min_episodes=100,
                 max_env_steps=None,
                 gamma=1.0,
                 epsilon=1.0,
                 epsilon_min=0.01,
                 epsilon_log_decay=0.995,
                 alpha=0.01,
                 alpha_decay=0.01,
                 batch_size=64,
                 monitor=False,
                 quiet=False):
        self.memory = deque(maxlen=100000)

        # self.env = gym.make(environment_name)
        self.env = StartOpenAI_ROS_Environment(environment_name)

        if monitor:
            self.env = gym.wrappers.Monitor(self.env,
                                            '../data/cartpole-1',
                                            force=True)

        self.input_dim = n_observations
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_log_decay
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.n_episodes = n_episodes
        self.n_win_ticks = n_win_ticks
        self.min_episodes = min_episodes
        self.batch_size = batch_size
        self.quiet = quiet
        if max_env_steps is not None:
            self.env._max_episode_steps = max_env_steps

        # Init model
        self.model = Sequential()

        self.model.add(Dense(24, input_dim=self.input_dim, activation='tanh'))
        self.model.add(Dense(48, activation='tanh'))
        self.model.add(Dense(self.n_actions, activation='linear'))
        self.model.compile(loss='mse',
                           optimizer=Adam(lr=self.alpha,
                                          decay=self.alpha_decay))
    def test(self):
        # Initialize OpenAI_ROS ENV
        LoadYamlFileParamsTest(rospackage_name="learning_ros", rel_path_from_package_to_file="config", yaml_file_name="aliengo_stand.yaml")
        env = StartOpenAI_ROS_Environment('AliengoStand-v0')
        saver = tf.train.Saver()

        time.sleep(3)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver.restore(sess, 'model_test/model_ddqn.ckpt')
            scores = []
            success_num = 0

            for e in range(self.EPISODES):
                state = env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                i = 0
                rewards = []
                while  not rospy.is_shutdown():  # until ros is not shutdown                #self.env.render()
                    # height prob is chosen as action
                    action = np.argmax(self.model.predict(state))
                    next_state, reward, done, _ = env.step(action)
                    time.sleep(0.01)

                    next_state = np.reshape(next_state, [1, self.state_size])
                    rewards.append(reward)
                    self.remember(state, action, reward, next_state, done)
                    
                    i += 1
                    if done:
                        # every step update target model
                        self.update_target_model()
                        

                        state = env.reset()
                        break 
                    else:
                        state = next_state   
                    self.replay()
                scores.append(sum(rewards))
               
        plt.plot(scores)
        plt.show()        
Esempio n. 4
0
def main():


	rospy.init_node('parrotdrone_goto_qlearn',
                    anonymous=True, log_level=rospy.WARN)

	task_and_robot_environment_name = rospy.get_param(
        '/drone/task_and_robot_environment_name')

	env = StartOpenAI_ROS_Environment(
        task_and_robot_environment_name)
Esempio n. 5
0
def launch(args):
    # create the ddpg_agent
    task_and_robot_environment_name = rospy.get_param(
        '/fetch/task_and_robot_environment_name')
    # to register our task env to openai env.
    # so that we don't care the output of this method for now.
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    # env = gym.make(args.env_name)

    # set random seeds for reproduce
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())
    if args.cuda:
        torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())
    # get the environment parameters
    env_params = get_env_params(env)
    # create the ddpg agent to interact with the environment
    ddpg_trainer = ddpg_agent(args, env, env_params)
    ddpg_trainer.learn()
def main():
    task_and_robot_environment_name = rospy.get_param(
        '/fetch/task_and_robot_environment_name')
    # to register our task env to openai env.
    # so that we don't care the output of this method for now.
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    logdir = None
    n_epochs = 50
    num_cpu = 1
    seed = 0
    policy_save_interval = 5
    replay_strategy = "future"
    clip_return = 1
    rospy.logwarn("task_and_robot_environment_name ==>" + str(env))
    rospy.logwarn("HERE")
    rospy.init_node("train_fetch_her")
    launch(task_and_robot_environment_name, logdir, n_epochs, num_cpu, seed,
           replay_strategy, policy_save_interval, clip_return)
from stable_baselines.deepq.policies import MlpPolicy as mlp_dqn
from stable_baselines.sac.policies import MlpPolicy as mlp_sac
from stable_baselines.ddpg.policies import MlpPolicy as mlp_ddpg
from stable_baselines.td3.policies import MlpPolicy as mlp_td3
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

from openai_ros.task_envs.cartpole_stay_up import stay_up
from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment

os.chdir('~/catkin_ws/src/openai_examples/cartpole/cartpole3d/scripts/')

rospy.init_node('cartpole3d_trpo', anonymous=True, log_level=rospy.FATAL)

environment_name = rospy.get_param(
    '/cartpole_v0/task_and_robot_environment_name')
env = StartOpenAI_ROS_Environment(environment_name)
env = DummyVecEnv([lambda: env
                   ])  # The algorithms require a vectorized environment to run

# The noise objects for TD3
n_actions = env.action_space.n
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))

model_list = [
    A2C(MlpPolicy,
        env,
        verbose=1,
        tensorboard_log="../results/tensorboard_logs/A2C/"),
    ACKTR(MlpPolicy,
          env,
Esempio n. 8
0
params = P()

params.goal = ROBOTICS_ENV_LIST[params.env_name]
params.test_episodes = 10

now = datetime.now()

params.log_dir += "{}".format(params.env_name)
params.model_dir += "{}".format(params.env_name)

rospy.init_node("start_her")
task_and_robot_environment_name = rospy.get_param(
    '/fetch/task_and_robot_environment_name')
# to register our task env to openai env.
# so that we don't care the output of this method for now.
env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)

# params.max_action = env.action_space.high[0]
# params.num_action = env.action_space.shape[0]

# TODO: this is temp solution..... check openai's fetch's implementation!!
params.max_action = 0
params.num_action = 4

# set seed
env.seed(params.seed)
tf.random.set_random_seed(params.seed)

agent = DDPG(Actor, Critic, params.num_action, params)

replay_buffer = ReplayBuffer(params.memory_size)
Esempio n. 9
0
from stable_baselines.deepq.policies import MlpPolicy as mlp_dqn
from stable_baselines.sac.policies import MlpPolicy as mlp_sac
from stable_baselines.ddpg.policies import MlpPolicy as mlp_ddpg
from stable_baselines.td3.policies import MlpPolicy as mlp_td3
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

from openai_ros.task_envs.cartpole_stay_up import stay_up
from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment 


os.chdir('~/catkin_ws/src/openai_examples/cartpole/cartpole3d/scripts/')

rospy.init_node('cartpole3d_train_all', anonymous=True, log_level=rospy.FATAL)

environment_name = rospy.get_param('/cartpole_v0/task_and_robot_environment_name')
env = StartOpenAI_ROS_Environment(environment_name)
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run


# The noise objects for TD3
n_actions = env.action_space.n
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model_list = [
        A2C(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/A2C/"), 
        ACKTR(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/ACKTR/"), 
        PPO2(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/PPO2/"), 
        TRPO(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/TRPO/"),
]

algo_list = ['A2C', 'ACKTR', 'PPO2', 'TRPO']
Esempio n. 10
0
                agent.writer.add_scalar('Train/q2_loss', agent.q2_loss.item(),
                                        ep_num)
                agent.writer.add_scalar('Train/policy_loss',
                                        agent.policy_loss.item(), ep_num)
                agent.writer.flush()

            total_rewards.append(ep_reward)
            avg_reward = np.mean(total_rewards[-100:])

            agent.writer.add_scalar('avg_reward', avg_reward, ep_num)
            agent.writer.flush()

            print("Steps:{} Episode:{} Reward:{} Avg Reward:{}".format(
                t, ep_num, ep_reward, avg_reward))

            o, r, d, ep_reward, ep_len = env.reset(), 0, False, 0, 0
            goal_point.position.x, goal_point.position.y, goal_point.position.z = [
                random.uniform(0.3, 0.5) for _ in xrange(3)
            ]
            goal_pub.publish(goal_point)
            ep_num += 1

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t


rospy.init_node('test_RL_PANDA', anonymous=True, log_level=rospy.WARN)
replay_buffer = ReplayBuffer(int(1e6))
env = NormalizedActions(StartOpenAI_ROS_Environment('MyPandaTrainingEnv-v0'))
agent = SAC(env, replay_buffer, hidden_dim=[256, 256])
train(agent)
Esempio n. 11
0
def main():
    rospy.init_node('movingcube_onedisk_walk_gym',
                    anonymous=True,
                    log_level=rospy.WARN)

    # Set the path where learned model will be saved
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('my_moving_cube_pkg')
    models_dir_path = os.path.join(pkg_path, "models_saved")
    if not os.path.exists(models_dir_path):
        os.makedirs(models_dir_path)

    out_model_file_path = os.path.join(models_dir_path, "movingcube_model.pkl")

    max_timesteps = rospy.get_param("/moving_cube/max_timesteps")
    buffer_size = rospy.get_param("/moving_cube/buffer_size")
    # We convert to float becase if we are using Ye-X notation, it sometimes treats it like a string.
    lr = float(rospy.get_param("/moving_cube/lr"))

    exploration_fraction = rospy.get_param("/moving_cube/exploration_fraction")
    exploration_final_eps = rospy.get_param(
        "/moving_cube/exploration_final_eps")
    print_freq = rospy.get_param("/moving_cube/print_freq")

    reward_task_learned = rospy.get_param("/moving_cube/reward_task_learned")

    def callback(lcl, _glb):
        # stop training if reward exceeds 199
        aux1 = lcl['t'] > 100
        aux2 = sum(lcl['episode_rewards'][-101:-1]) / 100
        is_solved = aux1 and aux2 >= reward_task_learned

        rospy.logdebug("aux1=" + str(aux1))
        rospy.logdebug("aux2=" + str(aux2))
        rospy.logdebug("reward_task_learned=" + str(reward_task_learned))
        rospy.logdebug("IS SOLVED?=" + str(is_solved))

        return is_solved

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param(
        '/moving_cube/task_and_robot_environment_name')
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    rospy.loginfo("Starting Learning")

    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=lr,
        max_timesteps=max_timesteps,
        buffer_size=buffer_size,
        exploration_fraction=exploration_fraction,
        exploration_final_eps=exploration_final_eps,
        print_freq=
        print_freq,  # how many apisodes until you print total rewards and info
        param_noise=False,
        callback=callback)

    env.close()
    rospy.logwarn("Saving model to movingcube_model.pkl")
    act.save(out_model_file_path)
def main():
    name = input('Please input your exp number:')
    folder = "Human_exp_new_" + str(name)
    os.system("mkdir " + folder)

    ####################### inite environment ########################################
    map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/"
    #os.rename(map_dir+"house_3.world",map_dir+"maze.world")
    rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN)
    env = StartOpenAI_ROS_Environment('MyTurtleBot2HumanModel-v1')
    #os.rename(map_dir+"maze.world",map_dir+"house_3.world")
    s_dim = env.observation_space.shape[0] + 4
    a_dim = env.action_space.shape[0]
    a_bound = env.action_space.high
    sign_talker = rospy.Publisher('/sign', Int64, queue_size=1)
    ####################### load agent #######################################
    TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE,
              LR_A, LR_C)
    TD3.loader()
    # ddpg = DDPG(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C)
    # with ddpg.sess.as_default():
    #     with ddpg.g.as_default():
    #         ddpg.loader()
    d = [0.5, 1, 0.5, 1] + list(6 * np.ones([180]))
    A = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
    TGREEN = '\033[32m'  # Green Text
    ENDC = '\033[m'  # reset to the defaults

    #################### Initialize Data #################################
    n_h = 0  ############## number of eps of pure human control #################
    n_s_TD3 = 0  ############## number of eps of shared control #################
    # n_s_ddpg = 0

    n_crash_h = 0  ############## number of crash in pure human control #################
    n_crash_s_TD3 = 0  ############## number of crash in shared control #################
    # n_crash_s_ddpg = 0

    n_success_h = 0  ############## number of success of pure human control #################
    n_success_s_TD3 = 0  ############## number of success of shared control #################
    # n_success_s_ddpg = 0

    t_h = 0  ############## total time of pure human control #################
    t_s_TD3 = 0  ############## total time of shared control #################
    # t_s_ddpg = 0
    #################### SART #####################
    SA_h = 0
    SA_s_TD3 = 0
    # SA_s_ddpg = 0
    #################### workload #################
    WL_h = 0
    WL_s_TD3 = 0
    # WL_s_ddpg = 0
    #################### Start ####################
    i = 0
    total_ep = 2
    sequence_ep = [0, 1]
    #np.random.shuffle(sequence_ep)
    pureh_ep = sequence_ep[0:int(total_ep / 2)]
    # ddpg_ep = sequence_ep[int(total_ep/3):int(2*total_ep/3)]
    td3_ep = sequence_ep[int(total_ep / 2):]
    n_d = 0
    while i < total_ep:
        s = env.reset()
        print(s)
        t1 = time.time()
        min_distance_ob = s[-2]
        theta_bt_uh_ob = s[-1]
        a_old = A[0]
        s = np.hstack((a_old, np.array(s[3:-2]) / d))
        done = 0
        while not done:
            ################ Mode #########################################
            if i in td3_ep:
                a = TD3.choose_action(s, 1)
                if a[1] <= 0.9 and min_distance_ob <= 0.7 and abs(
                        theta_bt_uh_ob) < 1.3:
                    print(TGREEN + "Going to Crash!", ENDC)
                    a = A[1]
            elif i in pureh_ep:
                a = A[0]

            s_, r, done, info = env.step(a)
            sign_talker.publish(int(r))
            if r < -400 and done:
                done = 0
                if i in td3_ep:
                    n_crash_s_TD3 += 1
                elif i in pureh_ep:
                    n_crash_h += 1
            print("n_crash_s_TD3:", n_crash_s_TD3)
            print("n_crash_h:", n_crash_h)
            print("i:", i)
            print("n_d:", n_d)
            min_distance_ob = s_[-2]
            theta_bt_uh_ob = s_[-1]
            a_old = a
            s = np.hstack((a_old, np.array(s_[3:-2]) / d))
            if done:
                t = 0
                n_d = n_d + 1
                if n_d % 5 == 0:
                    instability = int(
                        input(
                            'How changeable is the situation (1: stable and straightforward -- 7: changing suddenly):'
                        ))
                    variability = int(
                        input(
                            'How many variables are changing within the situation: (1: very few -- 7: large number):'
                        ))
                    complexity = int(
                        input(
                            'How complicated is the situation (1: simple -- 7: complex):'
                        ))
                    arousal = int(
                        input(
                            'How aroused are you in the situation (1: low degree of alertness -- 7: alert and ready for activity):'
                        ))
                    spare = int(
                        input(
                            'How much mental capacity do you have to spare in the situation (1: Nothing to spare -- 7: sufficient):'
                        ))
                    concentration = int(
                        input(
                            'How much are you concentrating on the situation (1: focus on only one thing -- 7: many aspect):'
                        ))
                    attention = int(
                        input(
                            'How much is your attention divide in the situation (1: focus on only one thing -- 7: many aspect):'
                        ))
                    quantity = int(
                        input(
                            'How much information have your gained about the situation (1: little -- 7: much):'
                        ))
                    quality = int(
                        input(
                            'How good information have you been accessible and usable (1: poor -- 7: good):'
                        ))
                    famlilarity = int(
                        input(
                            'How familar are you with the situation (1: New situation -- 7: a great deal of relevant experience):'
                        ))
                    SA = quantity + quality + famlilarity - (
                        (instability + variability + complexity) -
                        (arousal + spare + concentration + attention))
                    if i in td3_ep:
                        SA_s_TD3 += SA
                        WL_s_TD3 += float(
                            input('Please input your workload (from TLX):'))
                        t_s_TD3 += t
                        n_s_TD3 += 1
                        if r > 500:
                            n_success_s_TD3 += 1
                    elif i in pureh_ep:
                        SA_h += SA
                        WL_h += float(
                            input('Please input your workload (from TLX):'))
                        t_h += t
                        n_h += 1
                        if r > 500:
                            n_success_h += 1
                    i = i + 1
                break

    np.savetxt(
        'data.dat',
        np.array([[
            n_s_TD3, t_s_TD3, n_crash_s_TD3, n_success_s_TD3, SA_s_TD3,
            WL_s_TD3
        ], [n_h, t_h, n_crash_h, n_success_h, SA_h, WL_h]]))
    #,[n_s_ddpg,t_s_ddpg,n_crash_s_ddpg,n_success_s_ddpg, SA_s_ddpg, WL_s_ddpg]]))
    ########### shared_TD3: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    ########### human: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    ########### shared_TD3_ddpg: number of eps, total time, time of crash, time of success, situation awareness, workload ###########
    os.system('cp -r data.dat ' + folder + '/')
Esempio n. 13
0
class DQNRobotSolver():
    def __init__(self,
                 environment_name,
                 n_observations,
                 n_actions,
                 n_episodes=1000,
                 n_win_ticks=195,
                 min_episodes=100,
                 max_env_steps=None,
                 gamma=1.0,
                 epsilon=1.0,
                 epsilon_min=0.01,
                 epsilon_log_decay=0.995,
                 alpha=0.01,
                 alpha_decay=0.01,
                 batch_size=64,
                 monitor=False,
                 quiet=False):
        self.memory = deque(maxlen=100000)

        # self.env = gym.make(environment_name)
        self.env = StartOpenAI_ROS_Environment(environment_name)

        if monitor:
            self.env = gym.wrappers.Monitor(self.env,
                                            '../data/cartpole-1',
                                            force=True)

        self.input_dim = n_observations
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_log_decay
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.n_episodes = n_episodes
        self.n_win_ticks = n_win_ticks
        self.min_episodes = min_episodes
        self.batch_size = batch_size
        self.quiet = quiet
        if max_env_steps is not None:
            self.env._max_episode_steps = max_env_steps

        # Init model
        self.model = Sequential()

        self.model.add(Dense(24, input_dim=self.input_dim, activation='tanh'))
        self.model.add(Dense(48, activation='tanh'))
        self.model.add(Dense(self.n_actions, activation='linear'))
        self.model.compile(loss='mse',
                           optimizer=Adam(lr=self.alpha,
                                          decay=self.alpha_decay))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state, epsilon):
        return self.env.action_space.sample() if (
            np.random.random() <= epsilon) else np.argmax(
                self.model.predict(state))

    def get_epsilon(self, t):
        return max(
            self.epsilon_min,
            min(self.epsilon, 1.0 - math.log10((t + 1) * self.epsilon_decay)))

    def preprocess_state(self, state):
        return np.reshape(state, [1, self.input_dim])

    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory),
                                                   batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state)
            y_target[0][
                action] = reward if done else reward + self.gamma * np.max(
                    self.model.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])

        self.model.fit(np.array(x_batch),
                       np.array(y_batch),
                       batch_size=len(x_batch),
                       verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def run(self):

        rate = rospy.Rate(30)

        scores = deque(maxlen=100)

        for e in range(self.n_episodes):

            init_state = self.env.reset()

            state = self.preprocess_state(init_state)
            done = False
            i = 0
            while not done:
                # openai_ros doesnt support render for the moment
                #self.env.render()
                action = self.choose_action(state, self.get_epsilon(e))
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.preprocess_state(next_state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1

            scores.append(i)
            mean_score = np.mean(scores)
            if mean_score >= self.n_win_ticks and e >= min_episodes:
                if not self.quiet:
                    print('Ran {} episodes. Solved after {} trials'.format(
                        e, e - min_episodes))
                return e - min_episodes
            if e % 1 == 0 and not self.quiet:
                print(
                    '[Episode {}] - Mean survival time over last {} episodes was {} ticks.'
                    .format(e, min_episodes, mean_score))

            self.replay(self.batch_size)

        if not self.quiet: print('Did not solve after {} episodes'.format(e))
        return e
Esempio n. 14
0
def main():
    # Initialize OpenAI_ROS ENV
    LoadYamlFileParamsTest(rospackage_name="learning_ros",
                           rel_path_from_package_to_file="config",
                           yaml_file_name="aliengo_stand.yaml")
    env = StartOpenAI_ROS_Environment('AliengoStand-v0')
    time.sleep(3)
    # Initialize PPO agent
    Policy = Policy_net('policy', ob_space=3, act_space=8)
    Old_Policy = Policy_net('old_policy', ob_space=3, act_space=8)
    PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        obs = env.reset()
        reward = 0
        success_num = 0
        scores = []

        for iteration in range(ITERATION):
            observations = []
            actions = []
            v_preds = []
            rewards = []
            while not rospy.is_shutdown():  # until ros is not shutdown
                # prepare to feed placeholder Policy.obs
                obs = np.stack([obs]).astype(dtype=np.float32)
                act, v_pred = Policy.act(obs=obs, stochastic=True)
                #print('act: ',act, 'v_pred: ',v_pred )

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)
                observations.append(obs)
                actions.append(act)
                v_preds.append(v_pred)
                rewards.append(reward)
                #execute according to action
                next_obs, reward, done, _ = env.step(act)
                time.sleep(0.01)
                if done:
                    # next state of terminate state has 0 state value
                    v_preds_next = v_preds[1:] + [0]
                    obs = env.reset()
                    break
                else:
                    obs = next_obs
            #scores store for visualization
            scores.append(sum(rewards))
            #if consectuvely 10 times has high reward end the training
            if sum(rewards) >= 400:
                success_num += 1
                print("Succes number: " + str(success_num))
                if success_num >= 5:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Clear!! Model saved.')
                if success_num >= 10:
                    saver.save(sess, 'model_train/model_ppo.ckpt')
                    print('Finished! ')
                    break

            else:
                success_num = 0

            gaes = PPO.get_gaes(rewards=rewards,
                                v_preds=v_preds,
                                v_preds_next=v_preds_next)

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations, [len(observations), 3])
            actions = np.array(actions).astype(dtype=np.int32)
            rewards = np.array(rewards).astype(dtype=np.float32)
            v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)
            #calculate generative advantage estimator score
            gaes = np.array(gaes).astype(dtype=np.float32)
            gaes = (gaes - gaes.mean())
            print('gaes', gaes)
            #assign current policy params to previous policy params
            PPO.assign_policy_parameters()

            inp = [observations, actions, rewards, v_preds_next, gaes]

            # PPO train
            for epoch in range(4):
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=64)  # indices are in [low, high)
                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # sample training data
                PPO.train(obs=sampled_inp[0],
                          actions=sampled_inp[1],
                          rewards=sampled_inp[2],
                          v_preds_next=sampled_inp[3],
                          gaes=sampled_inp[4])
        plt.plot(scores)
        plt.show()
        env.stop()
Esempio n. 15
0
    experience_replay_buffer = ReplayMemory()
    episode_rewards = np.zeros(num_episodes)
    episode_lens = np.zeros(num_episodes)

    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_change = (epsilon - epsilon_min) / 500000

    rospy.init_node('sumo_dqlearn', anonymous=True, log_level=rospy.WARN)

    episode_counter_pub = rospy.Publisher('/episode_counter', Int16)

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param(
        '/turtlebot2/task_and_robot_environment_name')
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    rospy.loginfo("Gym environment done")
    obs = env.reset()
    print obs.shape

    # Create the Gym environment
    rospy.loginfo("Gym environment done")
    rospy.loginfo("Starting Learning")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('gazebo_sumo')
    #outdir = pkg_path + '/training_results'
    #env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")
Esempio n. 16
0
    def run(self):
        # Initialize OpenAI_ROS ENV
        LoadYamlFileParamsTest(rospackage_name="learning_ros",
                               rel_path_from_package_to_file="config",
                               yaml_file_name="aliengo_stand.yaml")
        env = StartOpenAI_ROS_Environment('AliengoStand-v0')
        saver = tf.train.Saver()

        time.sleep(3)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            scores = []
            success_num = 0

            for e in range(self.EPISODES):
                state = env.reset()
                state = np.reshape(state, [1, self.state_size])
                done = False
                rewards = []
                while not rospy.is_shutdown(
                ):  # until ros is not shutdown                #self.env.render()
                    action = self.act(state)
                    next_state, reward, done, _ = env.step(action)
                    time.sleep(0.01)

                    next_state = np.reshape(next_state, [1, self.state_size])
                    rewards.append(reward)
                    self.remember(state, action, reward, next_state, done)

                    if done:
                        # every step update target model
                        self.update_target_model()

                        state = env.reset()
                        break
                    else:
                        state = next_state
                    self.replay()
                    #if consectuvely 10 times has high reward end the training
                scores.append(sum(rewards))
                if sum(rewards) >= 400:
                    success_num += 1
                    print("Succes number: " + str(success_num))
                    if success_num >= 5:  #checkpoint
                        if (ddqn):
                            saver.save(sess, 'model_train/model_ddqn.ckpt')
                        else:
                            saver.save(sess, 'model_train/model_dqn.ckpt')

                        print('Clear!! Model saved.')
                    if success_num >= 10:
                        if (ddqn):
                            saver.save(sess, 'model_train/model_ddqn.ckpt')
                        else:
                            saver.save(sess, 'model_train/model_dqn.ckpt')

                        print('Clear!! Model saved. AND Finished! ')
                        break

                else:
                    success_num = 0
        plt.plot(scores)
        plt.show()
Esempio n. 17
0
        self.model = model_from_yaml(loaded_model_yaml)
        # load weights into new model
        self.model.load_weights(model_name_HDF5_format_path)
        print("Loaded model from disk")


if __name__ == '__main__':
    rospy.init_node('fetch_n1try_algorithm',
                    anonymous=True, log_level=rospy.WARN)

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param(
        '/sia_fetch/task_and_robot_environment_name')
    rospy.logwarn("task_and_robot_environment_name ==>" +
                  str(task_and_robot_environment_name))
    openai_ros_env_object = StartOpenAI_ROS_Environment(
        task_and_robot_environment_name)
    #print("##########################  task_and_robot_environment_name  ########################################")
    #print(task_and_robot_environment_name)  
    rospy.loginfo("Starting Learning")

    rospackage_name = "fetch_train"
    model_name = "sia_fetch_n1try"

    n_observations = rospy.get_param('/sia_fetch/n_observations')
    n_actions = rospy.get_param('/sia_fetch/n_actions')
     

    n_episodes_training = rospy.get_param('/sia_fetch/episodes_training')
    n_episodes_running = rospy.get_param('/sia_fetch/episodes_running')

    n_win_ticks = rospy.get_param('/sia_fetch/n_win_ticks')
Esempio n. 18
0
        ep_reward += reward
        rospy.logwarn("Testing: Cumulative Reward of this episode: ")
        rospy.logwarn(ep_reward)
        writer.add_scalar("Test_Cumulative_Rewards", ep_reward, global_step=test_global_step)
    return ep_reward, test_global_step



if __name__ == '__main__':

    rospy.init_node('example_turtlebot2_maze_dqn', anonymous=True, log_level=rospy.WARN)

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param('/turtlebot2/task_and_robot_environment_name')
    # Create the Gym environment
    env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)
    rospy.loginfo("Gym environment done")
    rospy.loginfo("Starting Learning")


    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('turtle2_openai_ros_example')
    outdir = pkg_path + '/training_results'
    env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")

    last_time_steps = np.ndarray(0)

    """
    # Loads parameters from the ROS param server
Esempio n. 19
0
from gym import wrappers
# ROS packages required
import rospy
import rospkg
from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment


if __name__ == '__main__':

    rospy.init_node('example_turtlebot2_maze_qlearn',
                    anonymous=True, log_level=rospy.WARN)

    # Init OpenAI_ROS ENV
    task_and_robot_environment_name = rospy.get_param(
        '/turtlebot2/task_and_robot_environment_name')
    env = StartOpenAI_ROS_Environment(
        task_and_robot_environment_name)
    # Create the Gym environment
    rospy.loginfo("Gym environment done")
    rospy.loginfo("Starting Learning")

    # Set the logging system
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('turtle2_openai_ros_example')
    outdir = pkg_path + '/training_results'
    env = wrappers.Monitor(env, outdir, force=True)
    rospy.loginfo("Monitor Wrapper started")

    last_time_steps = numpy.ndarray(0)

    # Loads parameters from the ROS param server
    # Parameters are stored in a yaml file inside the config directory
Esempio n. 20
0
# RL packages required
import tensorflow as tf
import cv2

from ForgER.agent import *
from ForgER.replay_buff import AggregatedBuff
from ForgER.model import get_network_builder

rospy.init_node('example_deepsoccer_soccer_qlearn', anonymous=True, log_level=rospy.WARN)

task_and_robot_environment_name = rospy.get_param('/deepsoccer/task_and_robot_environment_name')
save_path = rospy.get_param('/deepsoccer/save_path')
save_file = rospy.get_param('/deepsoccer/save_file')

env = StartOpenAI_ROS_Environment(task_and_robot_environment_name)

def get_dtype_dict(env):
    action_shape = env.action_space.shape
    action_shape = action_shape if len(action_shape) > 0 else 1
    action_dtype = env.action_space.dtype
    action_dtype = 'int32' if np.issubdtype(action_dtype, int) else action_dtype
    action_dtype = 'float32' if np.issubdtype(action_dtype, float) else action_dtype
    env_dict = {'action': {'shape': action_shape,
                            'dtype': action_dtype},
                'reward': {'dtype': 'float32'},
                'done': {'dtype': 'bool'},
                'n_reward': {'dtype': 'float32'},
                'n_done': {'dtype': 'bool'},
                'actual_n': {'dtype': 'float32'},
                'demo': {'dtype': 'float32'},