def objective(objective_args): (z_scale, zdot_reward, action_reward, exploration, tolerance, max_mem_a, k_a, alpha_a, k_c, alpha_c, max_mem_pm, k_pm, pred_tol, lambda_trace, gamma) = objective_args (df_x, df_z, df_xdot, df_zdot, df_theta, df_thetadot, df_u1, df_u3) = QuadRotor2DPlant.get_default_feature_set() feature_z = Feature(r"$z$ [m]", scale=z_scale, bounds=np.array([-25, 0])) quad_rotor_plant = QuadRotor2DPlant(1. / FREQUENCY, blade_flapping=BLADE_FLAPPING, init_mean=DEFAULT_INIT_STATE_MEAN, feature_set=FeatureSet([ df_x, feature_z, df_xdot, df_zdot, df_theta, df_thetadot, df_u1, df_u3, ])) train_args = ( Actor( FeatureSet([feature_z, df_zdot]), FeatureSet([df_u1]), quad_rotor_plant.get_feature_set(), k_a, max_mem_a * 50, alpha_a, tolerance, ), Critic( FeatureSet([feature_z, df_zdot]), quad_rotor_plant.get_feature_set(), k_c, max_mem_a * 50, lambda_trace, alpha_c, gamma, QuadraticErrorRewardFunction([action_reward, 0], [0, 10., 0, zdot_reward, 0, 0], desired_state=DESIRED_STATE), tolerance), PlantModel( quad_rotor_plant.get_feature_set(), FeatureSet([feature_z, df_zdot, df_u1]), k_pm, max_mem_pm * 50, pred_tol, ), quad_rotor_plant, DEFAULT_LENGTH, DEFAULT_ADD_METHOD, DEFAULT_PURGE_METHOD, ExplorationStrategy({1: exploration}), ) cs = ControllerSet( parallelize( parsed_args.j, train, [train_args + (SEED + i, ) for i in range(parsed_args.p)], )) result = SimulationResult( cs.lookback_result( LOOK_BACK_WINDOW, look_back_metric="median", ), metric=parsed_args.metric, ) training_message = "Finished training with cumulative z-error {:.2f}".format( result.get_cum_state_error().flatten()[1]) print(training_message) return result.get_cum_state_error().flatten()[1]
def higher_discount_rate(): quad_rotor_plant = QuadRotor2DPlant( 1. / FREQUENCY, blade_flapping=BLADE_FLAPPING, init_mean=DEFAULT_INIT_STATE_MEAN, ) actor_critic_args = ( Actor( FeatureSet([feature_z, feature_zdot]), FeatureSet([feature_u1]), quad_rotor_plant.get_feature_set(), K_ACTOR, STAGE_ONE_AC_MEMORY, ALPHA_ACTOR, TOLERANCE_ACTOR, ), Critic( FeatureSet([feature_z, feature_zdot]), quad_rotor_plant.get_feature_set(), K_CRITIC, STAGE_ONE_AC_MEMORY, LAMBDA_TRACE, ALPHA_CRITIC, 0.99, QuadraticErrorRewardFunction( ACTION_REWARDS, STATE_REWARDS, desired_state=DESIRED_STATE ), TOLERANCE_CRITIC, ), PlantModel( quad_rotor_plant.get_feature_set(), FeatureSet([feature_z, feature_zdot, feature_u1]), K_PLANT_MODEL, STAGE_ONE_PM_MEMORY, PREDICTION_TOLERANCE, ), quad_rotor_plant, DEFAULT_LENGTH, DEFAULT_ADD_METHOD, DEFAULT_PURGE_METHOD, ExplorationStrategy(EXPLORATION_DICT), ) print("Starting training of quad-rotor with higher discount rate.") trained_cs = ControllerSet( parallelize( parsed_args.j, train, [actor_critic_args + (SEED + i,) for i in range(parsed_args.p)], ) ) print("Finished higher discount rate with {:.2f} (id={})".format( SimulationResult( trained_cs.lookback_result(LOOK_BACK_WINDOW), metric=parsed_args.metric ).get_cum_state_error()[1:2].sum(), trained_cs.get_id(), )) trained_cs.notes = "Sensitivity analysis: higher discount rate" # trained_cs.dump() RewardSet(trained_cs).dump()
filemode="w" ) SEED = 4124135 np.random.seed(SEED) np.set_printoptions( precision=4, linewidth=200, suppress=True, ) np.seterr(divide="raise", invalid="raise") (feature_x, feature_z, feature_xdot, feature_zdot, feature_theta, feature_thetadot, feature_u1, feature_u3) = QuadRotor2DPlant.get_default_feature_set() LOOK_BACK_WINDOW = 5 FREQUENCY = 50. # Hz BLADE_FLAPPING = True DEFAULT_ADD_METHOD = "mean" DEFAULT_PURGE_METHOD = "age-weighted" DEFAULT_LENGTH = 3 # seconds STAGE_ONE_EPISODES = 75 EXPLORATION_DICT = {1: 2, 51: 3, 101: 3} DEFAULT_INIT_STATE_MEAN = np.array([[0, -9., 0, 0, 0, 0]]).T DESIRED_STATE = np.array([[0, -10., 0, 0, 0, 0]]).T AGE_THRESHOLD = 45. # Actor ALPHA_ACTOR = 0.15032140063618069
def objective(objective_args): ( a2_scale, theta_scale, thetadot_scale, max_mem_ac, max_mem_pm, theta_spread, thetadot_spread, exploration, theta_reward, thetadot_reward, u_3_reward, k_a, k_c, k_pm, ) = objective_args feature_theta = Feature(r"$\theta$ [rad]", scale=theta_scale), feature_thetadot = Feature(r"$\dot{\theta}$ [rad/s]", scale=thetadot_scale, derivative=True) feature_a2 = Feature(r"$a_2$ [-]", feature_type="action", scale=0.760859, bounds=0.3 * np.array([-1, 1])) quad_rotor_plant = QuadRotor2DPlant( 1. / FREQUENCY, blade_flapping=BLADE_FLAPPING, init_mean=DEFAULT_INIT_STATE_MEAN, feature_set=FeatureSet([ df_x, df_z, df_xdot, df_zdot, feature_theta, feature_thetadot, df_a1, feature_a2 ]), ) stage_one_args = [ Actor( FeatureSet([df_z, df_zdot]), FeatureSet([df_a1]), quad_rotor_plant.get_feature_set(), K_ACTOR, STAGE_ONE_AC_MEMORY, ALPHA_ACTOR, TOLERANCE_ACTOR, ), Critic( FeatureSet([df_z, df_zdot]), quad_rotor_plant.get_feature_set(), K_CRITIC, STAGE_ONE_AC_MEMORY, LAMBDA_TRACE, ALPHA_CRITIC, DISCOUNT, QuadraticErrorRewardFunction(ACTION_REWARDS, STATE_REWARDS, desired_state=DESIRED_STATE), TOLERANCE_CRITIC, ), PlantModel( quad_rotor_plant.get_feature_set(), FeatureSet([df_z, df_zdot, df_a1]), K_PLANT_MODEL, STAGE_ONE_PM_MEMORY, PREDICTION_TOLERANCE, ), quad_rotor_plant, DEFAULT_LENGTH, DEFAULT_ADD_METHOD, DEFAULT_PURGE_METHOD, ExplorationStrategy(STAGE_ONE_EXPLORATION_DICT) ] print("Training basic quad-rotor") # STAGE ONE cs_stage_one = ControllerSet( parallelize( parsed_args.j, train_stage_one, [stage_one_args + [SEED + i] for i in range(parsed_args.p)], )) _, z_error_stage_one, _, _, _, _ = SimulationResult( cs_stage_one.lookback_result(LOOK_BACK_WINDOW), metric=parsed_args.metric).get_cum_state_error().flatten() print("Finished stage one with {:s} cumulative z-error of {:.2f}".format( parsed_args.metric, z_error_stage_one)) stage_two_args = [ max_mem_ac, max_mem_pm, theta_spread, thetadot_spread, exploration, theta_reward, thetadot_reward, u_3_reward, k_a, k_c, k_pm, feature_theta, feature_thetadot, feature_a2 ] cs_stage_two = ControllerSet( parallelize( parsed_args.j, train_stage_two, [stage_two_args + [deepcopy(ac)] for ac in cs_stage_one], )) x_error, z_error, _, _, theta_error, _ = SimulationResult( cs_stage_two.lookback_result(LOOK_BACK_WINDOW), metric=parsed_args.metric).get_cum_state_error().flatten() return z_error
LAMBDA_TRACE = 0.75055692999458412 STATE_REWARDS = np.array([0, 10., 0, 0.4491648864, 0, 0]) ACTION_REWARDS = np.array([3.551383408, 0]) # Plant model K_PLANT_MODEL = 9 STAGE_ONE_PM_MEMORY = 350 PREDICTION_TOLERANCE = 4.9763444056056387e-07 # STAGE TWO STAGE_TWO_EPISODES = 75 STAGE_TWO_INCR_HOLD = 3 STAGE_TWO_METHOD = "clone-gauss" (df_x, df_z, df_xdot, df_zdot, df_theta, df_thetadot, df_a1, df_a2) = QuadRotor2DPlant.get_default_feature_set() def train_stage_one(args): actor_critic = ActorCriticController(*args) actor_critic.AGE_THRESHOLD = AGE_THRESHOLD actor_critic.train(STAGE_ONE_EPISODES) return actor_critic def train_stage_two(args): (max_mem_ac, max_mem_pm, theta_spread, thetadot_spread, exploration, theta_reward, thetadot_reward, u_3_reward, k_a, k_c, k_pm, feature_theta, feature_thetadot, feature_a2, actor_critic) = args actor_critic._actor._knn = int(k_a)
actor_critic.set_memory_sizes(STAGE_TWO_AC_MEMORY, STAGE_TWO_AC_MEMORY, STAGE_TWO_PM_MEMORY) actor_critic.change_feature( FeatureChange(feature_thetadot, "clone-gauss", spread=PITCH_DOT_SPREAD)) actor_critic.train(int(np.floor(STAGE_TWO_EPISODES / 2)), train_hold=STAGE_TWO_INCR_HOLD) return actor_critic if __name__ == "__main__": try: quad_rotor_plant = QuadRotor2DPlant( 1. / FREQUENCY, blade_flapping=BLADE_FLAPPING, init_mean=DEFAULT_INIT_STATE_MEAN, ) (feature_x, feature_z, feature_xdot, feature_zdot, feature_theta, feature_thetadot, feature_u1, feature_u3) = quad_rotor_plant.get_feature_set() actor_critic_args = ( Actor( FeatureSet([feature_z, feature_zdot]), FeatureSet([feature_u1]), quad_rotor_plant.get_feature_set(), K_ACTOR, STAGE_ONE_AC_MEMORY, ALPHA_ACTOR, TOLERANCE_ACTOR,