Beispiel #1
0
]
agent_params.network_wrappers['main'].input_embedders_parameters = {
    'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
}
agent_params.algorithm.discount = 0.98
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16)
agent_params.algorithm.num_consecutive_training_steps = 40
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
    40)
agent_params.algorithm.rate_for_copying_weights_to_target = 0.05
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6)
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
agent_params.exploration.evaluation_epsilon = 0

agent_params.memory = EpisodicHindsightExperienceReplayParameters()
agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Final
agent_params.memory.hindsight_transitions_per_regular_transition = 1
agent_params.memory.goals_space = GoalsSpace(
    goal_name='state',
    reward_type=ReachingGoal(distance_from_goal_threshold=0,
                             goal_reaching_reward=0,
                             default_reward=-1),
    distance_metric=GoalsSpace.DistanceMetric.Euclidean)

###############
# Environment #
###############
env_params = Mujoco()
env_params.level = 'rl_coach.environments.toy_problems.bit_flip:BitFlip'
env_params.additional_simulator_parameters = {
Beispiel #2
0
agent_params = DQNAgentParameters()
agent_params.network_wrappers['main'].batch_size = 128
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(100)
agent_params.algorithm.discount = 0.99

# to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer
# with something in the order of the discounted reward of a random policy
agent_params.network_wrappers['main'].heads_parameters = \
[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]

# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False

# ER - we'll be needing an episodic replay buffer for off-policy evaluation
agent_params.memory = EpisodicExperienceReplayParameters()

# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. 
agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)
agent_params.exploration.evaluation_epsilon = 0

# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
#agent_params.algorithm.action_drop_method_parameters = KNNParameters()


DATATSET_PATH = 'acrobot_dataset.csv'
agent_params.memory = EpisodicExperienceReplayParameters()
agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True)

spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}),
                          goal=None,
Beispiel #3
0
from rl_coach.exploration_policies.e_greedy import EGreedyParameters

from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters, DQNNetworkParameters

from rl_coach.agents.rainbow_dqn_agent import RainbowDQNAgentParameters
from rl_coach.base_parameters import PresetValidationParameters, VisualizationParameters, AgentParameters
from rl_coach.core_types import EnvironmentSteps
from rl_coach.environments.second_test import ControlSuiteEnvironmentParameters
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import SimpleSchedule

experience_replay_parameters = ExperienceReplayParameters()
experience_replay_parameters.max_size = (MemoryGranularity.Transitions, 10000)
agent_params = DQNAgentParameters()
agent_params.memory = experience_replay_parameters
schedule_params = SimpleSchedule()
schedule_params.heatup_steps = EnvironmentSteps(10)
preset_validation_params = PresetValidationParameters(
    test=True,
    min_reward_threshold=-50000,
    max_episodes_to_achieve_reward=10,
    num_workers=5)
vis_params = VisualizationParameters(render=False)

env_params = ControlSuiteEnvironmentParameters()
graph_manager = BasicRLGraphManager(
    agent_params=agent_params,
    env_params=env_params,
    schedule_params=schedule_params,
    vis_params=VisualizationParameters(),
Beispiel #4
0
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(50000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
schedule_params.evaluation_steps = EnvironmentSteps(135000)
schedule_params.heatup_steps = EnvironmentSteps(50000)

#########
# Agent #
#########
agent_params = DQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.00025
agent_params.memory = PrioritizedExperienceReplayParameters()
agent_params.memory.beta = LinearSchedule(
    0.4, 1, 12500000)  # 12.5M training iterations = 50M steps = 200M frames

###############
# Environment #
###############
env_params = Atari()
env_params.level = SingleLevelSelection(atari_deterministic_v4)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [
    SelectedPhaseOnlyDumpMethod(RunPhase.TEST),
    MaxDumpMethod()
]
vis_params.dump_mp4 = False
# DQN params
experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
    100)
experience_generating_agent_params.algorithm.discount = 0.99
experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(
    1)

# NN configuration
experience_generating_agent_params.network_wrappers[
    'main'].learning_rate = 0.00025
experience_generating_agent_params.network_wrappers[
    'main'].replace_mse_with_huber_loss = False

# ER size
experience_generating_agent_params.memory = EpisodicExperienceReplayParameters(
)
experience_generating_agent_params.memory.max_size = \
    (MemoryGranularity.Transitions,
     experience_generating_schedule_params.heatup_steps.num_steps +
     experience_generating_schedule_params.improve_steps.num_steps)

# E-Greedy schedule
experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
    1.0, 0.01, 10000)

################
#  Environment #
################
env_params = GymVectorEnvironment(level='CartPole-v0')

########