Exemple #1
0
def train_on_pure_random(env_params, n_epochs, dataset_size):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    schedule_params = set_schedule_params(n_epochs, dataset_size)
    agent_params = set_agent_params(DDQNAgentParameters)

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return
Exemple #2
0
def train_on_csv_file(csv_file, n_epochs, dataset_size, obs_dim, act_dim):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    schedule_params = set_schedule_params(n_epochs, dataset_size)

    #########
    # Agent #
    #########
    # note that we have moved to BCQ, which will help the training to converge better and faster
    agent_params = set_agent_params(DDQNBCQAgentParameters)
    # additional setting for DDQNBCQAgentParameters agent parameters
    # can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
    # agent_params.algorithm.action_drop_method_parameters = KNNParameters()
    agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters(
    )

    DATATSET_PATH = csv_file
    agent_params.memory.load_memory_from_file_path = CsvDataset(
        DATATSET_PATH, is_episodic=True)

    spaces = SpacesDefinition(state=StateSpace(
        {'observation': VectorObservationSpace(shape=obs_dim)}),
                              goal=None,
                              action=DiscreteActionSpace(act_dim),
                              reward=RewardSpace(1))

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        env_params=None,
        spaces_definition=spaces,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30,
        train_to_eval_ratio=0.4)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return
Exemple #3
0
# ER - we'll be needing an episodic replay buffer for off-policy evaluation
agent_params.memory = EpisodicExperienceReplayParameters()

# E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. 
agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1)
agent_params.exploration.evaluation_epsilon = 0

# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
#agent_params.algorithm.action_drop_method_parameters = KNNParameters()


DATATSET_PATH = 'acrobot_dataset.csv'
agent_params.memory = EpisodicExperienceReplayParameters()
agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True)

spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}),
                          goal=None,
                          action=DiscreteActionSpace(3),
                          reward=RewardSpace(1))

graph_manager = BatchRLGraphManager(agent_params=agent_params,
                                    env_params=None,
                                    spaces_definition=spaces,
                                    schedule_params=schedule_params,
                                    vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),
                                    reward_model_num_epochs=30,
                                    train_to_eval_ratio=0.4)
graph_manager.create_graph(task_parameters)
graph_manager.improve()
Exemple #4
0
# can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation
agent_params.algorithm.action_drop_method_parameters = KNNParameters()

#################
# Visualization #
#################

vis_params = VisualizationParameters()
vis_params.dump_gifs = True

################
#  Environment #
################
env_params = GymVectorEnvironment(level='CartPole-v0')

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250

graph_manager = BatchRLGraphManager(
    agent_params=agent_params,
    env_params=env_params,
    schedule_params=schedule_params,
    vis_params=vis_params,
    preset_validation_params=preset_validation_params)
     experience_generating_schedule_params.improve_steps.num_steps + 1)

# E-Greedy schedule
experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, DATASET_SIZE)
experience_generating_agent_params.exploration.evaluation_epsilon = 0


################
#  Environment #
################
env_params = GymVectorEnvironment(level='MountainCar-v0')

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 50
preset_validation_params.read_csv_tries = 500

graph_manager = BatchRLGraphManager(agent_params=agent_params,
                                    experience_generating_agent_params=experience_generating_agent_params,
                                    experience_generating_schedule_params=experience_generating_schedule_params,
                                    env_params=env_params,
                                    schedule_params=schedule_params,
                                    vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),
                                    preset_validation_params=preset_validation_params,
                                    reward_model_num_epochs=30,
                                    train_to_eval_ratio=0.4)
spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=4)}),
                          goal=None,
                          action=DiscreteActionSpace(2),
                          reward=RewardSpace(1))

#################
# Visualization #
#################

vis_params = VisualizationParameters()
vis_params.dump_gifs = True

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250


graph_manager = BatchRLGraphManager(agent_params=agent_params,
                                    env_params=None,
                                    spaces_definition=spaces,
                                    schedule_params=schedule_params,
                                    vis_params=vis_params,
                                    reward_model_num_epochs=30,
                                    train_to_eval_ratio=0.4,
                                    preset_validation_params=preset_validation_params)
Exemple #7
0
def train_using_experience_agent(env_params, n_epochs, dataset_size):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    # Experience Generating Agent parameters
    experience_generating_agent_params = DDQNAgentParameters()
    # schedule parameters
    experience_generating_schedule_params = ScheduleParameters()
    experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
    experience_generating_schedule_params.improve_steps = TrainingSteps(
        dataset_size -
        experience_generating_schedule_params.heatup_steps.num_steps)
    experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
        10)
    experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(
        1)

    # DQN params
    experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        100)
    experience_generating_agent_params.algorithm.discount = 0.99
    experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(
        1)

    # NN configuration
    experience_generating_agent_params.network_wrappers[
        'main'].learning_rate = 0.0001
    experience_generating_agent_params.network_wrappers[
        'main'].batch_size = 128
    experience_generating_agent_params.network_wrappers[
        'main'].replace_mse_with_huber_loss = False
    experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # ER size
    experience_generating_agent_params.memory = EpisodicExperienceReplayParameters(
    )
    experience_generating_agent_params.memory.max_size = \
        (MemoryGranularity.Transitions,
         experience_generating_schedule_params.heatup_steps.num_steps +
         experience_generating_schedule_params.improve_steps.num_steps)

    # E-Greedy schedule
    experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
        1.0, 0.01, DATASET_SIZE)
    experience_generating_agent_params.exploration.evaluation_epsilon = 0

    schedule_params = set_schedule_params(n_epochs, dataset_size)
    # set the agent params as before
    # agent_params = set_agent_params(DDQNAgentParameters)
    agent_params = set_agent_params(DDQNBCQAgentParameters)
    agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters(
    )

    # 50 epochs of training (the entire dataset is used each epoch)
    # schedule_params.improve_steps = TrainingSteps(50)

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        experience_generating_agent_params=experience_generating_agent_params,
        experience_generating_schedule_params=
        experience_generating_schedule_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30,
        train_to_eval_ratio=0.5)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return