env = make_multi_env(env_init=env_init, nb_env=int(args.nb_env)) tp = TrainingParam() # NN training tp.lr = 1e-5 tp.lr_decay_steps = 300000 tp.minibatch_size = 32 * int(args.nb_env) tp.update_freq = tp.minibatch_size / 2 # limit the number of time steps played per scenarios tp.step_increase_nb_iter = None # None to deactivate it tp.min_iter = None tp.update_nb_iter = None # once 100 scenarios are solved, increase of "step_increase_nb_iter" # oversampling hard scenarios tp.oversampling_rate = None # None to deactivate it # experience replay tp.buffer_size = 1000000 # just observe the data for a while tp.min_observe = None # int(10000) # e greedy tp.min_observation = 128 tp.initial_epsilon = 0.2 tp.final_epsilon = 1./(288.) tp.step_for_final_epsilon = int(1e5) # TODO add the "i dont do anything for a few time steps at the beginning of the training" # don't start always at the same hour (if not None) otherwise random sampling, see docs
tp = TrainingParam() # NN training tp.lr = 1e-4 tp.lr_decay_steps = 30000 tp.minibatch_size = 256 tp.update_freq = 128 # limit the number of time steps played per scenarios tp.step_increase_nb_iter = 2 tp.min_iter = 10 tp.update_nb_iter(2) # oversampling hard scenarios tp.oversampling_rate = 3 # experience replay tp.buffer_size = 1000000 # e greedy tp.min_observation = 10000 tp.initial_epsilon = 0.4 tp.final_epsilon = 1. / (2 * 7 * 288.) tp.step_for_final_epsilon = int(1e5) # don't start always at the same hour (if not None) otherwise random sampling, see docs tp.random_sample_datetime_start = None # saving, logging etc. tp.save_model_each = 10000