def load_leader_weights(leader_path, epoch, _outdir): resume_path = leader_path + epoch weights_path = resume_path + '.h5' monitor_path = resume_path params_json = resume_path + '.json' with open(params_json) as file: d = json.load(file) learnStart = d.get('learnStart') learningRate = d.get('learningRate') discountFactor = d.get('discountFactor') memorySize = d.get('memorySize') network_inputs = d.get('network_inputs') network_outputs = d.get('network_outputs') network_structure = d.get('network_structure') leader_deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) leader_deepQ.initNetworks(network_structure) leader_deepQ.loadWeights(weights_path) clear_monitor_files(_outdir) copy_tree(monitor_path, _outdir)
epochs = 10000 steps = 100 updateTargetNetwork = 10000 explorationRate = 1 minibatch_size = 64 learnStart = 64 learningRate = 0.00025 discountFactor = 0.99 memorySize = 1000000 # network_inputs = 100 network_inputs = 3 network_outputs = 21 network_structure = [300,300] current_epoch = 0 deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) else: #Load weights, monitor info and parameter info. #ADD TRY CATCH fro this else with open(params_json) as outfile: d = json.load(outfile) epochs = d.get('epochs') steps = d.get('steps') updateTargetNetwork = d.get('updateTargetNetwork') explorationRate = d.get('explorationRate') minibatch_size = d.get('minibatch_size') learnStart = d.get('learnStart') learningRate = d.get('learningRate') discountFactor = d.get('discountFactor') memorySize = d.get('memorySize')
def start(self): ns = '/burger/' task_and_robot_environment_name = rospy.get_param( ns + "task_and_robot_environment_name") env = gym.make(task_and_robot_environment_name) filepath = os.path.dirname(os.path.abspath(__file__)) outdir = filepath + '/model/gazebo_gym_experiments/' path = filepath + '/model/burger_war_dqn_ep' dt_now = datetime.datetime.now() resultpath = filepath + '/model/' + dt_now.strftime( '%Y-%m-%d-%H:%M:%S') + '.csv' resume_epoch = rospy.get_param( ns + "resume_epoch") # change to epoch to continue from resume_path = path + resume_epoch weights_path = resume_path + '.h5' monitor_path = resume_path params_json = resume_path + '.json' if resume_epoch == "0": #Each time we take a sample and update our weights it is called a mini-batch. #Each time we run through the entire dataset, it's called an epoch. #PARAMETER LIST save_interval = rospy.get_param(ns + "save_interval") epochs = rospy.get_param(ns + "epochs") steps = rospy.get_param(ns + "steps") explorationRate = rospy.get_param(ns + "explorationRate") minibatch_size = rospy.get_param(ns + "minibatch_size") learningRate = rospy.get_param(ns + "learningRate") discountFactor = rospy.get_param(ns + "discountFactor") memorySize = rospy.get_param(ns + "memorySize") updateTargetNetwork = rospy.get_param(ns + "updateTargetNetwork") learnStart = rospy.get_param(ns + "learnStart") network_inputs = rospy.get_param(ns + "network_inputs") network_outputs = rospy.get_param(ns + "network_outputs") network_structure = rospy.get_param(ns + "network_structure") current_epoch = 0 epsilon_decay = rospy.get_param(ns + "epsilon_decay") vel_max_x = rospy.get_param(ns + "vel_max_x") vel_min_x = rospy.get_param(ns + "vel_min_x") vel_max_z = rospy.get_param(ns + "vel_max_z") deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) else: #Load weights, monitor info and parameter info. #ADD TRY CATCH fro this else with open(params_json) as outfile: d = json.load(outfile) save_interval = d.get('save_interval') epochs = d.get('epochs') steps = d.get('steps') if self.runMode == 'test': explorationRate = 0 else: explorationRate = d.get('explorationRate') minibatch_size = d.get('minibatch_size') learnStart = d.get('learnStart') discountFactor = d.get('discountFactor') memorySize = d.get('memorySize') network_outputs = d.get('network_outputs') updateTargetNetwork = d.get('updateTargetNetwork') learningRate = d.get('learningRate') network_inputs = d.get('network_inputs') network_outputs = d.get('network_outputs') network_structure = d.get('network_structure') current_epoch = d.get('current_epoch') epsilon_decay = d.get('epsilon_decay') vel_max_x = d.get('vel_max_x') vel_min_x = d.get('vel_min_x') vel_max_z = d.get('vel_max_z') deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) deepQ.loadWeights(weights_path) if not os.path.exists(outdir): os.makedirs(outdir) self.clear_monitor_files(outdir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) copy_tree(monitor_path, outdir) env._max_episode_steps = steps # env returns done after _max_episode_steps prod = False if self.runMode == 'test': prod = True env = gym.wrappers.Monitor(env, outdir, force=not prod, resume=prod) lastScores = [0] * save_interval lastScoresIndex = 0 lastFilled = False stepCounter = 0 start_time = time.time() #start iterating from 'current epoch'. env.set_mode(self.side, self.runMode, self.collisionMode, network_outputs, vel_max_x, vel_min_x, vel_max_z) for epoch in xrange(current_epoch + 1, epochs + 1, 1): observation = env.reset() cumulated_reward = 0 done = False episode_step = 0 # run until env returns done while not done: # env.render() qValues = deepQ.getQValues(observation) action = deepQ.selectAction(qValues, explorationRate) newObservation, reward, done, info = env.step(action) cumulated_reward += reward deepQ.addMemory(observation, action, reward, newObservation, done) if ((self.runMode != 'test') and (stepCounter >= learnStart)): if stepCounter <= updateTargetNetwork: deepQ.learnOnMiniBatch(minibatch_size, False) else: deepQ.learnOnMiniBatch(minibatch_size, True) observation = newObservation if done: lastScores[lastScoresIndex] = episode_step lastScoresIndex += 1 if lastScoresIndex >= save_interval: lastFilled = True lastScoresIndex = 0 m, s = divmod(int(time.time() - start_time), 60) h, m = divmod(m, 60) if not lastFilled: print("EP " + str(epoch) + " - " + format(episode_step + 1) + " Episode steps Exploration=" + str(round(explorationRate, 2))) else: print("EP " + str(epoch) + " - " + format(episode_step + 1) + " Episode steps - last Steps : " + str((sum(lastScores) / len(lastScores))) + " - Cumulated R: " + str(cumulated_reward) + " Eps=" + str(round(explorationRate, 2)) + " Time: %d:%02d:%02d" % (h, m, s)) if (epoch % save_interval) == 0: #save model weights and monitoring data every save_interval epochs. deepQ.saveModel(path + str(epoch) + '.h5') env._flush() copy_tree(outdir, path + str(epoch)) #save simulation parameters. parameter_keys = [ 'save_interval', 'epochs', 'steps', 'updateTargetNetwork', 'explorationRate', 'minibatch_size', 'learnStart', 'learningRate', 'discountFactor', 'memorySize', 'network_inputs', 'network_outputs', 'network_structure', 'current_epoch', 'epsilon_decay', 'vel_max_x', 'vel_min_x', 'vel_max_z' ] parameter_values = [ save_interval, epochs, steps, updateTargetNetwork, explorationRate, minibatch_size, learnStart, learningRate, discountFactor, memorySize, network_inputs, network_outputs, network_structure, epoch, epsilon_decay, vel_max_x, vel_min_x, vel_max_z ] parameter_dictionary = dict( zip(parameter_keys, parameter_values)) with open(path + str(epoch) + '.json', 'w') as outfile: json.dump(parameter_dictionary, outfile) with open(resultpath, mode='a') as f: f.write( str(epoch) + "," + format(episode_step + 1) + "," + str(cumulated_reward) + "," + str(round(explorationRate, 2)) + "," + "%d:%02d:%02d" % (h, m, s) + "\n") stepCounter += 1 if stepCounter % updateTargetNetwork == 0: deepQ.updateTargetNetwork() print("updating target network") episode_step += 1 if self.runMode == 'train': explorationRate *= epsilon_decay #epsilon decay # explorationRate -= (2.0/epochs) explorationRate = max(0.05, explorationRate) if self.runMode == 'test': break env.close()
def train(env, algorithm, params=None, load=False, loadpath=None, loaditer=None): if algorithm == ppo: assert args.gym_env == "DubinsCarEnv-v0" or args.gym_env == "PlanarQuadEnv-v0" # Initialize policy ppo.create_session() init_policy = ppo.create_policy('pi', env) ppo.initialize() if load and loadpath is not None and loaditer is not None: # load trained policy pi = init_policy pi.load_model(loadpath, iteration=loaditer) pi.save_model(MODEL_DIR, iteration=0) else: # init policy pi = init_policy pi.save_model(MODEL_DIR, iteration=0) # init params with open(params) as params_file: d = json.load(params_file) num_iters = d.get('num_iters') num_ppo_iters = d.get('num_ppo_iters') timesteps_per_actorbatch = d.get('timesteps_per_actorbatch') clip_param = d.get('clip_param') entcoeff = d.get('entcoeff') optim_epochs = d.get('optim_epochs') optim_stepsize = d.get('optim_stepsize') optim_batchsize = d.get('optim_batchsize') gamma = d.get('gamma') lam = d.get('lam') max_iters = num_ppo_iters # record performance data overall_perf = list() ppo_reward = list() ppo_length = list() suc_percents = list() wall_clock_time = list() best_suc_percent = 0 best_pi = None perf_flag = False eval_ppo_reward = list() eval_suc_percents = list() # index for num_iters loop i = 1 while i <= num_iters: wall_clock_time.append(time()) logger.info('overall training iteration %d' % i) # each learning step contains "num_ppo_iters" ppo-learning steps. # each ppo-learning steps == ppo-learning on single episode # each single episode is a single markov chain which contains many states, actions, rewards. pi, ep_mean_length, ep_mean_reward, suc_percent = algorithm.ppo_learn( env=env, policy=pi, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, gamma=gamma, lam=lam, max_iters=max_iters, schedule='constant') ppo_length.extend(ep_mean_length) ppo_reward.extend(ep_mean_reward) suc_percents.append(suc_percent) # perf_metric = evaluate() # overall_perf.append(perf_metric) # print('[Overall Iter %d]: perf_metric = %.2f' % (i, perf_metric)) pi.save_model(MODEL_DIR, iteration=i) plot_performance(range(len(ppo_reward)), ppo_reward, ylabel=r'avg reward per ppo-learning step', xlabel='ppo iteration', figfile=os.path.join(FIGURE_DIR, 'ppo_reward'), title='TRAIN') plot_performance( range(len(suc_percents)), suc_percents, ylabel=r'overall success percentage per algorithm step', xlabel='algorithm iteration', figfile=os.path.join(FIGURE_DIR, 'success_percent'), title="TRAIN") # --- for plotting evaluation perf on success rate using early stopping trick --- logger.record_tabular('suc_percent', suc_percent) logger.record_tabular('best_suc_percent', best_suc_percent) logger.record_tabular('perf_flag', perf_flag) logger.dump_tabular() if suc_percent > best_suc_percent: best_suc_percent = suc_percent best_pi = pi if suc_percent > 0.6: perf_flag = True if not perf_flag: # less timesteps_per_actorbatch to make eval faster. _, _, eval_ep_mean_reward, eval_suc_percent = algorithm.ppo_eval( env, pi, timesteps_per_actorbatch // 2, max_iters=5, stochastic=False) else: _, _, eval_ep_mean_reward, eval_suc_percent = algorithm.ppo_eval( env, best_pi, timesteps_per_actorbatch // 2, max_iters=5, stochastic=False) eval_ppo_reward.extend(eval_ep_mean_reward) eval_suc_percents.append(eval_suc_percent) plot_performance(range(len(eval_ppo_reward)), eval_ppo_reward, ylabel=r'avg reward per ppo-eval step', xlabel='ppo iteration', figfile=os.path.join(FIGURE_DIR, 'eval_ppo_reward'), title='EVAL') plot_performance( range(len(eval_suc_percents)), eval_suc_percents, ylabel=r'overall eval success percentage per algorithm step', xlabel='algorithm iteration', figfile=os.path.join(FIGURE_DIR, 'eval_success_percent'), title="EVAL") # ------------------------------------------------------------------------------- # save data which is accumulated UNTIL iter i with open( RESULT_DIR + '/ppo_length_' + 'iter_' + str(i) + '.pickle', 'wb') as f1: pickle.dump(ppo_length, f1) with open( RESULT_DIR + '/ppo_reward_' + 'iter_' + str(i) + '.pickle', 'wb') as f2: pickle.dump(ppo_reward, f2) with open( RESULT_DIR + '/success_percent_' + 'iter_' + str(i) + '.pickle', 'wb') as fs: pickle.dump(suc_percents, fs) with open( RESULT_DIR + '/wall_clock_time_' + 'iter_' + str(i) + '.pickle', 'wb') as ft: pickle.dump(wall_clock_time, ft) # save evaluation data accumulated until iter i with open( RESULT_DIR + '/eval_ppo_reward_' + 'iter_' + str(i) + '.pickle', 'wb') as f_er: pickle.dump(eval_ppo_reward, f_er) with open( RESULT_DIR + '/eval_success_percent_' + 'iter_' + str(i) + '.pickle', 'wb') as f_es: pickle.dump(eval_suc_percents, f_es) # Incrementing our algorithm's loop counter i += 1 # plot_performance(range(len(overall_perf)), overall_perf, ylabel=r'overall performance per algorithm step', # xlabel='algorithm iteration', # figfile=os.path.join(FIGURE_DIR, 'overall_perf')) # overall, we need plot the time-to-reach for the best policy so far. env.close() return pi elif algorithm == deepq: assert args.gym_env == "DubinsCarEnv_dqn-v0" or args.gym_env == "PlanarQuadEnv_dqn-v0" # do something about dqn training tmp_path = MODEL_DIR + '/ep' continue_execution = False resume_epoch = '200' resume_path = tmp_path + resume_epoch weights_path = resume_path + '.h5' params_json = resume_path + '.json' epochs = steps = updateTargetNetwork = explorationRate = minibatch_size = learnStart = learningRate= \ discountFactor = memorySize = network_inputs = network_outputs = network_structure = current_epoch = None if not continue_execution: # Each time we take a sample and update our weights it is called a mini-batch. # Each time we run through the entire dataset, it's called an epoch. epochs = 1000 steps = 1000 updateTargetNetwork = 10000 explorationRate = 1 minibatch_size = 128 learnStart = 64 learningRate = 0.00025 discountFactor = 0.99 memorySize = 1000000 network_inputs = env.state_dim + env.num_lasers # network_outputs = 21 network_outputs = 25 network_structure = [300, 300] current_epoch = 0 deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) else: # Load weights and parameter info. with open(params_json) as outfile: d = json.load(outfile) epochs = d.get('epochs') steps = d.get('steps') updateTargetNetwork = d.get('updateTargetNetwork') explorationRate = d.get('explorationRate') minibatch_size = d.get('minibatch_size') learnStart = d.get('learnStart') learningRate = d.get('learningRate') discountFactor = d.get('discountFactor') memorySize = d.get('memorySize') network_inputs = d.get('network_inputs') network_outputs = d.get('network_outputs') network_structure = d.get('network_structure') current_epoch = d.get('current_epoch') deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) deepQ.initNetworks(network_structure) deepQ.loadWeights(weights_path) env._max_episode_steps = steps last100Scores = [0] * 100 last100ScoresIndex = 0 last100Filled = False stepCounter = 0 highest_reward = 0 start_time = time() # start iterating from 'current epoch'. for epoch in np.arange(current_epoch + 1, epochs + 1, 1): observation = env.reset() cumulated_reward = 0 done = False episode_step = 0 # run until env returns done while not done: # env.render() qValues = deepQ.getQValues(observation) action = deepQ.selectAction(qValues, explorationRate) newObservation, reward, done, suc, info = env.step(action) cumulated_reward += reward if highest_reward < cumulated_reward: highest_reward = cumulated_reward deepQ.addMemory(observation, action, reward, newObservation, done) if stepCounter >= learnStart: if stepCounter <= updateTargetNetwork: deepQ.learnOnMiniBatch(minibatch_size, False) else: deepQ.learnOnMiniBatch(minibatch_size, True) observation = newObservation if done: last100Scores[last100ScoresIndex] = episode_step last100ScoresIndex += 1 if last100ScoresIndex >= 100: last100Filled = True last100ScoresIndex = 0 if not last100Filled: print("EP " + str(epoch) + " - " + format(episode_step + 1) + "/" + str(steps) + " Episode steps Exploration=" + str(round(explorationRate, 2))) else: m, s = divmod(int(time() - start_time), 60) h, m = divmod(m, 60) print("EP " + str(epoch) + " - " + format(episode_step + 1) + "/" + str(steps) + " Episode steps - last100 Steps : " + str((sum(last100Scores) / len(last100Scores))) + " - Cumulated R: " + str(cumulated_reward) + " Eps=" + str(round(explorationRate, 2)) + " Time: %d:%02d:%02d" % (h, m, s)) if epoch % 100 == 0: # save model weights and monitoring data every 100 epochs. deepQ.saveModel(tmp_path + str(epoch) + '.h5') # save simulation parameters. # convert from numpy int64 type to python int type for json serialization epoch = int(epoch) parameter_keys = [ 'epochs', 'steps', 'updateTargetNetwork', 'explorationRate', 'minibatch_size', 'learnStart', 'learningRate', 'discountFactor', 'memorySize', 'network_inputs', 'network_outputs', 'network_structure', 'current_epoch' ] parameter_values = [ epochs, steps, updateTargetNetwork, explorationRate, minibatch_size, learnStart, learningRate, discountFactor, memorySize, network_inputs, network_outputs, network_structure, epoch ] parameter_dictionary = dict( zip(parameter_keys, parameter_values)) with open(tmp_path + str(epoch) + '.json', 'w') as outfile: json.dump(parameter_dictionary, outfile) stepCounter += 1 if stepCounter % updateTargetNetwork == 0: deepQ.updateTargetNetwork() print("updating target network") episode_step += 1 explorationRate *= 0.995 # epsilon decay # explorationRate -= (2.0/epochs) explorationRate = max(0.05, explorationRate) env.close() return 1 else: raise ValueError("Please input an valid algorithm")
epochs = 350 steps = 1000 updateTargetNetwork = 5000 explorationRate = 1 minibatch_size = 32 learnStart = 64 learningRate = 0.00025 discountFactor = 0.99 memorySize = 100000 network_inputs = 9 + 3 * uav_count network_outputs = 7 network_structure = [50, 50] current_epoch = 0 # init and create deepQ leader_deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart) leader_deepQ.initNetworks(network_structure) follower_deepQ = deepq.DeepQ((network_inputs - 2), network_outputs, memorySize, discountFactor, learningRate, learnStart) follower_deepQ.initNetworks(network_structure) leader_deepQ.memory.liveplot = plotter # load weights, monitor info and parameter info, else: with open(params_json) as outfile: d = json.load(outfile) epochs = d.get('epochs') steps = d.get('steps')