def main(w_count, success_count): env = MyEnv({}) while True: # print(f'step {step}') # ランダムアクションの選択 # action_index = env.action_space.sample() a = np.array([0, 1, 2, 3]) p = np.array([1, 1, 1, 1]) """ a = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]) p = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]) """ p = p / np.sum(p) action_index = np.random.choice(a, p=p) # 環境を1step 実行 observation, reward, done, _ = env.step(action_index) if args.render_mode == 'human': print(f'\naction is selected at {env.steps}') status_print(env, observation, reward, done) # 環境の描画 shot = env.render(mode=args.render_mode) # Space keyでpause, デバッグ用 pause_for_debug() # エピソードの終了処理 if done: # print('done') w_count, success_count = conunt_results(env, w_count, success_count) break return w_count, success_count
def main(): # Initialize ray ray.init(ignore_reinit_error=True, log_to_driver=False) # Generate & Check environment env = MyEnv({}) # Define trainer agent model_name = MODEL_NAME config = ppo.DEFAULT_CONFIG.copy() config['env_config'] = {} config['num_gpus'] = 0 config['framework'] = 'tfe' config['eager_tracing'] = True agent = ppo.PPOTrainer(config=config, env=MyEnv) agent.restore(model_name) for idx in range(90): """ Initialization """ observation = env.reset() frames = [] """ Save some initial values """ fighter_0 = env.fighter.ingress jammer_0 = env.jammer.ingress while True: action_index = agent.compute_action(observation) # 環境を1step 実行 observation, reward, done, _ = env.step(action_index) # 環境の描画とビデオ録画 # shot = env.render(mode=args.render_mode) frames.append(env.render(mode=args.render_mode)) # Space keyでpause, デバッグ用 pause_for_debug() # Slow down rendering pygame.time.wait(10) # エピソードの終了処理 if done: status_print(env, observation, reward, done, fighter_0, jammer_0) video_name = ALGORITHM + '_' + env.mission_condition + '-' + str( idx) make_video(video_name, frames) make_jason(env, video_name, fighter_0, jammer_0) break
MIN_rewards = [] MIN_serveratio = [] MIN_incentives = [] rewards = [] # Number of trials (episodes) no_episodes = 50 stats = plotting.EpisodeStats(episode_lengths=np.zeros(no_episodes), episode_rewards=np.zeros(no_episodes)) T = 2000 number_of_contents = 10 myenv = MyEnv(density=density, T=T, number_of_contents=number_of_contents) if (RL == False): RL = DeepQNetwork(myenv.no_actions, myenv.observation_length, learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=5000, memory_size=2000, batch_size=220 # output_graph=True ) print("No. vehicles:" + str(myenv.number_of_vehicles))
all_rewards_max = [] all_powers_max = [] all_services_max = [] all_upload_max = [] # Q = None; # myenv = MyEnv(density=density, T=100000) # print("learning:"+str(myenv.number_of_vehicles)) # # Q, stats = qLearning(myenv, 1) cache += 1 for iteration in range(iterations): myenv = MyEnv(density=density, T=time, number_of_contents=numbers[cache]) #myenv.RSU_cache_size = caches[cache] #myenv.hit_energy_ratio = ws[cache] print("Testing:" + str(myenv.number_of_vehicles)) # 1) Greedy Algorithm myenv.reset() start = 0 myenv.i = start for i in range(start, myenv.number_of_vehicles): if (i not in myenv.available_contents_to_cache and myenv.available[i] not in myenv.RSU_cache): myenv.step(1) else:
def main(): # Initialize ray ray.init(ignore_reinit_error=True, log_to_driver=False) # Generate & Check environment env = MyEnv({}) # Define trainer agent model_name = MODEL_NAME config = ppo.DEFAULT_CONFIG.copy() config['env_config'] = {} config['num_workers'] = NUM_WORKERS config['num_gpus'] = 0 config['framework'] = 'tfe' config['eager_tracing'] = True agent = ppo.PPOTrainer(config=config, env=MyEnv) agent.restore(model_name) success_history = [] success_count = 0 for idx in range(N_EVAL_EPISODES): """ Initialization """ observation = env.reset() frames = [] """ Save some initial values """ fighter_0 = env.fighter.ingress jammer_0 = env.jammer.ingress while True: action_index = agent.compute_action(observation) # 環境を1step 実行 observation, reward, done, info = env.step(action_index) # 環境の描画とビデオ録画 # shot = env.render(mode=args.render_mode) frames.append(env.render(mode=args.render_mode)) # Slow down rendering # pygame.time.wait(10) # エピソードの終了処理 if done: success_history.append(info['success']) if info['success'] > .5: success_count += 1 break n_success = success_count n_fail = N_EVAL_EPISODES - n_success if np.sum(success_history) != success_count: raise Exception('Something is wrong!') """ Summarize results """ print('==================== Summary of the results ====================') print( f'Mission conditions = w1 : w2 : w3 = ' f'{env.mission_probability[0]:.3f} : {env.mission_probability[1]:.3f} : {env.mission_probability[2]:.3f}' ) print(f' Model is < {MODEL_NAME} >') print( f' Number of success missions: {round(n_success)} / {N_EVAL_EPISODES}, ' f' Number of failed missions {round(n_fail)} / {N_EVAL_EPISODES}')
from gym.wrappers import Monitor from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DQN, DDPG from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.policies import MlpPolicy import numpy as np # import time from myenv import MyEnv log = 'env/' env1 = Monitor(MyEnv(8), log, force=True) env = DummyVecEnv([lambda: env1]) # env = gym.make('CartPole-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model = DDPG(MlpPolicy, env, verbose=1, tensorboard_log='./log/') model.learn(total_timesteps=10000) model.save("ddpg_mountain_8") # del model # remove to demonstrate saving and loading # # # model = DDPG.load("ddpg_mountain")
all_rewards_random = [] all_powers_random = [] all_services_random = [] all_rewards_most = [] all_powers_most = [] all_services_most = [] all_rewards_static = [] all_powers_static = [] all_services_static = [] for iteration in range(iterations): myenv = MyEnv(density=density) myenv.RSU_cache_size = 100 # 1) Greedy Algorithm myenv.reset() for i in range(myenv.number_of_vehicles): myenv.step(1) # Store the best reward for one iteration all_rewards_greedy.append(myenv.total_reward) all_powers_greedy.append(myenv.total_energy) all_services_greedy.append(myenv.total_download / myenv.total_request_amount) # 2) Random Algorithm myenv.reset() actions = [
if __name__ == "__main__": densities = [0.002] for density in densities: rewards = [] # Number of trials (episodes) no_episodes = 4000; stats = plotting.EpisodeStats( episode_lengths=np.zeros(no_episodes), episode_rewards=np.zeros(no_episodes)) myenv = MyEnv(density=density) print(myenv.number_of_vehicles) RL = DeepQNetwork(myenv.no_actions, myenv.number_of_contents+2, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=100, # output_graph=True ) for e in range(no_episodes): # Reset the envirounment observation = myenv.reset();
def main(): # Initialize ray ray.init(ignore_reinit_error=True, log_to_driver=False) # Define trainer agent config = ppo.DEFAULT_CONFIG.copy() config['env_config'] = {} config['num_gpus'] = 0 config['num_workers'] = NUM_WORKERS config['num_cpus_per_worker'] = 1 config['framework'] = 'tfe' config['eager_tracing'] = True # config['model']['fcnet_hiddens'] = [64, 64, 64] print(pretty_print(config)) trainer = ppo.PPOTrainer(config=config, env=MyEnv, logger_creator=custom_log_creator( os.path.expanduser("./" + PROJECT + "/logs"), TRIAL)) logdir = trainer.logdir print(f'\n********************** logdir = {logdir}\n') # Check trainer agent policy = trainer.get_policy() policy.model.base_model.summary() # Define evaluator agent eval_env = MyEnv({}) # obs = eval_env.reset() # Train agent max_episode = MAX_EPISODE eval_freq = EVAL_FREQ n_eval_episode = N_EVAL_EPISODE best_success_count = -100 best_checkpoint_dir = os.path.join('./' + PROJECT + '/checkpoints/', TRIAL + '_best') success_history = [] iteration_history = [] for i in range(max_episode): print(f'{i}th iteration is starting.') # Training result = trainer.train() # print(pretty_print(result)) # Evaluation if i % eval_freq == 0: print( f'\n--------------- Evaluation results at {i}th iteration ---------------' ) print(pretty_print(result)) total_return = 0 success_count = 0 info = {} return_list = [] for j in range(n_eval_episode): # Test the trained agent obs = eval_env.reset() done = False while not done: action = trainer.compute_action(obs) obs, reward, done, info = eval_env.step(action) total_return += reward return_list.append(total_return) if info['success'] > 0.5: success_count += 1 print( f'\niteration {i} success_count: {success_count} / {n_eval_episode}' ) # print(f'return list: {return_list}') success_history.append(success_count / n_eval_episode) iteration_history.append(i) success_history_np = np.array(success_history) file_name = './' + PROJECT + '/learning_history/trial_' + str(ID) # np.savez('./learning_history', iteration_history, success_history) np.savez(file_name, iteration_history, success_history) if success_count >= best_success_count: best_checkpoint = trainer.save( checkpoint_dir=best_checkpoint_dir) print(f'best checkpoint saved at {best_checkpoint}\n') best_success_count = success_count print( f'------------------------------------------------------------\n' )