def main(training=True): #load an environment env = Environment(opticalTweezers(), 4) env.reset() #state action placeholders States = tf.placeholder(tf.float32, shape=[None, 4, 6], name='States') Actions = tf.placeholder(tf.int32, shape=[None], name='Actions') Rewards = tf.placeholder(tf.float32, shape=[None, 1], name='Rewards') Advantages = tf.placeholder(tf.float32, shape=[None, 1], name='Advantages') Entropy_coefficient = tf.placeholder(tf.float32, shape=(), name='Entropy_coefficient') #load a model or else init if os.path.isfile(os.path.join(os.getcwd(), 'model')): #load model pass else: max_grad_norm = 0.5 actor = Actor(States, Actions, Advantages, Rewards, Entropy_coefficient, max_grad_norm) critic = Critic(Rewards, actor) if training: train_model2(env, actor, critic, States, Actions, Rewards, Advantages, Entropy_coefficient) else: pass
def main(args=None): summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" + args.env) env = Environment(gym.make('LunarLanderContinuous-v2')) env.reset() state_dim = env.get_state_size() action_space = gym.make(args.env).action_space action_dim = action_space.high.shape[0] act_range = action_space.high algo = DDPG(action_dim, state_dim, act_range, args.consecutive_frames) stats = algo.train(env, args, summary_writer) df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = 'models/' if not os.path.exists(exp_dir): os.makedirs(exp_dir) algo.save_weights(exp_dir) env.env.close()
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) # Environment Initialization if(args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() elif(args.type=="DDPG"): # Continuous Environments Wrapper env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_space = gym.make(args.env).action_space action_dim = action_space.high.shape[0] act_range = action_space.high else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n # Pick algorithm to train if(args.type=="DDQN"): algo = DDQN(action_dim, state_dim, args) algo.load_weights(args.model_path) elif(args.type=="A2C"): algo = A2C(action_dim, state_dim, args.consecutive_frames) algo.load_weights(args.actor_path, args.critic_path) elif(args.type=="A3C"): algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari) algo.load_weights(args.actor_path, args.critic_path) elif(args.type=="DDPG"): algo = DDPG(action_dim, state_dim, act_range, args.consecutive_frames) algo.load_weights(args.actor_path, args.critic_path) # Display agent old_state, time = env.reset(), 0 while True: env.render() a = algo.policy_action(old_state) old_state, r, done, _ = env.step(a) time += 1 if done: env.reset() env.env.close()
def main(): args=Arg() env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari) set_session(get_session()) summary_writer = tf.summary.FileWriter("./tensorboard_" + args.env) stats = algo.train(env, args, summary_writer) print(stats) algo.save_weights('./'+args.env+'.h5') env.env.close()
def train(self, args, summary_writer): # Instantiate one environment per thread envs = [ Environment(gym.make(args.env), args.consecutive_frames) for i in range(args.n_threads) ] [e.reset() for e in envs] state_dim = envs[0].get_state_size() action_dim = gym.make(args.env).action_space.n # Create threads tqdm_e = tqdm(range(int(args.nb_episodes)), desc='Score', leave=True, unit=" episodes") threads = [ threading.Thread(target=training_thread, daemon=True, args=(self, args.nb_episodes, envs[i], action_dim, args.training_interval, summary_writer, tqdm_e, args.render)) for i in range(args.n_threads) ] for t in threads: t.start() time.sleep(0.5) try: [t.join() for t in threads] except KeyboardInterrupt: print("Exiting all threads...") return None
def train(self, env, args, summary_writer): # Instantiate one environment per thread if(args.is_atari): envs = [AtariEnvironment(args) for i in range(args.n_threads)] state_dim = envs[0].get_state_size() action_dim = envs[0].get_action_size() else: envs = [Environment(gym.make(args.env), args.consecutive_frames) for i in range(args.n_threads)] [e.reset() for e in envs] state_dim = envs[0].get_state_size() action_dim = gym.make(args.env).action_space.n # Create threads factor = 100.0 / (args.nb_episodes) tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") threads = [threading.Thread( target=training_thread, args=(self, args.nb_episodes, envs[i], action_dim, args.training_interval, summary_writer, tqdm_e, factor)) for i in range(args.n_threads)] for t in threads: t.start() time.sleep(1) [t.join() for t in threads] return None
def train(self, env, args, summary_writer): # Instantiate one environment per thread if (args.is_ai2thor): config_dict = {'max_episode_length': 500} envs = [ AI2ThorEnv(config_dict=config_dict) for i in range(args.n_threads) ] env.reset() state = envs[0].reset() state_dim = state.shape action_dim = envs[0].action_space.n elif (args.is_atari): envs = [AtariEnvironment(args) for i in range(args.n_threads)] state_dim = envs[0].get_state_size() action_dim = envs[0].get_action_size() else: envs = [ Environment(gym.make(args.env), args.consecutive_frames) for i in range(args.n_threads) ] [e.reset() for e in envs] state_dim = envs[0].get_state_size() action_dim = gym.make(args.env).action_space.n # Create threads tqdm_e = tqdm(range(int(args.nb_episodes)), desc='Score', leave=True, unit=" episodes") threads = [ threading.Thread(target=training_thread, daemon=True, args=(self, args.nb_episodes, envs[i], action_dim, args.training_interval, summary_writer, tqdm_e, args.render)) for i in range(args.n_threads) ] for t in threads: t.start() time.sleep(0.5) try: [t.join() for t in threads] except KeyboardInterrupt: print("Exiting all threads...") return None
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) # Environment Initialization if args.is_ai2thor: config_dict = {'max_episode_length': 2000} env = AI2ThorEnv(config_dict=config_dict) env.reset() state = env.reset() state_dim = state.shape action_dim = env.action_space.n elif (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari, is_ai2thor=args.is_ai2thor) algo.load_weights(args.actor_path, args.critic_path) # Display agent old_state, time = env.reset(), 0 while True: a = algo.policy_action(old_state) old_state, r, done, _ = env.step(a) time += 1 if done: print('----- done, resetting env ----') env.reset()
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) if args.wandb: wandb.init(entity=args.wandb_id, project=args.wandb_project) # Environment Initialization if (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() elif (args.type == "DDPG"): # Continuous Environments Wrapper env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_space = gym.make(args.env).action_space action_dim = action_space.high.shape[0] act_range = action_space.high else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n # Pick algorithm to train if (args.type == "DDQN"): algo = DDQN(action_dim, state_dim, args) elif (args.type == "A2C"): algo = A2C(action_dim, state_dim, args.consecutive_frames) elif (args.type == "A3C"): algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari) elif (args.type == "DDPG"): algo = DDPG(action_dim, state_dim, act_range, args.consecutive_frames) # Train stats = algo.train(env, args) # e, mean, stdev: e is episode # Export results to CSV if (args.gather_stats): df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = '{}/models/'.format(args.type) if not os.path.exists(exp_dir): os.makedirs(exp_dir) export_path = '{}{}_ENV_{}_NB_EP_{}_BS_{}'.format(exp_dir, args.type, args.env, args.nb_episodes, args.batch_size) algo.save_weights(export_path) env.env.close()
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) summary_writer = tf.summary.FileWriter("{}/tensorboard_M1_{}_M1_{}_snr1_{}_snr2_{}".format(args.out_dir, args.M1, args.M1, args.snr_M1, args.snr_M2)) # Initialize the wireless environment users_env = UsersEnvCluster(args.M1, args.M2, args.snr_M1, args.snr_M2, fixed_channel=False) print(users_env) # Wrap the environment to use consecutive frames env = Environment(users_env, args.consecutive_frames) env.reset() # Define parameters for the DDQN and DDPG algorithms state_dim = env.get_state_size() action_dim = users_env.action_dim act_range = 1 act_min = 0 # Initialize the DQN algorithm for the clustering optimization n_clusters = users_env.n_clusters algo_clustering = DDQN(n_clusters, state_dim, args) # Initialize the DDPG algorithm for the beamforming optimization algo = DDPG(action_dim, state_dim, act_range, act_min, args.consecutive_frames, algo_clustering, episode_length=args.episode_length) if args.step == "train": # Train stats = algo.train(env, args, summary_writer) # Export results to CSV if(args.gather_stats): df = pd.DataFrame(np.array(stats)) df.to_csv(args.out_dir + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = '{}/models_M1_{}_M2_{}_snr1_{}_snr2_{}/'.format(args.out_dir, args.M1, args.M2, args.snr_M1, args.snr_M2) if not os.path.exists(exp_dir): os.makedirs(exp_dir) # Save DDPG export_path = '{}_{}_NB_EP_{}_BS_{}'.format(exp_dir, "DDPG", args.nb_episodes, args.batch_size) algo.save_weights(export_path) # Save DDQN export_path = '{}_{}_NB_EP_{}_BS_{}'.format(exp_dir, "DDQN", args.nb_episodes, args.batch_size) algo.ddqn_clustering.save_weights(export_path) elif args.step == "inference": print("Loading the DDPG networks (actor and critic) and the DDQN policy network ...") path_actor = '<add the path of the .h5 file of the DDPG actor>' path_critic = '<add the path of the .h5 file of the DDPG critic>' path_ddqn = '<add the path of the .h5 file of the DDQN actor>' algo.load_weights(path_actor, path_critic, path_ddqn) # run a random policy during inference as an example s = np.random.rand(1, args.Nr) s_1 = np.zeros_like(s) s = np.vstack((s_1, s)) while True: W = algo.policy_action(s) cluster_index = algo.ddqn_clustering.policy_action(s) a_and_c = {'a': W, 'c': cluster_index} new_state, r, done, _ = env.step(a_and_c) print("RL min rate = {}".format(r)) print("RL state = {}".format(np.log(1 + new_state))) s = new_state input('Press Enter to continue ...')
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" + args.env) # Environment Initialization if(args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() elif(args.type=="DDPG"): # Continuous Environments Wrapper env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_space = gym.make(args.env).action_space action_dim = action_space.high.shape[0] act_range = action_space.high else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n # Pick algorithm to train if(args.type=="DDQN"): algo = DDQN(action_dim, state_dim, args) elif(args.type=="A2C"): algo = A2C(action_dim, state_dim, args.consecutive_frames) elif(args.type=="A3C"): algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari) elif(args.type=="DDPG"): algo = DDPG(action_dim, state_dim, act_range, args.consecutive_frames) # Train stats = algo.train(env, args, summary_writer) # Export results to CSV if(args.gather_stats): df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = '{}/models/'.format(args.type) if not os.path.exists(exp_dir): os.makedirs(exp_dir) export_path = '{}{}_ENV_{}_NB_EP_{}_BS_{}'.format(exp_dir, args.type, args.env, args.nb_episodes, args.batch_size) algo.save_weights(export_path) env.env.close()
import os import sys import gym import argparse import numpy as np import pandas as pd import tensorflow as tf import gym gym.logger.set_level(40) from utils.continuous_environments import Environment env = Environment(gym.make(LunarLanderContinuous - v2), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_space = gym.make("LunarLanderContinuous-v2").action_space action_dim = action_space.high.shape[0] act_range = action_space.high
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" + args.env) # Environment Initialization if (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() elif (args.type == "DDPG"): # Continuous Environments Wrapper env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_space = gym.make(args.env).action_space action_dim = action_space.high.shape[0] act_range = action_space.high else: if args.env == 'cell': #do this env = Environment(opticalTweezers(), args.consecutive_frames) # env=opticalTweezers(consecutive_frames=args.consecutive_frames) env.reset() state_dim = (6, ) action_dim = 4 #note that I have to change the reshape code for a 2d agent # should be 4 else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() print(state_dim) action_dim = gym.make(args.env).action_space.n print(action_dim) # Pick algorithm to train if (args.type == "DDQN"): algo = DDQN(action_dim, state_dim, args) elif (args.type == "A2C"): algo = A2C(action_dim, state_dim, args.consecutive_frames) elif (args.type == "A3C"): algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari) elif (args.type == "DDPG"): algo = DDPG(action_dim, state_dim, act_range, args.consecutive_frames) # Train stats = algo.train(env, args, summary_writer) # Export results to CSV if (args.gather_stats): df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Display agent old_state, time = env.reset(), 0 # all_old_states=[old_state for i in range(args.consecutive_frames)] while True: env.render() a = algo.policy_action(old_state) old_state, r, done, _ = env.step(a) time += 1 if done: env.reset()
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # Environment Initialization if args.is_ai2thor: config_dict = {'max_episode_length': 500} env = AI2ThorEnv(config_dict=config_dict) env.reset() state = env.reset() state_dim = state.shape action_dim = env.action_space.n args.env = 'ai2thor' elif (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() print(state_dim) print(action_dim) else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n set_session(get_session()) summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" + args.env) algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari, is_ai2thor=args.is_ai2thor) # Train stats = algo.train(env, args, summary_writer) # Export results to CSV if args.gather_stats: df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = '{}/models/'.format(args.type) if not os.path.exists(exp_dir): os.makedirs(exp_dir) export_path = '{}{}_ENV_{}_NB_EP_{}_BS_{}'.format(exp_dir, args.type, args.env, args.nb_episodes, args.batch_size) algo.save_weights(export_path) env.close()
def fit(self, env, summary_writer, debug=False, num_cpus=4, is_market=False, env_args={}, test_env_args=None, env_version='v1'): stagnation = 1 best_so_far = 0 # Init test env test_env = None if env_version == 'v1': test_env = MarketEnvironmentV1( **test_env_args) if test_env_args else None if env_version == 'v2': test_env = MarketEnvironmentV2( **test_env_args) if test_env_args else None envs = [] # Create environements for all population if is_market: if env_version == 'v1': envs = [ MarketEnvironmentV1(**env_args) for i in range(self.population_size) ] if env_version == 'v2': envs = [ MarketEnvironmentV2(**env_args) for i in range(self.population_size) ] else: envs = [ Environment(**env_args) for i in range(self.population_size) ] # Iterating over all generations tqdm_e = tqdm(total=self.generations, desc='Generation', leave=True, unit=" gen") for gen_i in range(self.generations): # Doing our evaluations args = [(self, self.networks[i], envs[i]) for i in range(self.population_size)] with Pool(num_cpus) as p: rewards = np.array(p.map(_run_par_evaluate, args)) # Tracking best score per generation self.fitness.append(np.max(rewards)) # Selecting the best network best_network = np.argmax(rewards) # Selecting top n networks n = int(self.survival_ratio * self.population_size) top_n_index = np.argsort(rewards)[-n:] # Creating our child networks new_networks = [] for _ in range(self.population_size - n): # Origin will take -> 0 if both parent -> 1 if one parent and -> 2 if just get another network from previous run origin = np.random.choice([0, 1, 2], p=[ self.both_parent_percentage, self.one_parent_percentage, 1 - self.both_parent_percentage - self.one_parent_percentage ]) # both parents if origin == 0: new_net = NeuralNet(parent1=self.networks[random.randint( 0, len(top_n_index) - 1)], parent2=self.networks[random.randint( 0, len(top_n_index) - 1)], var=self.mutation_variance) # One parent elif origin == 1: new_net = NeuralNet(parent1=self.networks[random.randint( 0, len(top_n_index) - 1)], parent2=None, var=self.mutation_variance) else: # Copy from other run (aside from the choosen best) index = top_n_index[0] while index not in top_n_index: index = random.randint(0, len(self.networks) - 1) new_net = self.networks[index] new_networks.append(new_net) # Setting our new networks maintain_best_n = [self.networks[i] for i in top_n_index] self.networks = maintain_best_n + new_networks # Export results for Tensorboard r_max = rewards.max() r_mean = rewards.mean() r_std = rewards.std() self.insert_info(r_max, r_mean, r_std) summary_writer.add_summary(tfSummary('Max rewards', r_max), global_step=gen_i) summary_writer.add_summary(tfSummary('Mean rewards', r_mean), global_step=gen_i) summary_writer.add_summary(tfSummary('STD rewards', r_std), global_step=gen_i) # Update stagnation if r_max > best_so_far: best_so_far = r_max stagnation = 1 else: stagnation += 1 #Update tqdm tqdm_e.set_description('Generation:' + str(gen_i + 1) + '| Highest Reward:' + str(r_max) + '| Average Reward:' + str(r_mean) + '| std Reward: ' + str(r_std) + '| Stagnation: ' + str(stagnation) + '| Population size: ' + str(len(self.networks))) # Save current weights self.best_network = self.networks[best_network] if debug: self._log_best_network_env_info(maintain_best_n[0], summary_writer, envs[0], test_env, gen_i) self.save_weights(gen_i, maintain_best_n[0], self.save_path) # Update logs summary_writer.flush() tqdm_e.update(1) tqdm_e.refresh() # Se estiver estagnado por muito tempo, eu paro if stagnation > 10 and self.stagnation_end: break # Close the environments [e.close() for e in envs] # Returning the best network self.best_network = self.networks[best_network] return self.global_info