def set_up_training(self): self.logging.debug("Simulating Data") self.data_handler = DataHandler(N_train=self.N_train, rng=self.rng) if self.experiment_type == "GP": self.data_handler.generate_returns() else: self.data_handler.generate_returns() # TODO check if these method really fit and change the parameters in the gin file self.data_handler.estimate_parameters() self.logging.debug("Instantiating action space") if self.MV_res: self.action_space = ResActionSpace() else: action_range, ret_quantile, holding_quantile = get_action_boundaries( N_train=self.N_train, f_speed=self.data_handler.f_speed, returns=self.data_handler.returns, factors=self.data_handler.factors, ) gin.query_parameter("%ACTION_RANGE")[0] = action_range self.action_space = ActionSpace() self.logging.debug("Instantiating market environment") self.env = self.env_cls( N_train=self.N_train, f_speed=self.data_handler.f_speed, returns=self.data_handler.returns, factors=self.data_handler.factors, ) self.logging.debug("Instantiating DQN model") input_shape = self.env.get_state_dim() self.train_agent = DQN( input_shape=input_shape, action_space=self.action_space, rng=self.rng, N_train=self.N_train, ) self.logging.debug( "Set up length of training and instantiate test env") self.train_agent._get_exploration_length(self.N_train) self.logging.debug("Instantiating Out of sample tester") self.oos_test = Out_sample_vs_gp( savedpath=self.savedpath, tag="DQN", experiment_type=self.experiment_type, env_cls=self.env_cls, MV_res=self.MV_res, ) self.oos_test.init_series_to_fill(iterations=self.col_names_oos)
def dqn(env, learning_rate, batch_size, random_step, log_dir, weight_dir): print('Env Name: ', env) env = gym.make(env) print('Action Space: ', env.action_space.n) print('State Shape:', env.render(mode='rgb_array').shape) agent = DQN(env, QNet(env.action_space.n), nn.MSELoss(), optim.RMSprop, lr=learning_rate, log_dir=log_dir, weight_dir=weight_dir) agent.train(batch_size=batch_size, random_step=random_step)
def dqn_two_agents(env_type, experiment_id, config_file): params = read_yaml(config_file) params['model_type'] = 'DQN' params['env_type'] = env_type params['experiment_id'] = experiment_id save_config(params, experiment_id) env = make_env(env_type, params) env.make_world(wall_prob=params.wall_prob, wall_seed=20, food_prob=0) q_net = create_nn(params) agent_predator = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop) agent_prey = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop) trainer = Trainer(params, env) trainer.train(agent_predator, agent_prey)
def dqn(env_type, experiment_id, config_file): params = read_yaml(config_file) params['model_type'] = 'DQN' params['env_type'] = env_type params['experiment_id'] = experiment_id save_config(params, experiment_id) env = make_env(env_type, params) env.make_world(wall_prob=params.wall_prob, food_prob=0) q_net = create_nn(params) agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop) agent.train(params.episodes, params.episode_step, params.random_step, params.min_greedy, params.max_greedy, params.greedy_step, params.update_period)
def dqn(params, env_type, experiment_id, test_id): ''' Deep Q-learning Args: params: Dictionary of settings env_type: Evnrionment Type experiment_id: Id for the experiment test_id: Id for the test ''' params['experiment_id'] = experiment_id params['test_id'] = test_id env = make_env(env_type, params) env.make_world(wall_prob=params.wall_prob, food_prob=0) q_net = torch.load(args.model_file).cuda() agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop) agent.test()
def __init__(self, hyperparams): config = copy.deepcopy(DQN_CONFIG) config.update(hyperparams) DQN.__init__(self, config) self.e_DQN = FCDuelingDQN(self.n_states, self.n_actions, n_hiddens=config['hidden_layers'], usebn=config['use_batch_norm'], nonlinear=config['act_func']) self.t_DQN = FCDuelingDQN(self.n_states, self.n_actions, n_hiddens=config['hidden_layers'], usebn=config['use_batch_norm'], nonlinear=config['act_func']) self.lossfunc = config['loss']() if self.mom == 0 or self.mom is None: self.optimizer = config['optimizer'](self.e_DQN.parameters(), lr=self.lr) else: self.optimizer = config['optimizer'](self.e_DQN.parameters(), lr=self.lr, momentum=self.mom)
def dqn(env_type, experiment_id, config_file): ''' Deep Q-learning Args: env_type: Evnrionment Type experiment_id: Id for the experiment config_file: Path of the config file ''' params = read_yaml(config_file) params['model_type'] = 'DQN' params['env_type'] = env_type params['experiment_id'] = experiment_id save_config(params, experiment_id) env = make_env(env_type, params) env.make_world(wall_prob=params.wall_prob, wall_seed=20, food_prob=0) q_net = create_nn(params) agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop) agent.train(params.episodes, params.episode_step, params.random_step, params.min_greedy, params.max_greedy, params.greedy_step, params.update_period)
class DQN_runner(MixinCore): def __init__( self, env_cls: object, MV_res: bool, experiment_type: str, seed: int, episodes: int, N_train: int, len_series: Union[int or None], dt: int, start_train: int, save_freq: int, use_GPU: bool, outputDir: str = "_outputs", outputClass: str = "DQN", outputModel: str = "test", varying_pars: Union[list or None] = None, varying_type: str = "chunk", num_cores: int = None, ): self.logging.info("Starting model setup") self._setattrs() self.rng = np.random.RandomState(self.seed) if self.use_GPU: gpu_devices = tf.config.experimental.list_physical_devices("GPU") for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) else: my_devices = tf.config.experimental.list_physical_devices( device_type="CPU") tf.config.experimental.set_visible_devices(devices=my_devices, device_type="CPU") if self.dt != 1.0: # self.len_series = self.len_series * (1/self.dt) self.N_train = int(self.N_train * (1 / self.dt)) if self.episodes: self.N_train = self.episodes * self.len_series self.col_names_oos = [ str(e) for e in np.arange(0, self.episodes + 1, save_freq)[1:] ] else: self.len_series = self.N_train self.save_freq_n = self.N_train // save_freq self.col_names_oos = [ str(int(i)) for i in np.arange(0, self.N_train + 1, self.save_freq_n) ][1:] self.savedpath = GeneratePathFolder( outputDir, outputClass, outputModel, varying_pars, varying_type, self.N_train, ) if save_freq and not os.path.exists( os.path.join(self.savedpath, "ckpt")): os.makedirs(os.path.join(self.savedpath, "ckpt")) elif save_freq and os.path.exists(os.path.join(self.savedpath, "ckpt")): pass logging.info("Successfully generated path to save outputs...") def run(self): """Wrapper for keyboard interrupt.""" try: self.set_up_training() # if self.episodes: # self.training_episodic_agent() # else: self.training_agent() except (KeyboardInterrupt, SystemExit): self.logging.debug("Exit on KeyboardInterrupt or SystemExit") sys.exit() def set_up_training(self): self.logging.debug("Simulating Data") self.data_handler = DataHandler(N_train=self.N_train, rng=self.rng) if self.experiment_type == "GP": self.data_handler.generate_returns() else: self.data_handler.generate_returns() # TODO check if these method really fit and change the parameters in the gin file self.data_handler.estimate_parameters() self.logging.debug("Instantiating action space") if self.MV_res: self.action_space = ResActionSpace() else: action_range, ret_quantile, holding_quantile = get_action_boundaries( N_train=self.N_train, f_speed=self.data_handler.f_speed, returns=self.data_handler.returns, factors=self.data_handler.factors, ) gin.query_parameter("%ACTION_RANGE")[0] = action_range self.action_space = ActionSpace() self.logging.debug("Instantiating market environment") self.env = self.env_cls( N_train=self.N_train, f_speed=self.data_handler.f_speed, returns=self.data_handler.returns, factors=self.data_handler.factors, ) self.logging.debug("Instantiating DQN model") input_shape = self.env.get_state_dim() self.train_agent = DQN( input_shape=input_shape, action_space=self.action_space, rng=self.rng, N_train=self.N_train, ) self.logging.debug( "Set up length of training and instantiate test env") self.train_agent._get_exploration_length(self.N_train) self.logging.debug("Instantiating Out of sample tester") self.oos_test = Out_sample_vs_gp( savedpath=self.savedpath, tag="DQN", experiment_type=self.experiment_type, env_cls=self.env_cls, MV_res=self.MV_res, ) self.oos_test.init_series_to_fill(iterations=self.col_names_oos) def training_agent(self): """ Main routine to train and test the DRL algorithm. The steps are: 1. Load the dataset, metadata, any model output and any pre-loaded data (cached_data). 2. Start the Backtrader engine and initialize the broker object. 3. Instantiate the environment. 4. Instantiate the model for the agent. 5. Train the model according to a chosen technique. 6. Test the model out-of-sample. 7. Log the performance data, plot, save configuration file and the runner logger output. Once this is done, the backtest is over and all of the artifacts are saved in `_exp/experiment_name/_backtests/`. """ self.logging.debug("Training...") CurrState, _ = self.env.reset() # CurrOptState = env.opt_reset() # OptRate, DiscFactorLoads = env.opt_trading_rate_disc_loads() for i in tqdm(iterable=range(self.N_train + 1), desc="Training DQNetwork"): self.train_agent.update_epsilon() epsilon = self.train_agent.epsilon side_only = self.action_space.side_only copy_step = self.train_agent.copy_step action, qvalues = self.train_agent.eps_greedy_action( CurrState, epsilon, side_only=side_only) if not side_only: unscaled_action = action else: unscaled_action = get_bet_size( qvalues, action, action_limit=self.action_space.action_range[0], zero_action=self.action_space.zero_action, rng=self.rng, ) if self.MV_res: NextState, Result, _ = self.env.MV_res_step( CurrState, unscaled_action, i) else: NextState, Result, _ = self.env.step(CurrState, unscaled_action, i) self.env.store_results(Result, i) exp = { "s": CurrState, "a": action, "a_unsc": unscaled_action, "r": Result["Reward_DQN"], "s2": NextState, } self.train_agent.add_experience(exp) self.train_agent.train(i, side_only) if (i % copy_step == 0) and (i > self.train_agent.start_train): self.train_agent.copy_weights() CurrState = NextState if (i % self.save_freq_n == 0) and (i > self.train_agent.start_train): self.train_agent.model.save_weights( os.path.join(self.savedpath, "ckpt", "DQN_{}_ep_weights".format(i)), save_format="tf", ) self.logging.debug("Testing...") self.oos_test.run_test(it=i, test_agent=self.train_agent) # if executeGP: # NextOptState, OptResult = env.opt_step( # CurrOptState, OptRate, DiscFactorLoads, i # ) # env.store_results(OptResult, i) # CurrOptState = NextOptState self.oos_test.save_series() save_gin(os.path.join(self.savedpath, "config.gin")) logging.info("Config file saved")
def runplot_holding(p): query = gin.query_parameter outputClass = p["outputClass"] tag = p["algo"] seed = p["seed"] if 'DQN' in tag: hp = p["hyperparams_model_dqn"] outputModels = p["outputModels_dqn"] elif 'PPO' in tag: hp = p["hyperparams_model_ppo"] outputModels = p["outputModels_ppo"] if hp is not None: outputModel = [exp.format(*hp) for exp in outputModels] else: outputModel = outputModels fig = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2))) gs = gridspec.GridSpec(ncols=2, nrows=2, figure=fig) ax1 = fig.add_subplot(gs[0]) ax2 = fig.add_subplot(gs[1]) ax3 = fig.add_subplot(gs[2]) ax4 = fig.add_subplot(gs[3]) axes = [ax1, ax2, ax3, ax4] fig2 = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2))) gs2 = gridspec.GridSpec(ncols=2, nrows=2, figure=fig2) ax12 = fig2.add_subplot(gs2[0]) ax22 = fig2.add_subplot(gs2[1]) ax32 = fig2.add_subplot(gs2[2]) ax42 = fig2.add_subplot(gs2[3]) axes2 = [ax12, ax22, ax32, ax42] fig3 = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2))) gs3 = gridspec.GridSpec(ncols=2, nrows=2, figure=fig2) ax13 = fig3.add_subplot(gs3[0]) ax23 = fig3.add_subplot(gs3[1]) ax33 = fig3.add_subplot(gs3[2]) ax43 = fig3.add_subplot(gs3[3]) axes3 = [ax13, ax23, ax33, ax43] for i, model in enumerate(outputModel): modelpath = "outputs/{}/{}".format(outputClass, model) # get the latest created folder "length" all_subdirs = [ os.path.join(modelpath, d) for d in os.listdir(modelpath) if os.path.isdir(os.path.join(modelpath, d)) ] latest_subdir = max(all_subdirs, key=os.path.getmtime) length = os.path.split(latest_subdir)[-1] experiment = [ exp for exp in os.listdir("outputs/{}/{}/{}".format( outputClass, model, length)) if seed in exp ][0] data_dir = "outputs/{}/{}/{}/{}".format(outputClass, model, length, experiment) gin.parse_config_file(os.path.join(data_dir, "config.gin"), skip_unknown=True) rng = np.random.RandomState(query("%SEED")) if query("%MV_RES"): action_space = ResActionSpace() else: action_space = ActionSpace() if query("%INP_TYPE") == "f": input_shape = (len(query('%F_PARAM')) + 1, ) else: input_shape = (2, ) if "DQN" in tag: train_agent = DQN(input_shape=input_shape, action_space=action_space, rng=rng) if p['n_dqn']: train_agent.model = load_DQNmodel(data_dir, p['n_dqn'], model=train_agent.model) else: train_agent.model = load_DQNmodel(data_dir, query("%N_TRAIN"), model=train_agent.model) elif "PPO" in tag: train_agent = PPO(input_shape=input_shape, action_space=action_space, rng=rng) if p['ep_ppo']: train_agent.model = load_PPOmodel(data_dir, p['ep_ppo'], model=train_agent.model) else: train_agent.model = load_PPOmodel( data_dir, gin.query_parameter("%EPISODES"), model=train_agent.model) else: print("Choose proper algorithm.") sys.exit() oos_test = Out_sample_vs_gp(savedpath=None, tag=tag[0], experiment_type=query("%EXPERIMENT_TYPE"), env_cls=MarketEnv, MV_res=query("%MV_RES"), N_test=p['N_test']) res_df = oos_test.run_test(train_agent, return_output=True) plot_portfolio(res_df, tag[0], axes[i]) plot_action(res_df, tag[0], axes2[i]) split = model.split("mv_res") if 'halflife' in model: axes[i].set_title( "_".join(["mv_res", split[-1]]).replace("_", " ") + 'halflife: {}'.format( model.split('halflife_')[1].split('_')[0]), fontsize=10) axes2[i].set_title( "_".join(["mv_res", split[-1]]).replace("_", " ") + 'halflife: {}'.format( model.split('halflife_')[1].split('_')[0]), fontsize=10) else: axes[i].set_title("_".join(["mv_res", split[-1]]).replace("_", " "), fontsize=10) axes2[i].set_title("_".join(["mv_res", split[-1]]).replace("_", " "), fontsize=10) # if '18' not in model.split('_')[0]: plot_action(res_df, tag[0], axes3[i], hist=True) if 'halflife' in model: axes3[i].set_title( "_".join(["mv_res", split[-1]]).replace("_", " ") + 'halflife: {}'.format( model.split('halflife_')[1].split('_')[0]), fontsize=10) else: axes3[i].set_title("_".join(["mv_res", split[-1]]).replace("_", " "), fontsize=10) fig.suptitle('Holdings') fig2.suptitle('Actions') # if '18' not in model.split('_')[0]: fig3.suptitle('Res Actions')
def main(): args = parse_args() if args.grad_loss: loss_weighting = 0.0 else: loss_weighting = 1.0 verbs = ['go', 'take', 'open', 'grab', 'run', 'walk', 'climb'] vocabulary = load_list_from_file('./data/vocabulary.txt') basic_actions = [ 'open egg', 'go east', 'go west', 'go north', 'go south', 'go up', 'go down', 'look', 'take egg' ] dictionary = [ 'pray', 'yellow', 'trapdoor', 'open', 'bell', 'touch', 'pile', 'trunk', 'sack', 'inflate', 'southeast', 'of', 'move', 'match', 'figurine', 'railing', 'with', 'map', 'mirror', 'wind', 'examine', 'north', 'out', 'trident', 'turn', 'skull', 'throw', 'northwest', 'case', 'bag', 'red', 'press', 'jewels', 'east', 'pump', 'bolt', 'rusty', 'window', 'douse', 'boat', 'bracelet', 'matchbook', 'basket', 'book', 'coffin', 'bar', 'rug', 'lid', 'drop', 'nasty', 'wrench', 'light', 'sand', 'bauble', 'kill', 'tie', 'painting', 'sword', 'wave', 'in', 'south', 'northeast', 'ring', 'canary', 'lower', 'egg', 'all', 'to', 'candles', 'page', 'and', 'echo', 'emerald', 'tree', 'from', 'rope', 'troll', 'screwdriver', 'torch', 'enter', 'coal', 'go', 'look', 'shovel', 'knife', 'down', 'take', 'switch', 'prayer', 'launch', 'diamond', 'read', 'up', 'get', 'scarab', 'west', 'land', 'southwest', 'climb', 'thief', 'raise', 'wait', 'odysseus', 'button', 'sceptre', 'lamp', 'chalice', 'garlic', 'buoy', 'pot', 'label', 'put', 'dig', 'machine', 'close' ] actions = basic_actions optimize_memory = False sparse_reward = True actor_train_start = 0 eps_start = 1.0 test_params = { 'nn=-1': { 'number_of_neighbors': -1 }, 'nn=1': { 'number_of_neighbors': 1 }, 'nn=3': { 'number_of_neighbors': 3 }, 'nn=11': { 'number_of_neighbors': 11 }, } game_seed = 52 if args.task == 0: buffer_size = 20000 time_steps = 100000 project_name = 'egg_quest_minimal_actions' task = 'egg' elif args.task == 1: buffer_size = 20000 time_steps = 2000000 project_name = 'egg_quest_extended_actions' actions = dictionary task = 'egg' elif args.task == -1: buffer_size = 20000 time_steps = 100000 project_name = 'egg_quest_baby_actions' actions = ['open', 'egg', 'north', 'climb', 'tree', 'take'] task = 'egg' elif args.task == 2: buffer_size = 40000 time_steps = 1000000 project_name = 'troll_imitation' actions = dictionary task = 'troll' sparse_reward = False test_params = { 'nn': { 'number_of_neighbors': args.nn }, } game_seed = 12 elif args.task == 3: buffer_size = 40000 time_steps = 1000000 project_name = 'troll' actions = [ 'north', 'south', 'east', 'west', 'open window', 'take sword', 'take lamp', 'move rug', 'open trapdoor', 'go down', 'light lamp', 'kill troll with sword' ] task = 'troll' sparse_reward = False else: raise NotImplementedError words = list() words.append('') for action in actions: tokens = tokenizer(action) for token in tokens: if token not in words: words.append(token) sentences = list() for i, word1 in enumerate(words): for word2 in words[i + 1:]: if word1 in verbs: sentences.append(word1 + ' ' + word2) else: sentences.append(word2 + ' ' + word1) if args.pomdp: project_name = project_name + '_pomdp' seed = args.seed disable_cuda = False #random.seed(seed) #torch.manual_seed(seed) if torch.cuda.is_available() and not disable_cuda: # free_gpu = get_free_gpu() device = torch.device('cuda') # + str(free_gpu)) #torch.cuda.manual_seed(seed) torch.backends.cudnn.enabled = False else: device = torch.device('cpu') vocab_size = len(vocabulary) bow_parser = BagOfWords( vocabulary=vocabulary, type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(1)) # word2vec_model_path = os.getcwd() + '/../ZorkGym/text_utils/GoogleNews-vectors-negative300.bin' # word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True) word2vec_model = glove_api.load('glove-wiki-gigaword-50') embedding_size = word2vec_model.vector_size word2vec_parser = Word2Vec( type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(0), word2vec_model=word2vec_model, return_func=lambda x: word2vec_padding(x, 65, embedding_size)) onehot_parser = OneHotParser( type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(1), vocabulary=actions) """ Experiments from here and below """ for simulation in args.simulations: tau = 0.0 train_params = {'seed': seed} if simulation == 'dqn_mlp': test_params = None agent = DQN(actions=sentences, model_type='MLP', parser=bow_parser, input_length=vocab_size + 1, input_width=1, history_size=1, device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) elif simulation == 'dqn_cnn': test_params = None word2vec_parser.return_func = lambda x: word2vec_padding( x, 65, embedding_size) agent = DQN(actions=sentences, model_type='CNN', parser=word2vec_parser, input_length=embedding_size, input_width=65, history_size=1, device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) elif simulation == 'drrn_mlp': test_params = None agent = DRRN(actions=sentences, model_type='MLP', parser=bow_parser, input_length=vocab_size + 1, input_width=1, history_size=1, device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) tau = 0.2 elif simulation == 'drrn_cnn': test_params = None agent = DRRN(actions=sentences, model_type='CNN', parser=word2vec_parser, input_length=embedding_size, input_width=65, history_size=1, device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) tau = 0.2 elif simulation == 'dddpg_mlp': word2vec_parser.return_func = lambda x: word2vec_sum( x, embedding_size) action_vocab_list = [] for action in sentences: tokens = tokenizer(action) for token in tokens: if token not in action_vocab_list: action_vocab_list.append(token) action_vocabulary = {} embedding_size = len(action_vocab_list) for idx, action in enumerate(action_vocab_list): action_vocabulary[action] = np.zeros(embedding_size) action_vocabulary[action][idx] = 1.0 word2vec_parser.word2vec_model = action_vocabulary train_params['number_of_neighbors'] = args.nn agent = DDDPG(actions=sentences, state_parser=bow_parser, action_parser=word2vec_parser, embedding_size=embedding_size, input_length=vocab_size + 1, input_width=1, history_size=1, loss_weighting=loss_weighting, model_type='MLP', device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) elif simulation == 'dddpg_cnn': action_word2vec_parser = Word2Vec( type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze( 0), word2vec_model=word2vec_model, return_func=lambda x: word2vec_padding(x, 65, embedding_size)) action_word2vec_parser.return_func = lambda x: word2vec_sum( x, embedding_size) train_params['number_of_neighbors'] = args.nn agent = DDDPG(actions=sentences, state_parser=word2vec_parser, action_parser=action_word2vec_parser, embedding_size=embedding_size, input_length=embedding_size, input_width=65, history_size=1, loss_weighting=loss_weighting, model_type='CNN', device=device, pomdp_mode=args.pomdp, task=task, sparse_reward=sparse_reward) elif simulation == 'ompddpg_mlp': words = set() for action in sentences: for word in tokenizer(action): words.add(word) action_vocabulary = {} if args.action_w2v: for word in words: action_vocabulary[word] = word2vec_model[word] action_vocabulary[''] = [ 0 for _ in range(len(action_vocabulary['open'])) ] else: words.add('') for idx, word in enumerate(words): action_vocabulary[word] = np.zeros(len(words)) action_vocabulary[word][idx] = 1.0 embedding_size = len(action_vocabulary['open']) train_params['number_of_neighbors'] = args.nn agent = OMPDDPG(actions=action_vocabulary, state_parser=bow_parser, embedding_size=embedding_size, input_length=vocab_size + 1, input_width=1, history_size=1, model_type='MLP', device=device, pomdp_mode=args.pomdp, loss_weighting=loss_weighting, linear=args.linear, improved_omp=args.improved_omp, model_path=args.model_path, task=task, sparse_reward=sparse_reward) elif simulation == 'ompddpg_cnn': words = set() for action in sentences: for word in tokenizer(action): words.add(word) action_vocabulary = {} if args.action_w2v: for word in words: action_vocabulary[word] = word2vec_model[word] action_vocabulary[''] = [ 0 for _ in range(len(action_vocabulary['open'])) ] else: words.add('') for idx, word in enumerate(words): action_vocabulary[word] = np.zeros(len(words)) action_vocabulary[word][idx] = 1.0 embedding_size = len(action_vocabulary['open']) train_params['number_of_neighbors'] = args.nn agent = OMPDDPG(actions=action_vocabulary, state_parser=word2vec_parser, embedding_size=embedding_size, input_length=embedding_size, input_width=65, history_size=1, model_type='CNN', device=device, pomdp_mode=args.pomdp, loss_weighting=loss_weighting, linear=args.linear, improved_omp=args.improved_omp, model_path=args.model_path, task=task, sparse_reward=sparse_reward) else: raise NotImplementedError model_name = simulation if not ('dqn' in simulation or 'drrn' in simulation): model_name += '/neighbors=' + str(args.nn) if args.action_w2v: model_name += '/w2v' if args.linear: model_name += '/linear' agent.learn( total_timesteps=time_steps, buffer_size=buffer_size, visualize=True, vis_name=project_name + '/' + model_name + '/' + str(seed), optimize_memory=optimize_memory, tau=tau, learn_start_steps=0, #20000, train_params=train_params, test_params=test_params, eps_start=eps_start, test_interval=args.test_interval, game_seed=game_seed)
def main(config_file): # Check TF version logging.info("Tensorflow version: {}".format(tf.version.VERSION)) # Load main config file with open(config_file, "r") as f: config = yaml.load(f) result_path = config["result_dir"] agent_type = config["agent"] agent_config_file = os.path.join(config["agent_config_dir"], str(agent_type) + ".yml") mode = config["mode"] environment = config["environment"] environment_seed = config["environment_seed"] # Load config file for agent with open(agent_config_file, "r") as f: agent_config = yaml.load(f) # Create output directory time_str = time.strftime("%Y%m%d_%H%M%S") result_path = os.path.join(result_path, agent_type, time_str) if not os.path.exists(result_path): os.makedirs(result_path) agent_config["render_environment"] = config["render_environment"] agent_config["max_episode"] = config["max_episode"] agent_config["max_step"] = config["max_step"] agent_config["slide_window"] = config["slide_window"] agent_config["result_path"] = result_path # Save config files to output directory copyfile(config_file, os.path.join(result_path, os.path.basename(config_file))) copyfile(config_file, os.path.join(result_path, os.path.basename(agent_config_file))) logging.info( mode + " with {} algorithm in environment {}".format(agent_type, environment)) logging.info("Results will be saved at {}".format(result_path)) # Initialize environment env = gym.make('CartPole-v1') env.seed(environment_seed) env = env.unwrapped # Build/load agent if agent_type == "DQN": agent = DQN(agent_config, env) agent.train() elif agent_type == "DDQN": agent = DDQN(agent_config, env) agent.train() elif agent_type == "DDQN_PER_Prop": agent = DDQN_PER_Prop(agent_config, env) agent.train() elif agent_type == "A2C": agent = A2C(agent_config, env) agent.train() elif agent_type == "REINFORCE": agent = REINFORCE(agent_config, env) agent.train() else: raise KeyError("Agent type does not exist") # Train or play if mode == "train": agent.train() elif mode == "play": agent.play()
model = Sequential() model.add(Dense(100, input_dim=domain.number_states())) model.add(Activation('relu')) model.add(Dense(100)) model.add(Activation('relu')) model.add(Dense(domain.valid_actions())) model.add(Activation('linear')) model.compile(loss='mse', optimizer=Adam(0.001)) return model # agent agent = Tabular(domain.valid_actions(), randomizer=lambda n: np.random.randn(n) * 0.1) agent = DQN(domain.number_states(), domain.valid_actions(), model_lambda=model_lambda, batch_size=64, epochs=1, memory_size=3000) # algorithm trainer = ExpectedSarsa(discount=0.95, episode_length=200) trainer = DeepQLearning(discount=0.95, episode_length=200, encoding=domain.default_encoding) # run perf, eps = trainer.train_many(agent, domain, # Fixed(0.5), # ExponentialDecay(0.5, 0.85), # VDBE(0.5, 1.0 / domain.valid_actions(), 0.01), # BMC(1.0, 1.0 / 0.499999 - 1.0, 100.0), BMCRobust(mu=0, tau=1, a=500, b=500, alpha=25, beta=25 + 0.01), FixedLearningRate(0.6),