Exemple #1
0
    def set_up_training(self):

        self.logging.debug("Simulating Data")

        self.data_handler = DataHandler(N_train=self.N_train, rng=self.rng)
        if self.experiment_type == "GP":
            self.data_handler.generate_returns()
        else:
            self.data_handler.generate_returns()
            # TODO check if these method really fit and change the parameters in the gin file
            self.data_handler.estimate_parameters()

        self.logging.debug("Instantiating action space")
        if self.MV_res:
            self.action_space = ResActionSpace()
        else:
            action_range, ret_quantile, holding_quantile = get_action_boundaries(
                N_train=self.N_train,
                f_speed=self.data_handler.f_speed,
                returns=self.data_handler.returns,
                factors=self.data_handler.factors,
            )

            gin.query_parameter("%ACTION_RANGE")[0] = action_range
            self.action_space = ActionSpace()

        self.logging.debug("Instantiating market environment")
        self.env = self.env_cls(
            N_train=self.N_train,
            f_speed=self.data_handler.f_speed,
            returns=self.data_handler.returns,
            factors=self.data_handler.factors,
        )

        self.logging.debug("Instantiating DQN model")
        input_shape = self.env.get_state_dim()

        self.train_agent = DQN(
            input_shape=input_shape,
            action_space=self.action_space,
            rng=self.rng,
            N_train=self.N_train,
        )

        self.logging.debug(
            "Set up length of training and instantiate test env")
        self.train_agent._get_exploration_length(self.N_train)

        self.logging.debug("Instantiating Out of sample tester")
        self.oos_test = Out_sample_vs_gp(
            savedpath=self.savedpath,
            tag="DQN",
            experiment_type=self.experiment_type,
            env_cls=self.env_cls,
            MV_res=self.MV_res,
        )

        self.oos_test.init_series_to_fill(iterations=self.col_names_oos)
Exemple #2
0
def dqn(env, learning_rate, batch_size, random_step, log_dir, weight_dir):
    print('Env Name: ', env)
    env = gym.make(env)
    print('Action Space: ', env.action_space.n)
    print('State Shape:', env.render(mode='rgb_array').shape)
    agent = DQN(env,
                QNet(env.action_space.n),
                nn.MSELoss(),
                optim.RMSprop,
                lr=learning_rate,
                log_dir=log_dir,
                weight_dir=weight_dir)
    agent.train(batch_size=batch_size, random_step=random_step)
Exemple #3
0
def dqn_two_agents(env_type, experiment_id, config_file):
    params = read_yaml(config_file)
    params['model_type'] = 'DQN'
    params['env_type'] = env_type
    params['experiment_id'] = experiment_id
    save_config(params, experiment_id)
    env = make_env(env_type, params)
    env.make_world(wall_prob=params.wall_prob, wall_seed=20, food_prob=0)
    q_net = create_nn(params)
    agent_predator = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop)
    agent_prey = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop)

    trainer = Trainer(params, env)
    trainer.train(agent_predator, agent_prey)
Exemple #4
0
def dqn(env_type, experiment_id, config_file):
    params = read_yaml(config_file)
    params['model_type'] = 'DQN'
    params['env_type'] = env_type
    params['experiment_id'] = experiment_id

    save_config(params, experiment_id)
    env = make_env(env_type, params)
    env.make_world(wall_prob=params.wall_prob, food_prob=0)
    q_net = create_nn(params)
    agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop)
    agent.train(params.episodes, params.episode_step, params.random_step,
                params.min_greedy, params.max_greedy, params.greedy_step,
                params.update_period)
Exemple #5
0
def dqn(params, env_type, experiment_id, test_id):
    '''
    Deep Q-learning

    Args:
        params: Dictionary of settings
        env_type: Evnrionment Type
        experiment_id: Id for the experiment
        test_id: Id for the test
    '''
    params['experiment_id'] = experiment_id
    params['test_id'] = test_id

    env = make_env(env_type, params)
    env.make_world(wall_prob=params.wall_prob, food_prob=0)
    q_net = torch.load(args.model_file).cuda()
    agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop)
    agent.test()
Exemple #6
0
 def __init__(self, hyperparams):
     config = copy.deepcopy(DQN_CONFIG)
     config.update(hyperparams)
     DQN.__init__(self, config)
     self.e_DQN = FCDuelingDQN(self.n_states,
                               self.n_actions,
                               n_hiddens=config['hidden_layers'],
                               usebn=config['use_batch_norm'],
                               nonlinear=config['act_func'])
     self.t_DQN = FCDuelingDQN(self.n_states,
                               self.n_actions,
                               n_hiddens=config['hidden_layers'],
                               usebn=config['use_batch_norm'],
                               nonlinear=config['act_func'])
     self.lossfunc = config['loss']()
     if self.mom == 0 or self.mom is None:
         self.optimizer = config['optimizer'](self.e_DQN.parameters(),
                                              lr=self.lr)
     else:
         self.optimizer = config['optimizer'](self.e_DQN.parameters(),
                                              lr=self.lr,
                                              momentum=self.mom)
Exemple #7
0
def dqn(env_type, experiment_id, config_file):
    '''
    Deep Q-learning

    Args:
        env_type: Evnrionment Type
        experiment_id: Id for the experiment
        config_file: Path of the config file
    '''

    params = read_yaml(config_file)
    params['model_type'] = 'DQN'
    params['env_type'] = env_type
    params['experiment_id'] = experiment_id

    save_config(params, experiment_id)
    env = make_env(env_type, params)
    env.make_world(wall_prob=params.wall_prob, wall_seed=20, food_prob=0)
    q_net = create_nn(params)
    agent = DQN(params, env, q_net, nn.MSELoss(), optim.RMSprop)
    agent.train(params.episodes, params.episode_step, params.random_step,
                params.min_greedy, params.max_greedy, params.greedy_step,
                params.update_period)
Exemple #8
0
class DQN_runner(MixinCore):
    def __init__(
        self,
        env_cls: object,
        MV_res: bool,
        experiment_type: str,
        seed: int,
        episodes: int,
        N_train: int,
        len_series: Union[int or None],
        dt: int,
        start_train: int,
        save_freq: int,
        use_GPU: bool,
        outputDir: str = "_outputs",
        outputClass: str = "DQN",
        outputModel: str = "test",
        varying_pars: Union[list or None] = None,
        varying_type: str = "chunk",
        num_cores: int = None,
    ):

        self.logging.info("Starting model setup")
        self._setattrs()

        self.rng = np.random.RandomState(self.seed)

        if self.use_GPU:
            gpu_devices = tf.config.experimental.list_physical_devices("GPU")
            for device in gpu_devices:
                tf.config.experimental.set_memory_growth(device, True)
        else:
            my_devices = tf.config.experimental.list_physical_devices(
                device_type="CPU")
            tf.config.experimental.set_visible_devices(devices=my_devices,
                                                       device_type="CPU")

        if self.dt != 1.0:
            # self.len_series = self.len_series * (1/self.dt)
            self.N_train = int(self.N_train * (1 / self.dt))

        if self.episodes:
            self.N_train = self.episodes * self.len_series
            self.col_names_oos = [
                str(e) for e in np.arange(0, self.episodes + 1, save_freq)[1:]
            ]
        else:
            self.len_series = self.N_train
            self.save_freq_n = self.N_train // save_freq
            self.col_names_oos = [
                str(int(i))
                for i in np.arange(0, self.N_train + 1, self.save_freq_n)
            ][1:]

        self.savedpath = GeneratePathFolder(
            outputDir,
            outputClass,
            outputModel,
            varying_pars,
            varying_type,
            self.N_train,
        )
        if save_freq and not os.path.exists(
                os.path.join(self.savedpath, "ckpt")):
            os.makedirs(os.path.join(self.savedpath, "ckpt"))
        elif save_freq and os.path.exists(os.path.join(self.savedpath,
                                                       "ckpt")):
            pass

        logging.info("Successfully generated path to save outputs...")

    def run(self):
        """Wrapper for keyboard interrupt."""
        try:
            self.set_up_training()
            # if self.episodes:
            #     self.training_episodic_agent()
            # else:
            self.training_agent()
        except (KeyboardInterrupt, SystemExit):
            self.logging.debug("Exit on KeyboardInterrupt or SystemExit")
            sys.exit()

    def set_up_training(self):

        self.logging.debug("Simulating Data")

        self.data_handler = DataHandler(N_train=self.N_train, rng=self.rng)
        if self.experiment_type == "GP":
            self.data_handler.generate_returns()
        else:
            self.data_handler.generate_returns()
            # TODO check if these method really fit and change the parameters in the gin file
            self.data_handler.estimate_parameters()

        self.logging.debug("Instantiating action space")
        if self.MV_res:
            self.action_space = ResActionSpace()
        else:
            action_range, ret_quantile, holding_quantile = get_action_boundaries(
                N_train=self.N_train,
                f_speed=self.data_handler.f_speed,
                returns=self.data_handler.returns,
                factors=self.data_handler.factors,
            )

            gin.query_parameter("%ACTION_RANGE")[0] = action_range
            self.action_space = ActionSpace()

        self.logging.debug("Instantiating market environment")
        self.env = self.env_cls(
            N_train=self.N_train,
            f_speed=self.data_handler.f_speed,
            returns=self.data_handler.returns,
            factors=self.data_handler.factors,
        )

        self.logging.debug("Instantiating DQN model")
        input_shape = self.env.get_state_dim()

        self.train_agent = DQN(
            input_shape=input_shape,
            action_space=self.action_space,
            rng=self.rng,
            N_train=self.N_train,
        )

        self.logging.debug(
            "Set up length of training and instantiate test env")
        self.train_agent._get_exploration_length(self.N_train)

        self.logging.debug("Instantiating Out of sample tester")
        self.oos_test = Out_sample_vs_gp(
            savedpath=self.savedpath,
            tag="DQN",
            experiment_type=self.experiment_type,
            env_cls=self.env_cls,
            MV_res=self.MV_res,
        )

        self.oos_test.init_series_to_fill(iterations=self.col_names_oos)

    def training_agent(self):
        """
        Main routine to train and test the DRL algorithm. The steps are:

        1. Load the dataset, metadata, any model output and any pre-loaded
        data (cached_data).
        2. Start the Backtrader engine and initialize the broker object.
        3. Instantiate the environment.
        4. Instantiate the model for the agent.
        5. Train the model according to a chosen technique.
        6. Test the model out-of-sample.
        7. Log the performance data, plot, save configuration file and
            the runner logger output.

        Once this is done, the backtest is over and all of the artifacts
        are saved in `_exp/experiment_name/_backtests/`.
        """

        self.logging.debug("Training...")
        CurrState, _ = self.env.reset()

        # CurrOptState = env.opt_reset()
        # OptRate, DiscFactorLoads = env.opt_trading_rate_disc_loads()

        for i in tqdm(iterable=range(self.N_train + 1),
                      desc="Training DQNetwork"):

            self.train_agent.update_epsilon()
            epsilon = self.train_agent.epsilon
            side_only = self.action_space.side_only
            copy_step = self.train_agent.copy_step

            action, qvalues = self.train_agent.eps_greedy_action(
                CurrState, epsilon, side_only=side_only)
            if not side_only:
                unscaled_action = action
            else:
                unscaled_action = get_bet_size(
                    qvalues,
                    action,
                    action_limit=self.action_space.action_range[0],
                    zero_action=self.action_space.zero_action,
                    rng=self.rng,
                )
            if self.MV_res:
                NextState, Result, _ = self.env.MV_res_step(
                    CurrState, unscaled_action, i)
            else:
                NextState, Result, _ = self.env.step(CurrState,
                                                     unscaled_action, i)

            self.env.store_results(Result, i)

            exp = {
                "s": CurrState,
                "a": action,
                "a_unsc": unscaled_action,
                "r": Result["Reward_DQN"],
                "s2": NextState,
            }

            self.train_agent.add_experience(exp)

            self.train_agent.train(i, side_only)

            if (i % copy_step == 0) and (i > self.train_agent.start_train):
                self.train_agent.copy_weights()

            CurrState = NextState

            if (i % self.save_freq_n
                    == 0) and (i > self.train_agent.start_train):
                self.train_agent.model.save_weights(
                    os.path.join(self.savedpath, "ckpt",
                                 "DQN_{}_ep_weights".format(i)),
                    save_format="tf",
                )

                self.logging.debug("Testing...")
                self.oos_test.run_test(it=i, test_agent=self.train_agent)

            # if executeGP:
            #     NextOptState, OptResult = env.opt_step(
            #         CurrOptState, OptRate, DiscFactorLoads, i
            #     )
            #     env.store_results(OptResult, i)
            #     CurrOptState = NextOptState

        self.oos_test.save_series()

        save_gin(os.path.join(self.savedpath, "config.gin"))
        logging.info("Config file saved")
def runplot_holding(p):

    query = gin.query_parameter

    outputClass = p["outputClass"]
    tag = p["algo"]
    seed = p["seed"]
    if 'DQN' in tag:
        hp = p["hyperparams_model_dqn"]
        outputModels = p["outputModels_dqn"]
    elif 'PPO' in tag:
        hp = p["hyperparams_model_ppo"]
        outputModels = p["outputModels_ppo"]

    if hp is not None:
        outputModel = [exp.format(*hp) for exp in outputModels]
    else:
        outputModel = outputModels

    fig = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2)))
    gs = gridspec.GridSpec(ncols=2, nrows=2, figure=fig)
    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1])
    ax3 = fig.add_subplot(gs[2])
    ax4 = fig.add_subplot(gs[3])
    axes = [ax1, ax2, ax3, ax4]

    fig2 = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2)))
    gs2 = gridspec.GridSpec(ncols=2, nrows=2, figure=fig2)
    ax12 = fig2.add_subplot(gs2[0])
    ax22 = fig2.add_subplot(gs2[1])
    ax32 = fig2.add_subplot(gs2[2])
    ax42 = fig2.add_subplot(gs2[3])
    axes2 = [ax12, ax22, ax32, ax42]

    fig3 = plt.figure(figsize=set_size(width=1000.0, subplots=(2, 2)))
    gs3 = gridspec.GridSpec(ncols=2, nrows=2, figure=fig2)
    ax13 = fig3.add_subplot(gs3[0])
    ax23 = fig3.add_subplot(gs3[1])
    ax33 = fig3.add_subplot(gs3[2])
    ax43 = fig3.add_subplot(gs3[3])
    axes3 = [ax13, ax23, ax33, ax43]

    for i, model in enumerate(outputModel):
        modelpath = "outputs/{}/{}".format(outputClass, model)
        # get the latest created folder "length"
        all_subdirs = [
            os.path.join(modelpath, d) for d in os.listdir(modelpath)
            if os.path.isdir(os.path.join(modelpath, d))
        ]
        latest_subdir = max(all_subdirs, key=os.path.getmtime)
        length = os.path.split(latest_subdir)[-1]
        experiment = [
            exp for exp in os.listdir("outputs/{}/{}/{}".format(
                outputClass, model, length)) if seed in exp
        ][0]
        data_dir = "outputs/{}/{}/{}/{}".format(outputClass, model, length,
                                                experiment)

        gin.parse_config_file(os.path.join(data_dir, "config.gin"),
                              skip_unknown=True)

        rng = np.random.RandomState(query("%SEED"))

        if query("%MV_RES"):
            action_space = ResActionSpace()
        else:
            action_space = ActionSpace()

        if query("%INP_TYPE") == "f":
            input_shape = (len(query('%F_PARAM')) + 1, )
        else:
            input_shape = (2, )

        if "DQN" in tag:
            train_agent = DQN(input_shape=input_shape,
                              action_space=action_space,
                              rng=rng)
            if p['n_dqn']:
                train_agent.model = load_DQNmodel(data_dir,
                                                  p['n_dqn'],
                                                  model=train_agent.model)
            else:
                train_agent.model = load_DQNmodel(data_dir,
                                                  query("%N_TRAIN"),
                                                  model=train_agent.model)

        elif "PPO" in tag:
            train_agent = PPO(input_shape=input_shape,
                              action_space=action_space,
                              rng=rng)
            if p['ep_ppo']:
                train_agent.model = load_PPOmodel(data_dir,
                                                  p['ep_ppo'],
                                                  model=train_agent.model)
            else:
                train_agent.model = load_PPOmodel(
                    data_dir,
                    gin.query_parameter("%EPISODES"),
                    model=train_agent.model)
        else:
            print("Choose proper algorithm.")
            sys.exit()

        oos_test = Out_sample_vs_gp(savedpath=None,
                                    tag=tag[0],
                                    experiment_type=query("%EXPERIMENT_TYPE"),
                                    env_cls=MarketEnv,
                                    MV_res=query("%MV_RES"),
                                    N_test=p['N_test'])

        res_df = oos_test.run_test(train_agent, return_output=True)

        plot_portfolio(res_df, tag[0], axes[i])
        plot_action(res_df, tag[0], axes2[i])
        split = model.split("mv_res")
        if 'halflife' in model:
            axes[i].set_title(
                "_".join(["mv_res", split[-1]]).replace("_", " ") +
                'halflife: {}'.format(
                    model.split('halflife_')[1].split('_')[0]),
                fontsize=10)
            axes2[i].set_title(
                "_".join(["mv_res", split[-1]]).replace("_", " ") +
                'halflife: {}'.format(
                    model.split('halflife_')[1].split('_')[0]),
                fontsize=10)
        else:
            axes[i].set_title("_".join(["mv_res",
                                        split[-1]]).replace("_", " "),
                              fontsize=10)
            axes2[i].set_title("_".join(["mv_res",
                                         split[-1]]).replace("_", " "),
                               fontsize=10)

        # if '18' not in model.split('_')[0]:
        plot_action(res_df, tag[0], axes3[i], hist=True)
        if 'halflife' in model:
            axes3[i].set_title(
                "_".join(["mv_res", split[-1]]).replace("_", " ") +
                'halflife: {}'.format(
                    model.split('halflife_')[1].split('_')[0]),
                fontsize=10)
        else:
            axes3[i].set_title("_".join(["mv_res",
                                         split[-1]]).replace("_", " "),
                               fontsize=10)

    fig.suptitle('Holdings')
    fig2.suptitle('Actions')
    # if '18' not in model.split('_')[0]:
    fig3.suptitle('Res Actions')
Exemple #10
0
def main():
    args = parse_args()

    if args.grad_loss:
        loss_weighting = 0.0
    else:
        loss_weighting = 1.0

    verbs = ['go', 'take', 'open', 'grab', 'run', 'walk', 'climb']
    vocabulary = load_list_from_file('./data/vocabulary.txt')

    basic_actions = [
        'open egg', 'go east', 'go west', 'go north', 'go south', 'go up',
        'go down', 'look', 'take egg'
    ]

    dictionary = [
        'pray', 'yellow', 'trapdoor', 'open', 'bell', 'touch', 'pile', 'trunk',
        'sack', 'inflate', 'southeast', 'of', 'move', 'match', 'figurine',
        'railing', 'with', 'map', 'mirror', 'wind', 'examine', 'north', 'out',
        'trident', 'turn', 'skull', 'throw', 'northwest', 'case', 'bag', 'red',
        'press', 'jewels', 'east', 'pump', 'bolt', 'rusty', 'window', 'douse',
        'boat', 'bracelet', 'matchbook', 'basket', 'book', 'coffin', 'bar',
        'rug', 'lid', 'drop', 'nasty', 'wrench', 'light', 'sand', 'bauble',
        'kill', 'tie', 'painting', 'sword', 'wave', 'in', 'south', 'northeast',
        'ring', 'canary', 'lower', 'egg', 'all', 'to', 'candles', 'page',
        'and', 'echo', 'emerald', 'tree', 'from', 'rope', 'troll',
        'screwdriver', 'torch', 'enter', 'coal', 'go', 'look', 'shovel',
        'knife', 'down', 'take', 'switch', 'prayer', 'launch', 'diamond',
        'read', 'up', 'get', 'scarab', 'west', 'land', 'southwest', 'climb',
        'thief', 'raise', 'wait', 'odysseus', 'button', 'sceptre', 'lamp',
        'chalice', 'garlic', 'buoy', 'pot', 'label', 'put', 'dig', 'machine',
        'close'
    ]

    actions = basic_actions

    optimize_memory = False
    sparse_reward = True
    actor_train_start = 0
    eps_start = 1.0

    test_params = {
        'nn=-1': {
            'number_of_neighbors': -1
        },
        'nn=1': {
            'number_of_neighbors': 1
        },
        'nn=3': {
            'number_of_neighbors': 3
        },
        'nn=11': {
            'number_of_neighbors': 11
        },
    }

    game_seed = 52
    if args.task == 0:
        buffer_size = 20000
        time_steps = 100000
        project_name = 'egg_quest_minimal_actions'
        task = 'egg'
    elif args.task == 1:
        buffer_size = 20000
        time_steps = 2000000
        project_name = 'egg_quest_extended_actions'
        actions = dictionary
        task = 'egg'
    elif args.task == -1:
        buffer_size = 20000
        time_steps = 100000
        project_name = 'egg_quest_baby_actions'
        actions = ['open', 'egg', 'north', 'climb', 'tree', 'take']
        task = 'egg'
    elif args.task == 2:
        buffer_size = 40000
        time_steps = 1000000
        project_name = 'troll_imitation'
        actions = dictionary
        task = 'troll'
        sparse_reward = False

        test_params = {
            'nn': {
                'number_of_neighbors': args.nn
            },
        }
        game_seed = 12
    elif args.task == 3:
        buffer_size = 40000
        time_steps = 1000000
        project_name = 'troll'
        actions = [
            'north', 'south', 'east', 'west', 'open window', 'take sword',
            'take lamp', 'move rug', 'open trapdoor', 'go down', 'light lamp',
            'kill troll with sword'
        ]
        task = 'troll'
        sparse_reward = False
    else:
        raise NotImplementedError

    words = list()
    words.append('')
    for action in actions:
        tokens = tokenizer(action)
        for token in tokens:
            if token not in words:
                words.append(token)

    sentences = list()
    for i, word1 in enumerate(words):
        for word2 in words[i + 1:]:
            if word1 in verbs:
                sentences.append(word1 + ' ' + word2)
            else:
                sentences.append(word2 + ' ' + word1)

    if args.pomdp:
        project_name = project_name + '_pomdp'

    seed = args.seed
    disable_cuda = False

    #random.seed(seed)
    #torch.manual_seed(seed)
    if torch.cuda.is_available() and not disable_cuda:
        # free_gpu = get_free_gpu()
        device = torch.device('cuda')  # + str(free_gpu))
        #torch.cuda.manual_seed(seed)
        torch.backends.cudnn.enabled = False
    else:
        device = torch.device('cpu')

    vocab_size = len(vocabulary)
    bow_parser = BagOfWords(
        vocabulary=vocabulary,
        type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(1))

    # word2vec_model_path = os.getcwd() + '/../ZorkGym/text_utils/GoogleNews-vectors-negative300.bin'
    # word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
    word2vec_model = glove_api.load('glove-wiki-gigaword-50')
    embedding_size = word2vec_model.vector_size
    word2vec_parser = Word2Vec(
        type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(0),
        word2vec_model=word2vec_model,
        return_func=lambda x: word2vec_padding(x, 65, embedding_size))
    onehot_parser = OneHotParser(
        type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(1),
        vocabulary=actions)
    """
        Experiments from here and below
    """
    for simulation in args.simulations:
        tau = 0.0
        train_params = {'seed': seed}
        if simulation == 'dqn_mlp':
            test_params = None
            agent = DQN(actions=sentences,
                        model_type='MLP',
                        parser=bow_parser,
                        input_length=vocab_size + 1,
                        input_width=1,
                        history_size=1,
                        device=device,
                        pomdp_mode=args.pomdp,
                        task=task,
                        sparse_reward=sparse_reward)

        elif simulation == 'dqn_cnn':
            test_params = None
            word2vec_parser.return_func = lambda x: word2vec_padding(
                x, 65, embedding_size)

            agent = DQN(actions=sentences,
                        model_type='CNN',
                        parser=word2vec_parser,
                        input_length=embedding_size,
                        input_width=65,
                        history_size=1,
                        device=device,
                        pomdp_mode=args.pomdp,
                        task=task,
                        sparse_reward=sparse_reward)

        elif simulation == 'drrn_mlp':
            test_params = None
            agent = DRRN(actions=sentences,
                         model_type='MLP',
                         parser=bow_parser,
                         input_length=vocab_size + 1,
                         input_width=1,
                         history_size=1,
                         device=device,
                         pomdp_mode=args.pomdp,
                         task=task,
                         sparse_reward=sparse_reward)
            tau = 0.2
        elif simulation == 'drrn_cnn':
            test_params = None
            agent = DRRN(actions=sentences,
                         model_type='CNN',
                         parser=word2vec_parser,
                         input_length=embedding_size,
                         input_width=65,
                         history_size=1,
                         device=device,
                         pomdp_mode=args.pomdp,
                         task=task,
                         sparse_reward=sparse_reward)
            tau = 0.2
        elif simulation == 'dddpg_mlp':
            word2vec_parser.return_func = lambda x: word2vec_sum(
                x, embedding_size)

            action_vocab_list = []
            for action in sentences:
                tokens = tokenizer(action)
                for token in tokens:
                    if token not in action_vocab_list:
                        action_vocab_list.append(token)
            action_vocabulary = {}
            embedding_size = len(action_vocab_list)
            for idx, action in enumerate(action_vocab_list):
                action_vocabulary[action] = np.zeros(embedding_size)
                action_vocabulary[action][idx] = 1.0
            word2vec_parser.word2vec_model = action_vocabulary

            train_params['number_of_neighbors'] = args.nn

            agent = DDDPG(actions=sentences,
                          state_parser=bow_parser,
                          action_parser=word2vec_parser,
                          embedding_size=embedding_size,
                          input_length=vocab_size + 1,
                          input_width=1,
                          history_size=1,
                          loss_weighting=loss_weighting,
                          model_type='MLP',
                          device=device,
                          pomdp_mode=args.pomdp,
                          task=task,
                          sparse_reward=sparse_reward)

        elif simulation == 'dddpg_cnn':
            action_word2vec_parser = Word2Vec(
                type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(
                    0),
                word2vec_model=word2vec_model,
                return_func=lambda x: word2vec_padding(x, 65, embedding_size))
            action_word2vec_parser.return_func = lambda x: word2vec_sum(
                x, embedding_size)

            train_params['number_of_neighbors'] = args.nn

            agent = DDDPG(actions=sentences,
                          state_parser=word2vec_parser,
                          action_parser=action_word2vec_parser,
                          embedding_size=embedding_size,
                          input_length=embedding_size,
                          input_width=65,
                          history_size=1,
                          loss_weighting=loss_weighting,
                          model_type='CNN',
                          device=device,
                          pomdp_mode=args.pomdp,
                          task=task,
                          sparse_reward=sparse_reward)

        elif simulation == 'ompddpg_mlp':
            words = set()
            for action in sentences:
                for word in tokenizer(action):
                    words.add(word)
            action_vocabulary = {}
            if args.action_w2v:
                for word in words:
                    action_vocabulary[word] = word2vec_model[word]
                action_vocabulary[''] = [
                    0 for _ in range(len(action_vocabulary['open']))
                ]
            else:
                words.add('')
                for idx, word in enumerate(words):
                    action_vocabulary[word] = np.zeros(len(words))
                    action_vocabulary[word][idx] = 1.0

            embedding_size = len(action_vocabulary['open'])

            train_params['number_of_neighbors'] = args.nn

            agent = OMPDDPG(actions=action_vocabulary,
                            state_parser=bow_parser,
                            embedding_size=embedding_size,
                            input_length=vocab_size + 1,
                            input_width=1,
                            history_size=1,
                            model_type='MLP',
                            device=device,
                            pomdp_mode=args.pomdp,
                            loss_weighting=loss_weighting,
                            linear=args.linear,
                            improved_omp=args.improved_omp,
                            model_path=args.model_path,
                            task=task,
                            sparse_reward=sparse_reward)

        elif simulation == 'ompddpg_cnn':
            words = set()
            for action in sentences:
                for word in tokenizer(action):
                    words.add(word)
            action_vocabulary = {}
            if args.action_w2v:
                for word in words:
                    action_vocabulary[word] = word2vec_model[word]
                action_vocabulary[''] = [
                    0 for _ in range(len(action_vocabulary['open']))
                ]
            else:
                words.add('')
                for idx, word in enumerate(words):
                    action_vocabulary[word] = np.zeros(len(words))
                    action_vocabulary[word][idx] = 1.0

            embedding_size = len(action_vocabulary['open'])

            train_params['number_of_neighbors'] = args.nn

            agent = OMPDDPG(actions=action_vocabulary,
                            state_parser=word2vec_parser,
                            embedding_size=embedding_size,
                            input_length=embedding_size,
                            input_width=65,
                            history_size=1,
                            model_type='CNN',
                            device=device,
                            pomdp_mode=args.pomdp,
                            loss_weighting=loss_weighting,
                            linear=args.linear,
                            improved_omp=args.improved_omp,
                            model_path=args.model_path,
                            task=task,
                            sparse_reward=sparse_reward)

        else:
            raise NotImplementedError

        model_name = simulation

        if not ('dqn' in simulation or 'drrn' in simulation):
            model_name += '/neighbors=' + str(args.nn)
        if args.action_w2v:
            model_name += '/w2v'
        if args.linear:
            model_name += '/linear'

        agent.learn(
            total_timesteps=time_steps,
            buffer_size=buffer_size,
            visualize=True,
            vis_name=project_name + '/' + model_name + '/' + str(seed),
            optimize_memory=optimize_memory,
            tau=tau,
            learn_start_steps=0,  #20000,
            train_params=train_params,
            test_params=test_params,
            eps_start=eps_start,
            test_interval=args.test_interval,
            game_seed=game_seed)
def main(config_file):
    # Check TF version
    logging.info("Tensorflow version: {}".format(tf.version.VERSION))

    # Load main config file
    with open(config_file, "r") as f:
        config = yaml.load(f)

    result_path = config["result_dir"]
    agent_type = config["agent"]
    agent_config_file = os.path.join(config["agent_config_dir"],
                                     str(agent_type) + ".yml")
    mode = config["mode"]
    environment = config["environment"]
    environment_seed = config["environment_seed"]

    # Load config file for agent
    with open(agent_config_file, "r") as f:
        agent_config = yaml.load(f)

    # Create output directory
    time_str = time.strftime("%Y%m%d_%H%M%S")
    result_path = os.path.join(result_path, agent_type, time_str)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    agent_config["render_environment"] = config["render_environment"]
    agent_config["max_episode"] = config["max_episode"]
    agent_config["max_step"] = config["max_step"]
    agent_config["slide_window"] = config["slide_window"]
    agent_config["result_path"] = result_path

    # Save config files to output directory
    copyfile(config_file,
             os.path.join(result_path, os.path.basename(config_file)))
    copyfile(config_file,
             os.path.join(result_path, os.path.basename(agent_config_file)))

    logging.info(
        mode +
        " with {} algorithm in environment {}".format(agent_type, environment))
    logging.info("Results will be saved at {}".format(result_path))

    # Initialize environment
    env = gym.make('CartPole-v1')
    env.seed(environment_seed)
    env = env.unwrapped

    # Build/load agent
    if agent_type == "DQN":
        agent = DQN(agent_config, env)
        agent.train()
    elif agent_type == "DDQN":
        agent = DDQN(agent_config, env)
        agent.train()
    elif agent_type == "DDQN_PER_Prop":
        agent = DDQN_PER_Prop(agent_config, env)
        agent.train()
    elif agent_type == "A2C":
        agent = A2C(agent_config, env)
        agent.train()
    elif agent_type == "REINFORCE":
        agent = REINFORCE(agent_config, env)
        agent.train()
    else:
        raise KeyError("Agent type does not exist")

    # Train or play
    if mode == "train":
        agent.train()
    elif mode == "play":
        agent.play()
 
    model = Sequential()
    model.add(Dense(100, input_dim=domain.number_states()))
    model.add(Activation('relu'))
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dense(domain.valid_actions()))
    model.add(Activation('linear'))
    model.compile(loss='mse', optimizer=Adam(0.001))
    return model


# agent
agent = Tabular(domain.valid_actions(),
                randomizer=lambda n: np.random.randn(n) * 0.1) 
agent = DQN(domain.number_states(), domain.valid_actions(), model_lambda=model_lambda,
            batch_size=64, epochs=1, memory_size=3000)

# algorithm
trainer = ExpectedSarsa(discount=0.95, episode_length=200)
trainer = DeepQLearning(discount=0.95, episode_length=200, encoding=domain.default_encoding)

# run
perf, eps = trainer.train_many(agent, domain,
                               # Fixed(0.5),
                               # ExponentialDecay(0.5, 0.85),
                               # VDBE(0.5, 1.0 / domain.valid_actions(), 0.01),
                               # BMC(1.0, 1.0 / 0.499999 - 1.0, 100.0),
                               BMCRobust(mu=0, tau=1,
                                         a=500, b=500,
                                         alpha=25, beta=25 + 0.01),
                               FixedLearningRate(0.6),