Exemple #1
0
def mujoco_gail():
    return dict(
        policy_network=mlp(num_hidden=100, num_layers=2),
        classifier_network=mlp(num_hidden=100, num_layers=2),
        max_iters=4001,
        timesteps_per_batch=1000,
        max_kl=0.01,
        cg_iters=10,
        gamma=0.995,
        lam=0.97,
        entcoeff=0.0,
        cg_damping=0.1,
        vf_stepsize=1e-3,
        vf_iters=5,
        expert_trajs_path='./expert_trajs',
        num_expert_trajs=25,
        g_step=1,
        d_step=5,
        classifier_entcoeff=1e-3,
        num_particles=1,
        d_stepsize=0.01,
        normalize_observations=True,
        observation_dependent_var=True,
        use_classifier_logsumexp=False,
        use_reward_logsumexp=False,
        use_svgd=False
    )
def main():
    ishumanFirstPlayer = int(sys.argv[1])
    ishumanCut = int(sys.argv[2])
    iterNo = int(sys.argv[3])
    env = gym.make('shannon_switching-v0')
    env.configureEnvironment(computerType="minMax",
                             ishumanFirstPlayer=ishumanFirstPlayer,
                             ishumanCut=ishumanCut,
                             iterNo=iterNo)
    print("Computer Type: ", "minMax")
    print("ishumanFirstPlayer ", ishumanFirstPlayer)
    print("ishumanCut", ishumanCut)
    print("iterNo", iterNo)
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=20, num_layers=3),
        lr=5e-4,
        total_timesteps=50,
        buffer_size=5000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=1,
        param_noise=False,
        prioritized_replay=True,
        load_path='model/minMax/shannon_switching_{}_{}_{}.pkl'.format(
            ishumanFirstPlayer, ishumanCut, iterNo -
            1) if iterNo > 0 else None)
    return act
    print("Saving model to model/minMax/shannon_switching_{}_{}_{}.pkl".format(
        ishumanFirstPlayer, ishumanCut, iterNo))
    act.save("model/minMax/shannon_switching_{}_{}_{}.pkl".format(
        ishumanFirstPlayer, ishumanCut, iterNo))
    act.save_act("model/minMax/shannon_switching_train_{}_{}_{}.pkl".format(
        ishumanFirstPlayer, ishumanCut, iterNo))
def main():

    env = gym.make("mediator-v0")

    act = learn(env,
                network=models.mlp(num_layers=3,
                                   num_hidden=128,
                                   activation=tf.tanh,
                                   layer_norm=False),
                lr=1e-3,
                total_timesteps=10000,
                buffer_size=5000,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                train_freq=25,
                batch_size=128,
                print_freq=100,
                learning_starts=1000,
                gamma=0.1,
                target_network_update_freq=100,
                param_noise=True,
                callback=callback)

    print("Saving model to mediator_model.pkl")
    act.save("mediator-v0_model_00.pkl")
Exemple #4
0
def load_policy(model_path,
                input_dim,
                output_dim,
                num_hidden,
                num_layers,
                init_logstd=1.,
                discrete=False,
                beta=1.0):
    observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, ))
    if discrete:
        action_space = Discrete(n=output_dim)
    else:
        action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, ))
    tf.reset_default_graph()
    config = tf.ConfigProto(allow_soft_placement=True,
                            inter_op_parallelism_threads=8,
                            intra_op_parallelism_threads=8,
                            device_count={'CPU': 8})
    config.gpu_options.allow_growth = True
    sess = U.make_session(make_default=True, config=config)
    network = mlp(num_hidden=num_hidden, num_layers=num_layers)
    policy_train = build_policy(observation_space,
                                action_space,
                                network,
                                trainable_variance=True,
                                state_dependent_variance=True,
                                beta=beta,
                                init_logstd=init_logstd)()
    U.initialize()
    policy_train.load(model_path)
    return policy_train
Exemple #5
0
def main(learning_rate):

    tf.reset_default_graph(
    )  # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters
    graph = tf.get_default_graph()
    #print(graph.get_operations())
    env = PowerDynSimEnv(case_files_array, dyn_config_file, rl_config_file,
                         java_port)
    #model = deepq.models.mlp([128,128])

    act = deepq.learn(env,
                      network=models.mlp(num_layers=2,
                                         num_hidden=128,
                                         activation=tf.nn.relu),
                      lr=learning_rate,
                      total_timesteps=900000,
                      buffer_size=50000,
                      checkpoint_freq=1000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      callback=callback)
    print("Saving final model to power_model_multistep498_508_lr_%s_90w.pkl" %
          (str(learning_rate)))
    act.save(savedModel + "/" + model_name + "_lr_%s_90w.pkl" %
             (str(learning_rate)))
Exemple #6
0
def main():
    random.seed(10)

    env = gym.make('Scaling-v0')
    act = deepq.learn(env,
                      network=models.mlp(num_hidden=20, num_layers=1),
                      train_freq=4,
                      buffer_size=1000,
                      exploration_fraction=1.0,
                      exploration_final_eps=1e-5,
                      total_timesteps=200000,
                      prioritized_replay=True,
                      checkpoint_freq=None,
                      print_freq=1)

    # play model using shorter change rate
    env.change_rate = 100

    frames = 1000
    play(act, env, frames)

    # play sine curve
    env.change_rate = 1
    env.scaling_env_options['input'] = INPUTS['SINE_CURVE']

    play(act, env, frames)
Exemple #7
0
def sample_strategy_from_mixed(env, str_set, mix_str, identity):

    if not isinstance(mix_str,np.ndarray):
        raise ValueError("mix_str in sample func is not a numpy array.")

    if not len(str_set) == len(mix_str):
        raise ValueError("Length of mixed strategies does not match number of strategies.")

    picked_str = np.random.choice(str_set,p=mix_str)
    if not fp.isInName('.pkl', name = picked_str):
        raise ValueError('The strategy picked is not a pickle file.')

    if identity == 0: # pick a defender's strategy
        path = DIR + 'defender_strategies/'
    elif identity == 1:
        path = DIR + 'attacker_strategies/'
    else:
        raise ValueError("identity is neither 0 or 1!")

    if not fp.isExist(path + picked_str):
        raise ValueError('The strategy picked does not exist!')

    #TODO: assign nn info from game
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=256, num_layers=1),
        total_timesteps=0,
        load_path= path + picked_str
    )

    return act
Exemple #8
0
def main():
    env = gym.make("sparse-v0")
    act = deepq.learn(env,
                      network=models.mlp(num_layers=2,
                                         num_hidden=128,
                                         activation=tf.nn.relu),
                      total_timesteps=0,
                      load_path=dirs)

    while True:
        obs, screen_obs = env.reset_with_render()
        done = False
        episode_rew = 0
        converted = converter(screen_obs)
        my_plot = plt.imshow(converted)
        while not done:
            obs, rew, done, _, screen_obs = env.step_with_render(act(obs)[0])
            #obs, rew, done, _ , screen_obs = env.step_with_render(env.action_space.sample())
            converted = converter(screen_obs)
            plt.ion()
            my_plot.autoscale()
            my_plot.set_data(converted)
            plt.pause(.1)
            plt.draw()
            plt.show()
            print("action: ", act(obs)[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Exemple #9
0
def main(test_episodes=20, test_steps=50):
    env = env_search_control()
    print(env.observation_space)
    print(env.action_space)
    act = deepq.learn(env,
                      network=models.mlp(num_layers=1, num_hidden=64),
                      total_timesteps=0,
                      total_episodes=0,
                      total_steps=0,
                      load_path="assembly_model_fuzzy_final.pkl")
    episode_rewards = []
    episode_states = []
    for i in range(test_episodes):
        obs, done = env.reset()
        episode_rew = 0
        episode_obs = []
        logger.info(
            "================== The {} episode start !!! ===================".
            format(i))
        for j in range(test_steps):
            obs, rew, done, _ = env.step(act(obs[None])[0], j)
            episode_rew += rew
            episode_obs.append(obs)
        episode_rewards.append(cp.deepcopy(episode_rew))
        episode_states.append(cp.deepcopy(episode_obs))
        print("Episode reward", episode_rew)

    np.save('../data/test_episode_reward_fuzzy_final_new', episode_rewards)
    np.save('../data/test_episode_state_fuzzy_final_new', episode_states)
def main():
    # setup environment
    ishumanFirstPlayer = int(sys.argv[1])
    ishumanCut = int(sys.argv[2])
    iterNo = int(sys.argv[3])
    env = gym.make('shannon_switching-v0')
    env.configureEnvironment(computerType="selfPlayZero",
                             ishumanFirstPlayer=ishumanFirstPlayer,
                             ishumanCut=ishumanCut,
                             iterNo=iterNo)
    print("Computer Type: ", "selfPlayZero")
    print("ishumanFirstPlayer ", ishumanFirstPlayer)
    print("ishumanCut", ishumanCut)
    print("iterNo", iterNo)
    # input("Press Enter to continue...")

    # train network
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=25, num_layers=8),
        lr=5e-4,
        total_timesteps=20,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=1,
        param_noise=False,
        prioritized_replay=True,
        load_path='model/selfPlayZero/shannon_switching_{}_{}_{}.pkl'.format(
            ishumanFirstPlayer, ishumanCut, iterNo -
            1) if iterNo > 0 else None)
    print("Saving model to model/selfPlayZero/shannon_switching_{}_{}_{}.pkl".
          format(ishumanFirstPlayer, ishumanCut, iterNo))
Exemple #11
0
def train(args, extra_args):
    env_type = 'steam'
    env_id = 'dota2'
    print('env_type: {}'.format(env_type))

    alg_kwargs = dict(network=models.mlp(num_hidden=128, num_layers=1),
                      lr=1e-3,
                      buffer_size=10000,
                      total_timesteps=500000,
                      propertyexploration_fraction=1.0,
                      exploration_initial_eps=0.1,
                      exploration_final_eps=0.1,
                      train_freq=4,
                      target_network_update_freq=1000,
                      gamma=0.999,
                      batch_size=32,
                      prioritized_replay=True,
                      prioritized_replay_alpha=0.6,
                      experiment_name='test',
                      dueling=True)
    alg_kwargs.update(extra_args)
    env = DotaEnvironment()
    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))

    pool_size = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=pool_size) as pool:
        model = learn(env=env)

    return model, env
Exemple #12
0
def mlp_sokoban(observation, output_dim=1):
    net = observation
    net = mlp(num_layers=2,
              num_hidden=64,
              activation=tf.tanh,
              layer_norm=False)(net)
    output = slim.fully_connected(net, output_dim, activation_fn=None)
    return output
Exemple #13
0
def mlp_bsuite(observation, output_dim=1, output_rescale=1.):
    """Similiar to bsuite default agent network."""
    net = observation
    net = mlp(num_layers=2,
              num_hidden=50,
              activation=tf.nn.relu,
              layer_norm=False)(net)
    output = slim.fully_connected(net, output_dim,
                                  activation_fn=None) * output_rescale
    return output
def mujoco():
    return dict(network=mlp(num_hidden=32, num_layers=2),
                timesteps_per_batch=1024,
                max_kl=0.01,
                cg_iters=10,
                cg_damping=0.1,
                gamma=0.99,
                lam=0.98,
                vf_iters=5,
                vf_stepsize=1e-3,
                normalize_observations=True)
def load_action(path, game):

    env = game.env
    num_layers = game.num_layers
    num_hidden = game.num_hidden
    act = deepq.learn(env,
                      network=models.mlp(num_layers=num_layers,
                                         num_hidden=num_hidden),
                      total_timesteps=0,
                      load_path=path)
    return act
Exemple #16
0
def training_hado_att(game, transfer=False):
    param = game.param
    mix_str_def = game.hado_str(identity=0, param=param)

    if len(mix_str_def) != len(game.def_str):
        raise ValueError(
            "The length of mix_str_def and def_str does not match while retraining"
        )

    env = game.env
    env.reset_everything()

    env.set_training_flag(1)

    env.defender.set_mix_strategy(mix_str_def)
    env.defender.set_str_set(game.def_str)

    param_path = os.getcwd() + '/network_parameters/param.json'
    param = jp.load_json_data(param_path)

    if transfer:
        lr = param['trans_lr']
        total_timesteps = param['trans_timesteps']
        ex_frac = param['trans_exploration_fraction']
        ex_final_eps = param['trans_exploration_final_eps']
    else:
        lr = param['lr']
        total_timesteps = param['total_timesteps']
        ex_frac = param['exploration_fraction']
        ex_final_eps = param['exploration_final_eps']

    learner = Learner(retrain=True, freq=param['retrain_freq'])
    #TODO: add epoch???
    with learner.graph.as_default():
        with learner.sess.as_default():
            act_att, _ = learner.learn_multi_nets(
                env,
                network=models.mlp(num_hidden=param['num_hidden'],
                                   num_layers=param['num_layers']),
                lr=lr,
                total_timesteps=total_timesteps,
                exploration_fraction=ex_frac,
                exploration_final_eps=ex_final_eps,
                print_freq=param['print_freq'],
                param_noise=param['param_noise'],
                gamma=param['gamma'],
                prioritized_replay=param['prioritized_replay'],
                checkpoint_freq=param['checkpoint_freq'],
                scope='att_str_retrain' + str(0) + '.pkl' + '/',
                load_path=os.getcwd() + '/retrain_att/' + 'att_str_retrain' +
                str(0) + '.pkl')
            # print("Saving attacker's model to pickle.")
            # act_att.save(os.getcwd() + '/retrain_att/' + 'att_str_retrain' + str(epoch) + ".pkl", 'att_str_epoch' + str(epoch) + '.pkl' + '/')
    learner.sess.close()
Exemple #17
0
def dr():
    return dict(
        network=mlp(num_hidden=128, num_layers=3),
        timesteps_per_batch=144 * 100,
        max_kl=0.01,
        gamma=0.995,
        lam=0.95,
        ent_coef=0.01,
        activation=tf.nn.relu,
        normalize_observations=True,
        value_network='copy',
    )
Exemple #18
0
def training_def(game, mix_str_att, epoch, retrain=False):
    if len(mix_str_att) != len(game.att_str):
        raise ValueError(
            "The length of mix_str_att and att_str does not match while retraining"
        )

    print("training_def mix_str_att is ", mix_str_att)

    # env = copy.deepcopy(game.env)
    env = game.env
    env.reset_everything()

    env.set_training_flag(0)

    env.attacker.set_mix_strategy(mix_str_att)
    env.attacker.set_str_set(game.att_str)

    param_path = os.getcwd() + '/network_parameters/param.json'
    param = jp.load_json_data(param_path)

    if retrain:
        scope = 'def_str_retrain' + str(0) + '.pkl' + '/'
    else:
        scope = 'def_str_epoch' + str(epoch) + '.pkl' + '/'

    learner = Learner()
    with learner.graph.as_default():
        with learner.sess.as_default():
            act_def, d_BD = learner.learn_multi_nets(
                env,
                network=models.mlp(num_hidden=param['num_hidden'],
                                   num_layers=param['num_layers']),
                lr=param['lr'],
                total_timesteps=param['total_timesteps_def'],
                exploration_fraction=param['exploration_fraction'],
                exploration_final_eps=param['exploration_final_eps'],
                print_freq=param['print_freq'],
                param_noise=param['param_noise'],
                gamma=param['gamma'],
                prioritized_replay=param['prioritized_replay'],
                checkpoint_freq=param['checkpoint_freq'],
                scope=scope,
                epoch=epoch)
            print("Saving defender's model to pickle.")
            if retrain:
                act_def.save(
                    os.getcwd() + '/retrain_def/' + 'def_str_retrain' +
                    str(0) + '.pkl', 'def_str_retrain' + str(0) + '.pkl' + '/')
            else:
                act_def.save(DIR_def + "def_str_epoch" + str(epoch) + ".pkl",
                             "def_str_epoch" + str(epoch) + '.pkl' + '/')
    learner.sess.close()
    return d_BD
def sample_strategy_from_mixed(env, str_set, mix_str, identity, str_dict=None):

    if not isinstance(mix_str, np.ndarray):
        raise ValueError("mix_str in sample func is not a numpy array.")

    if not len(str_set) == len(mix_str):
        raise ValueError(
            "Length of mixed strategies does not match number of strategies.")

    # if np.sum(mix_str) != 1:
    #     mix_str = mix_str/np.sum(mix_str)

    picked_str = np.random.choice(str_set, p=mix_str)
    # print('current str:', picked_str)
    #TODO: modification for fast sampling.
    if str_dict != None:
        return str_dict[picked_str]

    if not fp.isInName('.pkl', name=picked_str):
        raise ValueError('The strategy picked is not a pickle file.')

    if identity == 0:  # pick a defender's strategy
        path = DIR + 'defender_strategies/'
    elif identity == 1:
        path = DIR + 'attacker_strategies/'
    else:
        raise ValueError("identity is neither 0 or 1!")

    # print(path + picked_str)
    if not fp.isExist(path + picked_str):
        raise ValueError('The strategy picked does not exist!')

    if "epoch1.pkl" in picked_str:
        act = fp.load_pkl(path + picked_str)
        return act

    flag = env.training_flag
    env.set_training_flag(identity)

    param_path = os.getcwd() + '/network_parameters/param.json'
    param = jp.load_json_data(param_path)

    act = learn(env,
                network=models.mlp(num_hidden=param['num_hidden'],
                                   num_layers=param['num_layers']),
                total_timesteps=0,
                load_path=path + picked_str,
                scope=picked_str + '/')

    env.set_training_flag(flag)

    return act
Exemple #20
0
def airhockey():
    return dict(
        network=mlp(num_hidden=32, num_layers=2),
        timesteps_per_batch=1024,
        max_kl=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.5**(1. / 16),
        lam=0.5**(1. / 8),
        vf_iters=5,
        vf_stepsize=1e-3,
        normalize_observations=False,
    )
def classic_control():
    return dict(
        network=mlp(num_hidden=32, num_layers=2),
        timesteps_per_batch=1024,
        #epsilon=0.01,
        #cg_iters=10,
        #cg_damping=0.1,
        #gamma=0.99,
        #lam=0.98,
        #vf_iters=5,
        #vf_stepsize=1e-3,
        normalize_observations=True,
    )
Exemple #22
0
def lucia_env():
    return dict(
        network=mlp(num_hidden=64, num_layers=5),
        timesteps_per_batch=1024,
        max_kl=0.005,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.98,
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3,
        normalize_observations=False,
    )
Exemple #23
0
def ev():
    return dict(
        network=mlp(num_hidden=64, num_layers=3),
        timesteps_per_batch=7000,
        max_kl=0.01,
        max_sf=2.0,
        gamma=0.99,
        lam=0.95,
        ent_coef=0.1,
        activation=tf.nn.relu,
        normalize_observations=True,
        value_network='copy',
    )
def roboschool():
    return dict(
        network=mlp(num_hidden=64, num_layers=2),
        timesteps_per_batch=512,
        max_kl=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.99,
        lam=0.98,
        vf_iters=10,
        vf_stepsize=1e-3,
        normalize_observations=True,
    )
Exemple #25
0
def mujoco():
    return dict(
        network = mlp(num_hidden=32, num_layers=2),
        timesteps_per_batch=1024,
        max_kl=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.99,
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3,
        normalize_observations=True,
    )
Exemple #26
0
def sample_both_strategies(env, att_str_set, att_mix_str, def_str_set, def_mix_str):

    if not len(att_str_set) == len(att_mix_str):
        raise ValueError("Length of mixed strategies does not match number of strategies for the attacker.")
    if not len(def_str_set) == len(def_mix_str):
        raise ValueError("Length of mixed strategies does not match number of strategies for the defender.")

    att_picked_str = np.random.choice(att_str_set, p=att_mix_str)
    def_picked_str = np.random.choice(def_str_set, p=def_mix_str)

    if not fp.isInName('.pkl', name=def_picked_str):
        raise ValueError('The strategy picked is not a pickle file for the defender.')
    if not fp.isInName('.pkl', name=att_picked_str):
        raise ValueError('The strategy picked is not a pickle file for the attacker.')

    path_def = DIR + 'defender_strategies/'
    path_att = DIR + 'attacker_strategies/'

    if not fp.isExist(path_def + def_picked_str):
        raise ValueError('The strategy picked does not exist for the defender!')
    if not fp.isExist(path_att + att_picked_str):
        raise ValueError('The strategy picked does not exist for the attacker!')

    act_att = deepq.learn(
        env,
        network=models.mlp(num_hidden=256, num_layers=1),
        total_timesteps=0,
        load_path=path_att + att_picked_str
    )

    act_def = deepq.learn(
        env,
        network=models.mlp(num_hidden=256, num_layers=1),
        total_timesteps=0,
        load_path= path_def + def_picked_str
    )

    return act_att, act_def
Exemple #27
0
def mujoco():
    return dict(
        network=mlp(num_hidden=256, num_layers=5),
        nsteps=2048,
        nminibatches=8,
        lam=0.95,
        gamma=0.99,
        noptepochs=10,
        log_interval=1,
        ent_coef=0.0,
        lr=lambda f: 5e-5 * f,
        cliprange=0.2,
        #value_network='copy'
    )
def main():
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    act = deepq.learn(env,
                      network=models.mlp(num_hidden=64, num_layers=1),
                      lr=1e-3,
                      total_timesteps=100000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.1,
                      print_freq=10,
                      param_noise=True)
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
def robotics():
    return dict(
        network=mlp(num_hidden=32, num_layers=2),
        timesteps_per_batch=1024,
        epsilon=0.01,
        cg_iters=10,
        cg_damping=0.1,
        gamma=0.99,
        lam=0.98,
        vf_iters=5,
        vf_stepsize=1e-3,
        normalize_observations=True,
        entcoeff=0.0,
    )
Exemple #30
0
def rand_str_generator(env, game):
    # Generate random nn for attacker.
    num_layers = game.num_layers
    num_hidden = game.num_hidden

    act_att = deepq.learn(
        env,
        network=models.mlp(num_hidden=num_hidden, num_layers=num_layers-3),
        total_timesteps=0
    )

    act_def = deepq.learn(
        env,
        network=models.mlp(num_hidden=num_hidden, num_layers=num_layers-3),
        total_timesteps=0
    )

    print("Saving attacker's model to pickle. Epoch name is equal to 1.")
    act_att.save(DIR_att + "att_str_epoch" + str(1) + ".pkl")
    game.att_str.append("att_str_epoch" + str(1) + ".pkl")

    print("Saving defender's model to pickle. Epoch in name is equal to 1.")
    act_def.save(DIR_def + "def_str_epoch" + str(1) + ".pkl")
    game.def_str.append("def_str_epoch" + str(1) + ".pkl")
Exemple #31
0
def main():
    env = gym.make("MountainCar-v0")
    act = deepq.learn(env,
                      network=models.mlp(num_layers=1, num_hidden=64),
                      total_timesteps=0,
                      load_path='mountaincar_model.pkl')

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
def main():
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=64, num_layers=1),
        lr=1e-3,
        total_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.1,
        print_freq=10,
        param_noise=True
    )
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
def main():
    env = gym.make("MountainCar-v0")
    act = deepq.learn(
        env,
        network=models.mlp(num_layers=1, num_hidden=64),
        total_timesteps=0,
        load_path='mountaincar_model.pkl'
    )

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)