Exemple #1
0
def main(_):
  model_dir = get_model_dir(conf,
      ['is_train', 'random_seed', 'monitor', 'display', 'log_level'])
  preprocess_conf(conf)

  with tf.Session() as sess:
    # environment
    env = gym.make(conf.env_name)
    env._seed(conf.random_seed)

    assert isinstance(env.observation_space, gym.spaces.Box), \
      "observation space must be continuous"
    assert isinstance(env.action_space, gym.spaces.Box), \
      "action space must be continuous"

    # exploration strategy
    if conf.noise == 'ou':
      strategy = OUExploration(env, sigma=conf.noise_scale)
    elif conf.noise == 'brownian':
      strategy = BrownianExploration(env, conf.noise_scale)
    elif conf.noise == 'linear_decay':
      strategy = LinearDecayExploration(env)
    else:
      raise ValueError('Unkown exploration strategy: %s' % conf.noise)

    # networks
    shared_args = {
      'sess': sess,
      'input_shape': env.observation_space.shape,
      'action_size': env.action_space.shape[0],
      'hidden_dims': conf.hidden_dims,
      'use_batch_norm': conf.use_batch_norm,
      'use_seperate_networks': conf.use_seperate_networks,
      'hidden_w': conf.hidden_w, 'action_w': conf.action_w,
      'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn,
      'w_reg': conf.w_reg,
    }

    logger.info("Creating prediction network...")
    pred_network = Network(
      scope='pred_network', **shared_args
    )

    logger.info("Creating target network...")
    target_network = Network(
      scope='target_network', **shared_args
    )
    target_network.make_soft_update_from(pred_network, conf.tau)

    # statistic
    stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat)

    agent = NAF(sess, env, strategy, pred_network, target_network, stat,
                conf.discount, conf.batch_size, conf.learning_rate,
                conf.max_steps, conf.update_repeat, conf.max_episodes)

    #agent.run(conf.monitor, conf.display, conf.is_train)
    agent.run(conf.monitor, conf.display, True)
    def __init__(self):
        self.world = World(*SimulationConfig.word_size)
        self.graphic = Graphic(self.world, *SimulationConfig.pane_size)

        if SimulationConfig.fixed_sick_cases:
            for i in range(SimulationConfig.population_size):
                if i < SimulationConfig.fixed_cases_count:
                    self.world.add_agent_on_free(Agent(self.world, True))
                else:
                    self.world.add_agent_on_free(Agent(self.world, False))
        else:
            for i in range(SimulationConfig.population_size):
                self.world.add_agent_on_free(
                    Agent(
                        self.world,
                        get_it_with_probability(
                            SimulationConfig.create_sick_agent_probability,
                            True, False)))
        self.statistic = Statistic(self.world)
class Simulation:
    graphic: Graphic
    world: World
    statistic: Statistic

    def __init__(self):
        self.world = World(*SimulationConfig.word_size)
        self.graphic = Graphic(self.world, *SimulationConfig.pane_size)

        if SimulationConfig.fixed_sick_cases:
            for i in range(SimulationConfig.population_size):
                if i < SimulationConfig.fixed_cases_count:
                    self.world.add_agent_on_free(Agent(self.world, True))
                else:
                    self.world.add_agent_on_free(Agent(self.world, False))
        else:
            for i in range(SimulationConfig.population_size):
                self.world.add_agent_on_free(
                    Agent(
                        self.world,
                        get_it_with_probability(
                            SimulationConfig.create_sick_agent_probability,
                            True, False)))
        self.statistic = Statistic(self.world)

    def run(self):
        while True:
            self.step()
            self.graphic.render()
            self.statistic.collect_statistics()

    def step(self):
        random.shuffle(self.world.agents)
        self.world.clear_death_agents()
        for agent in self.world.agents:
            agent.step()
        self.world.process_step_effects()
Exemple #4
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['exp_name'])
    # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S")
    # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto(intra_op_parallelism_threads=8,
                            inter_op_parallelism_threads=8)
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        def var_print():
            for var in tf.global_variables():
                print(var)

        print("printing vars:------------------------------------------------")
        var_print()
        print(
            "printing vars::------------------------------------------------")

        start_steps = 1000
        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions(
                [state], is_deterministic=False)[0]  # [-inf, inf]
            next_state, reward, done, info = env.step(
                action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size and global_step >= start_steps:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(
                        transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' %
                      ave_epi_rewards)

            if done:
                # save step
                all_epi_rewards.append(epi_rewards)
                stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                               np.mean(Q_loss), np.mean(pi_loss))
                # pbar.update(local_step)

                lenn = len(all_epi_rewards)
                fromm = max(lenn - 20, 0)
                to = lenn
                min_5_ep_ret = min(all_epi_rewards[fromm:to])

                # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' %
                #    (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) )
                print(
                    'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f'
                    % (episode + 1, epi_rewards, np.mean(pi_loss),
                       np.mean(Q_loss), min_5_ep_ret))
                threshold = -500.0
                if ((to - fromm) > 3 and min_5_ep_ret > threshold):
                    time_end = time.time()
                    print("SHI hyperParams have made algo converge (",
                          threshold, ") in ", (time_end - time_begin) / 1.0,
                          " s")
                    stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                                   np.mean(Q_loss), np.mean(pi_loss))
                    stat.save_model(global_step)
                    sys.exit()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
Exemple #5
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['env_name'])
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    # env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf]
            next_state, reward, done, info = env.step(action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards)

            if done:
                # save step
                stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss))
                pbar.update(local_step)
                pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' %
                       (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss)))
                print()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
        pbar.close()
Exemple #6
0
def main(_):
    config.observation_dims = eval(config.observation_dims)

    # Scale some of the flags.
    for flag in [
            'memory_size', 't_target_q_update_freq', 't_ep_end', 't_train_max',
            'learning_rate_decay_step', 't_learn_start', 't_test', 't_save',
            'n_step', 'n_episode'
    ]:
        setattr(config, flag, getattr(config, flag) * config.scale)

    # Determine some more flags, clean up flags.
    if config.chtc:
        config.t_test /= 10
    config.max_to_keep = 0 if not config.chtc else 2
    config.run_dir = config.run_dir.replace('//', '/')
    config.save_dir = config.save_dir.replace('//', '/')

    # Print config.
    PrettyPrinter().pprint({
        key: config.__dict__['__wrapped'][key].value
        for key in config.__dict__['__wrapped'].__dir__()
    })

    with tf.Session() as sess:
        # Create environment, networks, statistics module, and agent.
        env = AtariEnvironment(
            env_name=config.env_name,
            n_action_repeat=config.n_action_repeat,
            max_random_start=config.max_random_start,
            observation_dims=config.observation_dims,
            display=config.display,
            use_cumulated_reward=config.use_cumulated_reward)
        pred_network = CNN(sess=sess,
                           history_length=config.history_length,
                           observation_dims=config.observation_dims,
                           output_size=env.env.action_space.n,
                           name='pred_network',
                           trainable=True)

        target_network = CNN(sess=sess,
                             history_length=config.history_length,
                             observation_dims=config.observation_dims,
                             output_size=env.env.action_space.n,
                             name='target_network',
                             trainable=False)

        stat = Statistic(sess=sess,
                         t_test=config.t_test,
                         t_save=config.t_save,
                         t_learn_start=config.t_learn_start,
                         run_dir=config.run_dir,
                         save_dir=config.save_dir,
                         variables=pred_network.var.values(),
                         load=config.load,
                         chtc=config.chtc,
                         window_length=config.window_length,
                         termination_p_hat=config.termination_p_hat,
                         max_to_keep=config.max_to_keep)

        agent = DeepQAgent(sess=sess,
                           pred_network=pred_network,
                           env=env,
                           stat=stat,
                           config=config,
                           target_network=target_network)

        # Begin training/playing.
        if config.is_train:
            agent.train(config.t_train_max)
        else:
            agent.play(test_ep=0.,
                       n_step=config.n_step,
                       n_episode=config.n_episode)

        agent.stat.save_model(agent.t, stat.saver)
        print(" [*] Cleanly closing!")
Exemple #7
0
def train():
    # create place holder img
    input_ph, ground_truths_ph, ground_truths, pre_processed_input = dh.get_place_holders(
    )
    # Processing LabelId's
    one_hot_labels = utils.one_hot(
        ground_truths[0],
        is_color=False)  # TODO: add dictionary task-to-label-number
    # Geting model
    autoencoder = utils.get_autoencoder(user_config.autoencoder,
                                        config.working_dataset, config.strided)

    logits = autoencoder.inference(pre_processed_input)

    processed_ground_truths = [
        one_hot_labels, ground_truths[1], ground_truths[2]
    ]
    loss_op, loss_list, multi_loss_class = lh.get_loss(
        logits, processed_ground_truths)

    optimizer = tf.train.AdamOptimizer(FLAGS.leaning_rate)
    train_step = optimizer.minimize(loss_op)

    saver = tf.train.Saver()
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=config.gpu_memory_fraction)
    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    gpu_options=gpu_options)
    if FLAGS.use_summary:
        sh.define_summaries(logits, ground_truths, processed_ground_truths,
                            loss_op, loss_list, multi_loss_class)
    num_of_train_examples = FLAGS.num_of_train_imgs
    statistic = Statistic(logits, loss_op, loss_list, input_ph,
                          ground_truths_ph, multi_loss_class,
                          processed_ground_truths)
    val_input_img, val_gt = dh.init_data(FLAGS.num_of_val_imgs)

    for ind in range(FLAGS.num_of_val_imgs):
        val_input_img[ind], val_gt[ind] = dh.get_data(ind, 'val')
    with tf.Session(config=session_config) as sess:
        global_step = start_training(sess, autoencoder, saver)
        summary = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(results_dir + '/logs',
                                               sess.graph)
        input_img, gt = dh.init_data(num_of_train_examples)
        input_batch = None
        # training starts here
        step = 0
        for epoch in range(FLAGS.num_of_epchs):
            print("\nEpoch: " + str(epoch))
            sub_batche = 0
            for ind in tqdm(np.random.permutation(num_of_train_examples)):
                if input_img[ind] is None:
                    input_img[ind], gt[ind] = dh.get_data(ind, 'train')
                # ----- make the random batch ----
                if sub_batche == 0:
                    input_batch = input_img[ind]
                    gt_batch = gt[ind]
                else:
                    input_batch, gt_batch = add_to_batch(
                        input_batch, gt_batch, input_img[ind], gt[ind])
                if sub_batche < FLAGS.batch - 1:
                    sub_batche += 1
                    continue
                sub_batche = 0
                # ---- batch is ready ----
                feed_dict = get_feed_dict(input_ph, ground_truths_ph,
                                          input_batch, gt_batch)
                sess.run(train_step, feed_dict=feed_dict)
                if FLAGS.use_summary and step % FLAGS.calc_summary == 0:
                    sh.handle_summarys(sess, logits, summary, summary_writer,
                                       step, feed_dict)
                step += 1
            statistic.handle_statistic(epoch, logits, sess, input_img, gt,
                                       val_input_img, val_gt)
            if epoch % FLAGS.epoch_model_ckpts == 0:
                ckpt_dir = os.path.join(results_dir, 'global_ckpt')
                if not os.path.exists(ckpt_dir):
                    os.mkdir(ckpt_dir)
                saver.save(sess,
                           os.path.join(ckpt_dir, 'global_ckpt'),
                           global_step=global_step)
            if epoch % FLAGS.epoch_analysis_breakpoints == 0:
                analysis_ckpt_dir = os.path.join(results_dir, 'Analysis_ckpts')
                if not os.path.exists(analysis_ckpt_dir):
                    os.mkdir(analysis_ckpt_dir)
                saver.save(sess,
                           os.path.join(analysis_ckpt_dir,
                                        'epoch_' + str(epoch)),
                           global_step=global_step)