예제 #1
0
def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model,
         load_memory, save_memory, log_path):
    agent = DQNAgent(config)

    with agent.graph.as_default():
        if load_model:
            step = agent.load_model(load_model)
            screen_log.info("Load model: {}".format(load_model))
            screen_log.info("Start from step {}".format(step))
        else:
            step = 0

        if load_memory:
            agent.load_memory(load_memory)
            n_frames = len(agent.memory)
            screen_log.info("Load memory: {}".format(load_memory))
            screen_log.info("Memory size: {}".format(n_frames))

        log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6]))
        summary_writer = tf.summary.FileWriter(logdir=os.path.join(
            log_path, '{}'.format(log_name)),
                                               graph=agent.graph)

        episode = 0
        rewards_per_episode = []
        sum_Qs = .0
        sum_losses = .0

        try:
            while (step < max_num_of_steps and episode < max_num_of_episodes):
                episode += 1
                episode_done = False

                next_observation = reset_random_env()
                next_observation = preprocess_observation(next_observation)

                rewards_per_episode.append(0)

                while not episode_done:
                    observation = next_observation

                    if len(agent.memory) < config['replay_start_size']:
                        # init replay memory
                        action = env.action_space.sample()

                        next_observation, reward, episode_done, info = env.step(
                            action)
                        next_observation = preprocess_observation(
                            next_observation)
                        agent.memory.append(
                            MemoryItem(observation, action, reward,
                                       episode_done, info))

                        continue

                    state = agent.get_recent_state(observation)
                    Qs = agent.get_Q_values(state)
                    Qs = Qs[0]

                    # epsilon-greedy action selection
                    epsilon = get_epsilon(config, step)
                    if np.random.RandomState().rand() < epsilon:
                        action = env.action_space.sample()
                    else:
                        action = agent.get_action_from_Q(Qs)

                    next_observation, reward, episode_done, info = env.step(
                        action)
                    next_observation = preprocess_observation(next_observation)
                    agent.memory.append(
                        MemoryItem(observation, action, reward, episode_done,
                                   info))

                    step += 1
                    rewards_per_episode[-1] += reward
                    sum_Qs += Qs[action]

                    # train step
                    loss, loss_summary_str = agent.optimize_Q()
                    summary_writer.add_summary(loss_summary_str, step)
                    sum_losses += loss

                    if step % 1000 == 0:
                        ave_loss = sum_losses / step
                        ave_reward = np.mean(rewards_per_episode)
                        ave_Q = sum_Qs / step

                        [Q_summary_str, reward_summary_str
                         ] = agent.evaluate(ave_reward, ave_Q)

                        summary_writer.add_summary(Q_summary_str, step)
                        summary_writer.add_summary(reward_summary_str, step)

                        screen_log.info(
                            'step: {}, ave. loss: {:g}, '
                            'ave. reward: {:g}, ave. Q: {:g}'.format(
                                step,
                                ave_loss,
                                ave_reward,
                                ave_Q,
                            ))
                    if step % 10000 == 0:
                        agent.save_model(save_model, step)
                    if step % 1000000 == 0:
                        agent.save_memory(save_memory, step)

        except KeyboardInterrupt:
            print("\nUser interrupted training...")
        finally:
            summary_writer.close()

            agent.save_model(save_model, step)
            agent.save_memory(save_memory, step)

        screen_log.info(
            'Finished: the number of steps {}, the number of episodes {}.'.
            format(step, episode))
예제 #2
0
파일: main.py 프로젝트: trigrass2/RainBow
def main():
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari Space Invaders')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(84, 84), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00025,
                        help='Training learning rate.')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_process',
                        default=3,
                        type=int,
                        help='Number of parallel environment')
    parser.add_argument('--num_iteration',
                        default=20000000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.001,
        type=float,
        help='What fraction of num_iteration to run between evaluations.')
    parser.add_argument('--is_duel',
                        default=1,
                        type=int,
                        help='Whether use duel DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_double',
        default=1,
        type=int,
        help='Whether use double DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_per',
        default=1,
        type=int,
        help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_distributional',
        default=1,
        type=int,
        help='Whether use distributional DQN, 0 means no, 1 means yes.')
    parser.add_argument('--num_step',
                        default=1,
                        type=int,
                        help='Num Step for multi-step DQN, 3 is recommended')
    parser.add_argument('--is_noisy',
                        default=1,
                        type=int,
                        help='Whether use NoisyNet, 0 means no, 1 means yes.')

    args = parser.parse_args()
    args.input_shape = tuple(args.input_shape)
    print('Environment: %s.' % (args.env, ))
    env = gym.make(args.env)
    num_actions = env.action_space.n
    print('number_actions: %d.' % (num_actions, ))
    env.close()

    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = BatchEnvironment(args.env, args.num_process,
                                         args.window_size, args.input_shape,
                                         NUM_FRAME_PER_ACTION,
                                         MAX_EPISODE_LENGTH)

    if args.is_per == 1:
        replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE,
                                                 args.window_size,
                                                 args.input_shape)
    else:
        replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                     args.input_shape)

    create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network
    create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model
    noisy = True if args.is_noisy == 1 else False
    online_model, online_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'online_model',
                                                  create_network_fn,
                                                  trainable=True,
                                                  noisy=noisy)
    target_model, target_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'target_model',
                                                  create_network_fn,
                                                  trainable=False,
                                                  noisy=noisy)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory, num_actions,
                     args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size, args.is_double,
                     args.is_per, args.is_distributional, args.num_step,
                     args.is_noisy, args.learning_rate, RMSP_DECAY,
                     RMSP_MOMENTUM, RMSP_EPSILON)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, num_actions,
                                          NUM_FIXED_SAMPLES)

        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # Begin to train:
        fit_iteration = int(args.num_iteration * args.eval_every)

        for i in range(0, args.num_iteration, fit_iteration):
            # Evaluate:
            reward_mean, reward_var = agent.evaluate(sess, batch_environment,
                                                     NUM_EVALUATE_EPSIODE)
            mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples)
            print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var))
            # Train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)

    batch_environment.close()
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Train using Gazebo Simulations')
    parser.add_argument('--seed', default=10, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(80, 100), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00001,
                        help='learning rate')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--num_time',
                        default=4,
                        type=int,
                        help='Number of steps in RNN')
    parser.add_argument('--num_actions',
                        default=7,
                        type=int,
                        help='Number of actions')
    parser.add_argument('--batch_size',
                        default=64,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_iteration',
                        default=500000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.01,
        type=float,
        help='What fraction of num_iteration to run between evaluations')

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = GazeboWorld()
    print('Environment initialized')

    replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                 args.input_shape)
    online_model, online_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'online_model',
                                               create_duel_q_network,
                                               trainable=True)
    target_model, target_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'target_model',
                                               create_duel_q_network,
                                               trainable=False)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory,
                     args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size,
                     args.learning_rate)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        # saving and loading networks
        trainables = tf.trainable_variables()
        trainable_saver = tf.train.Saver(trainables, max_to_keep=1)
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.get_checkpoint_state("saved_networks")
        print('checkpoint:', checkpoint)
        if checkpoint and checkpoint.model_checkpoint_path:
            trainable_saver.restore(sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, args.num_actions,
                                          NUM_FIXED_SAMPLES)

        # initialize replay buffer
        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # start training:
        fit_iteration = int(args.num_iteration * args.eval_every)
        for i in range(0, args.num_iteration, fit_iteration):
            # evaluate:
            reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
                sess, batch_environment)
            mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(
                sess, fixed_samples)
            print("%d, %f, %f, %f, %f, %f, %f" %
                  (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
                   reward_max, reward_min))
            # train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)
            trainable_saver.save(sess, 'saved_networks/', global_step=i)

        reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
            sess, batch_environment)
        mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples)
        print("%d, %f, %f, %f, %f, %f, %f" %
              (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
               reward_max, reward_min))
예제 #4
0
 env = RewardNegativeDeath(env, death_factor=2)
 env = ObservationReshape(env)
 # create agent
 model = CartpoleNetwork(learning_rate=LEARNING_RATE,
                         discount_factor=DISCOUNT_FACTOR,
                         input_shape=(env.observation_space.shape[0], ),
                         output_shape=env.action_space.n)
 agent = DQNAgent(actions=env.action_space.n,
                  expl_max=EXPLORATION_MAX,
                  expl_min=EXPLORATION_MIN,
                  expl_decay=EXPLORATION_DECAY,
                  model=model,
                  memory_size=MEMORY_SIZE,
                  batch_size=BATCH_SIZE)
 # get and parse user args
 args = Parser.parseargs(defaultTrainIterations=10000,
                         defaultEvalIterations=10)
 if args.load:
     agent.load(env, args.loadversion)
 if args.train != 0:
     #agent.init_fill_memory(env, 50000)
     agent.train(env, args.train, train_s=1, save_i=MODEL_SAVE_EVERY)
 if args.eval != 0:
     print("Evaluation results (higher scores are better):")
     agent.evaluate(env, args.eval)
 if args.save:
     agent.save(env, args.saveversion)
 if args.render:
     agent.render_episode(env, random_action=args.renderrandom)
 # close env
 env.close()