コード例 #1
0
def main():
    # Initilization for tensor board
    session = tf.Session()
    tensorVar = tf.Variable(0)
    tf.summary.scalar("reward", tensorVar)
    sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic')
    sumWriterExternal = tf.summary.FileWriter('./reward/external')
    merged = tf.summary.merge_all()
    session.run(tf.initialize_all_variables())

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]
    goalExplain = [
        'top left door', 'top right door', 'middle ladder',
        'lower left ladder', 'lower right ladder', 'key'
    ]
    stepCount = 0
    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=80)
    #parser.add_argument("--repeat_action_probability", default=0.25)
    parser.add_argument("--color_averaging", default=False)
    parser.add_argument("--random_seed")
    #parser.add_argument("--record_screen_path", default="./record")
    #parser.add_argument("--record_sound_filename")
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    env = ALEEnvironment(args.game, args)
    hdqn = Hdqn()
    agent = Agent(hdqn, range(8), range(6))
    # set goalNum to hardcoded subgoal
    goalNum = 0
    intrinsicRewardMonitor = 0
    externalRewardMonitor = 0
    env.act(12)
    # for i in range(100):
    #     env.act(0)
    #     print(env.isTerminal())
    print(env.isTerminal())
コード例 #2
0
ファイル: main.py プロジェクト: renattou/ALE-Robot
def main():
    # Process arguments
    args = utils.parse_args()

    # Use random seed from argument
    if args.random_seed:
        random.seed(args.random_seed)

    # Instantiate environment class
    if args.environment == "ale":
        env = ALEEnvironment(args.game, args)
    elif args.environment == "gym":
        env = GymEnvironment(args.game, args)
    elif args.environment == "robot":
        env = RobotEnvironment(args.game, args)
    else:
        assert False, "Unknown environment" + args.environment

    # Instantiate DQN
    action_dim = env.action_dim()
    state_dim = env.state_dim()
    net = DQN(state_dim, action_dim, args)

    # Load weights before starting training
    if args.load_weights:
        filepath = args.load_weights
        net.load(filepath)

    # Instantiate agent
    agent = Agent(env, net, args)

    # Start statistics
    stats = Statistics(agent, agent.net, agent.net.memory, env, args)

    # Play game with two players (user and agent)
    if args.two_player:
        player_b = PlayerTwo(args)
        env.set_mode('test')
        stats.reset()
        agent.play_two_players(player_b)
        stats.write(0, "2player")
        sys.exit()

    # Play agent
    if args.play_games > 0:
        env.set_mode('test')
        stats.reset()
        for _ in range(args.play_games):
            agent.play()
        stats.write(0, "play")
        sys.exit()

    # Populate replay memory with random steps
    if args.random_steps:
        env.set_mode('test')
        stats.reset()
        agent.play_random(args.random_steps)
        stats.write(0, "random")

    for epoch in range(args.start_epoch, args.epochs):
        # Train agent
        if args.train_steps:
            env.set_mode('train')
            stats.reset()
            agent.train(args.train_steps)
            stats.write(epoch + 1, "train")

            # Save weights after every epoch
            if args.save_weights_prefix:
                filepath = args.save_weights_prefix + "_%d.h5" % (epoch + 1)
                net.save(filepath)

        # Test agent
        if args.test_steps:
            env.set_mode('test')
            stats.reset()
            agent.test(args.test_steps)
            stats.write(epoch + 1, "test")

    # Stop statistics
    stats.close()
コード例 #3
0
ファイル: main.py プロジェクト: softwarecomvc/simple_dqn
                    help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level",
                    choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
                    default="INFO",
                    help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
    env = ALEEnvironment(args.game, args)
    logger.info("Using ALE Environment")
elif args.environment == 'gym':
    # logger does not work with this line
    #logger.handlers.pop()
    env = GymEnvironment(args.game, args)
    logger.info("Using Gym Environment")
else:
    assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
コード例 #4
0
ファイル: main.py プロジェクト: Deanout/simple_dqn
mainarg.add_argument("--csv_file", help="Write training progress to this file.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
  random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
  env = ALEEnvironment(args.game, args)
  logger.info("Using ALE Environment")
elif args.environment == 'gym':
  logger.handlers.pop()
  env = GymEnvironment(args.game, args)
  logger.info("Using Gym Environment")
else:
  assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
  logger.info("Loading weights from %s" % args.load_weights)
コード例 #5
0
                    help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level",
                    choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
                    default="INFO",
                    help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
    env = ALEEnvironment(args.game, args)
    logger.info("Using ALE Environment")
elif args.environment == 'gym':
    logger.handlers.pop()
    env = GymEnvironment(args.game, args)
    logger.info("Using Gym Environment")
else:
    assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DQN(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
コード例 #6
0
ファイル: main.py プロジェクト: nikolaypavlov/simple_dqn
mainarg.add_argument("--csv_file", help="Write training progress to this file.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
        random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
  env = ALEEnvironment(args.game, args)
  logger.info("Using ALE Environment")
elif args.environment == 'gym':
  logger.handlers.pop()
  env = GymEnvironment(args.game, args)
  logger.info("Using Gym Environment")
else:
  assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DQN(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
コード例 #7
0
def run_agent(args):
    # Launch the graph

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        # Set up training variables
        training_iters = args.training_iters
        display_step = args.display_step
        test_step = args.test_step
        test_count = args.test_count
        tests_done = 0
        test_results = []

        # Stats for display
        ep_rewards = []
        ep_reward_last = 0
        qs = []
        q_last = 0
        avr_ep_reward = max_ep_reward = avr_q = 0.0

        # Set precision for printing numpy arrays, useful for debugging
        #np.set_printoptions(threshold='nan', precision=3, suppress=True)

        mode = args.model
        # Create environment
        if args.env_type == 'ALE':
            from environment import ALEEnvironment  # 这是原文件中的一个,暂时没用上
            env = ALEEnvironment(args.rom)
            if mode is None: mode = 'DQN'
            args.num_actions = env.numActions()

        elif args.env_type == 'gym':
            import gym
            try:
                import gym_vgdl  #This can be found on my github if you want to use it.
            except:
                pass
            env = gym.make(args.env)
            if mode is None:
                shape = env.observation_space.shape
                if len(shape) is 3: mode = 'DQN'
                elif shape[0] is None: mode = 'object'
                else: mode = 'vanilla'
            args.num_actions = env.action_space.n  #only works with discrete action spaces

        # Set agent variables
        if mode == 'DQN':
            args.model = 'CNN'
            args.preprocessor = 'deepmind'
            args.obs_size = [84, 84]
            args.history_len = 4
        elif mode == 'image':
            args.model = 'CNN'
            args.preprocessor = 'grayscale'
            args.obs_size = list(env.observation_space.shape)[:2]
            args.history_len = 2
        elif mode == 'object':
            args.model = 'object'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0
        elif mode == 'vanilla':
            args.model = 'nn'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0

        # Create agent
        agent = GraphQAgent(sess, args)
        #agent = DQNAgent.DQNAgent(sess, args)

        # Initialize all tensorflow variables
        sess.run(tf.global_variables_initializer())

        # Keep training until reach max iterations

        # Start Agent
        state = env.reset()
        agent.Reset(state)
        rewards = []
        terminal = False
        aver = np.zeros(int(training_iters / display_step) + 50)  # 这个数组太大复制太慢了
        maxeq = np.zeros(int(training_iters / display_step) + 50)
        savename = args.save_path + 'results/GBIL_3_' + args.riqi + args.env

        print(savename)
        iterationa = 0
        for step in tqdm(range(training_iters), ncols=80):

            #env.render()
            #print("step",step)
            # Act, and add
            action, value = agent.GetAction_wq(step)
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)
            #print("len(agent.trajectory_embeddings)",len(agent.trajectory_embeddings),"len(trj_obs)",len(agent.trajectory_observations))
            # Bookeeping
            rewards.append(reward)
            qs.append(value)

            if terminal:
                # Bookeeping
                ep_rewards.append(np.sum(rewards))
                rewards = []
                # Reset agent and environment
                # 应该在内存满了之后再用
                #if agent.G.Graphisfull():
                # C_time_a = time.time()
                # #agent.G.GetKeyPointByDegree()
                if cluster_flag:

                    # for x in range(len(agent.keypoint.obss)):
                    #     trj_x = agent.keypoint.obss[x]
                    #     for y in range(len(trj_x)):

                    #         node_y = trj_x[y]
                    #print("nodey",node_y)
                    # plt.imshow(node_y)
                    # plt.savefig(args.save_path+'results/temp/'+str(step)+'_'+str(x)+'_'+str(y)+'.png')
                    # plt.close()
                    #agent.G.ReconstructGraph(agent.keypoint.trjs2set())# 每display_step 重构一次
                    keypoints, keyobss = agent.keypoint.get_keypoint()
                    agent.G.ReconstructGraph(keypoints)
                    show_obs(
                        keyobss, args.save_path + 'results/temp/', "GBIL_3_" +
                        args.riqi + args.env + "_" + str(step) + "_")

                    #     agent.G.GraphCluster(args.num_center)
                    cluster_flag = False
                # # 这里的超参数决定了聚类的松紧,两个数值是特征向量之间的距离,
                # # 第一个要比第二个大,小于第二个参数的会被归为一类,
                # #第一个到第二个之间的可能归为多类,两个参数距离越远,类别数越多
                # C_time_b = time.time()
                # print("cluster time using ",C_time_b-C_time_a)
                state = env.reset()
                agent.Reset(state)

            # Display Statistics
            if (step) % display_step == 0:
                cluster_flag = True
                num_eps = len(ep_rewards[ep_reward_last:])
                if num_eps is not 0:
                    avr_ep_reward = np.mean(ep_rewards[ep_reward_last:])
                    max_ep_reward = np.max(ep_rewards[ep_reward_last:])
                    avr_q = np.mean(qs[q_last:])
                    q_last = len(qs)
                    ep_reward_last = len(ep_rewards)
                dict_entries = 0  #agent.DND.tot_capacity()
                aver[iterationa] = avr_ep_reward
                maxeq[iterationa] = max_ep_reward
                iterationa = iterationa + 1
                np.save(savename + 'aver.npy', aver)
                np.save(savename + 'maxeq.npy', maxeq)
                tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\
                    .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps)
                    +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\
                    .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries))

        # Continue until end of episode
        step = training_iters
        while not terminal:
            # Act, and add
            action, value = agent.GetAction_wq(step)
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)
            step += 1
コード例 #8
0
def run_agent(args):
    # Launch the graph

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        # Set up training variables
        training_iters = args.training_iters
        display_step = args.display_step
        test_step = args.test_step
        test_count = args.test_count
        tests_done = 0
        test_results = []

        # Stats for display
        ep_rewards = []
        ep_reward_last = 0
        qs = []
        q_last = 0
        avr_ep_reward = max_ep_reward = avr_q = 0.0

        # Set precision for printing numpy arrays, useful for debugging
        #np.set_printoptions(threshold='nan', precision=3, suppress=True)

        mode = args.model
        # Create environment
        if args.env_type == 'ALE':
            from environment import ALEEnvironment  # 这是原文件中的一个,暂时没用上
            env = ALEEnvironment(args.rom)
            if mode is None: mode = 'DQN'
            args.num_actions = env.numActions()

        elif args.env_type == 'gym':
            import gym
            try:
                import gym_vgdl  #This can be found on my github if you want to use it.
            except:
                pass
            env = gym.make(args.env)
            if mode is None:
                shape = env.observation_space.shape
                if len(shape) is 3: mode = 'DQN'
                elif shape[0] is None: mode = 'object'
                else: mode = 'vanilla'
            args.num_actions = env.action_space.n  #only works with discrete action spaces

        # Set agent variables
        if mode == 'DQN':
            args.model = 'CNN'
            args.preprocessor = 'deepmind'
            args.obs_size = [84, 84]
            args.history_len = 4
        elif mode == 'image':
            args.model = 'CNN'
            args.preprocessor = 'grayscale'
            args.obs_size = list(env.observation_space.shape)[:2]
            args.history_len = 2
        elif mode == 'object':
            args.model = 'object'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0
        elif mode == 'vanilla':
            args.model = 'nn'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0

        # Create agent
        agent = GraphQAgent.GraphQAgent(sess, args)
        #agent = DQNAgent.DQNAgent(sess, args)

        # Initialize all tensorflow variables
        sess.run(tf.global_variables_initializer())

        # Keep training until reach max iterations

        # Start Agent
        state = env.reset()
        agent.Reset(state)
        rewards = []
        terminal = False
        aver = np.zeros(training_iters)
        maxeq = np.zeros(training_iters)
        savename = args.save_path + 'results/GQ' + args.riqi + args.env

        print(savename)
        iterationa = 0
        for step in tqdm(range(training_iters), ncols=80):

            #env.render()

            # Act, and add
            action, value = agent.GetAction_wq()
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)

            # Bookeeping
            rewards.append(reward)
            qs.append(value)

            if terminal:
                # Bookeeping
                ep_rewards.append(np.sum(rewards))
                rewards = []
                # Reset agent and environment
                state = env.reset()
                agent.Reset(state)

            # Display Statistics
            if (step) % display_step == 0:
                num_eps = len(ep_rewards[ep_reward_last:])
                if num_eps is not 0:
                    avr_ep_reward = np.mean(ep_rewards[ep_reward_last:])
                    max_ep_reward = np.max(ep_rewards[ep_reward_last:])
                    avr_q = np.mean(qs[q_last:])
                    q_last = len(qs)
                    ep_reward_last = len(ep_rewards)
                dict_entries = 0  #agent.DND.tot_capacity()
                aver[iterationa] = avr_ep_reward
                maxeq[iterationa] = max_ep_reward
                iterationa = iterationa + 1
                np.save(savename + 'aver.npy', aver)
                np.save(savename + 'maxeq.npy', maxeq)
                tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\
                    .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps)
                    +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\
                    .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries))

        # Continue until end of episode
        step = training_iters
        while not terminal:
            # Act, and add
            action, value = agent.GetAction_wq()
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)
            step += 1
コード例 #9
0
ファイル: main.py プロジェクト: mthrok/simple_dqn
mainarg.add_argument("--csv_file", help="Write training progress to this file.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
  random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
  env = ALEEnvironment(args.game, args)
  logger.info("Using ALE Environment")
elif args.environment == 'gym':
  logger.handlers.pop()
  env = GymEnvironment(args.game, args)
  logger.info("Using Gym Environment")
else:
  assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
  logger.info("Loading weights from %s" % args.load_weights)
コード例 #10
0
def run_agent(args):
    # Launch the graph
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        # Set up training variables
        training_iters = args.training_iters
        display_step = args.display_step
        test_step = args.test_step
        test_count = args.test_count
        tests_done = 0
        test_results = []

        # Stats for display
        ep_rewards = []
        ep_reward_last = 0
        qs = []
        q_last = 0
        avr_ep_reward = max_ep_reward = avr_q = 0.0

        # Set precision for printing numpy arrays, useful for debugging
        #np.set_printoptions(threshold='nan', precision=3, suppress=True)

        mode = args.model
        # Create environment
        if args.env_type == 'ALE':
            from environment import ALEEnvironment
            env = ALEEnvironment(args.rom)
            if mode is None: mode = 'DQN'
            args.num_actions = env.numActions()

        elif args.env_type == 'gym':
            import gym
            try:
                import gym_vgdl  #This can be found on my github if you want to use it.
            except:
                pass
            env = gym.make(args.env)
            if mode is None:
                shape = env.observation_space.shape
                if len(shape) is 3: mode = 'DQN'
                elif shape[0] is None: mode = 'object'
                else: mode = 'vanilla'
            args.num_actions = env.action_space.n  #only works with discrete action spaces

        # Set agent variables
        if mode == 'DQN':
            args.model = 'CNN'
            args.preprocessor = 'deepmind'
            args.obs_size = [84, 84]
            args.history_len = 4
        elif mode == 'image':
            args.model = 'CNN'
            args.preprocessor = 'grayscale'
            args.obs_size = list(env.observation_space.shape)[:2]
            args.history_len = 2
        elif mode == 'object':
            args.model = 'object'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0
        elif mode == 'vanilla':
            args.model = 'nn'
            args.preprocessor = 'default'
            args.obs_size = list(env.observation_space.shape)
            args.history_len = 0

        # Create agent
        agent = NECAgent.NECAgent(sess, args)
        #agent = DQNAgent.DQNAgent(sess, args)

        # Initialize all tensorflow variables
        sess.run(tf.global_variables_initializer())

        # Keep training until reach max iterations

        # Start Agent
        state = env.reset()
        agent.Reset(state)
        rewards = []
        terminal = False

        for step in tqdm(range(training_iters), ncols=80):

            #env.render()

            # Act, and add
            action, value = agent.GetAction()
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)

            # Bookeeping
            rewards.append(reward)
            qs.append(value)

            if terminal:
                # Bookeeping
                ep_rewards.append(np.sum(rewards))
                rewards = []

                if step >= (tests_done) * test_step:
                    R_s = []
                    for i in tqdm(
                            range(test_count),
                            ncols=50,
                            bar_format='Testing: |{bar}| {n_fmt}/{total_fmt}'):
                        R = test_agent(agent, env)
                        R_s.append(R)
                    tqdm.write("Tests: {}".format(R_s))
                    tests_done += 1
                    test_results.append({
                        'step': step,
                        'scores': R_s,
                        'average': np.mean(R_s),
                        'max': np.max(R_s)
                    })

                    #Save to file
                    summary = {'params': vars(args), 'tests': test_results}
                    if args.save_file is not None:
                        np.save(args.save_file, summary)

                # Reset agent and environment
                state = env.reset()
                agent.Reset(state)

            # Display Statistics
            if (step) % display_step == 0:
                num_eps = len(ep_rewards[ep_reward_last:])
                if num_eps is not 0:
                    avr_ep_reward = np.mean(ep_rewards[ep_reward_last:])
                    max_ep_reward = np.max(ep_rewards[ep_reward_last:])
                    avr_q = np.mean(qs[q_last:])
                    q_last = len(qs)
                    ep_reward_last = len(ep_rewards)
                dict_entries = 0  #agent.DND.tot_capacity()
                tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\
                    .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps)
                    +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\
                    .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries))

        # Continue until end of episode
        step = training_iters
        while not terminal:
            # Act, and add
            action, value = agent.GetAction()
            state, reward, terminal, info = env.step(action)
            agent.Update(action, reward, state, terminal)
            step += 1

        # Final test
        R_s = []
        for i in tqdm(range(test_count),
                      ncols=50,
                      bar_format='Testing: |{bar}| {n_fmt}/{total_fmt}'):
            R = test_agent(agent, env)
            R_s.append(R)
        tqdm.write("Tests: {}".format(R_s))
        tests_done += 1
        test_results.append({
            'step': step,
            'scores': R_s,
            'average': np.mean(R_s),
            'max': np.max(R_s)
        })

        #Save to file
        summary = {'params': vars(args), 'tests': test_results}
        if args.save_file is not None:
            np.save(args.save_file, summary)
コード例 #11
0
def main():
    # Initilization for tensor board
    session = tf.Session()
    tensorVar = tf.Variable(0)
    tensorVarLoss = tf.Variable(0, dtype="float32")
    tensorVarMiddle = tf.Variable(0, dtype="float32")
    tensorVarLowerRight = tf.Variable(0, dtype="float32")
    tensorVarLowerLeft = tf.Variable(0, dtype="float32")
    tensorVarKey = tf.Variable(0, dtype="float32")

    tf.summary.scalar("reward", tensorVar)
    tf.summary.scalar("loss", tensorVarLoss)
    tf.summary.scalar("middle ladder", tensorVarMiddle)
    tf.summary.scalar("lower right ladder", tensorVarLowerRight)
    tf.summary.scalar("lower left ladder", tensorVarLowerLeft)
    tf.summary.scalar("key", tensorVarKey)
    sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic')
    sumWriterLoss = tf.summary.FileWriter('./reward/loss')
    sumWriterExternal = tf.summary.FileWriter('./reward/external')
    sumWriterMiddle = tf.summary.FileWriter('./reward/middleLadder')
    sumWriterLowerRight = tf.summary.FileWriter('./reward/lowerRightLadder')
    sumWriterLowerLeft = tf.summary.FileWriter('./reward/lowerLeftLadder')
    sumWriterKey = tf.summary.FileWriter('./reward/key')
    merged = tf.summary.merge_all()
    session.run(tf.initialize_all_variables())

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    actionExplain = [
        'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right',
        'jump left'
    ]
    goalExplain = ['lower right ladder', 'lower left ladder', 'key']
    stepCount = 0
    goalSuccessTrack = [
        deque(), deque(), deque(), deque()
    ]  # deque in python is linkedlist, list is actually an array
    goalSuccessCount = [0, 0, 0, 0]
    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=4)
    parser.add_argument("--color_averaging", default=False)
    parser.add_argument("--random_seed")
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    parser.add_argument("--load_weight", default=False)
    parser.add_argument("--use_sparse_reward", type=str2bool, default=False)
    args = parser.parse_args()
    ActorExperience = namedtuple(
        "ActorExperience",
        ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple(
        "MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    annealComplete = False
    saveExternalRewardScreen = True
    env = ALEEnvironment(args.game, args)
    hdqn = Hdqn()

    # Initilize network and agent
    if (args.load_weight):
        defaultRandomPlaySteps = 200000
        print('loading weight')
        hdqn.loadWeight()
        print('loading weight complete')
        agent = Agent(hdqn, range(8), range(3))
    else:
        defaultRandomPlaySteps = 200000
        agent = Agent(hdqn, range(8), range(3))
    intrinsicRewardMonitor = 0
    externalRewardMonitor = 0
    for episode in range(80000):
        print("\n\n### EPISODE " + str(episode) + "###")
        print("\n\n### STEPS " + str(stepCount) + "###")
        # Restart the game
        env.restart()
        episodeSteps = 0
        # set goalNum to hardcoded subgoal
        lastGoal = -1
        while not env.isGameOver() and episodeSteps <= maxStepsPerEpisode:
            totalExternalRewards = 0  # NOT SURE IF IT SHOULD BE CLEARED HERE!
            stateLastGoal = env.getStackedState()
            # nextState = stateLastGoal
            goal = agent.selectGoal(stateLastGoal)
            if (len(goalSuccessTrack[goal]) > 100):
                firstElement = goalSuccessTrack[goal].popleft()
                goalSuccessCount[goal] -= firstElement
            print('predicted subgoal is: ' + goalExplain[goal])
            while not env.isTerminal() and not env.goalReached(
                    goal) and episodeSteps <= maxStepsPerEpisode:
                state = env.getStackedState()
                action = agent.selectMove(state, goal)
                externalRewards = env.act(actionMap[action])
                if (externalRewards != 0):
                    externalRewards = 1.0
                # Debugging
                if (saveExternalRewardScreen and externalRewards == 100):
                    im = Image.fromarray(np.squeeze(env.getState()))
                    im.save('keyGet.jpeg')
                    saveExternalRewardScreen = False
                stepCount += 1
                episodeSteps += 1
                # save the model every 50000 steps
                if (stepCount % 50000 == 0):
                    hdqn.saveWeight(stepCount)
                nextState = env.getStackedState()
                distanceReward = env.distanceReward(lastGoal, goal)
                # only assign intrinsic reward if the goal is reached and it has not been reached previously
                intrinsicRewards = agent.criticize(
                    env.goalNotReachedBefore(goal) & env.goalReached(goal),
                    actionMap[action], env.isTerminal(), distanceReward,
                    args.use_sparse_reward)
                # Store transition and update network params
                exp = ActorExperience(state, goal, action, intrinsicRewards,
                                      nextState, env.isTerminal())
                agent.store(exp, meta=False)

                # Do not update the network during random play
                if (stepCount >= defaultRandomPlaySteps):
                    if (stepCount == defaultRandomPlaySteps):
                        print('start training (random walk ends)')
                    if (stepCount % 4 == 0):
                        loss = agent.update(stepCount, meta=False)
                        agent.update(stepCount, meta=True)

                # Update external reward for D2
                totalExternalRewards += externalRewards + intrinsicRewards

                # Update data for visualization
                externalRewardMonitor += externalRewards
                intrinsicRewardMonitor += intrinsicRewards

            # Store meta controller's experience
            exp = MetaExperience(stateLastGoal, goal, totalExternalRewards,
                                 nextState, env.isTerminal())
            agent.store(exp, meta=True)

            # Update goal
            if episodeSteps > maxStepsPerEpisode:
                goalSuccessTrack[goal].append(0)
                break
            elif env.goalReached(goal):
                goalSuccessTrack[goal].append(1)
                goalSuccessCount[goal] += 1
                print('goal reached: ' + goalExplain[goal])
                # Training Visualization
                intrinsicPlot = session.run(
                    merged, feed_dict={tensorVar: intrinsicRewardMonitor})
                sumWriterIntrinsic.add_summary(intrinsicPlot, stepCount)
                sumWriterIntrinsic.flush()
                externalPlot = session.run(
                    merged, feed_dict={tensorVar: externalRewardMonitor})
                sumWriterExternal.add_summary(externalPlot, stepCount)
                sumWriterExternal.flush()
                lowerRightPlot = session.run(
                    merged,
                    feed_dict={
                        tensorVarLowerRight:
                        float(goalSuccessCount[0]) /
                        (0.1 + len(goalSuccessTrack[0]))
                    })
                sumWriterLowerRight.add_summary(lowerRightPlot, stepCount)
                sumWriterLowerRight.flush()
                lowerLeftPlot = session.run(
                    merged,
                    feed_dict={
                        tensorVarLowerLeft:
                        float(goalSuccessCount[1]) /
                        (0.1 + len(goalSuccessTrack[1]))
                    })
                sumWriterLowerLeft.add_summary(lowerLeftPlot, stepCount)
                sumWriterLowerLeft.flush()
                keyPlot = session.run(merged,
                                      feed_dict={
                                          tensorVarKey:
                                          float(goalSuccessCount[2]) /
                                          (0.1 + len(goalSuccessTrack[2]))
                                      })
                sumWriterKey.add_summary(keyPlot, stepCount)
                sumWriterKey.flush()
                lastGoal = goal
                # get key
                if goal == 2:
                    break
            else:
                goalSuccessTrack[goal].append(0)
                if not env.isGameOver():
                    lastGoal = -1
                    env.beginNextLife()

        if (not annealComplete):
            # Annealing
            agent.annealMetaEpsilon(stepCount)
            agent.annealControllerEpsilon(stepCount, goal)
コード例 #12
0
mainarg.add_argument("--csv_file", help="Write training progress to this file.")

comarg = parser.add_argument_group('Common')
comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.")
comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
  random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
  env = ALEEnvironment(args.game, args)
  logger.info("Using ALE Environment")
elif args.environment == 'gym':
  logger.handlers.pop()
  env = GymEnvironment(args.game, args)
  logger.info("Using Gym Environment")
else:
  assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
  logger.info("Loading weights from %s" % args.load_weights)
コード例 #13
0
ファイル: test.py プロジェクト: stevenliang16/Montezuma
def main():
    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key']
    actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left']
    stepCount = 0
    parser = argparse.ArgumentParser()
    parser.add_argument("--game", default="montezuma_revenge.bin")
    parser.add_argument("--display_screen", type=str2bool, default=False)
    parser.add_argument("--frame_skip", default=4)
    #parser.add_argument("--repeat_action_probability", default=0.25)
    parser.add_argument("--color_averaging", default=False)
    parser.add_argument("--random_seed")
    #parser.add_argument("--record_screen_path", default="./record")
    #parser.add_argument("--record_sound_filename")
    parser.add_argument("--minimal_action_set", default=False)
    parser.add_argument("--screen_width", default=84)
    parser.add_argument("--screen_height", default=84)
    args = parser.parse_args()
    env = ALEEnvironment(args.game, args)
    hdqn = Hdqn()
    print('loading weights')
    hdqn.loadWeight()
    print('weight loaded')
    agent = Agent(hdqn, range(8), range(6))
    # Probability of making random action is 0.1
    agent.setControllerEpsilon([0.1]*6)
    agent.setMetaEpsilon(0.1)
    while True:
        env.restart()
        for i in range(10):
            env.act(0)
        goalNum = 0
        while not env.isGameOver():
            goal = agent.selectTrueGoal(goalNum)
            print('predicted subgoal is: ' + str(goal) + ' ' + goalExplain[goal])
            while not env.isTerminal() and not env.goalReached(goal):
                state = env.getState()
                action = agent.selectMove(state, goal)
                #print ('selected action is: ' + str(actionMap[action]) + ' ' + actionExplain[actionMap[action]])
                #print('selected action is :' + str(actionExplain[action]))
                externalRewards = env.act(actionMap[action])
            if env.isTerminal() is False:
                goalNum = goalNum + 1
            else:
                # Re-initialize game if not game over
                if not env.isGameOver():
                    goalNum = 0
                    env.resetLife()
                    for i in range(10):
                        env.act(0)
コード例 #14
0
                    default="INFO",
                    help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)
# bug with double logging
if args.environment == 'gym':
    logger.handlers.pop()

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
env = GymEnvironment(args.rom_file,
                     args) if args.environment == 'gym' else ALEEnvironment(
                         args.rom_file, args)
mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
    stats.reset()
    agent.play(args.play_games)
    stats.write(0, "play")
    if args.visualization_file:
コード例 #15
0
 def __init__(self, id, prediction_q, training_q, config):
     self.history = StateHistory(config)
     env = ALEEnvironment(config)
     super(DeepQAgent, self).__init__(id, prediction_q, training_q, config,
                                      env)
コード例 #16
0
from environment import ALEEnvironment
from RHEA import RollingHorizonEvolutionaryAlgorithm

if __name__ == '__main__':

    ale = ALEEnvironment('./roms/qbert.bin')
    rollout_length = 50
    rhea = RollingHorizonEvolutionaryAlgorithm(rollout_length, ale, 0.2, 10)

    rhea.run()