def main(): actionMap = [0, 1, 2, 3, 4, 5, 11, 12] goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key'] actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] stepCount = 0 parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=4) #parser.add_argument("--repeat_action_probability", default=0.25) parser.add_argument("--color_averaging", default=False) parser.add_argument("--random_seed") #parser.add_argument("--record_screen_path", default="./record") #parser.add_argument("--record_sound_filename") parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) args = parser.parse_args() env = ALEEnvironment(args.game, args) hdqn = Hdqn() print('loading weights') hdqn.loadWeight() print('weight loaded') agent = Agent(hdqn, range(8), range(6)) # Probability of making random action is 0.1 agent.setControllerEpsilon([0.1]*6) agent.setMetaEpsilon(0.1) while True: env.restart() for i in range(10): env.act(0) goalNum = 0 while not env.isGameOver(): goal = agent.selectTrueGoal(goalNum) print('predicted subgoal is: ' + str(goal) + ' ' + goalExplain[goal]) while not env.isTerminal() and not env.goalReached(goal): state = env.getState() action = agent.selectMove(state, goal) #print ('selected action is: ' + str(actionMap[action]) + ' ' + actionExplain[actionMap[action]]) #print('selected action is :' + str(actionExplain[action])) externalRewards = env.act(actionMap[action]) if env.isTerminal() is False: goalNum = goalNum + 1 else: # Re-initialize game if not game over if not env.isGameOver(): goalNum = 0 env.resetLife() for i in range(10): env.act(0)
def main(): # Initilization for tensor board session = tf.Session() tensorVar = tf.Variable(0) tensorVarLoss = tf.Variable(0, dtype="float32") tensorVarMiddle = tf.Variable(0, dtype="float32") tensorVarLowerRight = tf.Variable(0, dtype="float32") tensorVarLowerLeft = tf.Variable(0, dtype="float32") tensorVarKey = tf.Variable(0, dtype="float32") tf.summary.scalar("reward", tensorVar) tf.summary.scalar("loss", tensorVarLoss) tf.summary.scalar("middle ladder", tensorVarMiddle) tf.summary.scalar("lower right ladder", tensorVarLowerRight) tf.summary.scalar("lower left ladder", tensorVarLowerLeft) tf.summary.scalar("key", tensorVarKey) sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic') sumWriterLoss = tf.summary.FileWriter('./reward/loss') sumWriterExternal = tf.summary.FileWriter('./reward/external') sumWriterMiddle = tf.summary.FileWriter('./reward/middleLadder') sumWriterLowerRight = tf.summary.FileWriter('./reward/lowerRightLadder') sumWriterLowerLeft = tf.summary.FileWriter('./reward/lowerLeftLadder') sumWriterKey = tf.summary.FileWriter('./reward/key') merged = tf.summary.merge_all() session.run(tf.initialize_all_variables()) actionMap = [0, 1, 2, 3, 4, 5, 11, 12] actionExplain = [ 'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left' ] goalExplain = ['lower right ladder', 'lower left ladder', 'key'] stepCount = 0 goalSuccessTrack = [ deque(), deque(), deque(), deque() ] # deque in python is linkedlist, list is actually an array goalSuccessCount = [0, 0, 0, 0] parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=4) parser.add_argument("--color_averaging", default=False) parser.add_argument("--random_seed") parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) parser.add_argument("--load_weight", default=False) parser.add_argument("--use_sparse_reward", type=str2bool, default=False) args = parser.parse_args() ActorExperience = namedtuple( "ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple( "MetaExperience", ["state", "goal", "reward", "next_state", "done"]) annealComplete = False saveExternalRewardScreen = True env = ALEEnvironment(args.game, args) hdqn = Hdqn() # Initilize network and agent if (args.load_weight): defaultRandomPlaySteps = 200000 print('loading weight') hdqn.loadWeight() print('loading weight complete') agent = Agent(hdqn, range(8), range(3)) else: defaultRandomPlaySteps = 200000 agent = Agent(hdqn, range(8), range(3)) intrinsicRewardMonitor = 0 externalRewardMonitor = 0 for episode in range(80000): print("\n\n### EPISODE " + str(episode) + "###") print("\n\n### STEPS " + str(stepCount) + "###") # Restart the game env.restart() episodeSteps = 0 # set goalNum to hardcoded subgoal lastGoal = -1 while not env.isGameOver() and episodeSteps <= maxStepsPerEpisode: totalExternalRewards = 0 # NOT SURE IF IT SHOULD BE CLEARED HERE! stateLastGoal = env.getStackedState() # nextState = stateLastGoal goal = agent.selectGoal(stateLastGoal) if (len(goalSuccessTrack[goal]) > 100): firstElement = goalSuccessTrack[goal].popleft() goalSuccessCount[goal] -= firstElement print('predicted subgoal is: ' + goalExplain[goal]) while not env.isTerminal() and not env.goalReached( goal) and episodeSteps <= maxStepsPerEpisode: state = env.getStackedState() action = agent.selectMove(state, goal) externalRewards = env.act(actionMap[action]) if (externalRewards != 0): externalRewards = 1.0 # Debugging if (saveExternalRewardScreen and externalRewards == 100): im = Image.fromarray(np.squeeze(env.getState())) im.save('keyGet.jpeg') saveExternalRewardScreen = False stepCount += 1 episodeSteps += 1 # save the model every 50000 steps if (stepCount % 50000 == 0): hdqn.saveWeight(stepCount) nextState = env.getStackedState() distanceReward = env.distanceReward(lastGoal, goal) # only assign intrinsic reward if the goal is reached and it has not been reached previously intrinsicRewards = agent.criticize( env.goalNotReachedBefore(goal) & env.goalReached(goal), actionMap[action], env.isTerminal(), distanceReward, args.use_sparse_reward) # Store transition and update network params exp = ActorExperience(state, goal, action, intrinsicRewards, nextState, env.isTerminal()) agent.store(exp, meta=False) # Do not update the network during random play if (stepCount >= defaultRandomPlaySteps): if (stepCount == defaultRandomPlaySteps): print('start training (random walk ends)') if (stepCount % 4 == 0): loss = agent.update(stepCount, meta=False) agent.update(stepCount, meta=True) # Update external reward for D2 totalExternalRewards += externalRewards + intrinsicRewards # Update data for visualization externalRewardMonitor += externalRewards intrinsicRewardMonitor += intrinsicRewards # Store meta controller's experience exp = MetaExperience(stateLastGoal, goal, totalExternalRewards, nextState, env.isTerminal()) agent.store(exp, meta=True) # Update goal if episodeSteps > maxStepsPerEpisode: goalSuccessTrack[goal].append(0) break elif env.goalReached(goal): goalSuccessTrack[goal].append(1) goalSuccessCount[goal] += 1 print('goal reached: ' + goalExplain[goal]) # Training Visualization intrinsicPlot = session.run( merged, feed_dict={tensorVar: intrinsicRewardMonitor}) sumWriterIntrinsic.add_summary(intrinsicPlot, stepCount) sumWriterIntrinsic.flush() externalPlot = session.run( merged, feed_dict={tensorVar: externalRewardMonitor}) sumWriterExternal.add_summary(externalPlot, stepCount) sumWriterExternal.flush() lowerRightPlot = session.run( merged, feed_dict={ tensorVarLowerRight: float(goalSuccessCount[0]) / (0.1 + len(goalSuccessTrack[0])) }) sumWriterLowerRight.add_summary(lowerRightPlot, stepCount) sumWriterLowerRight.flush() lowerLeftPlot = session.run( merged, feed_dict={ tensorVarLowerLeft: float(goalSuccessCount[1]) / (0.1 + len(goalSuccessTrack[1])) }) sumWriterLowerLeft.add_summary(lowerLeftPlot, stepCount) sumWriterLowerLeft.flush() keyPlot = session.run(merged, feed_dict={ tensorVarKey: float(goalSuccessCount[2]) / (0.1 + len(goalSuccessTrack[2])) }) sumWriterKey.add_summary(keyPlot, stepCount) sumWriterKey.flush() lastGoal = goal # get key if goal == 2: break else: goalSuccessTrack[goal].append(0) if not env.isGameOver(): lastGoal = -1 env.beginNextLife() if (not annealComplete): # Annealing agent.annealMetaEpsilon(stepCount) agent.annealControllerEpsilon(stepCount, goal)