def savegame(config): fp = open('symbolMapping'+str(sys.argv[1])+'.txt','r') data = fp.read().split('\n') spd = [data_.split(' ')[::-1] for data_ in data] dic_local = dict(spd[0:-1]) dic_local['0'] = 'NULL' fp.close() fp = open('symbolMapping5.txt','r') data = fp.read().split('\n') spd = [data_.split(' ')for data_ in data] dic_global = dict(spd[0:-1]) dic_global['NULL']='0' fp.close() # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress') episode_length = 0 num_episodes = 0 total_reward = 0 MAX_STEPS = 105000 totalSteps = 0 MEM_STEPS = 10000 memory = [] while totalSteps < MAX_STEPS: totalSteps += 1 if env.START_NEW_GAME: episode_length = 0 env.START_NEW_GAME = False state, reward, terminal, availableObjects = env.newGame() brain.history.add(state) action_indicator = np.zeros(actions) object_indicator = np.zeros(objects) #predict action_index,object_index = brain.getAction(availableObjects, True) Qactions, Qobjects = brain.getQValues(availableObjects) action_indicator[action_index] = 1 object_indicator[object_index] = 1 # print state memory.append((convert_state(state, dic_local, dic_global), Qactions, Qobjects)) #act nextstate,reward,terminal, availableObjects = env.step(action_index,object_index) total_reward += reward episode_length += 1 #observe brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, True) state = nextstate if (totalSteps % MEM_STEPS == 0): fileName = str(config.game_num) + "_mem.txt" with open(fileName, "a") as fp: for i in range(len(memory)): for j in memory[i][0]: print >> fp, j, print >> fp for j in memory[i][1]: print >> fp, j, print >> fp for j in memory[i][2]: print >> fp, j, print >> fp memory = [] if ((terminal) or ((episode_length % config.max_episode_length) == 0)): num_episodes += 1 with open("saver_reward.txt", "a") as fp: print >> fp, (total_reward / (num_episodes * 1.0)) env.START_NEW_GAME = True pbar.update(1) if (brain.timeStep) > config.MAX_FRAMES: brain.train_writer.close() break brain.session.close()
def playgame(config): # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training pbar = tqdm(total=config.MAX_FRAMES, desc='Training Progress') episode_length = 0 num_episodes = 0 total_reward = 0 while True: if env.START_NEW_GAME: episode_length = 0 env.START_NEW_GAME = False state, reward, terminal, availableObjects = env.newGame() brain.history.add(state) action_indicator = np.zeros(actions) object_indicator = np.zeros(objects) #predict action_index, object_index = brain.getAction(availableObjects) action_indicator[action_index] = 1 object_indicator[object_index] = 1 #act nextstate, reward, terminal, availableObjects = env.step( action_index, object_index) total_reward += reward episode_length += 1 #observe brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, False) state = nextstate if ((terminal) or ((episode_length % config.max_episode_length) == 0)): num_episodes += 1 with open("train_reward.txt", "a") as fp: print >> fp, (total_reward / (num_episodes * 1.0)) env.START_NEW_GAME = True ##################################################################### #for evaluating qvalues if (brain.timeStep % config.EVAL == 0) and (brain.timeStep != 0): if (brain.timeStep / config.EVAL == 1): if not ((os.path.exists("checkStates.txt")) and (os.path.getsize("checkStates.txt") > 0)): assert config.SAMPLE_STATES % config.BATCH_SIZE == 0 assert config.SAMPLE_STATES < brain.memory.count checkStates, _1, _2, _3, _4, _5 = brain.memory.sample() with open("checkStates.txt", "w") as fp: cpickle.dump(checkStates, fp) else: with open("checkStates.txt", 'r') as fp: checkStates = cpickle.load(fp) evalQValues_a = brain.action_valueT.eval( feed_dict={brain.stateInputT: checkStates}, session=brain.session) maxEvalQValues_a = np.max(evalQValues_a, axis=1) avgEvalQValues_a = np.mean(maxEvalQValues_a) with open("evalQValue_a.txt", "a") as fp: print >> fp, avgEvalQValues_a evalQValues_o = brain.object_valueT.eval( feed_dict={brain.stateInputT: checkStates}, session=brain.session) maxEvalQValues_o = np.max(evalQValues_o, axis=1) avgEvalQValues_o = np.mean(maxEvalQValues_o) with open("evalQValue_o.txt", "a") as fp: print >> fp, avgEvalQValues_o ##################################################################### #save current history before starting evaluation # temp_history_data = brain.history.copy() #now let us evaluate avg reward #create alternate environment for EVALUATION # env_eval = Environment(2) env_eval = env if config.TUTORIAL_WORLD: total_reward, nrewards, nepisodes, quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt = evaluate( brain, env_eval, config) else: total_reward, nrewards, nepisodes, quest1_reward_cnt = evaluate( brain, env_eval, config) with open("test_reward.txt", "a") as fp: print >> fp, total_reward #setting the best network if len(env_eval.reward_history) == 0 or total_reward > max( env_eval.reward_history): # save best network if not os.path.exists(os.getcwd() + '/Savednetworks'): os.makedirs(os.getcwd() + '/Savednetworks') brain.saver.save(brain.session, os.getcwd() + '/Savednetworks/' + 'network' + '-dqn', global_step=brain.timeStep) env_eval.reward_history.append( total_reward) #doing this for keeping track of best network #go back to saved frame after evaluation completed # brain.history.add(temp_history_data) ##################################################################### if config.TUTORIAL_WORLD: brain.inject_summary( { 'average.q_a': avgEvalQValues_a, 'average.q_o': avgEvalQValues_o, 'average.q': (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a), 'average_reward': total_reward, 'average_num_pos_reward': nrewards, 'number_of_episodes': nepisodes, 'quest1_average_reward_cnt': quest1_reward_cnt, 'quest2_average_reward_cnt': quest2_reward_cnt, 'quest3_average_reward_cnt': quest3_reward_cnt }, brain.timeStep) else: brain.inject_summary( { 'average.q_a': avgEvalQValues_a, 'average.q_o': avgEvalQValues_o, 'average.q': (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a), 'average_reward': total_reward, 'average_numrewards': nrewards, 'number_of_episodes': nepisodes, 'quest1_average_reward_cnt': quest1_reward_cnt }, brain.timeStep) ##################################################################### pbar.update(1) if (brain.timeStep) > config.MAX_FRAMES: brain.train_writer.close() break brain.session.close()