Esempio n. 1
0
def savegame(config):
    fp = open('symbolMapping'+str(sys.argv[1])+'.txt','r')
    data = fp.read().split('\n')
    spd = [data_.split(' ')[::-1] for data_ in data]
    dic_local = dict(spd[0:-1])
    dic_local['0'] = 'NULL'
    fp.close()

    fp = open('symbolMapping5.txt','r')
    data = fp.read().split('\n')
    spd = [data_.split(' ')for data_ in data]
    dic_global = dict(spd[0:-1])

    dic_global['NULL']='0'
    fp.close()    

    # Step 1: init Game
    env = Environment(config.game_num) #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())
    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress')
    episode_length = 0
    num_episodes = 0
    total_reward = 0
    MAX_STEPS = 105000
    totalSteps = 0
    MEM_STEPS = 10000
    memory = []
    while totalSteps < MAX_STEPS:
        totalSteps += 1
        if env.START_NEW_GAME:
            episode_length = 0
            env.START_NEW_GAME = False
            state, reward, terminal, availableObjects = env.newGame()
            brain.history.add(state)
        action_indicator = np.zeros(actions)
        object_indicator = np.zeros(objects)
        #predict
        action_index,object_index = brain.getAction(availableObjects, True)
        Qactions, Qobjects = brain.getQValues(availableObjects)
        action_indicator[action_index] = 1
        object_indicator[object_index] = 1
        # print state
        memory.append((convert_state(state, dic_local, dic_global), Qactions, Qobjects))
        #act
        nextstate,reward,terminal, availableObjects = env.step(action_index,object_index)
        total_reward += reward
        episode_length += 1
        #observe
        brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, True)
        state = nextstate

        if (totalSteps % MEM_STEPS == 0):
            fileName = str(config.game_num) + "_mem.txt"
            with open(fileName, "a") as fp:
                for i in range(len(memory)):
                    for j in memory[i][0]:
                        print >> fp, j,
                    print >> fp
                    for j in memory[i][1]:
                        print >> fp, j,
                    print >> fp
                    for j in memory[i][2]:
                        print >> fp, j,
                    print >> fp
            memory = []


        if ((terminal) or ((episode_length % config.max_episode_length) == 0)):
            num_episodes += 1
            with open("saver_reward.txt", "a") as fp:
                print >> fp, (total_reward / (num_episodes * 1.0))
            env.START_NEW_GAME = True

        pbar.update(1)


        if (brain.timeStep) > config.MAX_FRAMES:
            brain.train_writer.close()
            break

    brain.session.close()
def playgame(config):
    # Step 1: init Game
    env = Environment(config.game_num)  #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())

    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    pbar = tqdm(total=config.MAX_FRAMES, desc='Training Progress')
    episode_length = 0
    num_episodes = 0
    total_reward = 0
    while True:
        if env.START_NEW_GAME:
            episode_length = 0
            env.START_NEW_GAME = False
            state, reward, terminal, availableObjects = env.newGame()
            brain.history.add(state)
        action_indicator = np.zeros(actions)
        object_indicator = np.zeros(objects)
        #predict
        action_index, object_index = brain.getAction(availableObjects)
        action_indicator[action_index] = 1
        object_indicator[object_index] = 1
        #act
        nextstate, reward, terminal, availableObjects = env.step(
            action_index, object_index)
        total_reward += reward
        episode_length += 1
        #observe
        brain.setPerception(state, reward, action_indicator, object_indicator,
                            nextstate, terminal, False)
        state = nextstate

        if ((terminal) or ((episode_length % config.max_episode_length) == 0)):
            num_episodes += 1
            with open("train_reward.txt", "a") as fp:
                print >> fp, (total_reward / (num_episodes * 1.0))
            env.START_NEW_GAME = True
#####################################################################
#for evaluating qvalues
        if (brain.timeStep % config.EVAL == 0) and (brain.timeStep != 0):
            if (brain.timeStep / config.EVAL == 1):
                if not ((os.path.exists("checkStates.txt")) and
                        (os.path.getsize("checkStates.txt") > 0)):
                    assert config.SAMPLE_STATES % config.BATCH_SIZE == 0
                    assert config.SAMPLE_STATES < brain.memory.count
                    checkStates, _1, _2, _3, _4, _5 = brain.memory.sample()
                    with open("checkStates.txt", "w") as fp:
                        cpickle.dump(checkStates, fp)
                else:
                    with open("checkStates.txt", 'r') as fp:
                        checkStates = cpickle.load(fp)

            evalQValues_a = brain.action_valueT.eval(
                feed_dict={brain.stateInputT: checkStates},
                session=brain.session)
            maxEvalQValues_a = np.max(evalQValues_a, axis=1)
            avgEvalQValues_a = np.mean(maxEvalQValues_a)

            with open("evalQValue_a.txt", "a") as fp:
                print >> fp, avgEvalQValues_a

            evalQValues_o = brain.object_valueT.eval(
                feed_dict={brain.stateInputT: checkStates},
                session=brain.session)
            maxEvalQValues_o = np.max(evalQValues_o, axis=1)
            avgEvalQValues_o = np.mean(maxEvalQValues_o)

            with open("evalQValue_o.txt", "a") as fp:
                print >> fp, avgEvalQValues_o
#####################################################################
#save current history before starting evaluation
# temp_history_data = brain.history.copy()
#now let us evaluate avg reward
#create alternate environment for EVALUATION
# env_eval = Environment(2)
            env_eval = env
            if config.TUTORIAL_WORLD:
                total_reward, nrewards, nepisodes, quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt = evaluate(
                    brain, env_eval, config)
            else:
                total_reward, nrewards, nepisodes, quest1_reward_cnt = evaluate(
                    brain, env_eval, config)

            with open("test_reward.txt", "a") as fp:
                print >> fp, total_reward

            #setting the best network
            if len(env_eval.reward_history) == 0 or total_reward > max(
                    env_eval.reward_history):
                # save best network
                if not os.path.exists(os.getcwd() + '/Savednetworks'):
                    os.makedirs(os.getcwd() + '/Savednetworks')
                brain.saver.save(brain.session,
                                 os.getcwd() + '/Savednetworks/' + 'network' +
                                 '-dqn',
                                 global_step=brain.timeStep)

            env_eval.reward_history.append(
                total_reward)  #doing this for keeping track of best network

            #go back to saved frame after evaluation completed
            # brain.history.add(temp_history_data)
            #####################################################################
            if config.TUTORIAL_WORLD:
                brain.inject_summary(
                    {
                        'average.q_a':
                        avgEvalQValues_a,
                        'average.q_o':
                        avgEvalQValues_o,
                        'average.q':
                        (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a),
                        'average_reward':
                        total_reward,
                        'average_num_pos_reward':
                        nrewards,
                        'number_of_episodes':
                        nepisodes,
                        'quest1_average_reward_cnt':
                        quest1_reward_cnt,
                        'quest2_average_reward_cnt':
                        quest2_reward_cnt,
                        'quest3_average_reward_cnt':
                        quest3_reward_cnt
                    }, brain.timeStep)
            else:
                brain.inject_summary(
                    {
                        'average.q_a':
                        avgEvalQValues_a,
                        'average.q_o':
                        avgEvalQValues_o,
                        'average.q':
                        (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a),
                        'average_reward':
                        total_reward,
                        'average_numrewards':
                        nrewards,
                        'number_of_episodes':
                        nepisodes,
                        'quest1_average_reward_cnt':
                        quest1_reward_cnt
                    }, brain.timeStep)


#####################################################################
        pbar.update(1)

        if (brain.timeStep) > config.MAX_FRAMES:
            brain.train_writer.close()
            break

    brain.session.close()