def savegame(config):
    # Step 1: init Game
    env = Environment(config.game_num)  #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())

    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    dic = {}
    with open("symbolMapping" + str(sys.argv[1]) + ".txt", 'r') as fp:
        data = fp.read().split('\n')
        for i in range(len(data) - 1):
            splitdata = data[i].split(' ')
            dic[int(splitdata[1])] = splitdata[0]
    dic[0] = "NULL"

    fp = open("teacher" + str(sys.argv[1]) + "_embeddings.txt", "w")
    for i in range(config.vocab_size - 1):
        state = np.zeros([config.batch_size, config.seq_length])
        state[:, 0] = i
        embedding = brain.output_embedT.eval(
            feed_dict={brain.stateInputT: state}, session=brain.session)[0,
                                                                         0, :]
        print >> fp, dic[i]
        for element in embedding:
            print >> fp, element,
        print >> fp
    brain.session.close()
コード例 #2
0
 def setup_dqn(trainable):
     with open("config.yaml", 'r') as stream:
         config = yaml.load(stream, Loader=yaml.FullLoader)
         cfg = config["dqn"]
     dqn = DQN(trainable, config["learning rate"], cfg["discount factor"],
               ReplayMemoryModel.parse_layer_blueprints(cfg["layers"]),
               config["replay min batch"], config["replay memory size"])
     return dqn
コード例 #3
0
def savegame(config):
    # Step 1: init Game
    env = Environment(config.game_num)  #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())

    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    dic = {}
    with open("symbolMapping" + str(sys.argv[1]) + ".txt", 'r') as fp:
        data = fp.read().split('\n')
        for i in range(len(data) - 1):
            splitdata = data[i].split(' ')
            dic[int(splitdata[1])] = splitdata[0]
    dic[0] = "NULL"

    dic_trans = {}
    with open("symbolMapping1236.txt", 'r') as fp:
        data = fp.read().split('\n')
        for i in range(len(data) - 1):
            splitdata = data[i].split(' ')
            dic_trans[splitdata[0]] = int(splitdata[1])
    dic_trans["NULL"] = 0

    dic_embedding = {}
    #1st let us initialize it randomly
    sess = tf.InteractiveSession()
    stateInput = tf.placeholder(tf.int32, [len(dic_trans.keys())])
    embed = tf.Variable(tf.random_uniform([len(dic_trans.keys()), 20], -1, 1),
                        name="embed")
    word_embeds = tf.nn.embedding_lookup(embed, stateInput)
    tf.initialize_all_variables().run()
    state = sorted(dic_trans.values())
    state_map = word_embeds.eval(feed_dict={stateInput: state})

    for i in range(len(state)):
        dic_embedding[state[i]] = state_map[i]
    sess.close()

    for i in range(config.vocab_size - 1):
        state = np.zeros([config.batch_size, config.seq_length])
        state[:, 0] = i
        embedding = brain.word_embeds.eval(feed_dict={brain.stateInput: state},
                                           session=brain.session)[0, 0]
        dic_embedding[dic_trans[dic[i]]] = embedding
    brain.session.close()

    cpickle.dump(dic_embedding,
                 open("embedTeacher" + str(sys.argv[1]) + ".p", "wb"))
コード例 #4
0
    def __init__(self, **kwargs):
        """Initializes the game window."""
        pg.init()
        pg.display.set_caption('Snake')
        self.clock = pg.time.Clock()
        if 'cell_size' in kwargs:
            self.cell_size = kwargs.pop('cell_size', False)
        if 'length' in kwargs:
            self.length = kwargs.pop('length', False)
        if 'height' in kwargs:
            self.height = kwargs.pop('height', False)
        if 'speed' in kwargs:
            self.actions_per_second = kwargs.pop('speed', False)
        self._exit = False
        self.is_training = True
        self.trainig_pressed = False
        self.score = 0
        self.actions = 0
        self.scores = []
        self.averages = []

        # For split-brain network
        if "sb_dimensions" and "sb_lr" in kwargs:
            dimensions = kwargs.pop("sb_dimensions", False)
            lr = kwargs.pop("sb_lr", False)
            self.split_brain_network = SplitBrainNetwork(dimensions=dimensions,
                                                         lr=lr)

        # DQN
        if "dqn_dimensions" and "dqn_lr" and "dqn_batch_size" and "dqn_sample_size" in kwargs:
            dimensions = kwargs.pop("dqn_dimensions", False)
            lr = kwargs.pop("dqn_lr", False)
            batch_size = kwargs.pop("dqn_batch_size", False)
            sample_size = kwargs.pop("dqn_sample_size", False)
            self.dqn = DQN(dimensions=dimensions,
                           lr=lr,
                           batch_size=batch_size,
                           sample_size=sample_size)
コード例 #5
0
def play_model(args):

    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    with torch.no_grad():
        i = 0
        observation = env.reset()
        while i < 5000:
            env.render()
            state = get_screen(env, device)
            action = int(target_net(state).max(1)[1].view(1, 1))
            observation, reward, done, info = env.step(action)

            if done:
                break
            i += 1

    env.close()
コード例 #6
0
ファイル: train_DQN.py プロジェクト: jt2594838/exp2
    total_loss = 0.0
    total_reward = 0.0
    cnt = 0
    while not env.is_over():
        curr_state = env.get_curr_state().unsqueeze(0)
        reward, act = net.take_action(Variable(curr_state), use_greedy=True)
        true_reward = Variable(torch.FloatTensor([env.step(act)]),
                               requires_grad=False)
        if use_cuda:
            true_reward = true_reward.cuda()
        loss = criterion(reward, true_reward)
        total_loss += loss.data[0]
        total_reward += true_reward
        cnt = cnt + 1
    return cnt, total_loss, total_reward


if __name__ == '__main__':

    # initializations
    loader = PickleDataReader(data_file, input_size, output_size, data_stride)
    train_set, test_set = loader.load(train_ratio, False)
    train_set = WindowDataSet(train_set)

    net = DQN(config)
    env = Environment(train_set)

    train_net(net, env, max_epoch, non_greedy_decay)
    cnt, loss, reward = eval_decision(net, env, criterion, use_cuda)
    print("step : %d, loss : %f, reward : %f" % (cnt, loss, reward))
コード例 #7
0
def savegame(config):
    fp = open('symbolMapping'+str(sys.argv[1])+'.txt','r')
    data = fp.read().split('\n')
    spd = [data_.split(' ')[::-1] for data_ in data]
    dic_local = dict(spd[0:-1])
    dic_local['0'] = 'NULL'
    fp.close()

    fp = open('symbolMapping5.txt','r')
    data = fp.read().split('\n')
    spd = [data_.split(' ')for data_ in data]
    dic_global = dict(spd[0:-1])

    dic_global['NULL']='0'
    fp.close()    

    # Step 1: init Game
    env = Environment(config.game_num) #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())
    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress')
    episode_length = 0
    num_episodes = 0
    total_reward = 0
    MAX_STEPS = 105000
    totalSteps = 0
    MEM_STEPS = 10000
    memory = []
    while totalSteps < MAX_STEPS:
        totalSteps += 1
        if env.START_NEW_GAME:
            episode_length = 0
            env.START_NEW_GAME = False
            state, reward, terminal, availableObjects = env.newGame()
            brain.history.add(state)
        action_indicator = np.zeros(actions)
        object_indicator = np.zeros(objects)
        #predict
        action_index,object_index = brain.getAction(availableObjects, True)
        Qactions, Qobjects = brain.getQValues(availableObjects)
        action_indicator[action_index] = 1
        object_indicator[object_index] = 1
        # print state
        memory.append((convert_state(state, dic_local, dic_global), Qactions, Qobjects))
        #act
        nextstate,reward,terminal, availableObjects = env.step(action_index,object_index)
        total_reward += reward
        episode_length += 1
        #observe
        brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, True)
        state = nextstate

        if (totalSteps % MEM_STEPS == 0):
            fileName = str(config.game_num) + "_mem.txt"
            with open(fileName, "a") as fp:
                for i in range(len(memory)):
                    for j in memory[i][0]:
                        print >> fp, j,
                    print >> fp
                    for j in memory[i][1]:
                        print >> fp, j,
                    print >> fp
                    for j in memory[i][2]:
                        print >> fp, j,
                    print >> fp
            memory = []


        if ((terminal) or ((episode_length % config.max_episode_length) == 0)):
            num_episodes += 1
            with open("saver_reward.txt", "a") as fp:
                print >> fp, (total_reward / (num_episodes * 1.0))
            env.START_NEW_GAME = True

        pbar.update(1)


        if (brain.timeStep) > config.MAX_FRAMES:
            brain.train_writer.close()
            break

    brain.session.close()
コード例 #8
0
import pickle
import time
import gym

from models.DQN import DQN

if __name__ == '__main__':
    env = gym.make('CartPole-v0').unwrapped

    RL = DQN(
        i=4,
        o=2,
        h1=512,
        h2=128,
        isinit=False,
        save_file='saves/CartPole/params/ql.pkl',
        hyperparam={
            'learn_rate': 1e-4,  # lr=1e-3 for training
            'weight_decay': 1e-5,
            'reward_decay': 0.1
        })
    data1 = []
    data2 = []
    scores = []
    for ii in range(50000):
        state1 = env.reset()

        for turn in range(10000):
            env.render()

            action = RL.action(state1, e=0.1)  #  e=0.3 for training
コード例 #9
0
ファイル: train.py プロジェクト: apourchot/DOOMRL
    game.set_screen_resolution(ScreenResolution.RES_256X144)
    game.init()
    print("Doom initialized.")
    return game


# Create Doom instance
game = vizdoom_init(config_file_path)
n = game.get_available_buttons_size()
actions = [list(a) for a in it.product([0, 1], repeat=n)]

if load_model:
    model = DQN(game,
                actions,
                file_name,
                prioritized=use_prioritized,
                ddqn=use_ddqn,
                gpu=use_gpu,
                loading=1)
else:
    model = DQN(game,
                actions,
                file_name,
                prioritized=use_prioritized,
                ddqn=use_ddqn,
                gpu=use_gpu,
                parameter_exploration=use_parameter_exploration,
                loading=0)

print("Starting the training.")
time_start = time()
コード例 #10
0
ファイル: test.py プロジェクト: apourchot/DOOMRL
    game.set_screen_format(ScreenFormat.CRCGCB)
    game.set_screen_resolution(ScreenResolution.RES_640X360)
    game.init()
    print("Doom initialized.")
    return game


# Create Doom instance
game = vizdoom_init(config_file_path)
n = game.get_available_buttons_size()
actions = [list(a) for a in it.product([0, 1], repeat=n)]

model = DQN(game,
            actions,
            file_name,
            ddqn=use_ddqn,
            parameter_exploration=use_parameter_exploration,
            gpu=use_gpu,
            loading=1)

print("======================================")
print("Testing trained neural network.")

print("Testing...")
test_scores = []

for _ in range(episodes_to_watch):

    game.new_episode()
    while not game.is_episode_finished():
        model.step(training=False, showing=True)
コード例 #11
0
    cfg.TRAIN_DATA = "data/{}_2000-2019.csv".format(df)
    cfg.VAL_DATA = "data/{}_2020.csv".format(df)

    # Loading datasets
    train_data = pd.read_csv(cfg.TRAIN_DATA)
    val_data = pd.read_csv(cfg.VAL_DATA)

    # Creating environment
    env = Environment(train_data, val_data)
    env.reset()

    ######################## Setting up the algorithm and assigning to the agent ########################
    ## DQN
    if cfg._DQN_ == 1:
        from models.DQN import DQN
        dqn = DQN(env.obs.shape[0], len(env.actions()), device)
        if (cfg.LOAD_MODEL):
            dqn.load_checkpoint()
        agent = Agent(env, dqn, cfg.DQN)

    ## AC
    if cfg._AC_ == 1:
        from models.AC import AC
        ac = AC(env.obs.shape[0], len(env.actions()), device)
        if (cfg.LOAD_MODEL):
            ac.load_checkpoint()
        agent = Agent(env, ac, cfg.AC)

    # PPO
    if cfg._PPO_ == 1:
        from models.PPO import PPO
コード例 #12
0
import pickle
import time
import gym

from models.DQN import DQN

if __name__ == '__main__':
    env = gym.make('Acrobot-v1').unwrapped

    RL = DQN(
        i=6,
        o=3,
        h1=512,
        h2=128,
        isinit=False,
        save_file='saves/Acrobot/params/ql.pkl',
        hyperparam={
            'learn_rate': 1e-3,  # lr=1e-3 for training
            'weight_decay': 1e-5,
            'reward_decay': 0.01
        })

    scores = []
    for ii in range(1000000):
        state1 = env.reset()

        for turn in range(1000000):
            env.render()

            action = RL.action(state1, e=0.1)  #  e=0.3 for training
            state2, reward, done, info = env.step(action)
コード例 #13
0
def playgame(config):
    # Step 1: init Game
    env = Environment(config.game_num)  #1 is for main game 2 is for evaluation
    ###################
    # Step 2: init DQN
    actions = env.action_size()
    objects = env.object_size()
    config.setnumactions(actions)
    config.setnumobjects(objects)
    config.setvocabsize(env.vocab_size())

    brain = DQN(config)

    # checkStates = None
    #adding progress bar for training
    pbar = tqdm(total=config.MAX_FRAMES, desc='Training Progress')
    episode_length = 0
    num_episodes = 0
    total_reward = 0
    while True:
        if env.START_NEW_GAME:
            episode_length = 0
            env.START_NEW_GAME = False
            state, reward, terminal, availableObjects = env.newGame()
            brain.history.add(state)
        action_indicator = np.zeros(actions)
        object_indicator = np.zeros(objects)
        #predict
        action_index, object_index = brain.getAction(availableObjects)
        action_indicator[action_index] = 1
        object_indicator[object_index] = 1
        #act
        nextstate, reward, terminal, availableObjects = env.step(
            action_index, object_index)
        total_reward += reward
        episode_length += 1
        #observe
        brain.setPerception(state, reward, action_indicator, object_indicator,
                            nextstate, terminal, False)
        state = nextstate

        if ((terminal) or ((episode_length % config.max_episode_length) == 0)):
            num_episodes += 1
            with open("train_reward.txt", "a") as fp:
                print >> fp, (total_reward / (num_episodes * 1.0))
            env.START_NEW_GAME = True
#####################################################################
#for evaluating qvalues
        if (brain.timeStep % config.EVAL == 0) and (brain.timeStep != 0):
            if (brain.timeStep / config.EVAL == 1):
                if not ((os.path.exists("checkStates.txt")) and
                        (os.path.getsize("checkStates.txt") > 0)):
                    assert config.SAMPLE_STATES % config.BATCH_SIZE == 0
                    assert config.SAMPLE_STATES < brain.memory.count
                    checkStates, _1, _2, _3, _4, _5 = brain.memory.sample()
                    with open("checkStates.txt", "w") as fp:
                        cpickle.dump(checkStates, fp)
                else:
                    with open("checkStates.txt", 'r') as fp:
                        checkStates = cpickle.load(fp)

            evalQValues_a = brain.action_valueT.eval(
                feed_dict={brain.stateInputT: checkStates},
                session=brain.session)
            maxEvalQValues_a = np.max(evalQValues_a, axis=1)
            avgEvalQValues_a = np.mean(maxEvalQValues_a)

            with open("evalQValue_a.txt", "a") as fp:
                print >> fp, avgEvalQValues_a

            evalQValues_o = brain.object_valueT.eval(
                feed_dict={brain.stateInputT: checkStates},
                session=brain.session)
            maxEvalQValues_o = np.max(evalQValues_o, axis=1)
            avgEvalQValues_o = np.mean(maxEvalQValues_o)

            with open("evalQValue_o.txt", "a") as fp:
                print >> fp, avgEvalQValues_o
#####################################################################
#save current history before starting evaluation
# temp_history_data = brain.history.copy()
#now let us evaluate avg reward
#create alternate environment for EVALUATION
# env_eval = Environment(2)
            env_eval = env
            if config.TUTORIAL_WORLD:
                total_reward, nrewards, nepisodes, quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt = evaluate(
                    brain, env_eval, config)
            else:
                total_reward, nrewards, nepisodes, quest1_reward_cnt = evaluate(
                    brain, env_eval, config)

            with open("test_reward.txt", "a") as fp:
                print >> fp, total_reward

            #setting the best network
            if len(env_eval.reward_history) == 0 or total_reward > max(
                    env_eval.reward_history):
                # save best network
                if not os.path.exists(os.getcwd() + '/Savednetworks'):
                    os.makedirs(os.getcwd() + '/Savednetworks')
                brain.saver.save(brain.session,
                                 os.getcwd() + '/Savednetworks/' + 'network' +
                                 '-dqn',
                                 global_step=brain.timeStep)

            env_eval.reward_history.append(
                total_reward)  #doing this for keeping track of best network

            #go back to saved frame after evaluation completed
            # brain.history.add(temp_history_data)
            #####################################################################
            if config.TUTORIAL_WORLD:
                brain.inject_summary(
                    {
                        'average.q_a':
                        avgEvalQValues_a,
                        'average.q_o':
                        avgEvalQValues_o,
                        'average.q':
                        (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a),
                        'average_reward':
                        total_reward,
                        'average_num_pos_reward':
                        nrewards,
                        'number_of_episodes':
                        nepisodes,
                        'quest1_average_reward_cnt':
                        quest1_reward_cnt,
                        'quest2_average_reward_cnt':
                        quest2_reward_cnt,
                        'quest3_average_reward_cnt':
                        quest3_reward_cnt
                    }, brain.timeStep)
            else:
                brain.inject_summary(
                    {
                        'average.q_a':
                        avgEvalQValues_a,
                        'average.q_o':
                        avgEvalQValues_o,
                        'average.q':
                        (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a),
                        'average_reward':
                        total_reward,
                        'average_numrewards':
                        nrewards,
                        'number_of_episodes':
                        nepisodes,
                        'quest1_average_reward_cnt':
                        quest1_reward_cnt
                    }, brain.timeStep)


#####################################################################
        pbar.update(1)

        if (brain.timeStep) > config.MAX_FRAMES:
            brain.train_writer.close()
            break

    brain.session.close()
コード例 #14
0
def train_agent(args):
    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, args.n_actions).to(device)
    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    if args.policyNet:
        target_net.load_state_dict(
            torch.load(args.policyNet, map_location=device))

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    args.steps_done = 0

    num_episodes = 1

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, policy_net, args, device)
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(optimizer, memory, policy_net, target_net, args,
                           device)
            if done:
                episode_durations.append(t + 1)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % args.target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), args.output_policyNet)
            torch.save(target_net.state_dict(), args.output_targetNet)

        if i_episode % 10 == 0:
            print(f'{i_episode+1}/{num_episodes}: Completed Episode.')

    print('Complete')
    env.close()

    torch.save(policy_net.state_dict(), args.output_policyNet)
    torch.save(target_net.state_dict(), args.output_targetNet)
コード例 #15
0
ファイル: experiment.py プロジェクト: Atrus619/DeckOfCards
def run_full_experiment(config):
    # archiving old experience
    db.archive_exp(db.get_all_exp())
    db.delete_all_exp()

    util.setup_file_logger(name=config.run_id, filename=config.run_id)
    logger = logging.getLogger(config.run_id)
    start_time = time.time()

    # Define players
    model_1 = DQN(run_id=config.run_id, **config.DQN_params)
    model_2 = model_1.copy()
    epsilon = Epsilon(epsilon_func=config.epsilon_func,
                      max_epsilon=config.max_epsilon,
                      min_epsilon=config.min_epsilon,
                      eval_epsilon=config.eval_epsilon,
                      num_cycles=config.num_cycles,
                      decrement=config.epsilon_decrement)

    player_list = [
        Agent(name=config.bot_1_name, model=model_1, epsilon=epsilon),
        Agent(name=config.bot_2_name, model=model_2, epsilon=epsilon)
    ]

    winner_list = []
    previous_experience_id = 0

    util.save_config(config=config, path=config.run_id)

    # For each cycle
    logger.info('Beginning run titled: ' + config.run_id)
    logger.info(cs.DIVIDER)

    for i in range(1, config.num_cycles + 1):
        # For each episode, play through episode and insert each state/action pair into the database
        logger.info('Beginning cycle: ' + str(i) + ' / ' +
                    str(config.num_cycles) + '\tCumulative Time Elapsed: ' +
                    util.get_pretty_time(time.time() - start_time))
        logger.info(
            f'Current Epsilon: {epsilon.get_epsilon(current_cycle=i):.3f}')
        cycle_start_time = time.time()

        # Async parallelization. May want to consider doing cpu_count - 1 to allow user to do things while it runs. Sux cuz of memory copying I think.
        # with mp.Pool(mp.cpu_count() - 1) as pool:
        #     game_output = pool.starmap_async(parallel.play_game, [(config.game, player_list, config.run_id, i) for j in range(config.episodes_per_cycle)]).get()

        # Old serial method
        winner_list += pu.play_games(num_games=config.episodes_per_cycle,
                                     name=config.game,
                                     players=player_list,
                                     run_id=config.run_id,
                                     current_cycle=i,
                                     config=config)

        logger.info('Data collection complete.\tTotal Episode Time: ' +
                    util.get_pretty_time(time.time() - cycle_start_time))
        logger.info('Loading experience and training model...')
        training_start_time = time.time()

        # Import data from database based on experience replay buffer and train model
        pu.train_model(model=model_1, config=config)

        logger.info('Model training complete.\tTotal Training Time: ' +
                    util.get_pretty_time(time.time() - training_start_time))

        # Update model_2
        if i % config.player_2_update_freq == 0:
            logger.info(cs.DIVIDER)
            logger.info(
                'Storing history and setting model 2 equal to model 1...')
            player_list[0].model.policy_net.store_history()
            player_list[1].set_model(model=model_1.copy())

        # Benchmark
        if i % config.benchmark_freq == 0:
            logger.info(cs.DIVIDER)
            logger.info('Benchmarking...')

            # List of player 1's win rate against player 2 by cycle
            benchmark_cycle_win_rate = 1 - sum(winner_list) / len(winner_list)
            winner_list = []  # Reset winner list

            # Play against random bot and measure win rate
            random_win_rate = benchmark.benchmark_test(
                primary_model=model_1,
                benchmark_model=RandomBot(),
                benchmark_bot_name=config.random_bot_name,
                num_games=config.random_bot_cycles,
                run_id=config.run_id if config.log_random_benchmark else None)
            logger.info(
                f'Winrate vs. Random Bot: {random_win_rate * 100:.1f}%')

            # Play against expert policy bot and measure win rate
            # expert_policy_win_rate = benchmark.benchmark_test(primary_model=model_1, benchmark_model=ExpertPolicy(), benchmark_bot_name=config.expert_policy_bot_name,
            #                                                   num_games=config.random_bot_cycles, run_id=config.run_id if config.log_expert_policy_benchmark else None)
            # logger.info(f'Winrate vs. Expert Policy: {expert_policy_win_rate * 100:.1f}%')

            # Collect average reward from database
            average_reward = benchmark.get_average_reward(
                run_id=config.run_id,
                previous_experience_id=previous_experience_id,
                agent_id=config.bot_1_name,
                opponent_id=config.bot_2_name)
            db.insert_metrics(run_id=config.run_id,
                              win_rate=benchmark_cycle_win_rate,
                              win_rate_random=random_win_rate,
                              win_rate_expert_policy=0.0,
                              average_reward=average_reward)

            previous_experience_id = db.get_max_id(config.run_id)

        # Checkpoint
        if config.checkpoint_freq is not None and i % config.checkpoint_freq == 0:
            logger.info(cs.DIVIDER)
            logger.info('Model checkpoint reached. Saving checkpoint...')
            model_1.save(folder=os.path.join(config.checkpoint_folder,
                                             config.run_id),
                         title=util.get_checkpoint_model_name(cycle=i))

        logger.info('Cycle complete.\tTotal Cycle Time: ' +
                    util.get_pretty_time(time.time() - cycle_start_time))
        logger.info(cs.DIVIDER)

    logging.info('Training complete.\tTotal Run Time: ' +
                 util.get_pretty_time(time.time() - start_time) +
                 '\tSaving model and exiting...')
    model_1.save(title=config.run_id)
コード例 #16
0
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
env.reset()
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n
print(n_actions)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)