def train(self):
        tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False)
        summary_writer = tf.summary.FileWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def))

        # the agent plays against itself, making the best move for each player
        #players = [TDAgent(Game.TOKENS[0], self,p=np.random.rand()/10), TDAgent(Game.TOKENS[1], self,p=np.random.rand()/10)]
        #players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]
        validation_interval = 10000
        episodes = 200000
        t = trange(episodes, desc='Bar desc', leave=True)
        for episode in t:
            players = [TDAgent(Game.TOKENS[0], self,p=np.random.rand()/3), TDAgent(Game.TOKENS[1], self,p=np.random.rand()/3)]
            if episode != 0 and episode % validation_interval == 0:
                self.test(episodes=200)
                np.random.seed()
                #self.test(episodes=200,mode=1)
            t.refresh()
            game = Game.new()
            player_num = random.randint(0, 1)
            if player_num==0:
                game.reverse()
            x = game.extract_features(players[player_num].player)
            
            
            #print(self.xy.eval())
            
            game_step = 0
            while not game.is_over():
                game.next_step(players[player_num], player_num)
                player_num = (player_num + 1) % 2

                x_next = game.extract_features(players[player_num].player)
                V_next = self.get_output(x_next)
                self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next })

                x = x_next
                game_step += 1

            winner = game.winner()

            _, global_step, summaries, _ = self.sess.run([
                self.train_op,
                self.global_step,
                self.summaries_op,
                self.reset_op
            ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') })
            summary_writer.add_summary(summaries, global_step=global_step)

            #tqdm.write("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step))
            self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step)
        tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        summary_writer.close()
Exemple #2
0
    def test(self, episodes=100, draw=False):
        players = [TDAgent(Game.TOKENS[0], self), RandomAgent(Game.TOKENS[1])]
        winners = [0, 0]
        for episode in range(episodes):
            game = Game.new()

            winner = game.play(players, draw=draw)
            winners[winner] += 1

            winners_total = sum(winners)
            print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \
                players[0].name, players[0].player, \
                players[1].name, players[1].player, \
                winners[0], winners[1], winners_total, \
                (winners[0] / winners_total) * 100.0))
Exemple #3
0
    def train(self):
        with tf.device('/gpu:0') as dev:
            tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False)
            summary_writer = tf.summary.FileWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def))

            # the agent plays against itself, making the best move for each player
            players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]

            validation_interval = 1000
            episodes = 5000

            for episode in range(episodes):
                if episode != 0 and episode % validation_interval == 0:
                    self.test(episodes=100)

                game = Game.new()
                player_num = random.randint(0, 1)

                x = game.extract_features(players[player_num].player)

                game_step = 0
                while not game.is_over():
                    game.next_step(players[player_num], player_num)
                    player_num = (player_num + 1) % 2

                    x_next = game.extract_features(players[player_num].player)
                    V_next = self.get_output(x_next)
                    self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next })

                    x = x_next
                    game_step += 1

                winner = game.winner()

                _, global_step, summaries, _ = self.sess.run([
                    self.train_op,
                    self.global_step,
                    self.summaries_op,
                    self.reset_op
                ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') })
                summary_writer.add_summary(summaries, global_step=global_step)

                print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step))
                self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step)

            summary_writer.close()

            self.test(episodes=1000)
Exemple #4
0
    def train(self):
        tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False)
        summary_writer = tf.train.SummaryWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def))

        # the agent plays against itself, making the best move for each player
        players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]

        validation_interval = 1000
        episodes = 5000

        for episode in range(episodes):
            if episode != 0 and episode % validation_interval == 0:
                self.test(episodes=100)

            game = Game.new()
            player_num = random.randint(0, 1)

            x = game.extract_features(players[player_num].player)

            game_step = 0
            while not game.is_over():
                game.next_step(players[player_num], player_num)
                player_num = (player_num + 1) % 2

                x_next = game.extract_features(players[player_num].player)
                V_next = self.get_output(x_next)
                self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next })

                x = x_next
                game_step += 1

            winner = game.winner()

            _, global_step, summaries, _ = self.sess.run([
                self.train_op,
                self.global_step,
                self.summaries_op,
                self.reset_op
            ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') })
            summary_writer.add_summary(summaries, global_step=global_step)

            print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step))
            self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step)

        summary_writer.close()

        self.test(episodes=1000)
Exemple #5
0
    def test(self, episodes=100, draw=False, save=None):
        players = [
            TDAgent(Game.TOKENS[0], self),
            TDAgent(Game.TOKENS[1], self)
        ]
        winners = [0, 0]
        for episode in range(episodes):
            game = Game.new()

            winner = game.play(players, draw=draw)
            if save:
                game.save_tmg(os.path.join(save, str(episode) + '.tmg'))
            winners[winner] += 1

            winners_total = sum(winners)
            print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \
                players[0].name, players[0].player, \
                players[1].name, players[1].player, \
                winners[0], winners[1], winners_total, \
                (winners[0] / winners_total) * 100.0))
    def test(self, episodes=100, draw=False, mode=0):
        if mode==0:
            players = [TDAgent(Game.TOKENS[0], self), Today_bot(Game.TOKENS[1])]
        if mode==1:
            players = [Today_bot(Game.TOKENS[0]), TDAgent(Game.TOKENS[1], self)]
        if mode==3:
            players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]
        #players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]
        winners = [0, 0]
        for episode in range(episodes):
            np.random.seed(episode)
            game = Game.new()

            winner = game.play(players, draw=draw)
            winners[winner] += 1

            winners_total = sum(winners)
        if mode<3:
            print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \
                players[0].name, players[0].player, \
                players[1].name, players[1].player, \
                winners[0], winners[1], winners_total, \
                (winners[mode] / winners_total) * 100.0))
            if (winners[mode] / winners_total) * 100.0 >self.max_wr:
                self.max_wr=(winners[mode] / winners_total) * 100.0
                w1=self.l1_W.eval()
                b1=self.l1_b.eval()
                w2=self.l2_W.eval()
                b2=self.l2_b.eval()
                np.savetxt("w1.txt", w1)
                np.savetxt("w2.txt", w2)
                np.savetxt("b1.txt", b1)
                np.savetxt("b2.txt", b2)
                with open("max_wr.txt", "w") as text_file:
                    text_file.write(str(self.max_wr))
        else:
            print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \
                players[0].name, players[0].player, \
                players[1].name, players[1].player, \
                winners[0], winners[1], winners_total, \
                (winners[0] / winners_total) * 100.0))
 def play(self):
     game = Game.new()
     game.play([TDAgent(Game.TOKENS[0], self), Today_bot(Game.TOKENS[1])], draw=True)
Exemple #8
0
 def play(self):
     game = Game.new()
     game.play([TDAgent(Game.TOKENS[0], self), HumanAgent(Game.TOKENS[1])], draw=True)
Exemple #9
0
        elif tup[0] == 'x':
            pos[i + 1] = len(tup)
        else:
            pos[i + 1] = -len(tup)
    return pos
    

if __name__ == '__main__':

    print pubeval(False, [0] +
                  [-2,  0,  0,  0,  0,  5] +
                  [ 0,  3,  0,  0,  0, -5] +
                  [ 5,  0,  0,  0, -3,  0] +
                  [-5,  0,  0,  0,  0,  2] +
                  [ 0] + 
                  [ 0,  0])

    from backgammon.game import Game

    g = Game.new()
    print pubeval(False, game_to_pos(g))

    actions = g.get_actions((5, 6), 'x', nodups=True)
    for a in sorted([str(foo) for foo in actions]):
        print a
    print
    actions = g.get_actions((5, 6), 'o', nodups=True)
    for a in sorted([str(foo) for foo in actions]):
        print a        
    
Exemple #10
0
 def random_selfplay(self):
     players = [RandomAgent(Game.TOKENS[0]), RandomAgent(Game.TOKENS[1])]
     game = Game.new()
     game.SLEEP = 0
     winner = game.play(players, draw=True)
Exemple #11
0
 def play(self, ts=False):
     game = Game.new()
     game.play([TDAgent(Game.TOKENS[0], self),
                HumanAgent(Game.TOKENS[1])],
               draw=True,
               ts=ts)
Exemple #12
0
    def train(self, episodes = 5000):
        tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False)
        summary_writer = tf.train.SummaryWriter('{0}{1}'.format(self.summary_path, int(time.time()),
                                                                graph_def=self.sess.graph_def))

        # the agent plays against itself, making the best move for each player
        players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)]

        validation_interval = 1000
        report_freq = 10

        prev_time = time.time()
        prev_step = self.sess.run(self.global_step)
        plies_per_batch = 0

        for episode in range(episodes):
            if episode != 0 and episode % validation_interval == 0:
                self.test(episodes=100)

            game = Game.new()
            player_num = random.randint(0, 1)

            x = game.extract_features(players[player_num].player)

            game_step = 0
            while not game.is_over():
                game.next_step(players[player_num], player_num)
                player_num = (player_num + 1) % 2

                x_next = game.extract_features(players[player_num].player)
                V_next = self.get_output(x_next)
                self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next })

                x = x_next
                game_step += 1

            winner = game.winner()

            _, global_step, summaries, _ = self.sess.run([
                self.train_op,
                self.global_step,
                self.summaries_op,
                self.reset_op
            ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') })

            print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step))
            plies_per_batch += game_step
            if episode != 0 and episode % report_freq == 0:
                now = time.time()
                elapsed_time = now - prev_time
                steps_per_sec = (global_step - prev_step) / elapsed_time
                games_per_sec = report_freq / elapsed_time
                plies_per_game = plies_per_batch / report_freq
                print('e=%.2f sps=%.2f gps=%.2f ppg=%.1f global=%d prev=%d' % (elapsed_time, steps_per_sec, games_per_sec, plies_per_game, global_step, prev_step))
                
                summary_writer.add_summary(summaries, global_step=global_step)
                
                s1 = tf.Summary(value=[tf.Summary.Value(tag='rate/global_steps_sec',
                                                        simple_value=steps_per_sec)])
                summary_writer.add_summary(s1, global_step)
                
                s2 = tf.Summary(value=[tf.Summary.Value(tag='rate/games_sec',
                                                        simple_value=games_per_sec)])
                summary_writer.add_summary(s2, global_step)
                
                s3 = tf.Summary(value=[tf.Summary.Value(tag='rate/plies_per_game',
                                                        simple_value=plies_per_game)])
                summary_writer.add_summary(s3, global_step)
                
                self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step)
                prev_time = now
                prev_step = global_step
                plies_per_batch = 0

        summary_writer.close()

        self.test(episodes=1000)
Exemple #13
0
    pos[25] = len(game.bar_pieces['x'])
    pos[26] = len(game.off_pieces['x'])
    pos[27] = -len(game.off_pieces['o'])
    for i, tup in enumerate(game.grid):
        if len(tup) == 0:
            pos[i + 1] = 0
        elif tup[0] == 'x':
            pos[i + 1] = len(tup)
        else:
            pos[i + 1] = -len(tup)
    return pos


if __name__ == '__main__':

    print pubeval(False, [0] + [-2, 0, 0, 0, 0, 5] + [0, 3, 0, 0, 0, -5] +
                  [5, 0, 0, 0, -3, 0] + [-5, 0, 0, 0, 0, 2] + [0] + [0, 0])

    from backgammon.game import Game

    g = Game.new()
    print pubeval(False, game_to_pos(g))

    actions = g.get_actions((5, 6), 'x', nodups=True)
    for a in sorted([str(foo) for foo in actions]):
        print a
    print
    actions = g.get_actions((5, 6), 'o', nodups=True)
    for a in sorted([str(foo) for foo in actions]):
        print a
Exemple #14
0
    def train(self):
        tf.train.write_graph(self.sess.graph_def,
                             self.model_path,
                             'td_gammon.pb',
                             as_text=False)
        summary_writer = tf.summary.FileWriter('{0}{1}'.format(
            self.summary_path, int(time.time()), self.sess.graph_def))

        # the agent plays against itself, making the best move for each player
        players = [
            TDAgent(Game.TOKENS[0], self),
            TDAgent(Game.TOKENS[1], self)
        ]

        #validation_interval = 1000
        #episodes = 5000
        validation_interval = 500
        episodes = 5000

        train_start_ts = time.time()
        for episode in range(episodes):
            start_ts = time.time()
            if episode != 0 and episode % validation_interval == 0:
                print('Episode:', episode)
                write('Episode: %d' % episode)
                self.test(episodes=100)
            game = Game.new()
            player_num = random.randint(0, 1)

            x = game.extract_features(players[player_num].player)

            game_step = 0
            while not game.is_over():
                game.next_step(players[player_num], player_num)
                player_num = (player_num + 1) % 2

                x_next = game.extract_features(players[player_num].player)
                V_next = self.get_output(x_next)
                self.sess.run(self.train_op,
                              feed_dict={
                                  self.x: x,
                                  self.V_next: V_next
                              })

                x = x_next
                game_step += 1

            winner = game.winner()

            _, global_step, summaries, _ = self.sess.run([
                self.train_op,
                self.global_step,
                self.summaries_op,
                self.reset_op,
            ],
                                                         feed_dict={
                                                             self.x:
                                                             x,
                                                             self.V_next:
                                                             np.array(
                                                                 [[winner]],
                                                                 dtype='float')
                                                         })
            summary_writer.add_summary(summaries, global_step=global_step)

            end_ts = time.time()
            print("%.2f - Game %d/%d (Winner: %s) in %d turns (%.2f secs)" %
                  (self.k, episode, episodes, players[winner].player,
                   game_step, end_ts - start_ts))
            """if episode in [9, 99, 999, 9999, 99999]:
                print("%d games avg time: %.2f secs" % (episode+1, (end_ts - train_start_ts) / (episode+1)))
            """
            self.saver.save(self.sess,
                            self.checkpoint_path + 'checkpoint',
                            global_step=global_step)

        summary_writer.close()
        write('Episode: 5000')
        self.test(episodes=100)