Esempio n. 1
0
def _main(unused_argv):
    if len(unused_argv) > 1:
        start_epi = (int)(unused_argv[1])
    else:
        start_epi = 0
    if len(unused_argv) > 2:
        num_episodes = (int)(unused_argv[2])
    else:
        num_episodes = 100

    parent_proc = psutil.Process()
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess,
                          screen_size,
                          minimap_size,
                          output_size,
                          learning_rate,
                          name="main")
        targetDQN = dqn.DQN(sess,
                            screen_size,
                            minimap_size,
                            output_size,
                            learning_rate,
                            name="target")
        sess.run(init)

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        print("memory before starting the iteration : %s (kb)" %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

        for episode in range(start_epi, num_episodes):
            e = 1.0 / ((episode / 50) + 2.0)  # decaying exploration rate
            with sc2_env.SC2Env("Odyssey",
                                agent_race=myrace,
                                bot_race=botrace,
                                difficulty="1",
                                visualize=visualize) as env:

                agent = minerva_agent.MinervaAgent(mainDQN)
                run_result = run_loop([agent], env, sess, e, mainDQN,
                                      targetDQN, copy_ops, 5000)
                agent.close()
                reward = run_result[0].reward
                if reward > 0:
                    env.save_replay("victory/")
                #else:
                #    env.save_replay("defeat/")

            children = parent_proc.children(recursive=True)
            for child in children:
                print("remaining child proc :", child)
            print(
                "memory after exit %d'th sc2env : %s (kb)" %
                (episode, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

            mainDQN.saveWeight()
            print("networks were saved, %d'th game result :" % episode, reward)
Esempio n. 2
0
    def onBeginTraining(self):
        ue.log("starting avoid agent training")
        self.INPUT_SIZE = 10
        self.OUTPUT_SIZE = 2
        self.DISCOUNT_RATE = 0.99
        self.REPLAY_MEMORY = 50000
        self.BATCH_SIZE = 64
        self.TARGET_UPDATE_FREQUENCY = 5
        self.MAX_EPISODES = 5000
        self.episode = 200
        self.state = np.zeros(10)
        self.next_state = np.zeros(10)
        self.action = 1
        self.reward = 0.0
        self.done = False
        self.step_count = 0

        self.replay_buffer = deque(maxlen=self.REPLAY_MEMORY)
        self.last_100_game_reward = deque(maxlen=100)
        self.sess = tf.compat.v1.Session()

        self.mainDQN = dqn.DQN(self.sess,
                               self.INPUT_SIZE,
                               self.OUTPUT_SIZE,
                               name="main")
        self.targetDQN = dqn.DQN(self.sess,
                                 self.INPUT_SIZE,
                                 self.OUTPUT_SIZE,
                                 name="target")
        self.sess.run(tf.compat.v1.global_variables_initializer())

        self.copy_ops = self.get_copy_var_ops(dest_scope_name="target",
                                              src_scope_name="main")
        self.sess.run(self.copy_ops)
        pass
Esempio n. 3
0
def main():
    max_episodes = 1000

    # store the previous observations in replay memory
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedly from the Q-network
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:  # big penalty
                    reward = -100

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:  # Good enough
                    break

            print("Episode: {}\tsteps: {}".format(episode, step_count))
            if step_count > 10000:
                break

            if episode % 10 == 1:  # train every 10 episodes
                # Get a random batch of experiences.
                for _ in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print("Loss: ", loss)
                # copy q-net -> target_net
                sess.run(copy_ops)

        bot_play(mainDQN)
Esempio n. 4
0
def main():
    global turn
    max_episodes = 5000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name='main')
        targetDQN = dqn.DQN(sess, input_size, output_size, name='target')
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="name")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            turn = 1
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            board = np.zeros([19, 19])
            while not done:
                state = np.reshape(board, [1, 19 * 19])
                if turn == 1:  # DQN 차례
                    if np.random.rand(1) < e:
                        action_xpos, action_ypos = get_random_action_pos(board)
                        board[action_xpos][action_ypos] = 1
                    else:
                        max_q_reshaped = np.reshape(mainDQN.predict(board),
                                                    [19, 19])
                        max_q_action_xpos, max_q_action_ypos = np.unravel_index(
                            np.argmax(max_q_reshaped, axis=None),
                            max_q_reshaped.shape)
                        if (board[max_q_action_xpos][max_q_action_ypos] !=
                                0):  # 만약 max q 2차원 좌표에 이미 돌이 있다면,
                            max_q_action_xpos, max_q_action_ypos = get_random_action_pos(
                                board)  # 랜덤 액션.
                        board[max_q_action_xpos][max_q_action_ypos] = 1
                elif turn == 2:  # RULE BASED 차례
                    RuleBasedAi.rulebased(board, turn)
                turn = game.finishcheck(board, turn)
                next_state = np.reshape(board, [1, 19 * 19])
                reward, done = get_reward_done(turn)
                replay_buffer.append((state, action_xpos * 19 + action_ypos,
                                      reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                step_count += 1
                if turn == 1:
                    turn = 2
                elif turn == 2:
                    turn = 1
            print("Episode: {} steps: {}".format(episode, step_count))

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print("loss: ", loss)
                sess.run(copy_ops)
Esempio n. 5
0
def main():
    max_episodes = 2000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")

        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(max_episodes):
            eps = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < eps:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1

                if step_count > 10000:  # Good Enough
                    break

            print("Episode: {} steps: {}".format(episode, step_count))

            if step_count > 10000:
                pass
                break

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss : ", loss)

                # copy q_net ==> target_net
                sess.run(copy_ops)

        bot_play(mainDQN)
def restore():
    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name='main')
        targetDQN = dqn.DQN(sess, input_size, output_size, name='target')
        saver = tf.train.Saver()
        saver.restore(sess, "./Backup/DQN_CartPole_2015.ckpt")

        bot_play(mainDQN)
Esempio n. 7
0
File: DQN02.py Progetto: sjsong08/RL
def main():
    max_episodes = 3000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")

        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -1000

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 200:
                    break

            print("Episode: {} step: {}".format(episode, step_count))
            if step_count > 200:
                pass

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)
                sess.run(copy_ops)

        env2 = wrappers.Monitor(env, 'gym-results', force=True)

        for i in range(200):
            bot_play(mainDQN, env=env2)

        env2.close()
        gym.upload("gym-results", api_key="sk_VT2wPcSS0ylnlPORltmQ")
Esempio n. 8
0
def main():
    # store the previous observations in replay memory
    replay_buffer = deque(maxlen=REPLAY_MEMORY)

    with tf.Session() as sess:
        # seperate networks
        mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main")
        targetDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target")
        sess.run(tf.global_variables_initializer())

        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")

        sess.run(copy_ops)

        step_list = []
        with open('long/log_ddqn', 'w') as f:
            for episode in range(MAX_EPISODES):
                e = 1. / ((episode / 10) + 1)
                done = False
                step_count = 0
                state = env.reset()

                while not done:
                    if np.random.rand() < e:
                        action = env.action_space.sample()
                    else:
                        # Choose an action by greedily from the Q-network
                        action = np.argmax(mainDQN.predict(state))

                    # Get new state and reward from environment
                    next_state, reward, done, _ = env.step(action)

                    if done:  # Penalty
                        reward = -1

                    # Save the experience to our buffer
                    replay_buffer.append(
                        (state, action, reward, next_state, done))

                    if len(replay_buffer) > BATCH_SIZE:
                        minibatch = random.sample(replay_buffer, BATCH_SIZE)
                        loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                    if step_count % TARGET_UPDATE_FREQUENCY == 0:
                        sess.run(copy_ops)

                    state = next_state
                    step_count += 1

                f.write("Episode\t{}\tSteps\t{}\n".format(episode, step_count))
                step_list.append(step_count)

    plt.bar(range(len(step_list)), step_list, color="blue")
    plt.show()
Esempio n. 9
0
def main(unusued_argv):
    parent_proc = psutil.Process()
    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess,
                          FLAGS.screen_size,
                          FLAGS.minimap_size,
                          output_size,
                          FLAGS.learning_rate,
                          name="main")
        targetDQN = dqn.DQN(sess,
                            FLAGS.screen_size,
                            FLAGS.minimap_size,
                            output_size,
                            FLAGS.learning_rate,
                            name="target")

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        print("memory before starting the iteration : %s (kb)" %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

        for episode in range(FLAGS.start_episode, FLAGS.num_episodes):
            e = 1.0 / ((episode / 50) + 2.0)  # decaying exploration rate
            with sc2_env.SC2Env(FLAGS.map_name,
                                screen_size_px=(FLAGS.screen_size,
                                                FLAGS.screen_size),
                                minimap_size_px=(FLAGS.minimap_size,
                                                 FLAGS.minimap_size),
                                agent_race=FLAGS.agent_race,
                                bot_race=FLAGS.bot_race,
                                difficulty=FLAGS.difficulty,
                                visualize=FLAGS.visualize) as env:

                agent = minerva_agent.MinervaAgent(mainDQN)
                run_result = run_loop([agent], env, sess, e, mainDQN,
                                      targetDQN, copy_ops, 5000)
                agent.close()
                reward = run_result[0].reward
                if reward > 0:
                    env.save_replay("victory/")
                #else:
                #    env.save_replay("defeat/")

            children = parent_proc.children(recursive=True)
            for child in children:
                print("remaining child proc :", child)
            print(
                "memory after exit %d'th sc2env : %s (kb)" %
                (episode, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

            mainDQN.saveWeight()
            print("networks were saved, %d'th game result :" % episode, reward)
Esempio n. 10
0
def main():
    # Dict of all games for generalization purposes, values are:
    # 0: play_game func, 1: Which environment to use, 2: Subfolder for checkpoints, log and figures, 3: Plotting func
    games = {
        "tictactoe":
        [play_tictactoe, g.tictactoe, "tictactoe", log.plotTicTacToe]
    }

    # Here you can choose which of the games declared above you want to train, feel free to change!
    game = games["tictactoe"]

    environment = game[1]()
    state, gamma, copy_step, num_states, num_actions, hidden_units, max_experiences, min_experiences, batch_size, alpha, epsilon, min_epsilon, decay = environment.variables

    nn = dqn.DQN(num_states, num_actions, hidden_units, gamma, max_experiences,
                 min_experiences, batch_size, alpha)

    model_name = ""
    directory = "tictactoe/models/" + model_name + "/TrainNet/"
    nn.model = tf.saved_model.load(directory)

    won, tie = game[0](environment, nn)

    if tie:
        print("It's a tie!")
    elif won:
        print("You lost! The AI won!")
    else:
        print("You won!")
Esempio n. 11
0
def main(unused_argv):
    replay_list = []
    if FLAGS.replay:
        REPLAY_PATH = REPLAY_HOME + FLAGS.replay
    else:
        REPLAY_PATH = REPLAY_HOME

    for root, dirs, files in os.walk(REPLAY_PATH):
        for subdir in dirs:
            tmp = os.path.join(root, subdir)
            if tmp[-10:] == '.SC2Replay':
                replay_list.append(tmp)
        for file1 in files:
            tmp = os.path.join(root, file1)
            if tmp[-10:] == '.SC2Replay':
                replay_list.append(tmp)

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess,
                          FLAGS.screen_size,
                          FLAGS.minimap_size,
                          output_size,
                          FLAGS.learning_rate,
                          name="main")

        for iter in range(FLAGS.repeat):
            for replay in replay_list:
                start_time = time.time()
                run_loop(replay, 1, mainDQN)
                run_loop(replay, 2, mainDQN)
                mainDQN.saveWeight()
                print("networks were updated / replay :", replay)
                elapsed_time = time.time() - start_time
                print("Took %.3f seconds... " % (elapsed_time))
def main():
    max_episodes = 50000

    # store the previous observation in replay memory
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess,input_size,output_size, h_size = 10, l_rate = learning_rate, name = 'main')
        tf.global_variables_initializer().run()

        for episode in range(max_episodes):
            # reset environment and get first new observation

            e = 1./((episode/10)+1)           # E&E(exploit&exploration) rate
            step_count = 0
            explore_count = 0
            done = False
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()   # take a random action
                    explore_count += 1
                else:
                    action = np.argmax(mainDQN.predict(state))               # 가장 높은 값으로 행동 함.

                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                replay_buffer.append((state,action,reward,next_state,done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                    print ('memory full')
                #
                # if episode > 400:
                #     print ('step:',step_count, 'ene_cnt:',explore_count, 'action', action, reward, done)

                step_count += 1
                state = next_state
                if step_count > 10000:
                    break

            print("Episode: {} steps: {} e&e: {}".format(episode, step_count, explore_count))
            # if step_count > 10000:
            #     pass

            if episode % 10 == 1:
                print('buffer length:', len(replay_buffer))
                for j in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)

                    # print(j,loss)

                print ("Loss: ", loss)

        bot_play(mainDQN)
Esempio n. 13
0
    def __init__(self,
                 input_size=10,
                 TICKER='MSFT',
                 BATCH_SIZE=128,
                 GAMMA=0.999,
                 EPS_START=0.9,
                 EPS_END=0.05,
                 EPS_DECAY=200,
                 TARGET_UPDATE=10,
                 REPLAY_MEMORY_CAPACITY=10000,
                 NUM_EPISODES=1,
                 hidden_layer=120,
                 actions=3):

        self.TICKER = TICKER
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_DECAY = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE
        self.NUM_EPISODES = NUM_EPISODES
        self.fd = financial_data.financial_data(input_size)
        self.date = self.fd.norm_data_ls[self.fd.ticker_ls.index(TICKER)].date
        self.policy_net = dqn.DQN(input_size, hidden_layer, actions)
        self.target_net = dqn.DQN(input_size, hidden_layer, actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = replay_memory.ReplayMemory(REPLAY_MEMORY_CAPACITY)
        self.steps_done = 0
        self.episode_durations = []
        self.actions = actions
        self.input_size = input_size
        self.action_index = ['Buy', 'Sell', 'Hold']
        self.reward_list = []
        self.episode_list = []
        self.episode_len = 1200
        self.money = self.fd.norm_data_ls[self.fd.ticker_ls.index(
            TICKER)].Close.values[0] * 20
        self.money_list = []
        self.loss_list = []
        self.action_list = []
Esempio n. 14
0
    def __init__(self, N_ACTIONS, memory_path=None):
        if memory_path == None:
            ### MEMORY HYPERPARAMETERS
            # Number of experiences the Memory can keep
            self.memory = memory.Memory(1000000)
        else:
            self.memory = pkl.load(open(memory_path, 'rb'))

        self.model = dqn.DQN()
        # Set up tensorboard
        self.model.set_up_board()
Esempio n. 15
0
def main():
    max_episodes = 2000

    #store the previous observations in replay memory
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size)
        init = tf.global_variables_initializer()
        sess.run(init)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    #Choose an action by greedily form the Q-network
                    action = np.argmax(mainDQN.predict(state))

                #Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                #Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print("Episode: {}        steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
                #break

            if episode % 10 == 1:
                #Get  a random batch of experiences.
                for _ in range(50):
                    #Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                    print("Loss: ",loss)

        bot_play(mainDQN)
Esempio n. 16
0
    def game_play_start(self, type):
        self.replay_buffer = deque(maxlen=self.REPLAY_MEMORY)
        self.last_100_game_reward = deque(maxlen=100)
        self.sess = tf.compat.v1.Session()

        self.mainDQN = dqn.DQN(self.sess,
                               self.INPUT_SIZE,
                               self.OUTPUT_SIZE,
                               name="main")
        self.targetDQN = dqn.DQN(self.sess,
                                 self.INPUT_SIZE,
                                 self.OUTPUT_SIZE,
                                 name="target")
        self.sess.run(tf.compat.v1.global_variables_initializer())

        self.copy_ops = self.get_copy_var_ops(dest_scope_name="target",
                                              src_scope_name="main")
        self.sess.run(self.copy_ops)
        # 이 다음부턴 BP로 반복문 통제
        pass
Esempio n. 17
0
    def run(self):
        import dqn
        with tf.Session() as sess:
            self.sess = sess
            self.mainDQN = dqn.DQN(sess,
                                   self.input_size,
                                   self.output_size,
                                   name="main")
            self.targetDQN = dqn.DQN(sess,
                                     self.input_size,
                                     self.output_size,
                                     name="target")
            self.tempDQN = dqn.DQN(sess,
                                   self.input_size,
                                   self.output_size,
                                   name="temp")
            tf.global_variables_initializer().run()

            episode = 5100
            try:
                self.mainDQN.restore(episode)
                self.targetDQN.restore(episode)
                self.tempDQN.restore(episode)
            except NotFoundError:
                print "save file not found"

            self.copy_ops = self.get_copy_var_ops()
            self.copy_ops_temp = self.get_copy_var_ops(dest_scope_name="main",
                                                       src_scope_name="temp")
            self.copy_ops_temp2 = self.get_copy_var_ops(dest_scope_name="temp",
                                                        src_scope_name="main")
            sess.run(self.copy_ops)
            sess.run(self.copy_ops_temp2)

            predict_thread = threading.Thread(target=self.predict)
            train_thread = threading.Thread(target=self.train)
            predict_thread.start()
            train_thread.start()
            train_thread.join()
            predict_thread.join()
Esempio n. 18
0
def main():
    # store the previous observations in replay memory
    replay_buffer = deque(maxlen=REPLAY_MEMORY)
    last_100_game_reward = deque(maxlen=100)

    with tf.compat.v1.Session() as sess:
        mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE)
        init = tf.compat.v1.global_variables_initializer()
        sess.run(init)

        for episode in range(MAX_EPISODE):
            e = annealing_epsilon(episode, MIN_E, 1.0,
                                  EPSILON_DECAYING_EPISODE)
            done = False
            state = env.reset()

            step_count = 0
            while not done:

                if np.random.rand() < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done:
                    reward = -1

                sleep(0.01)
                replay_buffer.append((state, action, reward, next_state, done))

                state = next_state
                step_count += 1

                if len(replay_buffer) > BATCH_SIZE:
                    minibatch = random.sample(replay_buffer, BATCH_SIZE)
                    train_minibatch(mainDQN, minibatch)

            print("[Episode {:>5}]  steps: {:>5} e: {:>5.2f}".format(
                episode, step_count, e))

            # CartPole-v0 Game Clear Logic
            last_100_game_reward.append(step_count)
            if len(last_100_game_reward) == last_100_game_reward.maxlen:
                avg_reward = np.mean(last_100_game_reward)
                if avg_reward > 199.0:
                    print("Game Cleared within {} episodes with avg reward {}".
                          format(episode, avg_reward))
                    break

        bot_play(mainDQN)
Esempio n. 19
0
def main(arglist):
    env = gym.make(arglist.scenario)
    writer = SummaryWriter(log_dir='./logs/')

    actor = agent.Actor(env.observation_space.shape[0], env.action_space.n,
                        arglist.lr, arglist.tau).to(device)
    actor.eval()
    target_actor = agent.Actor(env.observation_space.shape[0],
                               env.action_space.n, arglist.lr,
                               arglist.tau).to(device)
    target_actor.eval()

    dqn_algo = dqn.DQN(actor, target_actor, arglist.gamma, arglist.batch_size,
                       arglist.replay_buffer_size, arglist.eval,
                       arglist.update_time)
    dqn_algo.load('./saved/actor_' + str(arglist.load_episode_saved))

    t_step = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        done = False
        j = 0
        losses = 0
        total_reward = 0
        while not done:
            if not arglist.eval:
                env.render()

            action, value_action = dqn_algo.act(obs)

            obs2, reward, done, info = env.step(action)
            total_reward += reward

            if arglist.eval:
                losses += dqn_algo.train(t_step, value_action, [reward], obs,
                                         obs2, [done])
            obs = obs2
            j += 1
            t_step += 1

        dqn_algo.epislon_decay()

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            actor.save_model('./saved/actor_' + str(episode))

        print('reward: ', total_reward, 'episode:', episode)
        if arglist.eval:
            writer.add_scalar('Loss', losses / float(j), episode)
            writer.add_scalar('Reward', total_reward, episode)
            writer.add_scalar('Epsilon_decay', dqn_algo.epsilon, episode)

    env.close()
Esempio n. 20
0
    def __init__(self, parser):
        dqn_f = dqn.DQN()
        self.agent = agent.Agent(dqn_f, parser.atari_env)
        self.env = self.agent.makeEnvironment()

        tf_f = tf.Dnn(self.env.action_space.n, 32, parser.lr)
        dqn_f.set_params(tf_f, parser.C, parser.max_iter, parser.mem_size,
                         parser.exp_start, parser.exp_end, parser.last_fm,
                         parser.gamma)
        self.evaluation_freq = parser.eval_freq
        self.evaluation_number = parser.eval_num
        self.log = logger.Logger(parser.eval_num)
        self.init_number_in_replay_mem = parser.init_replay_size
def main():
    max_ep = 5000
    replay_buffer = deque()

    with tf.session() as sess:
        mainDQN = dqn.DQN(sess, input_size,output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size,output_size, name="target")

        cp_op = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")

        sess.run(cp_op)

        for episode in range(max_ep):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100
                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

                print "episode : {} step : {}".format(episode, step_count)
                if step_count > 1000
def main():
    max_episodes = 1000

    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size,
                          output_size)  #dqn.DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:

                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print("episoze: {}  step: {}".format(episode, step_count))

            if step_count > 10000:
                pass

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print("Loss: ", loss)

        bot_play(mainDQN)
def main():
    max_episodes = 5000

    # store the previous observations in replay memory  저장시키고, 랜덤하게 꺼내기 위해
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()

        for episodes in range(max_episodes):
            e = 1. / ((episodes / 10) + 1)
            step_count = 0
            done = False

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))
                # replay_buffer가 너무 커지는 것을 방지 -> 일정 수 보다 커지면 가장 먼저 받은 값을 내보냄
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print("Episodes : {} steps : {}".format(episodes, step_count))
            if step_count > 10000:
                pass
            # break

            # 10번 반복할때마다 모아놓은 replay_buffer에서 random으로 값을 추출하고, 학습시켜 Q_pred를 update 함
            if episodes % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print("Loss :", loss)

        bot_play(mainDQN)
Esempio n. 24
0
def main():
    max_episode = 5000

    replay_buff = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size)
        sess.run(tf.global_variables_initializer())
        success_count = 0

        for episode in range(max_episode):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                replay_buff.append((state, action, reward, next_state, done))
                if len(replay_buff) > REPLAY_MEMORY:
                    replay_buff.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    success_count += 1
                    break

            print('Episode: {} steps: {}'.format(episode, step_count))
            if step_count > 10000 and success_count > 50:
                #pass
                break

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buff, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print('Loss: ', loss)

        bot_play(mainDQN)
def main():
    max_episode = 5000
    # store the previous obervations in the replay memory
    replay_buffer = deque()
    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
        for episode in range(max_episode):
            e = 0.1/((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100 # big penalty

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000: # good enough(막대가 넘어지지 않고 잘 유지되는 횟수)
                    break

            print ("Episode: {} steps {}".format(episode, step_count))
            if step_count > 10000:
                pass

            if episode % 10 ==1: # train every 10 episode
                # Get a random batch of experience
                for _ in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print ("Loss : ", loss)

        bot_play(mainDQN)
def main():
    max_episodes = 5000
    
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
        
        for episode in range(max_episodes):
            e = 1. / ((episode/10) + 1)
            done = False
            step_count = 0
            
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done: # big penalty
                    reward = -100
                
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                step_count += 1
                if step_count > 10000: # Good enough
                    break
            print("Episode: {} steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
                # break
Esempio n. 27
0
    def __init__(self):
        ps_hosts = FLAGS.ps_hosts.split(",")
        worker_hosts = FLAGS.worker_hosts.split(",")
        if FLAGS.job_name == "ps":
            server.join()
        elif FLAGS.job_name == "worker":
            self.no_of_steps = 1000000000000
            self.game_train_batch_size = 3
            self.env = environment.GymEnvironment('Pong-v0')
            self.sess = tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=(FLAGS.task_index == 0),
                checkpoint_dir="/tmp/train_logs",
                hooks=hooks)

            self.G = graph.Graph(self.env.actions(), self.sess)
            self.graph, self.graph_input, self.graph_action_value, self.graph_updated_action, self.loss = self.G.get_graph(
            )
            self.dqn = dqn.DQN(self.sess, gamma=0.8)
            self.sess.run(tf.global_variables_initializer())
            self.tf_merged_summary_op = tf.summary.merge_all()
            self.tf_writer = tf.summary.FileWriter('output', self.sess.graph)
Esempio n. 28
0
    def control_start(self):
        import dqn
        with tf.Session() as sess:
            mainDQN = dqn.DQN(sess,
                              self.input_size,
                              self.output_size,
                              name="main",
                              is_training=False)
            tf.global_variables_initializer().run()

            mainDQN.restore(100)

            for episode in range(self.max_episodes):
                done = False
                clear = False
                state = self.env.reset()

                while not done and not clear:
                    action = np.argmax(mainDQN.predict(state))
                    print action
                    next_state, reward, done, clear, max_x, _, _ = self.env.step(
                        action)
                    state = next_state
def main() :
	a = Servo.servo()
    	b = degree_gyro_q_l.acc()
	global count
	global init_pwm_1
	global init_pwm_2
	global np_ML_data
	global start_time
	global memory_degree
	global memory_ang_vel    	
	global memory_acc_degree	
	global memory_semaphore	

	max_episodes = 2000
	## store the previous observations in replay memory
	replay_buffer = deque()

	que = []
	acc_que = []
	timecheck_list = []    	
	
	pwm_1 = init_pwm_1
	pwm_2 = init_pwm_2 
	
	## matplotlib data initialization ##
	#np_ML_data = np.array([[0, acc_gyro_pitch, b.pitch(), gyro_pitch_degree, init_pwm_1, init_pwm_2]])	

	
	with tf.Session() as sess:
		mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
		targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
		tf.global_variables_initializer().run()
		
		## initial copy q_net -> target_net
		copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")
		sess.run(copy_ops)
	
		for episode in range(max_episodes):
			print "new episodes initializaion"
			e = 1. / ((episode / 10) + 1) 
			done = False
			step_count = 0
			pwm_left = init_pwm_1
			pwm_right = init_pwm_2			
			
			"""
			degree = memory_degree.read()
			acc_gyro_pitch = float(degree.rstrip('\x00'))
			ang_vel = memory_ang_vel.read()	
			p_ang_vel = float(ang_vel.rstrip('\x00'))
			"""
			"""
			timecheck_list.append(time.time())
                	loop_time = timecheck_list[1] - timecheck_list[0]
			timecheck_list.pop(0)
			
			acc_pitch_degree = b.pitch()			
			
			gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree)
			get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch)
                	acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree)))	
			"""
			"""	
			state = np.array([acc_gyro_pitch, p_ang_vel, pwm_left, pwm_right])
			"""
			#state = np.array([acc_gyro_pitch, p_ang_vel])
			print "\n\n"	
			while not done:
				memory_semaphore.acquire(10)
				degree = memory_degree.read()
                        	acc_gyro_pitch = float(degree.rstrip('\x00'))
				ang_vel = memory_ang_vel.read()
                        	p_ang_vel = float(ang_vel.rstrip('\x00'))
				acc_degree = memory_acc_degree.read()
				acc_pitch = float(acc_degree.rstrip('\x00'))
				
				memory_semaphore.release()
				state = np.array([acc_gyro_pitch, p_ang_vel])

				print "\t\t\t<state> degree: %s, \tangular velocity: %s" %(state[0],  state[1])
				if np.random.rand(1) < e:
					action = np.random.randint(9)
				else:
					action = np.argmax(mainDQN.predict(state))
				print "Q: %s" % (mainDQN.predict(state))	
				pwm_left, pwm_right = step_action(action, pwm_left, pwm_right) 
				
				print "\t\t\t\t\t\t\t\t\t\t<action-motor> left: %s, right: %s <= %s" % (pwm_left, pwm_right, action_print(action))
			
				a.servo_1(pwm_left)
				a.servo_2(pwm_right)						
			
				time.sleep(0.01)
				
				## Get new state and reward from environment
				"""
				degree = memory_degree.read()
                        	acc_gyro_pitch = float(degree.rstrip('\x00'))
                        	ang_vel = memory_ang_vel.read()
                        	p_ang_vel = float(ang_vel.rstrip('\x00'))
				acc_degree = memory_acc_degree.read()
                                acc_pitch = float(acc_degree.rstrip('\x00'))
				"""
				memory_semaphore.acquire(10)
                                
				degree = memory_degree.read()
                                acc_gyro_pitch = float(degree.rstrip('\x00'))
  	                        ang_vel = memory_ang_vel.read()
                                p_ang_vel = float(ang_vel.rstrip('\x00'))
                                acc_degree = memory_acc_degree.read()
                                acc_pitch = float(acc_degree.rstrip('\x00'))
				
				memory_semaphore.release()
				
				"""
				timecheck_list.append(time.time())
                        	loop_time = timecheck_list[1] - timecheck_list[0]
				timecheck_list.pop(0)

                        	acc_pitch_degree = b.pitch()

                        	gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree)
                        	get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch)
                        	acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree)))				
				"""
				next_state = np.array([acc_gyro_pitch, p_ang_vel])				
				
				"""
				next_state = np.array([acc_gyro_pitch, p_ang_vel, pwm_left, pwm_right])
				"""
				reward, done = reward_done_check(state, next_state)		
			
		
				## Save the experience to our buffer
				replay_buffer.append((state, action, reward, next_state, done))
				if len(replay_buffer) > REPLAY_MEMORY:
					replay_buffer.popleft()
				
				if done: 
                                    	
					"""
					if step_count < 10:
						print "\t\t\t<warm-up>"
						done = False
						pass
					"""	
				    	
                               		print "\t\t\t<finish state> degree: %s, \tangular velocity: %s" %(next_state[0], next_state[1])
					time.sleep(3)
					
					"""	
					degree = memory_degree.read()
                        		acc_gyro_pitch = float(degree.rstrip('\x00'))
                       			ang_vel = memory_ang_vel.read()
                       			p_ang_vel = float(ang_vel.rstrip('\x00'))
					"""
					"""	
					timecheck_list.append(time.time())
                               		loop_time = timecheck_list[1] - timecheck_list[0]
                               		timecheck_list.pop(0)

                           	  	acc_pitch_degree = b.pitch()

                                	gyro_pitch_degree, _ = b.gyro_pitch(loop_time, gyro_pitch_degree)
                                	get_gyro_degree, p_ang_vel = b.gyro_pitch(loop_time, acc_gyro_pitch)
                                	acc_gyro_pitch = np.sign(get_gyro_degree) * ((0.97 * abs(get_gyro_degree)) + (0.03 * abs(acc_pitch_degree)))
					"""
					#next_state = np.array([acc_gyro_pitch, p_ang_vel])
				
				#state = next_state
				step_count += 1
				if step_count > 10000:
					break
			
			print "Episode: {}  steps: {}".format(episode, step_count)	
			if step_count > 10000:
				pass
	
			if len(replay_buffer) > 10 and episode % 10 == 1: # train every 10 episode
				# Get a random batch of experiences.
				for _ in range(50):
					minibatch = random.sample(replay_buffer, 10)
					
					loss, _ = replay_train(mainDQN, targetDQN, minibatch)
					
				print "Loss: %s" % (loss)

				# copy q_net -> target_net
				sess.run(copy_ops)
Esempio n. 30
0
def main():
    max_episodes = 5000

    # store the previous observations in replay memory
    replay_buffer = deque()

    with tf.Session() as sess:
        """
		여기서 network를 두 개를 생성합니다. 생성은 sess를 통해서 하겠죠?
		"""
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        # initial copy q_net -> target_net
        """
		초기의 w(세타)를 같게 만들고 시작합니다. 처음에는 당연히 network가 동일해야 합니다.
		다른 network로 해봤자 나중에 복사할 때 의미가 없기 때문입니다.
		"""
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        """복사한 것을 실행시킵니다."""
        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedilty from the Q-network
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:  #big penalty
                    reward = -100

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1

            print("Episode: {}  steps: {} ".format(episode, step_count))

            if episode % 10 == 1:  # train every 10 episodes
                # Get a random batch of experiences.
                for _ in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                    """
					replay_train에 mainDQN, minibatch가 아니라
					mainDQN, targetDQN, minibatch 값이 들어갑니다.
					"""

                print("Loss: ", loss)

                # copy q_net -> target_net
                sess.run(copy_ops)
                """학습된 main(q) network의 w값을 target network의 w값으로 복사를 합니다."""

        bot_play(mainDQN)