Esempio n. 1
0
def replay():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다.
            time.sleep(0.3)

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
Esempio n. 2
0
def replay():
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            time.sleep(0.3)

        print('Games: %d Score: %d' % (episode + 1, total_reward))
Esempio n. 3
0
def replay():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다.
            time.sleep(0.3)

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
Esempio n. 4
0
def replay(track, width, height, rand):
    sess = tf.Session()

    game = Game(track, width, height, show_game=True)
    brain = DQN(sess, width, height, CHANNEL, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if rand and np.random.rand() < 0.1:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            time.sleep(0.15)

        print('Games: %d Score: %d' % (episode + 1, total_reward))
Esempio n. 5
0
def replay():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	saver = tf.train.Saver()
	ckpt = tf.train.get_checkpoint_state('model')
	saver.restore(sess, ckpt.model_checkpoint_path)

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			action = brain.get_action()

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			time.sleep(0.3)

		print('episode: %d, score: %d' % (episode + 1, total_reward))
Esempio n. 6
0
def train():
    print("뇌세포 깨우는 중..")
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all

    brain.update_target_network()
    epsilon = 1.0

    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            if episode > OBSERVE:
                rpdilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

            print('게임횟수 : %d, 점수 :  %d' % (episode + 1, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 7
0
def replay():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    # 저장해둔 모델 읽어옴
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward
            brain.remember(state, action, reward, terminal)
            time.sleep(0.3)

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
Esempio n. 8
0
def test():
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    total_succ = 0
    for episode in range(10000):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        step = 0
        while not terminal and step <= 200:
            action = brain.get_action()
            state, reward, terminal, succ = game.step(action)
            if terminal and succ:
                total_succ += 1
            step += 1

    print(total_succ)
Esempio n. 9
0
def train():
    with tf.Session() as sess:
        tf.set_random_seed(GLOBAL_SEED)
        brain = DQN(sess, observation_size, action_size)
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        brain.update_target_network()
        time_step = 0
        total_reward_list = []

        for episode in range(MAX_EPISODE):
            done = False
            total_reward = 0
            epsilon = 1. / ((episode / 10) + 1)

            observation = env.reset()
            brain.init_state(observation)

            while not done:
                if np.random.rand() < epsilon:
                    action = random.randrange(action_size)
                else:
                    action = brain.get_action()

                observation, reward, done, _ = env.step(action)
                # print(observation, reward, done)
                total_reward += reward
                brain.remember(observation, action, reward, done)

                if time_step > 0:
                    if time_step % TRAIN_INTERVAL_FRAMES == 0:
                        _, loss = brain.train()
                    if time_step % TARGET_UPDATE_INTERVAL == 0:
                        brain.update_target_network()

                time_step += 1

            print('episode: %d total_reward: %d' % (episode, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 10
0
def replay():
    print('Loading..')
    sess = tf.Session()

    global_step = tf.Variable(0, trainable=False, name='global_step')
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step)
    #brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    saver.restore(sess, ckpt.model_checkpoint_path)

    server.accept()

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        #state = game.reset()
        id, _, _, _, state = server.readStatus()
        state = reshapeFromPacket(state)
        brain.init_state(state)

        while not terminal:

            action = brain.get_action()

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            #state, reward, terminal = game.step(action)
            server.sendX(id, action)
            id, reward, totalScore, terminal, state = server.readStatus()
            state = reshapeFromPacket(state)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다.
            #time.sleep(0.3)

        print(
            'Count of Play: %d total reward: %d' % (episode + 1, total_reward),
            "Action", action)
Esempio n. 11
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    sess.run(tf.global_variables_initializer())

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    time_step = 0
    epsilon = 1.0

    for episode in range(MAX_EPISODE):
        # 게임을 시작합니다.
        terminal = False

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        _, state, _, _ = game.first_step()
        brain.init_state(state)

        while not terminal:
            # 게임 기록을 가져옵니다.
            action, state, reward, terminal = game.step()

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)
            if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0:
                brain.train()
            # 타겟 네트웍을 업데이트 해 줍니다.
            # if (time_step % TARGET_UPDATE_INTERVAL) == 0:
            #     brain.update_target_network()
            time_step += 1
        # if episode % 50 == 0:
        print(episode)
    save_model(sess)
Esempio n. 12
0
def replay():
    sess = tf.Session()
    brain = DQN(sess, observation_size, action_size)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        done = False
        total_reward = 0
        observation = env.reset()
        brain.init_state(observation)

        while not done:
            action = brain.get_action()
            observation, reward, done, _ = env.step(action)
            total_reward += reward
            brain.remember(observation, action, reward, done)
            time.sleep(0.3)

        print('episode: %d total_reward: %d' % (episode, total_reward))
Esempio n. 13
0
def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): 
    print('start train rl')


    #print(images.shape)
    #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)

    #X_train = X_train.astype("float32", casting='unsafe')
    #X_val = X_val.astype("float32", casting='unsafe')
    #X_test = X_test.astype("float32", casting='unsafe')
    
    #print('check')
    #print(X_train.shape)
    with tf.Session() as sess:
        #onfig = get_config(FLAGS) or FLAGS
       
        model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act)
        
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        
        print('total %s folds', len(folds))
        
        #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)
#        X_train = X_train.astype("float32", casting='unsafe')
#        X_val = X_val.astype("float32", casting='unsafe')
#        X_test = X_test.astype("float32", casting='unsafe')

        ###

        # init target network
        model.update_target_network()
        
        # get next action from DQN
        epsilon = 1.0
        # def frame N
        t_step = 0
        tot_reward_list = []


        MAX_EPISODE = 10000
        n_img = len(targets)
        
        n_epi = n_img
        if stochastic: n_epi = MAX_EPISODE
        

        # call pred & loss 
        n_test = 3 
        if test:  #for debugging
            pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :])
            if not stochastic: n_epi = n_test
        else: pred_all, loss_all = predict_all(images, targets)
        
        #pred_all_train, loss_all_train = predict_all(X_train, y_train)

        #print(pred_all)

        # run simulation
        pred_rl = []
        for epi in range(n_epi):
            terminal = False
            tot_reward = 0

            #init game & get current state
            
            #state parsing
            state = np.expand_dims(images[epi], 0)
            #state = np.expand_dims(X_train[epi], 0)
            model.init_state(state)

            if np.random.rand() < epsilon:
                act = random.randrange(n_act)
            else:
                act = model.get_action()

            if epi > OBSERVE: epsilon -= 1/100
            if base_rand: act = random.randrange(n_act)
            
            #stochastic define
            if stochastic:
                ii = random.randrange(n_img)
                state = np.expand_dims(images[ii], 0)  
                #state = np.expand_dims(X_train[ii], 0)
                state_i = ii

            else:
                state = np.expand_dims(images[epi], 0)
                #state = np.expand_dims(X_train[epi], 0)
                state_i = epi
            
            # get model str by act
            choosen_model = model_list[act]
            
            # reward function
            if pred_all[choosen_model][state_i] == 1:
                reward = 1
                pred_rl.append(1)
            else:
                reward = -2
                pred_rl.append(0)


            tot_reward += reward

            model.remember(state, act, reward, terminal)

            if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0:
                # DQN train
                model.train()

            if t_step % TARGET_UPDATE_INTERVAL == 0:
                # target update
                model.update_target_network()

            t_step += 1

            print('epi: %d score: %d' % ((epi+1), tot_reward))

            tot_reward_list.append(tot_reward)

            if epi % 10 == 0:
                summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list})
                writer.add_summary(summary, t_step)
                tot_reward_list = []

            if epi % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=t_step)

        return tot_reward_list, pred_rl, pred_all
Esempio n. 14
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()
    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    # 학습결과 저장 및 확인
    # 한판마다 얻는 점수를 저장하고 확인
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    # 파일 저장
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 목표신경망 초기화
    brain.update_target_network()

    # 행동을 선택할떄 DQN을 이용할 시점 정함
    # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감
    epsilon = 1.0

    # 학습진행 조절을 위한 진행된 프레임 횟수
    time_step = 0
    # 학습결과를 확인하기 위한 점수 저장 배열
    total_reward_list = []

    # 학습 시작
    for episode in range(MAX_EPISODE):
        terminal = False  # 게임 종료
        total_reward = 0  # 한게임당 얻은 총 점수

        state = game.reset()  # 게임 초기화
        brain.init_state(state)  # DQN에 게임 초기화

        # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행
        while not terminal:

            # 학습 초반 (100회 이전)은 무작위로 수행
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 게임상태, 보상과 게임종료여부 받음
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재상태를 신경망 객체에 기억
            # 기억된 정보를 이용하여 신경망 학습 시킴
            brain.remember(state, action, reward, terminal)

            # 프레임 100번이 넘으면 4프레임마다 한번씩 학습
            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()
            # 1000프레임 마다 한번씩 목표 신경망 갱신
            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()
            time_step += 1

        # 게임 종료시 획득점수 출력하고 점수 저장
        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
        total_reward_list.append(total_reward)

        # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장
        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 15
0
            for i in range(4)[::-1]:
                print(i)
                time.sleep(1)
            PressKey(ENTER)
            time.sleep(0.05)
            ReleaseKey(ENTER)

            print("Learning Start !!!")
            for episode in range(args.max_episode):
                done = False
                total_reward = 0

                game.init_state()
                a = rgb2gray(game.state)
                dqn.init_state(rgb2gray(game.state))

                start = time.time()
                count = 0
                while not done:
                    if np.random.rand() < epsilon:
                        action = random.randrange(args.action_size)
                    else:
                        action = dqn.get_action()

                    if epsilon > args.observe:
                        epsilon -= 0.001

                    reward, done = game.step(action)
                    total_reward += reward
Esempio n. 16
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []


    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):

        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:

            if game.previous_price == 0 :
                now_price = driver.find_element_by_xpath(
                    '// *[ @ id = "cont_coin_info"] / div[1] / span[1]'
                ).text
                now_price = float(str(now_price).replace(",", ""))

                game.previous_price = now_price

                print("prepare..")

                time.sleep(0.5)

            # 1. 현재 가격 저장

            now_price = driver.find_element_by_xpath(
                '// *[ @ id = "cont_coin_info"] / div[1] / span[1]'
            ).text
            now_price = float(str(now_price).replace(",", ""))

            game.now_price = now_price

            # 2. 전체 매도량, 전체 매수량

            total_sell = driver.find_element_by_xpath(
                '// *[ @ id = "txt_total_bid"]'
            ).text

            total_buy = driver.find_element_by_xpath(
                '//*[@id="txt_total_ask"]'
            ).text

            total_trade = float(str(total_sell).replace(",", "")) + float(str(total_buy).replace(",", ""))

            selling = [0 for _ in range(10)]
            buying = [0 for _ in range(10)]

            for num in range(1,11):
                _xpath = '//*[@id="contSellCoin"]/li['+ str(num) +']/div/p'
                bar = driver.find_element_by_xpath(
                    _xpath
                ).text

                percent = 100 * float(bar) / total_trade


                selling[num-1] = percent



            for num in range(1,11):
                _xpath = '//*[@id="contBuyCoin"]/li[' + str(num) + ']/div/p'
                bar = driver.find_element_by_xpath(
                    _xpath
                ).text

                percent = 100 * float(bar) / total_trade


                buying[num-1] = percent


            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)

            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action, selling, buying)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            time.sleep(0.3)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

            print('게임횟수: %d 점수: %d' % (episode + 1, total_reward), "({})".format(game.seq))



            total_reward_list.append(total_reward)

            if terminal == True :# 게임 종료
                print("game over!")


        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 17
0
def train(cont):
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    if cont:
        sess.run(tf.global_variables_initializer())

        ckpt = str(tf.train.get_checkpoint_state('model'))
        i = ckpt.find("\"") + 1
        j = ckpt.find("\"", i)
        reader = pywrap_tensorflow.NewCheckpointReader(ckpt[i:j])
        var_to_shape_map = reader.get_variable_to_shape_map()
        target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        for key in var_to_shape_map:
            if "conv2d" in key and "Adam" not in key:
                for key_f in target_vars:
                    if key in key_f.name:
                        sess.run(key_f.assign(reader.get_tensor(key)))
                        break


#        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        if episode > OBSERVE:
            epsilon = 0.01

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()
            epsilon += 0.00001

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

        if episode % 10 == 0:
            print('Games: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 10000 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=episode)
Esempio n. 18
0
def train():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	rewards = tf.placeholder(tf.float32, [None])
	tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

	saver = tf.train.Saver()
	sess.run(tf.global_variables_initializer())

	writer = tf.summary.FileWriter('logs', sess.graph)
	summary_merged = tf.summary.merge_all()

	brain.update_target_network()

	epsilon = 1.0
	time_step = 0
	total_reward_list = []

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			if np.random.rand() < epsilon:
				action = random.randrange(NUM_ACTION)
			else:
				action = brain.get_action()

			if episode > OBSERVE:
				epsilon -= 1 / 1000.

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
				brain.train()

			if time_step % TARGET_UPDATE_INTERVAL == 0:
				brain.update_target_network()

			time_step += 1

		print('episode: %d, score: %d' % (episode + 1, total_reward))

		total_reward_list.append(total_reward)

		if episode % 10 == 0:
			summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
			writer.add_summary(summary, time_step)
			total_reward_list = []

		if episode % 100 == 0:
			saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 19
0
def test_simulation(data):
    print("Test mode")
    session = tf.Session()

    simulation = Simulation(data)
    network = DQN(session, data)

    saver = tf.train.Saver()
    ckeckpoint = tf.train.get_checkpoint_state('model')
    saver.restore(session, ckeckpoint.model_checkpoint_path)

    # 테스트 시작
    for episode in range(MAX_TEST):
        time = 0

        list_connection = [[] for i in range(data['NUM_AP'])]

        total_reward = 0

        before_reward = 0

        simulation.reset()
        simulation.make_state()

        network.init_state(simulation.state)
        start = timeit.default_timer()

        # UE 차례로 AP에 할당
        for ue in range(data['NUM_UE']):

            action = network.get_action()
            list_connection[action].append(ue)

            fairness, error = simulation.step(ue, action)
            reward = fairness - before_reward
            before_reward = fairness

            total_reward += reward

            if error:
                network.remember(simulation.state, action, reward, True)
            else:
                network.remember(simulation.state, action, reward,
                                 (ue == (data['NUM_UE'] - 1)))

            if error:
                break

        time += (timeit.default_timer() - start)
        print()
        print("Fairness:", total_reward)
        print()
        print("== Before adjustment ==")
        for ap in range(data['NUM_AP']):
            print("AP %d Timeslot: %.2f" %
                  (ap, simulation.state[SUM_TIMESLOT][ap]))
            print("Connection:", end=" ")
            for ue in list_connection[ap]:
                print("UE %d(%dkbps)" % (ue, data['LIST_RATE'][int(
                    simulation.info[ue][ap][CONST_REQUEST])]),
                      end=" ")
            print()

            # AP에 할당된 Timeslot이 허용 Timeslot보다 넘치는 경우
            if simulation.state[SUM_TIMESLOT][ap] > data['VAL_TIMESLOT']:
                start = timeit.default_timer()
                simulation.adjust_bitrate(ap, list_connection[ap])
                time += (timeit.default_timer() - start)

        print()

        total_dqn_psnr = 0
        total_ideal_psnr = 0
        print("== After adjustment ==")
        for ap in range(data['NUM_AP']):
            print("AP %d Timeslot: %.2f" %
                  (ap, simulation.state[SUM_TIMESLOT][ap]))
            print("Connection:", end=" ")
            for ue in list_connection[ap]:
                support_index = int(simulation.info[ue][ap][CONST_SUPPORT])
                support_rate = data['LIST_RATE'][support_index]
                total_dqn_psnr += simulation.get_PSNR(support_rate)

                request_index = int(simulation.info[ue][ap][CONST_REQUEST])
                request_rate = data['LIST_RATE'][request_index]
                total_ideal_psnr += simulation.get_PSNR(request_rate)

                print("UE %d(%dkbps)" % (ue, support_rate), end=" ")
            print()
        print()
        list_dqn_psnr.append(total_dqn_psnr / data['NUM_UE'])
        list_dqn_time.append(time)
        print("%s\tPSNR: %.2f %.4f" %
              ("DQN".ljust(20), total_dqn_psnr / data['NUM_UE'], time))
        performance, time = simulation.solve_fract()
        print("%s\tPSNR: %.2f %.4f" %
              ("Fractional".ljust(20), performance / data['NUM_UE'], time))
        performance, time = simulation.solve_random()
        list_random_psnr.append(performance / data['NUM_UE'])
        list_random_time.append(time)
        print("%s\tPSNR: %.2f %.4f" %
              ("Random".ljust(20), performance / data['NUM_UE'], time))
        performance, time = simulation.solve_greedy()
        list_greedy_psnr.append(performance / data['NUM_UE'])
        list_greedy_time.append(time)
        print("%s\tPSNR: %.2f %.4f" %
              ("Greedy".ljust(20), performance / data['NUM_UE'], time))
        performance, time = simulation.solve_mthm()
        list_mthm_psnr.append(performance / data['NUM_UE'])
        list_mthm_time.append(time)
        print("%s\tPSNR: %.2f %.4f" %
              ("Knapsack(MTHM)".ljust(20), performance / data['NUM_UE'], time))
        #"""
        performance, time = simulation.solve_mtm()
        list_mtm_psnr.append(performance / data['NUM_UE'])
        list_mtm_time.append(time)
        print("%s\tPSNR: %.2f %.4f" %
              ("Knapsack(MTM)".ljust(20), performance / data['NUM_UE'], time))
        performance, time = simulation.solve_bb()
        list_bb_psnr.append(performance / data['NUM_UE'])
        list_bb_time.append(time)
        print(
            "%s\tPSNR: %.2f %.4f" %
            ("Branch and Bound".ljust(20), performance / data['NUM_UE'], time))
        #"""
        list_ideal_psnr.append(total_ideal_psnr / data['NUM_UE'])
        print("%s\tPSNR: %.2f" %
              ("Ideal".ljust(20), total_ideal_psnr / data['NUM_UE']))

    # 테스트 종료
    """
Esempio n. 20
0
def train(IS_IMPORT):
    print('Loading ...')
    sess = tf.Session()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    global_step = tf.Variable(0, trainable=False, name='global_step')

    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step)
    #brain = DQN(sess, 61, global_step)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    totalScores = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores))

    total_reward_list = []
    total_score_list = []

    saver = tf.train.Saver(tf.global_variables())

    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    summary_merged = tf.summary.merge_all()

    if IS_IMPORT == True:
        fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce')
    else:
        server.accept()

    brain.update_target_network()
    print('global_step:', sess.run(global_step))

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0
        weight = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        #state = game.reset()
        if IS_IMPORT:
            id, _, _, _, state = fs.readState()
            if id == -1:
                sys.exit(1)
        else:
            id, _, _, _, state = server.readStatus()

        if id == -1:
            continue

        state = reshapeFromPacket(state)
        '''
        state.append(state[2])
        state.append(state[2])               
        '''

        brain.init_state(state)

        while not terminal:
            actionType = "Action:"

            if IS_IMPORT:
                action = fs.readAction()
                if action == -1: sys.exit(1)
                id, reward, totalScore, terminal, state = fs.readState()
                if id == -1: sys.exit(1)
            else:

                if np.random.rand() < epsilon:
                    action = random.randrange(NUM_ACTION)
                    print("Random action:", action)
                    #action = -1
                    #action = random.uniform(-1, 1)
                else:
                    action = brain.get_action()

                #action = brain.get_action()

                if episode > OBSERVE:
                    epsilon -= 1 / 1000

                server.sendX(id, action)

                if action == -1:
                    id2, action = server.readAction()
                    actionType = "Random Action:"

                    if id != id2:
                        print("Invalid Packet", id, id2)

                id, reward, totalScore, terminal, state = server.readStatus()

            reward = reward + (weight * 0.1)
            weight = weight + 1

            print(time.strftime("%H:%M:%S", time.localtime()), id, actionType,
                  action, "totalScore:", totalScore, "reward:", reward,
                  "terminal", terminal)

            if id == -1:
                break
            if terminal == True:
                total_score_list.append(totalScore)

            state = reshapeFromPacket(state)

            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.

                brain.train()
                '''
                try:
                except:
                    print("Train Error!!")
                    time_step -= 1
                '''

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if (episode) % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={
                                   rewards: total_reward_list,
                                   totalScores: total_score_list
                               })
            writer.add_summary(summary, sess.run(global_step))
            total_reward_list = []
            total_score_list = []

        if (episode + 1) % 100 == 0:
            saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step)

    #모두 학습한 후에 tflite 파일로 저장
    converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X],
                                                     [brain.Q])
    tflite_model = converter.convert()
    open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model)
    sys.exit(1)
Esempio n. 21
0
def train_simulation(data):
    print("Training mode")
    session = tf.Session()

    simulation = Simulation(data)
    network = DQN(session, data)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('reward average / episode', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    session.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', session.graph)
    summary = tf.summary.merge_all()

    # 네트워크 초기화
    network.update_target_network()

    epsilon = 1.0
    time = 0

    # 학습 시작
    for episode in range(MAX_EPISODE):
        total_reward = 0
        list_reward = []

        before_reward = 0

        simulation.reset()
        simulation.make_state()

        network.init_state(simulation.state)

        # UE 차례로 AP에 할당
        for ue in range(data['NUM_UE']):

            if np.random.rand() < epsilon:
                action = np.random.randint(data['NUM_AP'])
            else:
                action = network.get_action()

            epsilon -= 1 / DELTA_EPSILON

            fairness, error = simulation.step(ue, action)
            reward = fairness - before_reward
            before_reward = fairness

            total_reward += reward

            if error:
                network.remember(simulation.state, action, reward, True)
            else:
                network.remember(simulation.state, action, reward,
                                 (ue == (data['NUM_UE'] - 1)))

            if time > THRESH_OBSERVE and (time % INTERVAL_TRAINING == 0):
                network.train()

            if time % INTERVAL_UPDATE == 0:
                network.update_target_network()

            time += 1

            if error:
                break

        list_reward.append(total_reward)
        print(episode, total_reward)

        if episode % 10 == 0:
            result = session.run(summary, feed_dict={rewards: list_reward})
            writer.add_summary(result, time)
            list_reward = []

        if episode % 100 == 0:
            saver.save(session, 'model/dqn.ckpt', global_step=time)
Esempio n. 22
0
def train():
    print('Training... 뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Esempio n. 23
0
def train(track, width, height, cont):
    sess = tf.Session()

    game = Game(track, width, height, show_game=False)
    brain = DQN(sess, width, height, CHANNEL, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    if cont:
        ckpt = tf.train.get_checkpoint_state('model')
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    brain.update_target_network()

    epsilon = 1.0
    time_step = 0
    total_reward_list = []

    if cont:
        OBSERVE = 100
    else:
        OBSERVE = 5000

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        if episode > OBSERVE:
            epsilon = 2000 / episode

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if episode > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if episode > OBSERVE and time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

        if episode % 10 == 0:
            print('Games: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode > OBSERVE and episode % 10000 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=episode)
Esempio n. 24
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)