Example #1
0
def train_dqn(episode):
	loss = []
	agent = DQN(4, 10)
	init = tf.global_variables_initializer()
	with tf.Session() as sess:
		agent.set_session(sess)
		for e in range(episode):
			print("Episode {}".format(e))
			state = env.reset()
			state = np.reshape(state, (1, 10))
			score = 0
			max_steps = 1000
			for i in tqdm(range(max_steps)):
				action = agent.act(state)
				for i in range(11):
					reward, next_state, done = env.step(action)
				score += reward
				next_state = np.reshape(next_state, (1, 10))
				agent.remember(state, action, reward, next_state, done)
				state = next_state
				agent.replay()
				if done:
					print("")
					print("episode: {}/{}, score: {}".format(e, episode, score))
					time.sleep(2)
					break
			loss.append(score)
	return loss
Example #2
0
def train_dqn(episode):
    action_dict = {0: 'left', 1: 'down', 2: 'right', 3: 'up', 4: 'stay'}
    loss = []
    agent = DQN(5, 10)
    for e in range(episode):
        print("Episode {}".format(e))
        state = env.reset()
        state = np.reshape(state, (1, 10))
        score = 0
        max_steps = 1000
        full_msg = ''
        for i in tqdm(range(max_steps)):
            action = agent.act(state)
            for i in range(11):
                reward, next_state, done, full_msg = env.step(action, full_msg)
                #time.sleep(2)
            score += reward

            next_state = np.reshape(next_state, (1, 10))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("")
                print("episode: {}/{}, score: {}".format(e, episode, score))
                time.sleep(2)
                break
        loss.append(score)
    return loss
Example #3
0
def testing():
    gamma = 0.99
    batch_sz = 32
    num_episodes = 1000
    total_t = 0
    episode_rewards = np.zeros(num_episodes)
    last_100_avgs = []
    full_msg = ''

    model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model")

    with tf.Session() as sess:
        model.set_session(sess)
        sess.run(tf.global_variables_initializer())
        model.load()

        obs = env.reset()
        state = obs
        acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2]
        for i in tqdm(range(int(10e6))):
            action = np.random.choice(K)
            obs, reward, done, _, full_msg = env.step(action, full_msg)
            #time.sleep(0.8)
            #print(action)
            if done == 1:
                done = True
            else:
                done = False
            next_state = obs

            if done:
                obs = env.reset()
                state = obs
            else:
                state = next_state
def run():
    step_of_each_round = []

    for i in range(round):

        t1 = datetime.datetime.now()

        print("round :", i)

        observation = env.reset()

        # env.plt('start')
        step = 0
        while True:
            # choose action
            action = []
            for j in range(8):
                action.append(RL[j].choose_action(observation))

            # update environment
            observation_, reward, done = env.step(action)

            # 训练一段时间后,更新画面
            # if step > 10000:
            #     env.plt('update')

            # restore memory
            for j in range(8):
                RL[j].store_transition(observation, action[j], reward[j],
                                       observation_)

            if (step > 200) and (step % 5 == 0):
                for j in range(8):
                    RL[j].learn()

            if done:
                break
            observation = observation_
            step = step + 1
        step_of_each_round.append(step)
        t2 = datetime.datetime.now()
        print(t2 - t1)
    end = datetime.datetime.now()
    print(end - start)

    # output data
    csvFile = open('./data.csv', "a", newline='')
    data = RL[0].layers
    data.append(sum(step_of_each_round) / round)
    data.append(sum(step_of_each_round[-51:-1]) / 50)
    writer = csv.writer(csvFile, dialect='excel')
    writer.writerow(data)
    csvFile.close()

    print('average step: ', sum(step_of_each_round) / round)
    print('average step of latest 50 rounds: ',
          sum(step_of_each_round[-51:-1]) / 50)
    plt.plot(step_of_each_round)
    plt.pause(0)
def run():
    step_of_each_round = []
    for i in range(round):
        print(i)
        observation = env.reset()
        env.plt('start')
        step = 0
        while True:
            observation_of_agent = []
            observation_of_agent_ = []
            for j in range(7):
                observation_of_agent.append(observation[j + 1] - observation[j])
            observation_of_agent.append(observation[0] - observation[7])

            action_list = np.array([[0., 0.], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]])
            # choose action
            action = []
            for j in range(8):
                action.append(RL[j].choose_action(observation_of_agent[j]))
                if np.linalg.norm(env.E[j]) > 0.1:
                    action_list[j] = vel[action[j]]

            # update environment
            observation_, reward, done = env.step(action_list)
            for j in range(7):
                observation_of_agent_.append(observation_[j + 1] - observation_[j])
            observation_of_agent_.append(observation_[0] - observation_[7])

            if i > 50:
                env.plt('update')

            # restore memory
            for j in range(8):
                RL[j].store_transition(observation_of_agent[j], action[j], reward[j], observation_of_agent_[j])

            if (step > 200) and (step % 5 == 0):
                for j in range(8):
                    RL[j].learn()

            if done:
                # env.plt('finish')
                # RL[1].plot_cost()
                env.plt('clean')
                break

            step = step + 1
        step_of_each_round.append(step)
    plt.ioff()
    for i in range(8):
            RL[i].plot_cost()
    plt.pause(5)
    print(sum(step_of_each_round) / round)
    plt.plot(step_of_each_round)
    plt.pause(0)
Example #6
0
def collecting_training_samples(config, mcts, env, temp):
    env.reset4train()
    terminal = False
    reward = 0
    data = []
    while not terminal:
        act, acts, act_probs = mcts.get_action(cp.deepcopy(env),
                                               temp,
                                               training=True)
        temp = [
            env.uid, env.profile,
            cp.deepcopy(env.trajectory),
            cp.deepcopy(env.cat_trajectory), env.node_type
        ]
        candidate, node_type, reward, terminal = env.step(act)
        temp.extend([act, acts, act_probs, reward])
        data.append(temp)
    for item in data:
        item[-1] = reward
    print(reward)
    return data, reward
Example #7
0
def testing():
    gamma = 0.99
    batch_sz = 32
    num_episodes = 1000
    total_t = 0
    episode_rewards = np.zeros(num_episodes)
    last_100_avgs = []

    model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model")

    with tf.Session() as sess:
        model.set_session(sess)
        sess.run(tf.compat.v1.global_variables_initializer())
        model.load()

        state = env.reset()

        acc = [1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2]
        sum_reward = 0
        for i in range(int(10e6)):
            #if i % 4 == 0:
            #    action = model.sample_action(state, 0.1)
            #else:
            action = model.sample_action(state, 0.1)
            #action = acc[i%len(acc)]
            next_state, reward, done, _ = env.step(action)
            time.sleep(0.8)
            #print(action)
            done = done == 1
            sum_reward += reward

            if done:
                obs = env.reset()
                print("Reward: {}".format(sum_reward))
                sum_reward = 0
                state = obs
            else:
                state = next_state
Example #8
0
optimizer = optim.Adam(Q.parameters(), lr=args.lr)

# TODO wrap data in dataset

for i in range(args.iterations):

    done = False
    s = env.reset()  # TODO fold into rollout

    while not done:

        epsilon = decay_exploration(i, epsilon)

        a = epsilon_greedy(s, epsilon=epsilon)

        succ, r, done, _ = env.step(a)
        replay_buffer.append([s, a, r, succ, done])

        s = succ

        if i % args.batch_size == 0 and i > 0 and len(
                replay_buffer) >= args.batch_size:

            # TODO cuda and var
            state, val = train(replay_buffer, Q)
            y = Q(state)

            optimizer.zero_grad()
            loss = criterion(y, val)
            loss.backward()
            print(loss.data[0])
Example #9
0
     # nextState.shape # (1, 13, 13, 2)
     gameOver = False
     while not gameOver:
          #Selecting an action to play
          if np.random.rand() <= epsilon:
               action = np.random.randint(0, 2)
          else:
               qvalues = model.predict(currentState)[0]
               action = np.argmax(qvalues)
          
          #Updating the epsilon and saving the model
          epsilon -= epsilonDecayRate
          epsilon = max(epsilon, minEpsilon)

          #Updating the Environment
          frame, reward, gameOver = env.step(action)
          
          if frame.shape[0] != 9:
              pass
          else:

              # increase the signal of pos reward
              if reward > 0:
                  reward = reward * 1000
              
              # frame.shape # (13,13)
              frame = np.reshape(frame, (1,env.wind, env.prices_indicators.shape[1], 1))
              # frame.shape # (1, 13, 13, 1)
              nextState = np.append(nextState, frame, axis = 3)
              # nextState.shape # (1, 13, 13, 3)
              nextState = np.delete(nextState, 0, axis = 3)
Example #10
0
def trainning():
    num_action_act = [0, 0, 0, 0, 0]
    gamma = 0.99
    batch_sz = 32
    num_episodes = 1000000
    total_t = 0
    experience_replay_buffer = []
    episode_rewards = np.zeros(num_episodes)
    last_100_avgs = []
    full_msg = ''

    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_change = (epsilon - epsilon_min) / 1000000  #500000

    model = DQN(K=K, input_shape=2 + 2 * number_enemy, scope="model")
    target_model = DQN(K=K,
                       input_shape=2 + 2 * number_enemy,
                       scope="target_model")

    with tf.Session() as sess:
        model.set_session(sess)
        target_model.set_session(sess)
        sess.run(tf.global_variables_initializer())
        model.load()

        print("Filling experience replay buffer...")
        obs = env.reset()
        state = obs
        #for i in range(MIN_EXPERIENCES):
        for i in tqdm(range(MIN_EXPERIENCES)):
            action = np.random.randint(0, K)
            num_action_act[action] += 1
            obs, reward, done, _, full_msg = env.step(action, full_msg)
            #time.sleep(0.5)
            #print(obs)
            if done == 1:
                done = True
            else:
                done = False
            next_state = obs
            experience_replay_buffer.append(
                (state, action, reward, next_state, done))

            if done:
                obs = env.reset()
                state = obs
            else:
                state = next_state

        print(num_action_act)

        for i in range(num_episodes):
            t0 = datetime.now()

            obs = env.reset()
            state = obs
            loss = None

            total_time_training = 0
            num_steps_in_episode = 0
            episode_reward = 0

            done = False
            while True:
                #for _ in range(0, MAX_STEP):
                if total_t % TARGET_UPDATE_PERIOD == 0:
                    target_model.copy_from(model)
                    print(
                        "Copied model parameters to target network, total_t = %s, period = %s"
                        % (total_t, TARGET_UPDATE_PERIOD))

                action = model.sample_action(state, epsilon)
                num_action_act[action] += 1
                time_act = datetime.now()
                obs, reward, done, _, full_msg = env.step(action, full_msg)
                time_act = datetime.now() - time_act
                if done == 1:
                    done = True
                else:
                    done = False

                next_state = obs

                episode_reward += reward

                if len(experience_replay_buffer) == MAX_EXPERIENCES:
                    experience_replay_buffer.pop(0)
                experience_replay_buffer.append(
                    (state, action, reward, next_state, done))

                t0_2 = datetime.now()
                loss = learn(model, target_model, experience_replay_buffer,
                             gamma, batch_sz)
                dt = datetime.now() - t0_2

                #Confirm
                '''
                if time_act > dt:
                    print("Java timeout")
                else:
                    print("Python timeout")
                '''
                total_time_training += dt.total_seconds()
                num_steps_in_episode += 1

                state = next_state
                total_t += 1

                epsilon = max(epsilon - epsilon_change, epsilon_min)
                if done:
                    break

            duration = datetime.now() - t0

            episode_rewards[i] = episode_reward
            time_per_step = total_time_training / num_steps_in_episode

            last_100_avg = episode_rewards[max(0, i - 100):i].mean()
            last_100_avgs.append(last_100_avg)
            #print(i)
            #print("last 100: ",last_100_avg)
            #print("reward ",episode_reward)
            #print("rewards ",episode_rewards)
            #print("")
            print("Episode:", i, "Duration:", duration, "Num steps:",
                  num_steps_in_episode, "Reward:", episode_reward,
                  "Training time per step:", "%.3f" % time_per_step,
                  "Avg Reward (last 100):", "%.3f" % last_100_avg, "Epsilon:",
                  "%.3f" % epsilon)

            if i % 50 == 0:
                model.save(i)
            sys.stdout.flush()
            if np.sum(num_action_act) > 5e6:
                break

        plt.plot(last_100_avgs)
        plt.xlabel('episodes')
        plt.ylabel('Average Rewards')
        #        plt.show()
        plt.savefig('result.png')
        print(num_action_act)
Example #11
0
                        #print(action)
                else:
                    action = getMaxAction(state_for_action)
            else:
                if not q_table[state_for_action]:
                    action = env.action_space.sample()  # Explore action space
                else:
                    action = getMaxAction(state_for_action)

        # print(action)

            return action

        action = makeAction(state)

        next_state_to_process, reward, done, info = env.step(action)  #STEP
        #print(next_state_to_process, reward, done, info)
        if done:  #DONE
            #print("DONE")
            break

        old_value = q_table[state][action]  #TODO
        next_state = makeState(next_state_to_process)
        next_action = makeAction(next_state, False)
        next_max = q_table[next_state][next_action]  #TODO

        new_value = (1 - alpha) * old_value + alpha * (reward +
                                                       gamma * next_max)
        q_table[state][action] = new_value

        # if reward == -10:
Example #12
0
def trainning():
    gamma = 0.99
    batch_sz = 32
    num_episodes = 1000000
    total_t = 0
    experience_replay_buffer = []
    episode_rewards = []
    last_100_avgs = []
    max_eps = [-sys.maxsize]

    epsilon = 1.0
    epsilon_min = 0.01
    epsilon_change = (epsilon - epsilon_min) / 1000000  #500000

    model = DQN(K=K, input_shape=4 + 3 * number_enemy, scope="model")
    target_model = DQN(K=K,
                       input_shape=4 + 3 * number_enemy,
                       scope="target_model")

    with tf.compat.v1.Session() as sess:
        model.set_session(sess)
        target_model.set_session(sess)
        sess.run(tf.compat.v1.global_variables_initializer())
        model.load()

        print("Filling experience replay buffer...")
        state = env.reset()

        #for i in range(MIN_EXPERIENCES):
        for i in tqdm(range(MIN_EXPERIENCES)):
            action = np.random.randint(0, K)
            state, reward, done, _ = env.step(action)

            done = done == 1
            #time.sleep(0.5)
            #print(obs)

            next_state = state
            experience_replay_buffer.append(
                (state, action, reward, next_state, done))

            if done:
                state = env.reset()
            else:
                state = next_state

        try:
            i = 0
            #for i in range(num_episodes):
            while True:
                i += 1
                state = env.reset()
                loss = None
                num_steps_in_episode = 0
                episode_reward = 0
                done = False

                for _ in range(MAX_STEP):
                    #while True:
                    if total_t % TARGET_UPDATE_PERIOD == 0:
                        target_model.copy_from(model)
                        print(
                            "Copied model parameters to target network, total_t = %s, period = %s"
                            % (total_t, TARGET_UPDATE_PERIOD))

                    action = model.sample_action(state, epsilon)
                    next_state, reward, done, _ = env.step(action)
                    done = done == 1
                    episode_reward += reward

                    if len(experience_replay_buffer) == MAX_EXPERIENCES:
                        experience_replay_buffer.pop(0)
                    experience_replay_buffer.append(
                        (state, action, reward, next_state, done))

                    loss = learn(model, target_model, experience_replay_buffer,
                                 gamma, batch_sz)

                    num_steps_in_episode += 1

                    state = next_state
                    total_t += 1

                    epsilon = max(epsilon - epsilon_change, epsilon_min)
                    if done:
                        break
                #if not done:
                #    episode_reward-=100

                episode_rewards.append(episode_reward)  #Reward every eps
                last_100_avg = np.array(episode_rewards[max(0, i -
                                                            100):i]).mean()
                last_100_avgs.append(last_100_avg)  #Avg reward every eps
                max_eps.append(max(max_eps[-1], last_100_avg))  #Max eps

                print(
                    "Episode: {:>6}, Num steps: {:>3}, Reward: {:>8.3f}, Avg reward: {:>5.3f}, Max: {:>5.3f} Eps: {:>5.3f}"
                    .format(i, num_steps_in_episode, episode_reward,
                            last_100_avg, max_eps[-1], epsilon))
                if i % 100 == 0:
                    model.save(i)
                sys.stdout.flush()
                if total_t > NUM_FRAME:
                    break
        except:
            print("Break")
        finally:
            max_eps.pop(0)
            data = pd.DataFrame({
                'Reward': episode_rewards,
                'Avg Reward': last_100_avgs,
                'Max': max_eps
            })
            data.to_csv("./data_result.csv")

            figure(num=None,
                   figsize=(15, 8),
                   dpi=80,
                   facecolor='w',
                   edgecolor='k')

            plt.plot('Reward',
                     '--',
                     color="#999999",
                     data=data,
                     label="Reward")
            plt.plot('Avg Reward', data=data, label="Avg Reward")
            plt.plot('Max', data=data, label="Max")
            plt.legend(loc="upper left")

            plt.xlabel('episodes')
            #plt.show()
            plt.savefig('result.png')
Example #13
0
    agent = Agent(num_nodes=env.graph_size)

    scores = []

    for i in range(num_eposides):
        score = 0
        num_nodes, mu, edge_index, edge_w, state, done = env.reset()

        state_steps = [state]
        reward_steps = []
        action_steps = []
        steps_cntr = 0

        while not done[0]:
            action = agent.choose_action(mu, edge_index, edge_w, state)
            _, _, _, reward, new_state, done = env.step(action)

            state_steps.append(new_state)
            reward_steps.append(reward)
            action_steps.append(action)
            steps_cntr += 1

            if steps_cntr > n_step + 1:
                agent.remember(num_nodes, mu, edge_index, edge_w,
                               state_steps[-(n_step + 1)],
                               action_steps[-n_step],
                               [sum(reward_steps[-n_step:])], state_steps[-1],
                               done)
                agent.learn()

            state = new_state