コード例 #1
0
def train_new_model():
    game_env = PongGame()  # init env
    agent = DqnAgent(game_env)  # init agent

    # input from user
    print("Enter num of iteration:")
    while True:
        try:
            Dqn.total_num_of_training = int(input())
            break
        except ValueError:
            print("That's not an integer!")
            continue

    # create new file
    output_directory = JP.create_results_directory()
    file_name = output_directory + "/data_file.txt"

    checkpoint_dir = JP.create_checkpoints_directory(output_directory)
    print(checkpoint_dir)

    saver = tf.train.Saver()

    # call train model
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())  # for initialized
        Dqn.run_dqn(sess, game_env, agent, output_directory, file_name, saver,
                    checkpoint_dir)
コード例 #2
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        self.replay_mem = deque()
        self.last_scores = deque()
コード例 #3
0
ファイル: main.py プロジェクト: viktorvesely/Agent01
def main():
    env = gym.make('Acrobot-v1')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space.sample())
    num_actions = env.action_space.n
    hidden_units = [64, 64]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    iter_per_episode = 300

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size)
    N = 50
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.08
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step, iter_per_episode)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        if n % 5 == 0:
            print("Progress:", int(n/N*100), "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards)
    print("avg reward for last 100 episodes:", avg_rewards)
    make_video(env, TrainNet, 300)
    env.close()
コード例 #4
0
	def initialize_policies(self):
		self.Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
		self.policy_net_agent = DQN(n_feature = self._state_dim)
		self.policy_net_agent.double()
		self.target_net_agent = DQN(n_feature = self._state_dim)
		self.target_net_agent.double()
		self.target_net_agent.load_state_dict(self.policy_net_agent.state_dict())
コード例 #5
0
def main():

    h, s, v = rgb2hsv(201, 204, 214)
    print(h, s, v)
    r, g, b = hsv2rgb(h, s, v*0.7)
    print(r, g, b)

    dump_device_info()
    check_adb()
    n = 0
    while True:
        pull_screenshot()
        im = Image.open('./autojump.png')
        # 获取棋子和 board 的位置

        piece_x, piece_y, board_x, board_y = find_piece_and_board(im)
        ts = int(time.time())
        print(ts, piece_x, piece_y, board_x, board_y)
        set_button_position(im)
        jump(math.sqrt((board_x - piece_x) ** 2 + (board_y - piece_y) ** 2))
        # save_debug_creenshot(ts, im, piece_x, piece_y, board_x, board_y)
        # backup_screenshot(ts)
        DQN.save_pic(np.asarray(im), str(n))

        with open('time.txt','a+') as f:
            f.write('%d,%f' % (n, math.sqrt((board_x - piece_x) ** 2 + (board_y - piece_y) ** 2)))
        time.sleep(random.uniform(1.2, 1.4))  # 为了保证截图的时候应落稳了,多延迟一会儿
        n += 1
コード例 #6
0
ファイル: test_DQN.py プロジェクト: noname72/MLFYP_Project
    def setUp(self):
        self.env = DQN.env
        (self.player_states, (self.community_infos,
                              self.community_cards)) = self.env.reset()
        (self.player_infos, self.player_hands) = zip(*self.player_states)
        self.current_state = ((self.player_infos, self.player_hands),
                              (self.community_infos, self.community_cards))
        self.state = DQN.create_np_array(self.player_infos, self.player_hands,
                                         self.community_cards,
                                         self.community_infos)
        self.state_set = utilities.convert_list_to_tupleA(
            self.player_states[self.env.learner_bot.get_seat()],
            self.current_state[1])
        self._round = utilities.which_round(self.community_cards)
        self.current_player = self.community_infos[-3]
        self.learner_bot, self.villain = self.env.learner_bot, self.env.villain
        Q = defaultdict(lambda: np.zeros(self.env.action_space.n))
        self.agent = DQN.DQNAgent(DQN.state_size,
                                  DQN.action_size)  # initialise agent

        self.policy = DQN.make_epsilon_greedy_policy(Q, self.agent.epsilon,
                                                     self.env.action_space.n)
        self.villain_action = DQN.get_action_policy(
            self.player_infos, self.community_infos, self.community_cards,
            self.env, self._round, self.env.n_seats, self.state_set,
            self.policy, self.villain)
        self.learner_action = self.agent.act(self.state, self.player_infos,
                                             self.community_infos,
                                             self.community_cards, self.env,
                                             self._round, self.env.n_seats,
                                             self.state_set, self.policy)
コード例 #7
0
ファイル: agent.py プロジェクト: fhtanaka/TCC2018
    def __init__(self, config, device, model=False):

        self.device = device

        self.board_size = config.board_size

        self.eps_end = config.eps_end
        self.eps_start = config.eps_start
        self.eps_end = config.eps_end
        self.eps_decay = config.eps_decay
        self.gamma = config.gamma
        self.batch_size = config.batch_size

        # This part is for the network
        if (model != False):
            self.policy_net = torch.load(model)
        else:
            self.policy_net = DQN(config).to(device)

        # Be aware that the config must be the exact same for the loaded model
        self.target_net = DQN(config).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       momentum=config.momentum,
                                       lr=config.lr)
        self.criterion = torch.nn.SmoothL1Loss()
        self.memory = ReplayMemory(config.replay_memory)
        self.steps_done = 0
コード例 #8
0
ファイル: random_test.py プロジェクト: tomstream/orienteer
def f3(argv):
    state_dict, device, datas, data2s = argv
    if 'dqn' in args.model:
        if 'duel' in args.model:
            model = DQN.GraphNet(hidden_size=args.hidden_size,
                                 n_head=8,
                                 nlayers=4,
                                 duel_dqn=True)
        else:
            model = DQN.GraphNet(hidden_size=args.hidden_size,
                                 n_head=8,
                                 nlayers=4,
                                 duel_dqn=False)
    elif 'IL' in args.model:
        model = model_gnn.GraphNet()
    elif 'RL' in args.model:
        model = A2C.GraphNet(n_head=4, nlayers=2)

    model.load_state_dict(state_dict)
    model = model.to(device)
    model.eval()
    torch.no_grad()
    ret = []
    for data, data2 in zip(datas, data2s):
        ret.append(
            dqn_schedule(model, [data, data2],
                         device,
                         plan_limit=args.planlimit))
    return ret
コード例 #9
0
ファイル: Demo.py プロジェクト: Sriram-Ravula/RL-Music-Tuner
def run_demo(num_samples, model):
    samples = None

    if model == "Note_CNN":
        weights = "NOTE_CNN_WEIGHTS_400.pt"
        samples = nrnn.generate_samples_NoteCNN(weights, 32, 10, num_samples)

    elif model == "0.01":
        weights = "Q_400-500000.pt"
        samples = DQN.generate_sample(weights, 32, 10, num_samples)

    elif model == "0.05":
        weights = "Q_500-100000.pt"
        samples = DQN.generate_sample(weights, 32, 10, num_samples)

    elif model == "0.1":
        weights = "Q-500000.pt"
        samples = DQN.generate_sample(weights, 32, 10, num_samples)

    elif model == "0.3":
        weights = "Q_300-500000.pt"
        samples = DQN.generate_sample(weights, 32, 10, num_samples)

    elif model == "0.5":
        weights = "Q_200-500000.pt"
        samples = DQN.generate_sample(weights, 32, 10, num_samples)
    else:
        print("Invalid model parameter! Try again")

    for i in range(num_samples):
        oh.one_hot_to_midi(samples[i],
                           midi_filename='demo_song-' + str(i) + '.mid')

    return None
コード例 #10
0
 def DQN(observation_shape, action_shape, **params):
     if params.get('noisy', False):
         net = dqn.NoisyDQN(observation_shape, action_shape)
     else:
         net = dqn.DQN(observation_shape, action_shape)
     if params.get('target', False):
         net = dqn.DQNT(net, params['double'])
     return net.to(params.get('device', 'cpu'))
コード例 #11
0
ファイル: dqnAgents.py プロジェクト: davide97l/Pacman
    def __init__(self, width, height, numTraining=0):

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = width  # Maze width
        self.params['height'] = height  # Maze height
        self.params[
            'num_training'] = numTraining  # Number of games used for training

        # create saves and logs directory
        if not os.path.exists("saves/DQN/"):
            os.makedirs("saves/DQN/")
        if not os.path.exists("logs/"):
            os.makedirs("logs/")

        # get saves directory
        if params["load_file"] is not None and not params[
                "load_file"].startswith("saves/DQN/"):
            params["load_file"] = "saves/DQN/" + params["load_file"]

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)  # create DQN

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        self.Q_global = []  # max Q-values in the current game
        self.cost_disp = 0  # current loss

        self.cnt = self.qnet.sess.run(
            self.qnet.global_step
        )  # number of steps the model has been trained so far
        self.local_cnt = 0  # number of total steps the algorithm has run

        self.numeps = 0  # current episode
        if params["load_file"] is not None:
            self.numeps = int(params["load_file"].split("_")[-1])
        self.last_score = 0  # Score in the last step
        self.s = time.time()  # time elapsed since beginning of training
        self.last_reward = 0.  # Reward obtained in the last step

        self.replay_mem = deque()  # replay memory used for training

        self.terminal = False  # True if the game in a terminal state

        self.last_score = 0  # Score obtained in the last state
        self.current_score = 0  # Score obtained in the current state
        self.last_reward = 0.  # Reward obtained in the last state
        self.ep_rew = 0  # Cumulative reward obtained in the current game

        self.last_state = None  # Last state
        self.current_state = None  # Current state
        self.last_action = None  # Last action
        self.won = True  # True if the game has been won
        self.delay = 0
        self.frame = 0
コード例 #12
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.qnet = DQN(self.params)

        # Summary Writer
        self.summary = tf.Summary()
        self.wins = deque(maxlen=100)
        self.episodesSoFar = 0

        print(args)

        if (params['save_file']):
            self.writer = tf.summary.FileWriter('logs/model-' +
                                                params['save_file'],
                                                graph=tf.Session().graph)

        self.replay_mem = None

        if (params['load_file']):
            try:
                with open('memories/model-' + params['load_file'], 'r') as f:
                    self.replay_mem = pickle.load(f)
            except:
                pass

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S",
                                                 time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        if not self.replay_mem:
            self.replay_mem = deque()
        self.last_scores = deque()
コード例 #13
0
    def __init__(self, load_from_previous_model):
        self.policy_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device)
        self.target_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=LEARNING_RATE)
        self.replayMemory = ReplayMemory(10000)

        if load_from_previous_model:
            self.load_model()
コード例 #14
0
 def __init__(self):
     env = gym.envs.make("PongDeterministic-v4")
     self.Q_target = DQN.Mynet(env.observation_space, env.action_space).to(device)
     self.Q_policy = DQN.Mynet(env.observation_space, env.action_space).to(device)
     self.Q_target.load_state_dict(self.Q_policy.state_dict())
     self.Q_target.eval()
     self.env = env
     self.pool = DQN.ReplyMemory(15000)
     self.gramma = GRAMMA
     self.alpha = ALPHA
     self.epsilon = EPSILON
     self.ImageProcess = DQN.ImageProcess()
コード例 #15
0
def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)

    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)
        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    return DQN.update(x_stack, y_stack)
コード例 #16
0
 def initialize_policies(self):
     self.Transition = namedtuple(
         'Transition', ('state', 'action', 'next_state', 'reward'))
     self.policy_net_agent = DQN(n_feature=self._state_dim)
     self.policy_net_agent.double()
     self.target_net_agent = DQN(n_feature=self._state_dim)
     self.target_net_agent.double()
     self.target_net_agent.load_state_dict(
         self.policy_net_agent.state_dict())
     self.optimizer_agent = optim.RMSprop(
         self.policy_net_agent.parameters(),
         lr=self._lr,
         weight_decay=self._weight_decay)
コード例 #17
0
ファイル: Atari.py プロジェクト: joshgreaves/CS478_DQN
def main():
    env = gym.make("Boxing-v0")
    height = 84
    width = 84
    channels = 4
    num_actions = 18
    dqn = DQN(AtariNetwork(height, width, channels),
              height * width,
              num_actions,
              epsilon=1.0,
              epsilon_decay=0.999,
              num_stacked=channels,
              learning_rate=0.1)
    memory = MemoryReplay(height * width,
                          num_actions,
                          max_saved=10000,
                          num_stacked=channels)

    for epoch in tqdm(range(1000)):

        # Gain experience
        for _ in range(1):
            s = env.reset()
            s = preprocess(s)
            s = np.array([s, s, s, s])
            for i in range(100):
                # if epoch % 5 == 0:
                #     env.render()
                a = dqn.select_action(np.reshape(s, [1, -1]))
                s_prime, r, t, _ = env.step(np.argmax(a))
                s_prime = preprocess(s_prime)
                s_prime = np.roll(s, 1, axis=0)
                s_prime[0] = np.maximum(s_prime[1], s_prime[0])
                memory.add(s.reshape([-1]), a, r - 1, s_prime.reshape([-1]), t)
                s = s_prime

                if t:
                    break

        #print(epoch, ": ", total_reward)

        # Train on that experience
        # for i in range(min((epoch + 1) * 5, 250)):
        for i in range(25):
            dqn.train(*memory.get_batch())

        dqn.reassign_target_weights()

        if (epoch + 1) % 25 == 0:
            s = env.reset()
            s = preprocess(s)
            s = np.array([s, s, s, s])
            for i in range(100):
                a = dqn.select_greedy_action(np.reshape(s, [1, -1]))
                env.render()
                s_prime, _, t, _ = env.step(np.argmax(a))
                s = np.roll(s, 1, axis=0)
                s[0] = preprocess(s_prime)
                if t:
                    break
コード例 #18
0
    def build_net(self):
        print('Building QNet and targetnet...')
        self.qnet = DQN(self.params, 'qnet', self.params['TB_logpath'])
        self.targetnet = DQN(self.params, 'targetnet',
                             self.params['TB_logpath'])
        self.sess.run(tf.global_variables_initializer())
        saver_dict = {
            'qw1': self.qnet.w1,
            'qb1': self.qnet.b1,
            'qw2': self.qnet.w2,
            'qb2': self.qnet.b2,
            'qw3': self.qnet.w3,
            'qb3': self.qnet.b3,
            'qw4': self.qnet.w4,
            'qb4': self.qnet.b4,
            'qw5': self.qnet.w5,
            'qb5': self.qnet.b5,
            'tw1': self.targetnet.w1,
            'tb1': self.targetnet.b1,
            'tw2': self.targetnet.w2,
            'tb2': self.targetnet.b2,
            'tw3': self.targetnet.w3,
            'tb3': self.targetnet.b3,
            'tw4': self.targetnet.w4,
            'tb4': self.targetnet.b4,
            'tw5': self.targetnet.w5,
            'tb5': self.targetnet.b5,
            'step': self.qnet.global_step
        }
        self.saver = tf.train.Saver(saver_dict)
        self.cp_ops = [
            self.targetnet.w1.assign(self.qnet.w1),
            self.targetnet.b1.assign(self.qnet.b1),
            self.targetnet.w2.assign(self.qnet.w2),
            self.targetnet.b2.assign(self.qnet.b2),
            self.targetnet.w3.assign(self.qnet.w3),
            self.targetnet.b3.assign(self.qnet.b3),
            self.targetnet.w4.assign(self.qnet.w4),
            self.targetnet.b4.assign(self.qnet.b4),
            self.targetnet.w5.assign(self.qnet.w5),
            self.targetnet.b5.assign(self.qnet.b5)
        ]
        self.sess.run(self.cp_ops)

        if self.params['ckpt_file'] is not None:
            print('\x1b[1;30;41m RUN LOAD \x1b[0m')
            self.load()

        print('Networks had been built!')
        sys.stdout.flush()
コード例 #19
0
def outCome():
	num = {};
	num["rawoutcome"] = request.args.get("outcome")
	num["outcome"] = json.loads(num["rawoutcome"])
	num["state"]  = num["outcome"]["state"]
	num["next_state"] =num["outcome"]["next_state"]
	num["reward"] = num["outcome"]["reward"]
	num["done"] = num["outcome"]["done"]
	num["action"]  = num["outcome"]["action"]
	DQN.remember(agent,num["state"] , num["action"], num["reward"], num["next_state"], num["done"] )
	response = jsonify(num)
	response.headers.add('Access-Control-Allow-Origin', '*')
	return response;
	pass
コード例 #20
0
ファイル: main.py プロジェクト: flydsc/tensorflow_practice
def main():
    env = gym.make(ENV_NAME)
    agent = DQN.DQN(env)
    for episode in range(EPISODE):
        state = env.reset()
        # train
        for step in range(STEP):
            action = agent.egreedy_action(state)
            next_state, reward, done, _ = env.step(action)
            #Define reward
            reward_agent = -1 if done else 0.1
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                env.render()
                action = agent.action(state)
                state, reward, done, _ = env.step(action)
                total_reward += reward
                if done:
                    break
            ave_reward = total_reward / TEST
            print 'episode', episode, 'Evaluation Average Reward:', ave_reward
            if ave_reward >= 200:
                break
コード例 #21
0
ファイル: main.py プロジェクト: antonstagge/onepong
def ai_play(swap_network, SAVE_NAME):
    if swap_network:
        print("Swapped")
        neural_net = deep_neural_network.network(N_IN,
                                                 HIDDEN,
                                                 N_OUT,
                                                 True,
                                                 saveName=(SAVE_NAME +
                                                           "_target"))
    else:
        neural_net = deep_neural_network.network(N_IN,
                                                 HIDDEN,
                                                 N_OUT,
                                                 True,
                                                 saveName=SAVE_NAME)
    # player False and draw True
    pong = PlayPong(False, True)
    done = False
    grow = True
    while not done:
        obs = pong.get_observation()
        action = DQN.act(neural_net, obs, training=False)
        draw_neural_net.draw(pong.screen, grow, obs, neural_net.hidden[0],
                             neural_net.outputs[0])
        grow = False
        done = pong.play_one_pong(action)
    print(" GAME OVER!!\AI scored %d points" % pong.state.points)
コード例 #22
0
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0     

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        self.replay_mem = deque()
        self.last_scores = deque()
コード例 #23
0
    def __init__(self, restore=False):
        sess_conf = DQN.tf.ConfigProto()
        sess_conf.gpu_options.allow_growth = True
        self.sess = DQN.tf.Session(config=sess_conf)

        self.Q_main = DQN.DQN(self.sess, name="main")
        self.Q_target = DQN.DQN(self.sess, name="target")

        self.sess.run(DQN.tf.global_variables_initializer())
        self.copy_ops = DQN.get_copy_var_ops(dest_scope_name="target",
                                             src_scope_name="main")
        self.copy()

        if restore:
            self.restore()
            self.copy()
コード例 #24
0
def simpleReplayTrain(DQN, trainBatch):
    xStack = np.empty(0).reshape(0, 45)
    yStack = np.empty(0).reshape(0, 161)

    for state, action, reward, nextState, done in trainBatch:
        Q = DQN.predict(state)

        if not done:
            Q[0, action - 10] = reward
        else:
            Q[0,
              action - 10] = reward + dis * np.argmax(DQN.predict(nextState))

        xStack = np.vstack([xStack, state])
        yStack = np.vstack([yStack, Q])

    return DQN.update(xStack, yStack)
コード例 #25
0
 def step(self, a, t, context, env, val_model, targ_model):
     actions = torch.transpose(
         torch.Tensor(self.action_list(a, val_model, targ_model)), 0, 1)
     context_size = context.shape[0]
     if str(a) not in self.QLearning_Buffer.keys():
         self.QLearning_Buffer[str(a)] = DQN.Q_Learning(0.5,
                                                        0.99,
                                                        val_model,
                                                        targ_model,
                                                        actions,
                                                        context_size,
                                                        history_len=1)
     reward, self.q_learning_rewards = DQN.ql(env,
                                              self.QLearning_Buffer[str(a)],
                                              context[:, t], t,
                                              self.q_learning_rewards)
     return reward
コード例 #26
0
    def __init__(self):
        self.graph = Graph()
        self.actionSpace = []
        self.RL = DQN(self.action_size, self.feature_size, output_graph=True)
        self.requests = []
        self.max_request = 20000

        #get the action spaces
        for cl in self.graph.cloudlets:
            for operate in [0, 1, 2]:
                action = (cl, operate)
                self.actionSpace.append(action)
        self.action_size = len(self.actionSpace)

        #get the feature size
        self.feature_size = 5 + self.graph.cloudlet_number * self.graph.web_function_number * 2  #dimension number of the state
        '''
コード例 #27
0
def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)

    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)

        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    # Train our network using target and predicted Q values on each episode
    return DQN.update(x_stack, y_stack)
コード例 #28
0
def act():
	num = {};
	num["rawState"] = request.args.get("state")
	num["state"] = json.loads(num["rawState"])
	print(num["state"])
	num["action"] = DQN.act(agent,num["state"])
	response = jsonify(num)
	response.headers.add('Access-Control-Allow-Origin', '*')
	return response;
コード例 #29
0
    def __init__(self, learning_rate=1e-2, restore=False, name="main"):
        self.sess = DQN.tf.Session()
        self.action_value = DQN.DQN(self.sess,
                                    learning_rate=learning_rate,
                                    name=name)
        self.sess.run(DQN.tf.global_variables_initializer())

        if restore:
            self.restore()
コード例 #30
0
def main(weights_name, video_name=None, get_image=False):
    env = DQN.Environment(render=True, sigma=0.02, down=1.0, get_image=get_image)
    s_size = env.env.s_size

    agent = DQN.Agent(s_size=s_size)
    agent.network.model.load_weights("data/" + weights_name + ".h5", by_name=True)
    print("model loaded")

    for _ in range(3):
        s = time.time()

        if video_name:
            env.record("data/mov/" + video_name + ".mp4")

        step = env.replay(agent.policy)
        print("unicycle lasted {} steps and {:2f} seconds.".format(step, step/30))
        print("time = {}".format(time.time() - s))
    env.close()
コード例 #31
0
def main():
    max_episodes = 1000
    replay_buffer = deque()
    with tf.compat.v1.Session() as sess:
        mainDQN = DQN.DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN.DQN(sess, input_size, output_size, name="target")
        tf.compat.v1.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                state = next_state
                step_count += 1
                if step_count > 10000:
                    break
            print("Episode: {} steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print("Loss : ", loss)
                bot_play(mainDQN)
                sess.run(copy_ops)
        bot_play(mainDQN)
コード例 #32
0
ファイル: main.py プロジェクト: mrkulk/deepQN_tensorflow
 def build_nets(self):
     print "Building QNet and Targetnet..."
     self.qnet = DQN(self.params)
コード例 #33
0
ファイル: main.py プロジェクト: mrkulk/deepQN_tensorflow
class deep_atari:
    def __init__(self, params):
        print "Initializing Module..."
        self.params = params
        self.sess = tf.Session()
        self.DB = database(self.params["db_size"], self.params["input_dims_proc"])
        self.engine = emulator(rom_name="breakout.bin", vis=True)
        self.params["num_act"] = len(self.engine.legal_actions)
        self.build_nets()
        self.Q_global = 0
        self.cost_disp = 0

    def build_nets(self):
        print "Building QNet and Targetnet..."
        self.qnet = DQN(self.params)

    def start(self):
        print "Start training..."
        cnt = self.qnet.sess.run(self.qnet.global_step)
        print "Global step = " + str(cnt)
        local_cnt = 0
        s = time.time()
        for numeps in range(self.params["num_episodes"]):
            self.Q_global = 0
            state_proc = np.zeros((84, 84, 4))
            state_proc_old = None
            action = None
            terminal = None
            delay = 0
            state = self.engine.newGame()
            state_resized = cv2.resize(state, (84, 110))
            state_gray = cv2.cvtColor(state_resized, cv2.COLOR_BGR2GRAY)
            state_proc[:, :, 3] = state_gray[26:110, :] / 255.0
            total_reward_ep = 0
            for maxl in range(self.params["episode_max_length"]):
                if state_proc_old is not None:
                    self.DB.insert(state_proc_old[:, :, 3], reward, action, terminal)
                action = self.perceive(state_proc, terminal)
                if action == None:  # TODO - check [terminal condition]
                    break
                if local_cnt > self.params["train_start"] and local_cnt % self.params["learning_interval"] == 0:
                    bat_s, bat_a, bat_t, bat_n, bat_r = self.DB.get_batches(self.params["batch"])
                    bat_a = self.get_onehot(bat_a)
                    cnt, self.cost_disp = self.qnet.train(bat_s, bat_a, bat_t, bat_n, bat_r)
                if local_cnt > self.params["train_start"] and local_cnt % self.params["save_interval"] == 0:
                    self.qnet.save_ckpt("ckpt/model_" + str(cnt))
                    print "Model saved"

                state_proc_old = np.copy(state_proc)
                state, reward, terminal = self.engine.next(action)  # IMP: newstate contains terminal info
                state_resized = cv2.resize(state, (84, 110))
                state_gray = cv2.cvtColor(state_resized, cv2.COLOR_BGR2GRAY)
                state_proc[:, :, 0:3] = state_proc[:, :, 1:4]
                state_proc[:, :, 3] = state_gray[26:110, :] / 255.0
                total_reward_ep = total_reward_ep + reward
                local_cnt += 1
                # params['eps'] =0.05
                self.params["eps"] = max(0.1, 1.0 - float(cnt) / float(self.params["eps_step"]))
                # self.params['eps'] = 0.00001

            sys.stdout.write(
                "Epi: %d | frame: %d | train_step: %d | time: %f | reward: %f | eps: %f "
                % (numeps, local_cnt, cnt, time.time() - s, total_reward_ep, self.params["eps"])
            )
            sys.stdout.write("| max_Q: %f\n" % (self.Q_global))
            # sys.stdout.write("%f, %f, %f, %f, %f\n" % (self.t_e[0],self.t_e[1],self.t_e[2],self.t_e[3],self.t_e[4]))
            sys.stdout.flush()

    def select_action(self, state):
        if np.random.rand() > self.params["eps"]:
            # greedy with random tie-breaking
            Q_pred = self.qnet.sess.run(
                self.qnet.y,
                feed_dict={
                    self.qnet.x: np.reshape(state, (1, 84, 84, 4)),
                    self.qnet.q_t: np.zeros(1),
                    self.qnet.actions: np.zeros((1, self.params["num_act"])),
                    self.qnet.terminals: np.zeros(1),
                    self.qnet.rewards: np.zeros(1),
                },
            )[
                0
            ]  # TODO check
            self.Q_global = max(self.Q_global, np.amax(Q_pred))
            a_winner = np.argwhere(Q_pred == np.amax(Q_pred))
            if len(a_winner) > 1:
                return self.engine.legal_actions[a_winner[np.random.randint(0, len(a_winner))][0]]
            else:
                return self.engine.legal_actions[a_winner[0][0]]
        else:
            # random
            return self.engine.legal_actions[np.random.randint(0, len(self.engine.legal_actions))]

    def perceive(self, newstate, terminal):
        if not terminal:
            action = self.select_action(newstate)
            return action

    def get_onehot(self, actions):
        actions_onehot = np.zeros((self.params["batch"], self.params["num_act"]))
        for i in range(len(actions)):
            actions_onehot[i][self.engine.action_map[int(actions[i])]] = 1
        return actions_onehot
コード例 #34
0
class PacmanDQN(game.Agent):
    def __init__(self, args):

        print("Initialise DQN Agent")

        # Load parameters from user-given arguments
        self.params = params
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']

        # Start Tensorflow session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        self.sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
        self.qnet = DQN(self.params)

        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime())
        # Q and cost
        self.Q_global = []
        self.cost_disp = 0     

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step)
        self.local_cnt = 0

        self.numeps = 0
        self.last_score = 0
        self.s = time.time()
        self.last_reward = 0.

        self.replay_mem = deque()
        self.last_scores = deque()


    def getMove(self, state):
        # Exploit / Explore
        if np.random.rand() > self.params['eps']:
            # Exploit action
            self.Q_pred = self.qnet.sess.run(
                self.qnet.y,
                feed_dict = {self.qnet.x: np.reshape(self.current_state,
                                                     (1, self.params['width'], self.params['height'], 6)), 
                             self.qnet.q_t: np.zeros(1),
                             self.qnet.actions: np.zeros((1, 4)),
                             self.qnet.terminals: np.zeros(1),
                             self.qnet.rewards: np.zeros(1)})[0]

            self.Q_global.append(max(self.Q_pred))
            a_winner = np.argwhere(self.Q_pred == np.amax(self.Q_pred))

            if len(a_winner) > 1:
                move = self.get_direction(
                    a_winner[np.random.randint(0, len(a_winner))][0])
            else:
                move = self.get_direction(
                    a_winner[0][0])
        else:
            # Random:
            move = self.get_direction(np.random.randint(0, 4))

        # Save last_action
        self.last_action = self.get_value(move)

        return move

    def get_value(self, direction):
        if direction == Directions.NORTH:
            return 0.
        elif direction == Directions.EAST:
            return 1.
        elif direction == Directions.SOUTH:
            return 2.
        else:
            return 3.

    def get_direction(self, value):
        if value == 0.:
            return Directions.NORTH
        elif value == 1.:
            return Directions.EAST
        elif value == 2.:
            return Directions.SOUTH
        else:
            return Directions.WEST
            
    def observation_step(self, state):
        if self.last_action is not None:
            # Process current experience state
            self.last_state = np.copy(self.current_state)
            self.current_state = self.getStateMatrices(state)

            # Process current experience reward
            self.current_score = state.getScore()
            reward = self.current_score - self.last_score
            self.last_score = self.current_score

            if reward > 20:
                self.last_reward = 50.    # Eat ghost   (Yum! Yum!)
            elif reward > 0:
                self.last_reward = 10.    # Eat food    (Yum!)
            elif reward < -10:
                self.last_reward = -500.  # Get eaten   (Ouch!) -500
                self.won = False
            elif reward < 0:
                self.last_reward = -1.    # Punish time (Pff..)

            
            if(self.terminal and self.won):
                self.last_reward = 100.
            self.ep_rew += self.last_reward

            # Store last experience into memory 
            experience = (self.last_state, float(self.last_reward), self.last_action, self.current_state, self.terminal)
            self.replay_mem.append(experience)
            if len(self.replay_mem) > self.params['mem_size']:
                self.replay_mem.popleft()

            # Save model
            if(params['save_file']):
                if self.local_cnt > self.params['train_start'] and self.local_cnt % self.params['save_interval'] == 0:
                    self.qnet.save_ckpt('saves/model-' + params['save_file'] + "_" + str(self.cnt) + '_' + str(self.numeps))
                    print('Model saved')

            # Train
            self.train()

        # Next
        self.local_cnt += 1
        self.frame += 1
        self.params['eps'] = max(self.params['eps_final'],
                                 1.00 - float(self.cnt)/ float(self.params['eps_step']))


    def observationFunction(self, state):
        # Do observation
        self.terminal = False
        self.observation_step(state)

        return state

    def final(self, state):
        # Next
        self.ep_rew += self.last_reward

        # Do observation
        self.terminal = True
        self.observation_step(state)

        # Print stats
        log_file = open('./logs/'+str(self.general_record_time)+'-l-'+str(self.params['width'])+'-m-'+str(self.params['height'])+'-x-'+str(self.params['num_training'])+'.log','a')
        log_file.write("# %4d | steps: %5d | steps_t: %5d | t: %4f | r: %12f | e: %10f " %
                         (self.numeps,self.local_cnt, self.cnt, time.time()-self.s, self.ep_rew, self.params['eps']))
        log_file.write("| Q: %10f | won: %r \n" % ((max(self.Q_global, default=float('nan')), self.won)))
        sys.stdout.write("# %4d | steps: %5d | steps_t: %5d | t: %4f | r: %12f | e: %10f " %
                         (self.numeps,self.local_cnt, self.cnt, time.time()-self.s, self.ep_rew, self.params['eps']))
        sys.stdout.write("| Q: %10f | won: %r \n" % ((max(self.Q_global, default=float('nan')), self.won)))
        sys.stdout.flush()

    def train(self):
        # Train
        if (self.local_cnt > self.params['train_start']):
            batch = random.sample(self.replay_mem, self.params['batch_size'])
            batch_s = [] # States (s)
            batch_r = [] # Rewards (r)
            batch_a = [] # Actions (a)
            batch_n = [] # Next states (s')
            batch_t = [] # Terminal state (t)

            for i in batch:
                batch_s.append(i[0])
                batch_r.append(i[1])
                batch_a.append(i[2])
                batch_n.append(i[3])
                batch_t.append(i[4])
            batch_s = np.array(batch_s)
            batch_r = np.array(batch_r)
            batch_a = self.get_onehot(np.array(batch_a))
            batch_n = np.array(batch_n)
            batch_t = np.array(batch_t)

            self.cnt, self.cost_disp = self.qnet.train(batch_s, batch_a, batch_t, batch_n, batch_r)


    def get_onehot(self, actions):
        """ Create list of vectors with 1 values at index of action in list """
        actions_onehot = np.zeros((self.params['batch_size'], 4))
        for i in range(len(actions)):                                           
            actions_onehot[i][int(actions[i])] = 1      
        return actions_onehot   

    def mergeStateMatrices(self, stateMatrices):
        """ Merge state matrices to one state tensor """
        stateMatrices = np.swapaxes(stateMatrices, 0, 2)
        total = np.zeros((7, 7))
        for i in range(len(stateMatrices)):
            total += (i + 1) * stateMatrices[i] / 6
        return total

    def getStateMatrices(self, state):
        """ Return wall, ghosts, food, capsules matrices """ 
        def getWallMatrix(state):
            """ Return matrix with wall coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            grid = state.data.layout.walls
            matrix = np.zeros((height, width), dtype=np.int8)
            for i in range(grid.height):
                for j in range(grid.width):
                    # Put cell vertically reversed in matrix
                    cell = 1 if grid[j][i] else 0
                    matrix[-1-i][j] = cell
            return matrix

        def getPacmanMatrix(state):
            """ Return matrix with pacman coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            matrix = np.zeros((height, width), dtype=np.int8)

            for agentState in state.data.agentStates:
                if agentState.isPacman:
                    pos = agentState.configuration.getPosition()
                    cell = 1
                    matrix[-1-int(pos[1])][int(pos[0])] = cell

            return matrix

        def getGhostMatrix(state):
            """ Return matrix with ghost coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            matrix = np.zeros((height, width), dtype=np.int8)

            for agentState in state.data.agentStates:
                if not agentState.isPacman:
                    if not agentState.scaredTimer > 0:
                        pos = agentState.configuration.getPosition()
                        cell = 1
                        matrix[-1-int(pos[1])][int(pos[0])] = cell

            return matrix

        def getScaredGhostMatrix(state):
            """ Return matrix with ghost coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            matrix = np.zeros((height, width), dtype=np.int8)

            for agentState in state.data.agentStates:
                if not agentState.isPacman:
                    if agentState.scaredTimer > 0:
                        pos = agentState.configuration.getPosition()
                        cell = 1
                        matrix[-1-int(pos[1])][int(pos[0])] = cell

            return matrix

        def getFoodMatrix(state):
            """ Return matrix with food coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            grid = state.data.food
            matrix = np.zeros((height, width), dtype=np.int8)

            for i in range(grid.height):
                for j in range(grid.width):
                    # Put cell vertically reversed in matrix
                    cell = 1 if grid[j][i] else 0
                    matrix[-1-i][j] = cell

            return matrix

        def getCapsulesMatrix(state):
            """ Return matrix with capsule coordinates set to 1 """
            width, height = state.data.layout.width, state.data.layout.height
            capsules = state.data.layout.capsules
            matrix = np.zeros((height, width), dtype=np.int8)

            for i in capsules:
                # Insert capsule cells vertically reversed into matrix
                matrix[-1-i[1], i[0]] = 1

            return matrix

        # Create observation matrix as a combination of
        # wall, pacman, ghost, food and capsule matrices
        # width, height = state.data.layout.width, state.data.layout.height 
        width, height = self.params['width'], self.params['height']
        observation = np.zeros((6, height, width))

        observation[0] = getWallMatrix(state)
        observation[1] = getPacmanMatrix(state)
        observation[2] = getGhostMatrix(state)
        observation[3] = getScaredGhostMatrix(state)
        observation[4] = getFoodMatrix(state)
        observation[5] = getCapsulesMatrix(state)

        observation = np.swapaxes(observation, 0, 2)

        return observation

    def registerInitialState(self, state): # inspects the starting state

        # Reset reward
        self.last_score = 0
        self.current_score = 0
        self.last_reward = 0.
        self.ep_rew = 0

        # Reset state
        self.last_state = None
        self.current_state = self.getStateMatrices(state)

        # Reset actions
        self.last_action = None

        # Reset vars
        self.terminal = None
        self.won = True
        self.Q_global = []
        self.delay = 0

        # Next
        self.frame = 0
        self.numeps += 1

    def getAction(self, state):
        move = self.getMove(state)

        # Stop moving when not legal
        legal = state.getLegalActions(0)
        if move not in legal:
            move = Directions.STOP

        return move