def train(self, inputs, action, sampled_q): inputs = copy.deepcopy(inputs) action = copy.deepcopy(action) sampled_q = copy.deepcopy(sampled_q) inputs = nd.array(inputs, ctx=CTX) action = nd.array(action, ctx=CTX) sampled_q = nd.array(sampled_q, ctx=CTX) sampled_q = sampled_q.reshape(shape=(sampled_q.shape[0],)) with mx.autograd.record(): loss_vec = [] outputs = self.qnet(inputs, loss_vec) loss = 0. for element in loss_vec: loss = loss + element # print 'loss_dropout:', loss td_error = nd.sum(data=outputs * action, axis=1) - sampled_q for i in range(self.minibatch_size): if nd.abs(td_error[i]) < 1.0: loss = loss + 0.5 * nd.square(td_error[i]) else: loss = loss + nd.abs(td_error[i]) - 0.5 # print loss loss.backward() self.trainer.step(batch_size=self.minibatch_size, ignore_stale_grad=True)
def calc_loss_perceptual(hout, hcomp, hgt): for j in range(3): if j == 0: loss = nd.abs(hout[0] - hgt[0]).mean() loss = loss + nd.abs(hcomp[0] - hgt[0]).mean() else: loss = loss + nd.abs(hout[j] - hgt[j]).mean() loss = loss + nd.abs(hcomp[j] - hgt[j]).mean() return loss
def train(self, inputs, action, sampled_q): inputs = copy.deepcopy(inputs) action = copy.deepcopy(action) sampled_q = copy.deepcopy(sampled_q) inputs = nd.array(inputs, ctx=CTX) action = nd.array(action, ctx=CTX) sampled_q = nd.array(sampled_q, ctx=CTX) sampled_q = sampled_q.reshape(shape=(sampled_q.shape[0], )) with mx.autograd.record(): outputs = self.qnet(inputs) td_error = nd.sum(data=outputs * action, axis=1) - sampled_q loss = 0 for i in range(self.minibatch_size): if nd.abs(td_error[i]) < 1.0: loss = loss + 0.5 * nd.square(td_error[i]) else: loss = loss + nd.abs(td_error[i]) - 0.5 loss.backward() self.trainer.step(batch_size=self.minibatch_size)
def train(self, inputs, action, sampled_q): inputs = copy.deepcopy(inputs) action = copy.deepcopy(action) sampled_q = copy.deepcopy(sampled_q) inputs = nd.array(inputs, ctx=CTX) action = nd.array(action, ctx=CTX) sampled_q = nd.array(sampled_q, ctx=CTX) sampled_q = sampled_q.reshape(shape=(sampled_q.shape[0], )) with mx.autograd.record(): loss_vec = [] outputs = self.qnet.forward(inputs, loss_vec) loss = 0. for element in loss_vec: loss = loss + element # print 'loss_dropout:', loss q_est = nd.sum(data=outputs * action, axis=1) if q_est.shape[0] != sampled_q.shape[0]: print(q_est.shape) print(sampled_q.shape) print(q_est) print(sampled_q) td_error = q_est - sampled_q for i in range(self.minibatch_size): if nd.abs(td_error[i]) < 1.0: loss = loss + 0.5 * nd.square(td_error[i]) else: loss = loss + nd.abs(td_error[i]) - 0.5 # print loss loss.backward() grads_list = [] for name, value in self.qnet.collect_params().items(): if name.find('batchnorm') < 0: # grads_list.append(mx.nd.array(value.grad().asnumpy())) grads_list.append(value.grad()) return grads_list, self.minibatch_size
def style_loss(yhat, y): return nd.abs(gram(yhat) - gram(y)).mean()
from mxnet import gluon from mxnet import ndarray as nd from .score_fun import * from .. import * def logsigmoid(val): max_elem = nd.maximum(0., -val) z = nd.exp(-max_elem) + nd.exp(-val - max_elem) return -(max_elem + nd.log(z)) none = lambda x : x get_dev = lambda gpu : mx.gpu(gpu) if gpu >= 0 else mx.cpu() get_device = lambda args : mx.gpu(args.gpu[0]) if args.gpu[0] >= 0 else mx.cpu() norm_l1 = lambda x: nd.sum(nd.abs(x)) norm = lambda x, p: nd.sum(nd.abs(x) ** p) get_scalar = lambda x: x.detach().asscalar() reshape = lambda arr, x, y: arr.reshape(x, y) cuda = lambda arr, gpu: arr.as_in_context(mx.gpu(gpu)) def l2_dist(x, y, pw=False): if pw is False: x = x.expand_dims(axis=1) y = y.expand_dims(axis=0) return -nd.norm(x-y, ord=2, axis=-1)
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--rms-decay', required=False, type=float, default=0.95, help='Decay rate of the RMSProp') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default=None, help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=16, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') parser.add_argument('--nactor', required=False, type=int, default=16, help='number of actor') parser.add_argument('--exploration-period', required=False, type=int, default=4000000, help='length of annealing of epsilon greedy policy') parser.add_argument('--replay-memory-size', required=False, type=int, default=100, help='size of replay memory') parser.add_argument('--single-batch-size', required=False, type=int, default=5, help='batch size for every actor') parser.add_argument('--symbol', required=False, type=str, default="nature", help='type of network, nature or nips') parser.add_argument('--sample-policy', required=False, type=str, default="recent", help='minibatch sampling policy, recent or random') parser.add_argument('--epoch-num', required=False, type=int, default=50, help='number of epochs') parser.add_argument('--param-update-period', required=False, type=int, default=5, help='Parameter update period') parser.add_argument('--resize-mode', required=False, type=str, default="scale", help='Resize mode, scale or crop') parser.add_argument('--eps-update-period', required=False, type=int, default=8000, help='eps greedy policy update period') parser.add_argument('--server-optimizer', required=False, type=str, default="easgd", help='type of server optimizer') parser.add_argument('--nworker', required=False, type=int, default=1, help='number of kv worker') parser.add_argument('--easgd-alpha', required=False, type=float, default=0.01, help='easgd alpha') args, unknown = parser.parse_known_args() logging.info(str(args)) if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] time_str = time.strftime("%m%d_%H%M_%S", time.localtime()) args.dir_path = ('dqn-%s-%d_' % (rom_name,int(args.lr*10**5)))+time_str \ + "_" + os.environ.get('DMLC_TASK_ID') logging.info("saving to dir: " + args.dir_path) if args.ctx == None: args.ctx = os.environ.get('CTX') logging.info("Context: %s" % args.ctx) ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] # Async verision nactor = args.nactor param_update_period = args.param_update_period replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = args.replay_memory_size history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) games = [] for g in range(nactor): games.append( AtariGame(rom_path=args.rom, resize_mode=args.resize_mode, replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length)) ##RUN NATURE freeze_interval = 40000 / nactor freeze_interval /= param_update_period epoch_num = args.epoch_num steps_per_epoch = 4000000 / nactor discount = 0.99 save_screens = False eps_start = numpy.ones((3, )) * args.start_eps eps_min = numpy.array([0.1, 0.01, 0.5]) eps_decay = (eps_start - eps_min) / (args.exploration_period / nactor) eps_curr = eps_start eps_id = numpy.zeros((nactor, )) eps_update_period = args.eps_update_period eps_update_count = numpy.zeros((nactor, )) single_batch_size = args.single_batch_size minibatch_size = nactor * single_batch_size action_num = len(games[0].action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } if args.symbol == "nature": dqn_sym = dqn_sym_nature(action_num) elif args.symbol == "nips": dqn_sym = dqn_sym_nips(action_num) else: raise NotImplementedError qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) if args.optimizer == "adagrad": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) elif args.optimizer == "rmsprop" or args.optimizer == "rmspropnoncentered": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, gamma1=args.rms_decay, gamma2=0, rescale_grad=1.0, wd=args.wd) lr_decay = (args.lr - 0) / (steps_per_epoch * epoch_num / param_update_period) # Create kvstore use_easgd = False if args.kv_type != None: kvType = args.kv_type kv = kvstore.create(kvType) #Initialize kvstore for idx, v in enumerate(qnet.params.values()): kv.init(idx, v) if args.server_optimizer == "easgd": use_easgd = True easgd_beta = 0.9 easgd_alpha = args.easgd_alpha server_optimizer = mx.optimizer.create(name="ServerEasgd", learning_rate=easgd_alpha) easgd_eta = 0.00025 central_weight = OrderedDict([(n, v.copyto(q_ctx)) for n, v in qnet.params.items()]) kv.set_optimizer(server_optimizer) updater = mx.optimizer.get_updater(optimizer) else: kv.set_optimizer(optimizer) kvstore_update_period = args.kvstore_update_period npy_rng = numpy.random.RandomState(123456 + kv.rank) else: updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() states_buffer_for_act = numpy.zeros( (nactor, history_length) + (rows, cols), dtype='uint8') states_buffer_for_train = numpy.zeros( (minibatch_size, history_length + 1) + (rows, cols), dtype='uint8') next_states_buffer_for_train = numpy.zeros( (minibatch_size, history_length) + (rows, cols), dtype='uint8') actions_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='uint8') rewards_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='float32') terminate_flags_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='bool') # Begin Playing Game training_steps = 0 total_steps = 0 ave_fps = 0 ave_loss = 0 time_for_info = time.time() parallel_executor = concurrent.futures.ThreadPoolExecutor(nactor) for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() # for g, game in enumerate(games): game.start() game.begin_episode() eps_rand = npy_rng.rand() if eps_rand < 0.4: eps_id[g] = 0 elif eps_rand < 0.7: eps_id[g] = 1 else: eps_id[g] = 2 episode_stats = [EpisodeStat() for i in range(len(games))] while steps_left > 0: for g, game in enumerate(games): if game.episode_terminate: episode += 1 epoch_reward += game.episode_reward if args.kv_type != None: info_str = "Node[%d]: " % kv.rank else: info_str = "" info_str += "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, ave_fps, (eps_curr[eps_id[g]])) info_str += ", Avg Loss:%f" % ave_loss if episode_stats[g].episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_stats[g].episode_q_value / episode_stats[g].episode_action_step, episode_stats[g].episode_action_step) if g == 0: logging.info(info_str) if eps_update_count[g] * eps_update_period < total_steps: eps_rand = npy_rng.rand() if eps_rand < 0.4: eps_id[g] = 0 elif eps_rand < 0.7: eps_id[g] = 1 else: eps_id[g] = 2 eps_update_count[g] += 1 game.begin_episode(steps_left) episode_stats[g] = EpisodeStat() if total_steps > history_length: for g, game in enumerate(games): current_state = game.current_state() states_buffer_for_act[g] = current_state states = nd.array(states_buffer_for_act, ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=nactor, data=states)[0].asnumpy() actions_that_max_q = numpy.argmax(qval_npy, axis=1) actions = [0] * nactor for g, game in enumerate(games): # 1. We need to choose a new action based on the current game status if games[g].state_enabled and games[ g].replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr[eps_id[g]]) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! action = actions_that_max_q[g] episode_stats[g].episode_q_value += qval_npy[g, action] episode_stats[g].episode_action_step += 1 else: action = npy_rng.randint(action_num) actions[g] = action # t0=time.time() for ret in parallel_executor.map(play_game, zip(games, actions)): pass # t1=time.time() # logging.info("play time: %f" % (t1-t0)) eps_curr = numpy.maximum(eps_curr - eps_decay, eps_min) total_steps += 1 steps_left -= 1 if total_steps % 100 == 0: this_time = time.time() ave_fps = (100 / (this_time - time_for_info)) time_for_info = this_time # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps > minibatch_size and \ total_steps % (param_update_period) == 0 and \ games[-1].replay_memory.sample_enabled: if use_easgd and training_steps % kvstore_update_period == 0: for paramIndex in range(len(qnet.params)): k = qnet.params.keys()[paramIndex] kv.pull(paramIndex, central_weight[k], priority=-paramIndex) qnet.params[k][:] -= easgd_alpha * (qnet.params[k] - central_weight[k]) kv.push(paramIndex, qnet.params[k], priority=-paramIndex) # 3.1 Draw sample from the replay_memory for g, game in enumerate(games): episode_stats[g].episode_update_step += 1 nsample = single_batch_size i0 = (g * nsample) i1 = (g + 1) * nsample if args.sample_policy == "recent": action, reward, terminate_flag=game.replay_memory.sample_last(batch_size=nsample,\ states=states_buffer_for_train,offset=i0) elif args.sample_policy == "random": action, reward, terminate_flag=game.replay_memory.sample_inplace(batch_size=nsample,\ states=states_buffer_for_train,offset=i0) actions_buffer_for_train[i0:i1] = action rewards_buffer_for_train[i0:i1] = reward terminate_flags_buffer_for_train[i0:i1] = terminate_flag states = nd.array(states_buffer_for_train[:, :-1], ctx=q_ctx) / float(255.0) next_states = nd.array(states_buffer_for_train[:, 1:], ctx=q_ctx) / float(255.0) actions = nd.array(actions_buffer_for_train, ctx=q_ctx) rewards = nd.array(rewards_buffer_for_train, ctx=q_ctx) terminate_flags = nd.array(terminate_flags_buffer_for_train, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(batch_size=minibatch_size, is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) if args.kv_type is None or use_easgd: qnet.update(updater=updater) else: update_on_kvstore(kv, qnet.params, qnet.params_grad) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() if ave_loss == 0: ave_loss = loss else: ave_loss = 0.95 * ave_loss + 0.05 * loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) if args.optimizer == "rmsprop" or args.optimizer == "rmspropnoncentered": optimizer.lr -= lr_decay if save_screens and training_steps % ( 60 * 60 * 2 / param_update_period) == 0: logging.info("saving screenshots") for g in range(nactor): screen = states_buffer_for_train[( g * single_batch_size), -2, :, :].reshape( states_buffer_for_train.shape[2:]) cv2.imwrite("screen_" + str(g) + ".png", screen) training_steps += 1 end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) if args.kv_type != None: logging.info( "Node[%d]: Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (kv.rank, epoch, fps, epoch_reward / float(episode), episode)) else: logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') args = parser.parse_args() ## custom args.start_eps = 0.2 args.replay_start_size = 1000 if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s-lr%g' % (rom_name, args.lr) replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 ctx = parse_ctx(args.ctx) q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='resize', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 30 steps_per_epoch = 100000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - eps_min) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } dqn_sym = dqn_sym_nature(action_num) qnet = Base(data_shapes=data_shapes, sym_gen=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) use_easgd = False optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in range(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: current_state = game.current_state() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(is_train=False, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval)) \ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] qval = qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval)) \ * (1.0 - terminate_flags) * discount outputs = qnet.forward(is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward() qnet.update(updater=updater) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = 0.5 * nd.sum(nd.square(quadratic_part)).asnumpy()[0] + \ nd.sum(diff - quadratic_part).asnumpy()[0] episode_loss += loss # 3.3 Update the target network every freeze_interval if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_q_value / episode_action_step, episode_action_step) if episode % 100 == 0: logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) # qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
from mxnet import gluon from mxnet import ndarray as nd from mxnet import autograd from mxnet.gluon import data as gdata from mpl_toolkits.mplot3d import Axes3D from matplotlib import pyplot as plt import mxnet losser = gluon.loss.L2Loss() X = nd.random_uniform(-100, 100, shape=(1000)) Y = nd.random_uniform(-100, 100, shape=(1000)) XY = nd.concat(X.reshape((-1, 1)), Y.reshape((-1, 1)), dim=1) Z = nd.exp(-nd.abs(X - Y)) figure = plt.figure() axes = plt.subplot(111, projection='3d') axes.scatter(X.asnumpy(), Y.asnumpy(), Z.asnumpy(), 'r.') net = gluon.nn.Sequential() net.add(gluon.nn.Dense(200, activation='relu')) net.add(gluon.nn.Dense(200, activation='relu')) net.add(gluon.nn.Dense(1)) net.collect_params().initialize(mxnet.initializer.Xavier()) batch_size = 1000 dataset = gdata.ArrayDataset(XY, Z) dataiter = gdata.DataLoader(dataset, batch_size=batch_size) trainer = gluon.Trainer(net.collect_params(), 'adam', { 'beta1': .9, 'beta2': .999 })
def inference_g(self, observed_arr): ''' Inference with generator. Args: observed_arr: `mxnet.ndarray` of observed data points. Returns: Tuple data. - re-parametric data. - encoded data points. - re-encoded data points. ''' generated_arr, encoded_arr, re_encoded_arr = super().inference_g(observed_arr) if autograd.is_recording(): limit = self.long_term_seq_len seq_len = self.noise_sampler.seq_len self.noise_sampler.seq_len = limit long_term_observed_arr = self.noise_sampler.draw() observed_mean_arr = nd.expand_dims(nd.mean(long_term_observed_arr, axis=1), axis=1) sum_arr = None for seq in range(2, long_term_observed_arr.shape[1]): add_arr = nd.sum(long_term_observed_arr[:, :seq] - observed_mean_arr, axis=1) if sum_arr is None: sum_arr = nd.expand_dims(add_arr, axis=0) else: sum_arr = nd.concat( sum_arr, nd.expand_dims(add_arr, axis=0), dim=0 ) max_arr = nd.max(sum_arr, axis=0) min_arr = nd.min(sum_arr, axis=0) diff_arr = long_term_observed_arr - observed_mean_arr std_arr = nd.power(nd.mean(nd.square(diff_arr), axis=1), 1/2) R_S_arr = (max_arr - min_arr) / std_arr len_arr = nd.ones_like(R_S_arr, ctx=R_S_arr.context) * np.log(long_term_observed_arr.shape[1] / 2) observed_H_arr = nd.log(R_S_arr) / len_arr self.noise_sampler.seq_len = seq_len g_min_arr = nd.expand_dims(generated_arr.min(axis=1), axis=1) g_max_arr = nd.expand_dims(generated_arr.max(axis=1), axis=1) o_min_arr = nd.expand_dims(observed_arr.min(axis=1), axis=1) o_max_arr = nd.expand_dims(observed_arr.max(axis=1), axis=1) _observed_arr = generated_arr long_term_generated_arr = None for i in range(limit): generated_arr, _, _ = super().inference_g(_observed_arr) g_min_arr = nd.expand_dims(generated_arr.min(axis=1), axis=1) g_max_arr = nd.expand_dims(generated_arr.max(axis=1), axis=1) o_min_arr = nd.expand_dims(_observed_arr.min(axis=1), axis=1) o_max_arr = nd.expand_dims(_observed_arr.max(axis=1), axis=1) generated_arr = (generated_arr - g_min_arr) / (g_max_arr - g_min_arr) generated_arr = (o_max_arr - o_min_arr) * generated_arr generated_arr = o_min_arr + generated_arr if self.condition_sampler is not None: self.condition_sampler.output_shape = generated_arr.shape noise_arr = self.condition_sampler.generate() generated_arr += noise_arr if long_term_generated_arr is None: long_term_generated_arr = generated_arr else: long_term_generated_arr = nd.concat( long_term_generated_arr, generated_arr, dim=1 ) _observed_arr = generated_arr generated_mean_arr = nd.expand_dims(nd.mean(long_term_generated_arr, axis=1), axis=1) sum_arr = None for seq in range(2, long_term_generated_arr.shape[1]): add_arr = nd.sum(long_term_generated_arr[:, :seq] - generated_mean_arr, axis=1) if sum_arr is None: sum_arr = nd.expand_dims(add_arr, axis=0) else: sum_arr = nd.concat( sum_arr, nd.expand_dims(add_arr, axis=0), dim=0 ) max_arr = nd.max(sum_arr, axis=0) min_arr = nd.min(sum_arr, axis=0) diff_arr = long_term_generated_arr - generated_mean_arr std_arr = nd.power(nd.mean(nd.square(diff_arr), axis=1), 1/2) R_S_arr = (max_arr - min_arr) / std_arr len_arr = nd.ones_like(R_S_arr, ctx=R_S_arr.context) * np.log(long_term_generated_arr.shape[1] / 2) generated_H_arr = nd.log(R_S_arr) / len_arr multi_fractal_loss = nd.abs(generated_H_arr - observed_H_arr) multi_fractal_loss = nd.mean(multi_fractal_loss, axis=0, exclude=True) multi_fractal_loss = nd.expand_dims(multi_fractal_loss, axis=-1) multi_fractal_loss = nd.expand_dims(multi_fractal_loss, axis=-1) generated_arr = generated_arr + multi_fractal_loss return generated_arr, encoded_arr, re_encoded_arr
import mxnet as mx from mxnet import gluon from mxnet import ndarray as nd from .score_fun import * from .. import * def logsigmoid(val): max_elem = nd.maximum(0., -val) z = nd.exp(-max_elem) + nd.exp(-val - max_elem) return -(max_elem + nd.log(z)) get_device = lambda args: mx.gpu(args.gpu[0]) if args.gpu[0] >= 0 else mx.cpu() norm = lambda x, p: nd.sum(nd.abs(x)**p) get_scalar = lambda x: x.detach().asscalar() reshape = lambda arr, x, y: arr.reshape(x, y) cuda = lambda arr, gpu: arr.as_in_context(mx.gpu(gpu)) class ExternalEmbedding: """Sparse Embedding for Knowledge Graph It is used to store both entity embeddings and relation embeddings. Parameters ---------- args :
def main(): parser = argparse.ArgumentParser(description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', action='store_true', help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', action='store_true', help='Use Double DQN only if specified') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument('-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument('--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument('--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever') parser.add_argument('--kv-type', required=False, type=str, default=None, help='type of kvstore, default will not use kvstore, could also be dist_async') parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') args = parser.parse_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s-lr%g' % (rom_name, args.lr) replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 ctx = parse_ctx(args.ctx) q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - eps_min) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = {'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size,), 'dqn_reward': (minibatch_size,)} dqn_sym = dqn_sym_nature(action_num) qnet = Base(data_shapes=data_shapes, sym_gen=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) use_easgd = False optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in range(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array(current_state.reshape((1,) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(is_train=False, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] qval = qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward() qnet.update(updater=updater) # 3.3 Calculate Loss diff = nd.abs(nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = 0.5 * nd.sum(nd.square(quadratic_part)).asnumpy()[0] +\ nd.sum(diff - quadratic_part).asnumpy()[0] episode_loss += loss # 3.3 Update the target network every freeze_interval if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % (episode_q_value / episode_action_step, episode_action_step) if episode % 100 == 0: logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def GD(classifier, alpha, beta, gamma, max_iterations=100, learning_rate=50.0, learning_rate_decay=0.9, momentum=0.5): # Random initialization X = nd.abs(nd.random_normal(scale=1, shape=(1, *classifier.input_shape))) #audio_path_label_pairs = load_audio_path_label_pairs() #shuffle(audio_path_label_pairs) #audio_path, actual_label_id = audio_path_label_pairs[0] #mg = classifier.compute_melgram(audio_path) #X = nd.array(np.expand_dims(mg, axis=0), ctx=classifier.model_ctx) X = X.as_in_context(classifier.model_ctx) # GD with momentum eta = -1.0 * learning_rate prev_grad = nd.zeros(shape=X.shape) losses = [] cls_losses = [] sty_losses = [] pct_losses = [] l1s = [] for t in range(max_iterations): # Projection X = nd.maximum(X, 0.0) X = nd.minimum(X, 1.0) # Save as .csv img = X[0, 0, :, :].asnumpy() np.savetxt('./temp/iter%d.csv' % t, img) # Calculate losses and gradients cls_loss = classifier_loss(X, classifier) sty_loss = style_loss(X) pct_loss = perceptual_loss(X) l1 = l1_regularization(X) # Weighting loss = cls_loss[ 0] + alpha * sty_loss[0] + beta * pct_loss[0] + gamma * l1[0] grad = cls_loss[ 1] + alpha * sty_loss[1] + beta * pct_loss[1] + gamma * l1[1] # Store losses print("Iteration %d: %.2f | (%.2f, %.2f, %.2f, %.2f)" % (t, loss, cls_loss[0], sty_loss[0], pct_loss[0], l1[0])) #print("Iteration %d: %.2f | (%.2f, %.2f, %.2f)" % (t, loss, cls_loss[0], sty_loss[0], pct_loss[0])) losses.append(loss) cls_losses.append(cls_loss[0]) sty_losses.append(sty_loss[0]) pct_losses.append(pct_loss[0]) l1s.append(l1[0]) # Update X = X - eta * (nd.array(grad) + momentum * prev_grad) eta = eta * learning_rate_decay prev_grad = grad
def abs(val): return nd.abs(val)
def poly_kernels(self, x: NDArray, y: NDArray): prod = nd.dot(x, y) return nd.sign(prod) * nd.abs(prod)**2
def calculate_overdrive(initial, final, tau): overdrive = 0 for i, g in enumerate(initial): overdrive += (nd.abs(g - final[i]) > tau).sum().asscalar() return int(overdrive)
def replace_inf_with_zero(x): return nd.where(nd.abs(x) == np.inf, nd.zeros_like(x), x)
def main(): parser = argparse.ArgumentParser(description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument('-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument('--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument('--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever') parser.add_argument('--kv-type', required=False, type=str, default=None, help='type of kvstore, default will not use kvstore, could also be dist_async') args, unknown = parser.parse_known_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s' % rom_name ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) >0 else (device, 0) for device, num in ctx] replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - 0.1) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = {'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size,), 'dqn_reward': (minibatch_size,)} #optimizer = mx.optimizer.create(name='sgd', learning_rate=args.lr,wd=args.wd) optimizer = mx.optimizer.Nop() dqn_output_op = DQNOutputNpyOp() dqn_sym = dqn_sym_nature(action_num, dqn_output_op) qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) # Create kvstore testShape = (1,1686180*100) testParam = nd.ones(testShape,ctx=q_ctx) testGrad = nd.zeros(testShape,ctx=q_ctx) # Create kvstore if args.kv_type != None: kvType = args.kv_type kvStore = kvstore.create(kvType) #Initialize kvstore for idx,v in enumerate(qnet.params.values()): kvStore.init(idx,v); # Set optimizer on kvstore kvStore.set_optimizer(optimizer) kvstore_update_period = args.kvstore_update_period else: updater = mx.optimizer.get_updater(optimizer) # if args.kv_type != None: # kvType = args.kv_type # kvStore = kvstore.create(kvType) # kvStore.init(0,testParam) # testOptimizer = mx.optimizer.Nop() # kvStore.set_optimizer(testOptimizer) # kvstore_update_period = args.kvstore_update_period qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 while(1): time_before_wait = time.time() # kvStore.push(0,testGrad,priority=0) # kvStore.pull(0,testParam,priority=0) # testParam.wait_to_read() for paramIndex in range(len(qnet.params)):#range(6):# k=qnet.params.keys()[paramIndex] kvStore.push(paramIndex,qnet.params_grad[k],priority=-paramIndex) kvStore.pull(paramIndex,qnet.params[k],priority=-paramIndex) for v in qnet.params.values(): v.wait_to_read() logging.info("wait time %f" %(time.time()-time_before_wait)) for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array(current_state.reshape((1,) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=1, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(batch_size=minibatch_size,is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) nd.waitall() time_before_update = time.time() if args.kv_type != None: if total_steps % kvstore_update_period == 0: update_to_kvstore(kvStore,qnet.params,qnet.params_grad) else: qnet.update(updater=updater) logging.info("update time %f" %(time.time()-time_before_update)) time_before_wait = time.time() nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait)) '''nd.waitall() time_before_wait = time.time() kvStore.push(0,testGrad,priority=0) kvStore.pull(0,testParam,priority=0) nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait))''' # 3.3 Calculate Loss diff = nd.abs(nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % (episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') args = parser.parse_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s-lr%g' % (rom_name, args.lr) replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 ctx = parse_ctx(args.ctx) q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - eps_min) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } dqn_sym = dqn_sym_nature(action_num) qnet = Base(data_shapes=data_shapes, sym_gen=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) use_easgd = False if args.optimizer != "easgd": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) else: use_easgd = True easgd_beta = 0.9 easgd_p = 4 easgd_alpha = easgd_beta / (args.kvstore_update_period * easgd_p) server_optimizer = mx.optimizer.create(name="ServerEASGD", learning_rate=easgd_alpha) easgd_eta = 0.00025 local_optimizer = mx.optimizer.create(name='adagrad', learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) central_weight = OrderedDict([(n, nd.zeros(v.shape, ctx=q_ctx)) for n, v in qnet.params.items()]) # Create KVStore if args.kv_type != None: kv = kvstore.create(args.kv_type) #Initialize KVStore for idx, v in enumerate(qnet.params.values()): kv.init(idx, v) # Set Server optimizer on KVStore if not use_easgd: kv.set_optimizer(optimizer) else: kv.set_optimizer(server_optimizer) local_updater = mx.optimizer.get_updater(local_optimizer) kvstore_update_period = args.kvstore_update_period args.dir_path = args.dir_path + "-" + str(kv.rank) else: updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(is_train=False, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] qval = qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward() if args.kv_type != None: if use_easgd: if total_steps % kvstore_update_period == 0: for ind, k in enumerate(qnet.params.keys()): kv.pull(ind, central_weight[k], priority=-ind) qnet.params[k][:] -= easgd_alpha * \ (qnet.params[k] - central_weight[k]) kv.push(ind, qnet.params[k], priority=-ind) qnet.update(updater=local_updater) else: update_on_kvstore(kv, qnet.params, qnet.params_grad) else: qnet.update(updater=updater) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = 0.5 * nd.sum(nd.square(quadratic_part)).asnumpy()[0] +\ nd.sum(diff - quadratic_part).asnumpy()[0] episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward if args.kv_type != None: info_str = "Node[%d]: " % kv.rank else: info_str = "" info_str += "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) if args.kv_type is not None: logging.info( "Node[%d]: Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (kv.rank, epoch, fps, epoch_reward / float(episode), episode)) else: logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def abs(a): return nd.abs(a)