def savegame(config): # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training dic = {} with open("symbolMapping" + str(sys.argv[1]) + ".txt", 'r') as fp: data = fp.read().split('\n') for i in range(len(data) - 1): splitdata = data[i].split(' ') dic[int(splitdata[1])] = splitdata[0] dic[0] = "NULL" fp = open("teacher" + str(sys.argv[1]) + "_embeddings.txt", "w") for i in range(config.vocab_size - 1): state = np.zeros([config.batch_size, config.seq_length]) state[:, 0] = i embedding = brain.output_embedT.eval( feed_dict={brain.stateInputT: state}, session=brain.session)[0, 0, :] print >> fp, dic[i] for element in embedding: print >> fp, element, print >> fp brain.session.close()
def setup_dqn(trainable): with open("config.yaml", 'r') as stream: config = yaml.load(stream, Loader=yaml.FullLoader) cfg = config["dqn"] dqn = DQN(trainable, config["learning rate"], cfg["discount factor"], ReplayMemoryModel.parse_layer_blueprints(cfg["layers"]), config["replay min batch"], config["replay memory size"]) return dqn
def savegame(config): # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training dic = {} with open("symbolMapping" + str(sys.argv[1]) + ".txt", 'r') as fp: data = fp.read().split('\n') for i in range(len(data) - 1): splitdata = data[i].split(' ') dic[int(splitdata[1])] = splitdata[0] dic[0] = "NULL" dic_trans = {} with open("symbolMapping1236.txt", 'r') as fp: data = fp.read().split('\n') for i in range(len(data) - 1): splitdata = data[i].split(' ') dic_trans[splitdata[0]] = int(splitdata[1]) dic_trans["NULL"] = 0 dic_embedding = {} #1st let us initialize it randomly sess = tf.InteractiveSession() stateInput = tf.placeholder(tf.int32, [len(dic_trans.keys())]) embed = tf.Variable(tf.random_uniform([len(dic_trans.keys()), 20], -1, 1), name="embed") word_embeds = tf.nn.embedding_lookup(embed, stateInput) tf.initialize_all_variables().run() state = sorted(dic_trans.values()) state_map = word_embeds.eval(feed_dict={stateInput: state}) for i in range(len(state)): dic_embedding[state[i]] = state_map[i] sess.close() for i in range(config.vocab_size - 1): state = np.zeros([config.batch_size, config.seq_length]) state[:, 0] = i embedding = brain.word_embeds.eval(feed_dict={brain.stateInput: state}, session=brain.session)[0, 0] dic_embedding[dic_trans[dic[i]]] = embedding brain.session.close() cpickle.dump(dic_embedding, open("embedTeacher" + str(sys.argv[1]) + ".p", "wb"))
def __init__(self, **kwargs): """Initializes the game window.""" pg.init() pg.display.set_caption('Snake') self.clock = pg.time.Clock() if 'cell_size' in kwargs: self.cell_size = kwargs.pop('cell_size', False) if 'length' in kwargs: self.length = kwargs.pop('length', False) if 'height' in kwargs: self.height = kwargs.pop('height', False) if 'speed' in kwargs: self.actions_per_second = kwargs.pop('speed', False) self._exit = False self.is_training = True self.trainig_pressed = False self.score = 0 self.actions = 0 self.scores = [] self.averages = [] # For split-brain network if "sb_dimensions" and "sb_lr" in kwargs: dimensions = kwargs.pop("sb_dimensions", False) lr = kwargs.pop("sb_lr", False) self.split_brain_network = SplitBrainNetwork(dimensions=dimensions, lr=lr) # DQN if "dqn_dimensions" and "dqn_lr" and "dqn_batch_size" and "dqn_sample_size" in kwargs: dimensions = kwargs.pop("dqn_dimensions", False) lr = kwargs.pop("dqn_lr", False) batch_size = kwargs.pop("dqn_batch_size", False) sample_size = kwargs.pop("dqn_sample_size", False) self.dqn = DQN(dimensions=dimensions, lr=lr, batch_size=batch_size, sample_size=sample_size)
def play_model(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) with torch.no_grad(): i = 0 observation = env.reset() while i < 5000: env.render() state = get_screen(env, device) action = int(target_net(state).max(1)[1].view(1, 1)) observation, reward, done, info = env.step(action) if done: break i += 1 env.close()
total_loss = 0.0 total_reward = 0.0 cnt = 0 while not env.is_over(): curr_state = env.get_curr_state().unsqueeze(0) reward, act = net.take_action(Variable(curr_state), use_greedy=True) true_reward = Variable(torch.FloatTensor([env.step(act)]), requires_grad=False) if use_cuda: true_reward = true_reward.cuda() loss = criterion(reward, true_reward) total_loss += loss.data[0] total_reward += true_reward cnt = cnt + 1 return cnt, total_loss, total_reward if __name__ == '__main__': # initializations loader = PickleDataReader(data_file, input_size, output_size, data_stride) train_set, test_set = loader.load(train_ratio, False) train_set = WindowDataSet(train_set) net = DQN(config) env = Environment(train_set) train_net(net, env, max_epoch, non_greedy_decay) cnt, loss, reward = eval_decision(net, env, criterion, use_cuda) print("step : %d, loss : %f, reward : %f" % (cnt, loss, reward))
def savegame(config): fp = open('symbolMapping'+str(sys.argv[1])+'.txt','r') data = fp.read().split('\n') spd = [data_.split(' ')[::-1] for data_ in data] dic_local = dict(spd[0:-1]) dic_local['0'] = 'NULL' fp.close() fp = open('symbolMapping5.txt','r') data = fp.read().split('\n') spd = [data_.split(' ')for data_ in data] dic_global = dict(spd[0:-1]) dic_global['NULL']='0' fp.close() # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress') episode_length = 0 num_episodes = 0 total_reward = 0 MAX_STEPS = 105000 totalSteps = 0 MEM_STEPS = 10000 memory = [] while totalSteps < MAX_STEPS: totalSteps += 1 if env.START_NEW_GAME: episode_length = 0 env.START_NEW_GAME = False state, reward, terminal, availableObjects = env.newGame() brain.history.add(state) action_indicator = np.zeros(actions) object_indicator = np.zeros(objects) #predict action_index,object_index = brain.getAction(availableObjects, True) Qactions, Qobjects = brain.getQValues(availableObjects) action_indicator[action_index] = 1 object_indicator[object_index] = 1 # print state memory.append((convert_state(state, dic_local, dic_global), Qactions, Qobjects)) #act nextstate,reward,terminal, availableObjects = env.step(action_index,object_index) total_reward += reward episode_length += 1 #observe brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, True) state = nextstate if (totalSteps % MEM_STEPS == 0): fileName = str(config.game_num) + "_mem.txt" with open(fileName, "a") as fp: for i in range(len(memory)): for j in memory[i][0]: print >> fp, j, print >> fp for j in memory[i][1]: print >> fp, j, print >> fp for j in memory[i][2]: print >> fp, j, print >> fp memory = [] if ((terminal) or ((episode_length % config.max_episode_length) == 0)): num_episodes += 1 with open("saver_reward.txt", "a") as fp: print >> fp, (total_reward / (num_episodes * 1.0)) env.START_NEW_GAME = True pbar.update(1) if (brain.timeStep) > config.MAX_FRAMES: brain.train_writer.close() break brain.session.close()
import pickle import time import gym from models.DQN import DQN if __name__ == '__main__': env = gym.make('CartPole-v0').unwrapped RL = DQN( i=4, o=2, h1=512, h2=128, isinit=False, save_file='saves/CartPole/params/ql.pkl', hyperparam={ 'learn_rate': 1e-4, # lr=1e-3 for training 'weight_decay': 1e-5, 'reward_decay': 0.1 }) data1 = [] data2 = [] scores = [] for ii in range(50000): state1 = env.reset() for turn in range(10000): env.render() action = RL.action(state1, e=0.1) # e=0.3 for training
game.set_screen_resolution(ScreenResolution.RES_256X144) game.init() print("Doom initialized.") return game # Create Doom instance game = vizdoom_init(config_file_path) n = game.get_available_buttons_size() actions = [list(a) for a in it.product([0, 1], repeat=n)] if load_model: model = DQN(game, actions, file_name, prioritized=use_prioritized, ddqn=use_ddqn, gpu=use_gpu, loading=1) else: model = DQN(game, actions, file_name, prioritized=use_prioritized, ddqn=use_ddqn, gpu=use_gpu, parameter_exploration=use_parameter_exploration, loading=0) print("Starting the training.") time_start = time()
game.set_screen_format(ScreenFormat.CRCGCB) game.set_screen_resolution(ScreenResolution.RES_640X360) game.init() print("Doom initialized.") return game # Create Doom instance game = vizdoom_init(config_file_path) n = game.get_available_buttons_size() actions = [list(a) for a in it.product([0, 1], repeat=n)] model = DQN(game, actions, file_name, ddqn=use_ddqn, parameter_exploration=use_parameter_exploration, gpu=use_gpu, loading=1) print("======================================") print("Testing trained neural network.") print("Testing...") test_scores = [] for _ in range(episodes_to_watch): game.new_episode() while not game.is_episode_finished(): model.step(training=False, showing=True)
cfg.TRAIN_DATA = "data/{}_2000-2019.csv".format(df) cfg.VAL_DATA = "data/{}_2020.csv".format(df) # Loading datasets train_data = pd.read_csv(cfg.TRAIN_DATA) val_data = pd.read_csv(cfg.VAL_DATA) # Creating environment env = Environment(train_data, val_data) env.reset() ######################## Setting up the algorithm and assigning to the agent ######################## ## DQN if cfg._DQN_ == 1: from models.DQN import DQN dqn = DQN(env.obs.shape[0], len(env.actions()), device) if (cfg.LOAD_MODEL): dqn.load_checkpoint() agent = Agent(env, dqn, cfg.DQN) ## AC if cfg._AC_ == 1: from models.AC import AC ac = AC(env.obs.shape[0], len(env.actions()), device) if (cfg.LOAD_MODEL): ac.load_checkpoint() agent = Agent(env, ac, cfg.AC) # PPO if cfg._PPO_ == 1: from models.PPO import PPO
import pickle import time import gym from models.DQN import DQN if __name__ == '__main__': env = gym.make('Acrobot-v1').unwrapped RL = DQN( i=6, o=3, h1=512, h2=128, isinit=False, save_file='saves/Acrobot/params/ql.pkl', hyperparam={ 'learn_rate': 1e-3, # lr=1e-3 for training 'weight_decay': 1e-5, 'reward_decay': 0.01 }) scores = [] for ii in range(1000000): state1 = env.reset() for turn in range(1000000): env.render() action = RL.action(state1, e=0.1) # e=0.3 for training state2, reward, done, info = env.step(action)
def playgame(config): # Step 1: init Game env = Environment(config.game_num) #1 is for main game 2 is for evaluation ################### # Step 2: init DQN actions = env.action_size() objects = env.object_size() config.setnumactions(actions) config.setnumobjects(objects) config.setvocabsize(env.vocab_size()) brain = DQN(config) # checkStates = None #adding progress bar for training pbar = tqdm(total=config.MAX_FRAMES, desc='Training Progress') episode_length = 0 num_episodes = 0 total_reward = 0 while True: if env.START_NEW_GAME: episode_length = 0 env.START_NEW_GAME = False state, reward, terminal, availableObjects = env.newGame() brain.history.add(state) action_indicator = np.zeros(actions) object_indicator = np.zeros(objects) #predict action_index, object_index = brain.getAction(availableObjects) action_indicator[action_index] = 1 object_indicator[object_index] = 1 #act nextstate, reward, terminal, availableObjects = env.step( action_index, object_index) total_reward += reward episode_length += 1 #observe brain.setPerception(state, reward, action_indicator, object_indicator, nextstate, terminal, False) state = nextstate if ((terminal) or ((episode_length % config.max_episode_length) == 0)): num_episodes += 1 with open("train_reward.txt", "a") as fp: print >> fp, (total_reward / (num_episodes * 1.0)) env.START_NEW_GAME = True ##################################################################### #for evaluating qvalues if (brain.timeStep % config.EVAL == 0) and (brain.timeStep != 0): if (brain.timeStep / config.EVAL == 1): if not ((os.path.exists("checkStates.txt")) and (os.path.getsize("checkStates.txt") > 0)): assert config.SAMPLE_STATES % config.BATCH_SIZE == 0 assert config.SAMPLE_STATES < brain.memory.count checkStates, _1, _2, _3, _4, _5 = brain.memory.sample() with open("checkStates.txt", "w") as fp: cpickle.dump(checkStates, fp) else: with open("checkStates.txt", 'r') as fp: checkStates = cpickle.load(fp) evalQValues_a = brain.action_valueT.eval( feed_dict={brain.stateInputT: checkStates}, session=brain.session) maxEvalQValues_a = np.max(evalQValues_a, axis=1) avgEvalQValues_a = np.mean(maxEvalQValues_a) with open("evalQValue_a.txt", "a") as fp: print >> fp, avgEvalQValues_a evalQValues_o = brain.object_valueT.eval( feed_dict={brain.stateInputT: checkStates}, session=brain.session) maxEvalQValues_o = np.max(evalQValues_o, axis=1) avgEvalQValues_o = np.mean(maxEvalQValues_o) with open("evalQValue_o.txt", "a") as fp: print >> fp, avgEvalQValues_o ##################################################################### #save current history before starting evaluation # temp_history_data = brain.history.copy() #now let us evaluate avg reward #create alternate environment for EVALUATION # env_eval = Environment(2) env_eval = env if config.TUTORIAL_WORLD: total_reward, nrewards, nepisodes, quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt = evaluate( brain, env_eval, config) else: total_reward, nrewards, nepisodes, quest1_reward_cnt = evaluate( brain, env_eval, config) with open("test_reward.txt", "a") as fp: print >> fp, total_reward #setting the best network if len(env_eval.reward_history) == 0 or total_reward > max( env_eval.reward_history): # save best network if not os.path.exists(os.getcwd() + '/Savednetworks'): os.makedirs(os.getcwd() + '/Savednetworks') brain.saver.save(brain.session, os.getcwd() + '/Savednetworks/' + 'network' + '-dqn', global_step=brain.timeStep) env_eval.reward_history.append( total_reward) #doing this for keeping track of best network #go back to saved frame after evaluation completed # brain.history.add(temp_history_data) ##################################################################### if config.TUTORIAL_WORLD: brain.inject_summary( { 'average.q_a': avgEvalQValues_a, 'average.q_o': avgEvalQValues_o, 'average.q': (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a), 'average_reward': total_reward, 'average_num_pos_reward': nrewards, 'number_of_episodes': nepisodes, 'quest1_average_reward_cnt': quest1_reward_cnt, 'quest2_average_reward_cnt': quest2_reward_cnt, 'quest3_average_reward_cnt': quest3_reward_cnt }, brain.timeStep) else: brain.inject_summary( { 'average.q_a': avgEvalQValues_a, 'average.q_o': avgEvalQValues_o, 'average.q': (0.5 * avgEvalQValues_o + 0.5 * avgEvalQValues_a), 'average_reward': total_reward, 'average_numrewards': nrewards, 'number_of_episodes': nepisodes, 'quest1_average_reward_cnt': quest1_reward_cnt }, brain.timeStep) ##################################################################### pbar.update(1) if (brain.timeStep) > config.MAX_FRAMES: brain.train_writer.close() break brain.session.close()
def train_agent(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, args.n_actions).to(device) target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) if args.policyNet: target_net.load_state_dict( torch.load(args.policyNet, map_location=device)) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) args.steps_done = 0 num_episodes = 1 for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action(state, policy_net, args, device) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(optimizer, memory, policy_net, target_net, args, device) if done: episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if i_episode % args.target_update == 0: target_net.load_state_dict(policy_net.state_dict()) torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet) if i_episode % 10 == 0: print(f'{i_episode+1}/{num_episodes}: Completed Episode.') print('Complete') env.close() torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet)
def run_full_experiment(config): # archiving old experience db.archive_exp(db.get_all_exp()) db.delete_all_exp() util.setup_file_logger(name=config.run_id, filename=config.run_id) logger = logging.getLogger(config.run_id) start_time = time.time() # Define players model_1 = DQN(run_id=config.run_id, **config.DQN_params) model_2 = model_1.copy() epsilon = Epsilon(epsilon_func=config.epsilon_func, max_epsilon=config.max_epsilon, min_epsilon=config.min_epsilon, eval_epsilon=config.eval_epsilon, num_cycles=config.num_cycles, decrement=config.epsilon_decrement) player_list = [ Agent(name=config.bot_1_name, model=model_1, epsilon=epsilon), Agent(name=config.bot_2_name, model=model_2, epsilon=epsilon) ] winner_list = [] previous_experience_id = 0 util.save_config(config=config, path=config.run_id) # For each cycle logger.info('Beginning run titled: ' + config.run_id) logger.info(cs.DIVIDER) for i in range(1, config.num_cycles + 1): # For each episode, play through episode and insert each state/action pair into the database logger.info('Beginning cycle: ' + str(i) + ' / ' + str(config.num_cycles) + '\tCumulative Time Elapsed: ' + util.get_pretty_time(time.time() - start_time)) logger.info( f'Current Epsilon: {epsilon.get_epsilon(current_cycle=i):.3f}') cycle_start_time = time.time() # Async parallelization. May want to consider doing cpu_count - 1 to allow user to do things while it runs. Sux cuz of memory copying I think. # with mp.Pool(mp.cpu_count() - 1) as pool: # game_output = pool.starmap_async(parallel.play_game, [(config.game, player_list, config.run_id, i) for j in range(config.episodes_per_cycle)]).get() # Old serial method winner_list += pu.play_games(num_games=config.episodes_per_cycle, name=config.game, players=player_list, run_id=config.run_id, current_cycle=i, config=config) logger.info('Data collection complete.\tTotal Episode Time: ' + util.get_pretty_time(time.time() - cycle_start_time)) logger.info('Loading experience and training model...') training_start_time = time.time() # Import data from database based on experience replay buffer and train model pu.train_model(model=model_1, config=config) logger.info('Model training complete.\tTotal Training Time: ' + util.get_pretty_time(time.time() - training_start_time)) # Update model_2 if i % config.player_2_update_freq == 0: logger.info(cs.DIVIDER) logger.info( 'Storing history and setting model 2 equal to model 1...') player_list[0].model.policy_net.store_history() player_list[1].set_model(model=model_1.copy()) # Benchmark if i % config.benchmark_freq == 0: logger.info(cs.DIVIDER) logger.info('Benchmarking...') # List of player 1's win rate against player 2 by cycle benchmark_cycle_win_rate = 1 - sum(winner_list) / len(winner_list) winner_list = [] # Reset winner list # Play against random bot and measure win rate random_win_rate = benchmark.benchmark_test( primary_model=model_1, benchmark_model=RandomBot(), benchmark_bot_name=config.random_bot_name, num_games=config.random_bot_cycles, run_id=config.run_id if config.log_random_benchmark else None) logger.info( f'Winrate vs. Random Bot: {random_win_rate * 100:.1f}%') # Play against expert policy bot and measure win rate # expert_policy_win_rate = benchmark.benchmark_test(primary_model=model_1, benchmark_model=ExpertPolicy(), benchmark_bot_name=config.expert_policy_bot_name, # num_games=config.random_bot_cycles, run_id=config.run_id if config.log_expert_policy_benchmark else None) # logger.info(f'Winrate vs. Expert Policy: {expert_policy_win_rate * 100:.1f}%') # Collect average reward from database average_reward = benchmark.get_average_reward( run_id=config.run_id, previous_experience_id=previous_experience_id, agent_id=config.bot_1_name, opponent_id=config.bot_2_name) db.insert_metrics(run_id=config.run_id, win_rate=benchmark_cycle_win_rate, win_rate_random=random_win_rate, win_rate_expert_policy=0.0, average_reward=average_reward) previous_experience_id = db.get_max_id(config.run_id) # Checkpoint if config.checkpoint_freq is not None and i % config.checkpoint_freq == 0: logger.info(cs.DIVIDER) logger.info('Model checkpoint reached. Saving checkpoint...') model_1.save(folder=os.path.join(config.checkpoint_folder, config.run_id), title=util.get_checkpoint_model_name(cycle=i)) logger.info('Cycle complete.\tTotal Cycle Time: ' + util.get_pretty_time(time.time() - cycle_start_time)) logger.info(cs.DIVIDER) logging.info('Training complete.\tTotal Run Time: ' + util.get_pretty_time(time.time() - start_time) + '\tSaving model and exiting...') model_1.save(title=config.run_id)
EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() env.reset() init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n print(n_actions) policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY)