def learn(self): for i in range(self.epochs): print(f"Epoch {i}/{self.epochs}") pbar = tqdm(range(self.rollout_batch_size)) for b in pbar: #state = self.buffer_env.sample(batch_size=1)[0][0] state = self.env_model.reset() state = State.get_vec_observation(state) for h in range(self.rollout): pbar.set_description(f"batch: {b} rollout: {h}") board_cfg = State.get_board_config_from_vec(state, n_regions=self.n_regions, n_products=self.n_products ) feasible_actions = AllocationEnv.get_feasible_actions(board_cfg) #feasible_actions = AllocationEnv.get_feasible_actions(state["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, self.n_actions) # sample action a_j ~ pi(s_j) alpha = random.random() if alpha < self.eps: action = self.env_model.action_space.sample() else: action, _states = self.policy.predict(state.reshape(1, -1), mask=action_mask) # compute dynamics from env model new_state, r_hat, dones, info = self.env_model.step(action) new_state = State.get_vec_observation(new_state) reward = self.get_penalized_reward(r_hat, self.lmbda) # add (s, a, r, s') to buffer self.buffer_model.add(obs_t=state, action=action, reward=reward, obs_tp1=new_state, done=float(dones)) state = new_state # update policy with samples from D_env and D_model self.policy.update_weights(self.buffer_model) self.save_buffer()
def get_state_and_reward(chunk, date, product_set, board_cfg, weights, prod_to_idx): # product_set to vector day = datetime.strptime(date, "%Y-%m-%d") day_vec = State.get_day_vec(day.weekday()) chunk = chunk[chunk['DATE'] == date] chunk = chunk[chunk["UPC"].isin(product_set)] curr_sales = np.zeros_like(board_cfg) reward = 0 for idx, row in chunk.iterrows(): product = row["UPC"] total_sales = row["SALES"] p_idx = prod_to_idx[product] w = weights[:, p_idx] placement = board_cfg[:, p_idx] w = normalize(w * placement) est_sales = w * total_sales curr_sales[:, p_idx] = est_sales reward += est_sales.sum() state = { "day_vec": day_vec, "board_config": board_cfg, "curr_sales": curr_sales, } return state, reward
def predict(self, observation, state=None, mask=None, deterministic=True): if isinstance(observation, dict): observation = State.get_vec_observation(observation)[None] with self.sess.as_default(): actions = self.select_action(observation, mask=mask) return actions[0], None
def main(fpath): train_data = pd.read_csv(fpath) n_products = train_data['product'].max() + 1 n_regions = train_data['region'].max() + 1 buffer = ReplayBuffer(size=100000) grouped = train_data.groupby(by='date') prev_state = None for date, chunk in grouped: board_config = np.zeros([n_regions, n_products]) prev_sales = np.zeros([n_regions, n_products]) day = chunk.iloc[0, 8] prev_sales_product = {} prev_placement_cnts = {} for idx, row in chunk.iterrows(): region = row['region'] product = row['product'] prev_sales_product[product] = row['prev_sales'] if row['quantity'] > 0: board_config[region, product] = 1.0 if product not in prev_placement_cnts: prev_placement_cnts[product] = 0 prev_placement_cnts[product] += 1 for p in range(n_products): if p not in prev_placement_cnts: continue sales = prev_sales_product[p] cnt = prev_placement_cnts[p] avg_spatial_sales = sales / cnt regions = board_config[:, p] prev_sales[:, p] = regions * avg_spatial_sales day_vec = State.get_day_vec(day) state = { "day_vec": day_vec, "prev_sales": prev_sales, "board_config": board_config } if prev_state is not None: action = state['board_config'] - prev_state['board_config'] prev_state = state
def evaluate(self): gamma = .8 rewards = [] for i in range(self.n_episodes): self.queue = self.build_queue(self.buffer) r_i = 0 state, _, _, _, _ = self.buffer.sample(batch_size=1) state = state[0] iter = 0 cntr = 0 while True: board_cfg = State.get_board_config_from_vec( state, n_regions=self.n_regions, n_products=self.n_products) feasible_actions = AllocationEnv.get_feasible_actions( board_cfg) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.n_actions) #M = self.get_m(state, action_mask) M = 1 try: _, a, r, s_prime = self.queue[state].pop() #_, a, r, s_prime = self.queue[state][-1] except IndexError: break alpha = random.random() prob_policy = self.policy.proba_step(state.reshape(1, -1), mask=action_mask)[0][a] prob_env = self.env_policy.predict_proba(state)[a] rejection_tol = (1 / M) * prob_policy / prob_env iter += 1 print(f"eps: {i+1} - iter:{iter} - success: {cntr}") if alpha > rejection_tol: continue else: #self.queue[state].pop() r_i += (gamma)**cntr * r state = s_prime cntr += 1 if r_i > 0: rewards.append(r_i) return rewards
def _predict(self, features=None, n_samples=500): self.__check_model() self.__update_features(features) with self.env_model: posterior_pred = pm.sample_posterior_predictive(self.trace, samples=n_samples, progressbar=False) sales = self.__get_sales(posterior_pred['quantity_ij'], prices=features.prices) # clip estimated sales sales_hat = State.clip_val(sales, self.state.sales_bound) return sales_hat
def get_state(board_cfg, date, prev_sales): day = datetime.strptime(date, "%Y-%m-%d") day_vec = State.get_day_vec(day.weekday()) if prev_sales is not None: prev_sales = prev_sales.sum() state = { "day_vec": day_vec, "board_config": board_cfg, "prev_sales": prev_sales, } return state
def predict(self, observation, state=None, mask=None, deterministic=True): if isinstance(observation, dict): observation = State.get_vec_observation(observation)[None] vectorized_env = self._is_vectorized_observation( observation, self.observation_space) with self.sess.as_default(): actions, _, _ = self.step_model.step(observation, deterministic=deterministic, mask=mask) if not vectorized_env: actions = actions[0] return actions, None
def _load_data(self, model_path, train_data_path, load_model): train_data = pd.read_csv(train_data_path) # Remove zero quantity samples from training data train_data = train_data[train_data['quantity'] > 0] train_features = Features.feature_extraction(train_data, y_col='quantity') self.X_region = theano.shared(train_features.region) self.X_product = theano.shared(train_features.product) self.X_temporal = theano.shared(train_features.temporal) self.X_lagged = theano.shared(train_features.lagged) self.time_stamps = theano.shared(train_features.time_stamps) self.product_idx = theano.shared(train_features.product_idx) self.y = theano.shared(train_features.y) self.prices = train_features.prices self.init_state = State.init_state(cfg.vals) init_features = Features.featurize_state(self.init_state).toarray() self.init_state_len = init_features.shape[1] self.feature_shape = init_features.shape self.init_state_dimension = len(self.feature_shape) self.env_model = self._build_env_model() if load_model: if not os.path.exists(model_path): raise Exception( "Model file {} does not exist. Run train.py and try again". format(model_path)) with open(model_path, 'rb') as f: self.trace = pickle.load(f) #with self.env_model: #self.trace = pm.load_trace(model_path) ts = datetime.datetime.now() print("Environment model read from disk: {}".format(ts))
for j in range(100): obs = env.reset() for i in range(TEST_T): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask(feasible_actions, n_actions) action, _states = model.predict(obs, mask=action_mask) action = AllocationEnv.check_action(obs['board_config'], action) new_obs, r, dones, info = env.step([action]) results['rewards'].append(r[0] + results['rewards'][-1]) # add (s, a, r, s') to buffer buffer.add(obs_t=State.get_vec_observation(obs), action=action, reward=r[0], obs_tp1=State.get_vec_observation(new_obs), done=float(dones)) obs = new_obs with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f: json.dump(results, f) with open(f"../data/{store_id}-buffer-d.p", 'wb') as f: pickle.dump(buffer, f)
:param state: :return: an numpy array [ [1*7], [4*4], [4*4] ] ''' sales_bin = np.zeros(6, dtype=np.int8) prev_sales = state.prev_sales.sum() for idx, bin in quantiles.items(): if prev_sales < bin: sales_bin[idx] = 1 break if sales_bin.sum() == 0: sales_bin[5] = 1 assert sales_bin.sum() == 1 return {"day_vec": state.day_vec, "board_config": state.board_config, "prev_sales": sales_bin} if __name__ == "__main__": init_state = State.init_state(config=cfg.vals) state_features = Features.featurize_state(init_state) stop = 0
def main(): N_PRODUCTS = 15 # Import store, sales data stores = pd.read_csv("../data/store-level-data-17-19.csv") #stores['DATE'] = pd.to_datetime(stores['DATE']) # stores['day_of_week'] = stores['DATE'].dt.dayofweek stores = stores[stores['SALES'] > 0.0] # log of sales stores['SALES'] = stores['QUANTITY'] * stores['PRICE'] ## Standadarze sales data ([x - mu] / sd) # stores['SALES'] = (stores['SALES'] - stores['SALES'].mean()) / np.std(stores['SALES']) # stores['SALES_2'] = np.power(stores["SALES"], 2) #stores['day_of_week'] = stores['DATE'].dt.dayofweek ## Store consideration set # 600055785 --> Fort Union (Midvale) - Large Store (by sales) # 600055679 --> Draper - Small Store (by sales) STORE_SET = [600055785, 600055679] # Get Product set # Top 15 products top_prods = stores[['UPC', 'SALES']].groupby('UPC').sum().sort_values( "SALES", ascending=False).reset_index()[:N_PRODUCTS] PROD_SET = top_prods["UPC"].values prod_to_idx = dict(zip(PROD_SET, range(N_PRODUCTS))) idx_to_prod = dict(zip(range(N_PRODUCTS), PROD_SET)) stores_keep = stores[stores["UPC"].isin(PROD_SET)] store1 = stores_keep[stores_keep['CUSTOMER'] == STORE_SET[0]] store2 = stores_keep[stores_keep['CUSTOMER'] == STORE_SET[1]] # Regions per store n_regions = dict(zip(STORE_SET, [18, 12])) # Adjacency Matrices adj = { STORE_SET[0]: get_adj_mtx("../data/store-1-adj-mtx.json"), STORE_SET[1]: get_adj_mtx("../data/store-2-adj-mtx.json") } for k, a in adj.items(): is_symmetric = np.allclose(a, a.transpose()) assert is_symmetric # Heterogeneous spatial weights priors = init_prior(STORE_SET, adj, n_regions) # (regions x products) weights = gen_weights(STORE_SET, priors, N_PRODUCTS) store_cntr = 0 for store in [store1, store2]: prod_dist = get_product_distributions(store) buffer = ReplayBuffer(size=50000) store_id = STORE_SET[store_cntr] r = n_regions[store_id] actions = ActionSpace(r, N_PRODUCTS) date_to_idx, idx_to_date = get_time_stamp(store['DATE']) products_map = get_dicts(store) product_t = products_map[idx_to_date[0]] board_cfg = init_state(r, N_PRODUCTS, product_t, prod_to_idx) prev_sales = None for dt, dt_idx in date_to_idx.items(): product_t = products_map[dt] t_p_1 = idx_to_date.get(dt_idx + 1, None) if t_p_1 is None: break state = get_state(board_cfg, idx_to_date[0], prev_sales) _, sales_t = get_reward(store, dt, prod_dist, state["board_config"], weights[store_id], prod_to_idx) product_next = products_map[t_p_1] # select action a, a_idx = actions.sample(state["board_config"], product_next, idx_to_prod) new_board_cfg = state["board_config"] + a new_state = get_state(new_board_cfg, t_p_1, prev_sales=sales_t) reward, _ = get_reward(store, t_p_1, prod_dist, new_board_cfg, weights[store_id], prod_to_idx) #state = new_state print(state["board_config"], reward) buffer.add(obs_t=State.get_vec_observation(state), action=a_idx, reward=reward, obs_tp1=State.get_vec_observation(new_state), done=False) with open(f"../data/store-{store_cntr+1}-buffer.p", 'wb') as f: pickle.dump(buffer, f) store_cntr += 1
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, learning_curve=False, test_t=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] self.cumul_reward = [0.0] episode_successes = [] obs = self.env.reset() reset = True self.episode_reward = np.zeros((1, )) # variables for test eval ## test_step = test_t * 3 test_results = {'sum': []} test_ts = [] for _ in range(total_timesteps): ## Test eval period ## if learning_curve and _ % test_step == 0 and _ > 0: print("--> Simulating test period") self.env.reset() test_r = 0.0 for i in range(test_t): feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.env.action_space.n) action, _states = self.predict(obs, mask=action_mask) action = AllocationEnv.check_action( obs['board_config'], action) obs, rewards, dones, info = self.env.step(action) test_r += rewards test_results["sum"].append(test_r) test_ts.append(_) self.env.reset() # plot test eval progress plt.plot(test_ts, test_results["sum"]) # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k') plt.xlabel("Iteration count") plt.ylabel("Total (sum) test reward") plt.savefig("figs/rl-learning-curve-{}.pdf".format( cfg.vals['prj_name'])) plt.clf() plt.close() # write test eval progress write_results = {} for k, v in test_results.items(): write_results[k] = serialize_floats(v) with open( "output/rl-learning-curve-{}.json".format( cfg.vals['prj_name']), 'w') as f: json.dump(write_results, f) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True feasible_actions = AllocationEnv.get_feasible_actions( obs["board_config"]) action_mask = AllocationEnv.get_action_mask( feasible_actions, self.action_space.n) with self.sess.as_default(): action = self.act(State.get_vec_observation(obs)[None], update_eps=update_eps, **kwargs, mask=action_mask)[0] reset = False # CHECK IF ACTIONS IS FEASIBLE action = AllocationEnv.check_action(obs['board_config'], action) env_action = action new_obs, rew, done, info = self.env.step(env_action) print("action: {} - reward: {} - eps: {:.4}".format( action, rew, update_eps)) print(new_obs['day_vec']) print(new_obs['board_config']) # Store transition in the replay buffer. self.replay_buffer.add(State.get_vec_observation(obs), action, rew, State.get_vec_observation(new_obs), float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += rew self.cumul_reward.append(self.cumul_reward[-1] + rew) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() print('timestamp: {}'.format(self.num_timesteps, end='\r\n')) self.num_timesteps += 1 return self