Ejemplo n.º 1
0
    def learn(self):


        for i in range(self.epochs):
            print(f"Epoch {i}/{self.epochs}")
            pbar = tqdm(range(self.rollout_batch_size))
            for b in pbar:
                #state = self.buffer_env.sample(batch_size=1)[0][0]
                state = self.env_model.reset()
                state = State.get_vec_observation(state)



                for h in range(self.rollout):
                    pbar.set_description(f"batch: {b} rollout: {h}")
                    board_cfg = State.get_board_config_from_vec(state,
                                                                n_regions=self.n_regions,
                                                                n_products=self.n_products
                                                                )

                    feasible_actions = AllocationEnv.get_feasible_actions(board_cfg)
                    #feasible_actions = AllocationEnv.get_feasible_actions(state["board_config"])
                    action_mask = AllocationEnv.get_action_mask(feasible_actions, self.n_actions)

                    # sample action a_j ~ pi(s_j)
                    alpha = random.random()

                    if alpha < self.eps:
                        action = self.env_model.action_space.sample()
                    else:
                        action, _states = self.policy.predict(state.reshape(1, -1), mask=action_mask)

                    # compute dynamics from env model
                    new_state, r_hat, dones, info = self.env_model.step(action)
                    new_state = State.get_vec_observation(new_state)

                    reward = self.get_penalized_reward(r_hat, self.lmbda)


                    # add (s, a, r, s') to buffer
                    self.buffer_model.add(obs_t=state,
                                          action=action,
                                          reward=reward,
                                          obs_tp1=new_state,
                                          done=float(dones))

                    state = new_state



                    # update policy with samples from D_env and D_model
                self.policy.update_weights(self.buffer_model)
        self.save_buffer()
Ejemplo n.º 2
0
def get_state_and_reward(chunk, date, product_set, board_cfg, weights,
                         prod_to_idx):
    # product_set to vector
    day = datetime.strptime(date, "%Y-%m-%d")
    day_vec = State.get_day_vec(day.weekday())

    chunk = chunk[chunk['DATE'] == date]
    chunk = chunk[chunk["UPC"].isin(product_set)]

    curr_sales = np.zeros_like(board_cfg)
    reward = 0
    for idx, row in chunk.iterrows():
        product = row["UPC"]
        total_sales = row["SALES"]
        p_idx = prod_to_idx[product]
        w = weights[:, p_idx]
        placement = board_cfg[:, p_idx]
        w = normalize(w * placement)
        est_sales = w * total_sales
        curr_sales[:, p_idx] = est_sales

        reward += est_sales.sum()

    state = {
        "day_vec": day_vec,
        "board_config": board_cfg,
        "curr_sales": curr_sales,
    }

    return state, reward
Ejemplo n.º 3
0
    def predict(self, observation, state=None, mask=None, deterministic=True):
        if isinstance(observation, dict):
            observation = State.get_vec_observation(observation)[None]

        with self.sess.as_default():
            actions = self.select_action(observation, mask=mask)

        return actions[0], None
Ejemplo n.º 4
0
def main(fpath):
    train_data = pd.read_csv(fpath)
    n_products = train_data['product'].max() + 1
    n_regions = train_data['region'].max() + 1

    buffer = ReplayBuffer(size=100000)
    grouped = train_data.groupby(by='date')

    prev_state = None

    for date, chunk in grouped:
        board_config = np.zeros([n_regions, n_products])
        prev_sales = np.zeros([n_regions, n_products])

        day = chunk.iloc[0, 8]

        prev_sales_product = {}
        prev_placement_cnts = {}
        for idx, row in chunk.iterrows():

            region = row['region']
            product = row['product']

            prev_sales_product[product] = row['prev_sales']

            if row['quantity'] > 0:
                board_config[region, product] = 1.0

                if product not in prev_placement_cnts:
                    prev_placement_cnts[product] = 0

                prev_placement_cnts[product] += 1

        for p in range(n_products):

            if p not in prev_placement_cnts:
                continue

            sales = prev_sales_product[p]
            cnt = prev_placement_cnts[p]
            avg_spatial_sales = sales / cnt
            regions = board_config[:, p]

            prev_sales[:, p] = regions * avg_spatial_sales

        day_vec = State.get_day_vec(day)

        state = {
            "day_vec": day_vec,
            "prev_sales": prev_sales,
            "board_config": board_config
        }

        if prev_state is not None:
            action = state['board_config'] - prev_state['board_config']

        prev_state = state
Ejemplo n.º 5
0
    def evaluate(self):
        gamma = .8
        rewards = []
        for i in range(self.n_episodes):

            self.queue = self.build_queue(self.buffer)

            r_i = 0
            state, _, _, _, _ = self.buffer.sample(batch_size=1)
            state = state[0]

            iter = 0
            cntr = 0

            while True:
                board_cfg = State.get_board_config_from_vec(
                    state,
                    n_regions=self.n_regions,
                    n_products=self.n_products)
                feasible_actions = AllocationEnv.get_feasible_actions(
                    board_cfg)
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.n_actions)

                #M = self.get_m(state, action_mask)
                M = 1
                try:
                    _, a, r, s_prime = self.queue[state].pop()
                    #_, a, r, s_prime = self.queue[state][-1]

                except IndexError:
                    break

                alpha = random.random()

                prob_policy = self.policy.proba_step(state.reshape(1, -1),
                                                     mask=action_mask)[0][a]
                prob_env = self.env_policy.predict_proba(state)[a]

                rejection_tol = (1 / M) * prob_policy / prob_env

                iter += 1
                print(f"eps: {i+1} - iter:{iter} - success: {cntr}")

                if alpha > rejection_tol:
                    continue
                else:
                    #self.queue[state].pop()

                    r_i += (gamma)**cntr * r
                    state = s_prime
                    cntr += 1
            if r_i > 0:
                rewards.append(r_i)

        return rewards
Ejemplo n.º 6
0
    def _predict(self, features=None, n_samples=500):
        self.__check_model()

        self.__update_features(features)
        with self.env_model:
            posterior_pred = pm.sample_posterior_predictive(self.trace,
                                                            samples=n_samples,
                                                            progressbar=False)
        sales = self.__get_sales(posterior_pred['quantity_ij'],
                                 prices=features.prices)
        # clip estimated sales
        sales_hat = State.clip_val(sales, self.state.sales_bound)
        return sales_hat
Ejemplo n.º 7
0
def get_state(board_cfg, date, prev_sales):

    day = datetime.strptime(date, "%Y-%m-%d")
    day_vec = State.get_day_vec(day.weekday())

    if prev_sales is not None:
        prev_sales = prev_sales.sum()

    state = {
        "day_vec": day_vec,
        "board_config": board_cfg,
        "prev_sales": prev_sales,
    }

    return state
Ejemplo n.º 8
0
    def predict(self, observation, state=None, mask=None, deterministic=True):
        if isinstance(observation, dict):
            observation = State.get_vec_observation(observation)[None]
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation,
                                                 deterministic=deterministic,
                                                 mask=mask)

        if not vectorized_env:
            actions = actions[0]

        return actions, None
Ejemplo n.º 9
0
    def _load_data(self, model_path, train_data_path, load_model):
        train_data = pd.read_csv(train_data_path)
        # Remove zero quantity samples from training data
        train_data = train_data[train_data['quantity'] > 0]
        train_features = Features.feature_extraction(train_data,
                                                     y_col='quantity')

        self.X_region = theano.shared(train_features.region)
        self.X_product = theano.shared(train_features.product)
        self.X_temporal = theano.shared(train_features.temporal)
        self.X_lagged = theano.shared(train_features.lagged)
        self.time_stamps = theano.shared(train_features.time_stamps)
        self.product_idx = theano.shared(train_features.product_idx)
        self.y = theano.shared(train_features.y)
        self.prices = train_features.prices

        self.init_state = State.init_state(cfg.vals)
        init_features = Features.featurize_state(self.init_state).toarray()
        self.init_state_len = init_features.shape[1]
        self.feature_shape = init_features.shape

        self.init_state_dimension = len(self.feature_shape)
        self.env_model = self._build_env_model()

        if load_model:

            if not os.path.exists(model_path):
                raise Exception(
                    "Model file {} does not exist. Run train.py and try again".
                    format(model_path))

            with open(model_path, 'rb') as f:
                self.trace = pickle.load(f)

            #with self.env_model:
            #self.trace = pm.load_trace(model_path)
            ts = datetime.datetime.now()
            print("Environment model read from disk: {}".format(ts))
Ejemplo n.º 10
0
for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])

        results['rewards'].append(r[0] + results['rewards'][-1])

        # add (s, a, r, s') to buffer
        buffer.add(obs_t=State.get_vec_observation(obs),
                   action=action,
                   reward=r[0],
                   obs_tp1=State.get_vec_observation(new_obs),
                   done=float(dones))

        obs = new_obs

with open("output/rl-test-{}.json".format(cfg.vals['prj_name']), 'w') as f:
    json.dump(results, f)

with open(f"../data/{store_id}-buffer-d.p", 'wb') as f:
    pickle.dump(buffer, f)
Ejemplo n.º 11
0
        :param state:
        :return: an numpy array
        [ [1*7],
          [4*4],
          [4*4] ]
        '''

        sales_bin = np.zeros(6, dtype=np.int8)
        prev_sales = state.prev_sales.sum()

        for idx, bin in quantiles.items():

            if prev_sales < bin:
                sales_bin[idx] = 1
                break

        if sales_bin.sum() == 0:
            sales_bin[5] = 1

        assert sales_bin.sum() == 1

        return {"day_vec": state.day_vec, "board_config": state.board_config, "prev_sales": sales_bin}


if __name__ == "__main__":

    init_state = State.init_state(config=cfg.vals)
    state_features = Features.featurize_state(init_state)

    stop = 0
Ejemplo n.º 12
0
def main():
    N_PRODUCTS = 15
    # Import store, sales data
    stores = pd.read_csv("../data/store-level-data-17-19.csv")
    #stores['DATE'] = pd.to_datetime(stores['DATE'])
    # stores['day_of_week'] = stores['DATE'].dt.dayofweek
    stores = stores[stores['SALES'] > 0.0]
    # log of sales
    stores['SALES'] = stores['QUANTITY'] * stores['PRICE']
    ## Standadarze sales data ([x - mu] / sd)
    # stores['SALES'] = (stores['SALES'] - stores['SALES'].mean()) / np.std(stores['SALES'])
    # stores['SALES_2'] = np.power(stores["SALES"], 2)
    #stores['day_of_week'] = stores['DATE'].dt.dayofweek

    ## Store consideration set
    # 600055785 --> Fort Union (Midvale) - Large Store (by sales)
    # 600055679 --> Draper - Small Store (by sales)
    STORE_SET = [600055785, 600055679]

    # Get Product set
    # Top 15 products
    top_prods = stores[['UPC', 'SALES']].groupby('UPC').sum().sort_values(
        "SALES", ascending=False).reset_index()[:N_PRODUCTS]

    PROD_SET = top_prods["UPC"].values
    prod_to_idx = dict(zip(PROD_SET, range(N_PRODUCTS)))
    idx_to_prod = dict(zip(range(N_PRODUCTS), PROD_SET))

    stores_keep = stores[stores["UPC"].isin(PROD_SET)]

    store1 = stores_keep[stores_keep['CUSTOMER'] == STORE_SET[0]]
    store2 = stores_keep[stores_keep['CUSTOMER'] == STORE_SET[1]]

    # Regions per store
    n_regions = dict(zip(STORE_SET, [18, 12]))
    # Adjacency Matrices

    adj = {
        STORE_SET[0]: get_adj_mtx("../data/store-1-adj-mtx.json"),
        STORE_SET[1]: get_adj_mtx("../data/store-2-adj-mtx.json")
    }

    for k, a in adj.items():
        is_symmetric = np.allclose(a, a.transpose())
        assert is_symmetric

    # Heterogeneous spatial weights
    priors = init_prior(STORE_SET, adj, n_regions)

    # (regions x products)
    weights = gen_weights(STORE_SET, priors, N_PRODUCTS)

    store_cntr = 0
    for store in [store1, store2]:
        prod_dist = get_product_distributions(store)
        buffer = ReplayBuffer(size=50000)

        store_id = STORE_SET[store_cntr]
        r = n_regions[store_id]

        actions = ActionSpace(r, N_PRODUCTS)

        date_to_idx, idx_to_date = get_time_stamp(store['DATE'])
        products_map = get_dicts(store)

        product_t = products_map[idx_to_date[0]]
        board_cfg = init_state(r, N_PRODUCTS, product_t, prod_to_idx)

        prev_sales = None

        for dt, dt_idx in date_to_idx.items():

            product_t = products_map[dt]
            t_p_1 = idx_to_date.get(dt_idx + 1, None)

            if t_p_1 is None:
                break

            state = get_state(board_cfg, idx_to_date[0], prev_sales)
            _, sales_t = get_reward(store, dt, prod_dist,
                                    state["board_config"], weights[store_id],
                                    prod_to_idx)

            product_next = products_map[t_p_1]

            # select action
            a, a_idx = actions.sample(state["board_config"], product_next,
                                      idx_to_prod)
            new_board_cfg = state["board_config"] + a

            new_state = get_state(new_board_cfg, t_p_1, prev_sales=sales_t)

            reward, _ = get_reward(store, t_p_1, prod_dist, new_board_cfg,
                                   weights[store_id], prod_to_idx)

            #state = new_state
            print(state["board_config"], reward)

            buffer.add(obs_t=State.get_vec_observation(state),
                       action=a_idx,
                       reward=reward,
                       obs_tp1=State.get_vec_observation(new_state),
                       done=False)

        with open(f"../data/store-{store_cntr+1}-buffer.p", 'wb') as f:
            pickle.dump(buffer, f)
        store_cntr += 1
Ejemplo n.º 13
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self