def get_reordered(self, order, click_id): """ get item-level features by order order: (b, order_len) click_id: (b, order_len) last_click_id will be removed """ AssertEqual(len(order), self.batch_size()) AssertEqual(len(click_id), self.batch_size()) global_item_indice = [] for sub_order, sub_offset in zip(order, self.offset()): global_item_indice.append(np.array(sub_order) + sub_offset) global_item_indice = np.concatenate(global_item_indice, axis=0) new_batch_data = BatchData(self.conf, self.tensor_dict) new_seq_lens = [len(od) for od in order] for name in new_batch_data.conf.item_slot_names: if name == 'last_click_id': continue else: v = new_batch_data.tensor_dict[name].values[global_item_indice] new_batch_data.tensor_dict[name] = FakeTensor(v, new_seq_lens) new_batch_data.tensor_dict['click_id'] = FakeTensor(click_id.reshape([-1, 1]), new_seq_lens) return new_batch_data
def expand_candidates(self, other_batch_data, lens): """ Regard other_batch_data as a candidate pool Only expand item-level values 1. append values of self and other_batch_data 2. construct index to get new batch_data lens: (batch_size,), len to expand ignore `last_click_id` """ AssertEqual(len(lens), self.batch_size()) total_cand_len = other_batch_data.total_item_num() total_item_len = self.total_item_num() cand_indice = np.arange(total_item_len, total_item_len + total_cand_len) global_item_indice = [] lod = self.lod()[0] for i in range(len(lod) - 1): start, end = lod[i], lod[i+1] old_indice = np.arange(start, end) new_indice = np.random.choice(cand_indice, size=lens[i], replace=False) global_item_indice.append(old_indice) global_item_indice.append(new_indice) global_item_indice = np.concatenate(global_item_indice, axis=0) prev_seq_lens = self.seq_lens() seq_lens = [s + l for s,l in zip(prev_seq_lens, lens)] # update tensor_dict for name in self.conf.item_slot_names: if name == 'last_click_id': continue values = np.concatenate([self.tensor_dict[name].values, other_batch_data.tensor_dict[name].values], 0) self.tensor_dict[name] = FakeTensor(values[global_item_indice], seq_lens)
def __init__(self, values, seq_lens=None): self.values = values self.seq_lens = seq_lens if seq_lens is None: self.lod = [] else: AssertEqual(len(values), np.sum(seq_lens)) self.lod = [seq_len_2_lod(seq_lens)]
def click_prob_2_score(click_prob): """ args: click_prob: (n, dim) return: click_score: (n,) """ AssertEqual(len(click_prob.shape), 2) dim0, dim1 = click_prob.shape weight = np.arange(dim1).reshape([1, -1]) click_score = np.sum(click_prob * weight, 1) return click_score
def sequence_gather(input_sequence, lens, index): """ input_sequence: (sum(lens), *) lens: (batch_size,) index: len() = batch_size e.g. input_sequence = [1,2, 3,4,5,6] lens = [2, 0, 4] index = [1, None, 2] return [2, [], 5] """ AssertEqual(len(input_sequence), np.sum(lens)) AssertEqual(len(lens), len(index)) input_unconcat = sequence_unconcat(input_sequence, lens) res = [] for sub_input, sub_index in zip(input_unconcat, index): if not sub_index is None: res.append(sub_input[sub_index]) else: res.append([]) return res
def sequence_unconcat(input_sequence, lens): """ input_sequence: (sum(lens), *) e.g. input_sequence = [1,2,3,4,5,6] lens = [2, 0, 4] return [[1,2], [], [3,4,5,6]] """ AssertEqual(len(input_sequence), np.sum(lens)) res = [] start = 0 for l in lens: res.append(input_sequence[start: start + l]) start += l return res
def get_candidates(self, pre_items, stop_flags=None): """ pre_items: len() = batch_size stop_flags: (batch_size,) return: candidate_items: len() = batch_size, e.g. [[2,3,5], [3,4], ...] """ if stop_flags is None: stop_flags = np.zeros([len(pre_items)]) AssertEqual(len(pre_items), len(stop_flags)) res = [] for pre, seq_len, stop in zip(pre_items, self.seq_lens(), stop_flags): if stop: res.append([]) else: full = np.arange(seq_len) res.append(np.setdiff1d(full, pre)) return res
def sequence_expand(input, lens): """ input: len() = batch_size, e.g. [(dim), [], (dim), ...] lens: (batch_size,) e.g. input_sequence = [(dim), [], (dim)] lens = [1,0,2] return (3, dim) e.g. input_sequence = [(dim), [dim], (dim)] lens = [1,0,2] return (3, dim) """ AssertEqual(len(input), len(lens)) res = [] for inp, l in zip(input, lens): if l > 0: res.append(np.array([inp] * l)) # (l, dim) return np.concatenate(res, axis=0)
def get_reordered(self, order): """ get item-level features by order click_id will be removed """ AssertEqual(len(order), self.batch_size()) global_item_indice = [] for sub_order, sub_offset in zip(order, self.offset()): global_item_indice.append(np.array(sub_order) + sub_offset) global_item_indice = np.concatenate(global_item_indice, axis=0) new_batch_data = BatchData(self.conf, self.tensor_dict) new_seq_lens = [len(od) for od in order] for name in new_batch_data.conf.item_slot_names: values = new_batch_data.tensor_dict[name].values new_batch_data.tensor_dict[name] = FakeTensor(values[global_item_indice], new_seq_lens) del new_batch_data.tensor_dict['click_id'] return new_batch_data
def apply_masks(batch_data, list_item_masks, conf=None): """ list_item_masks: (n_masks, seq_len), a list of 1d item_mask Apply mask on item_level_slot_names except last_click_id """ if conf is None: conf = batch_data.conf batch_size = batch_data.batch_size() seq_len = batch_data.seq_lens()[0] AssertEqual(len(list_item_masks[0]), seq_len) batch_data.add_last_click_id() n_masks = len(list_item_masks) batch_item_masks = np.tile( np.array(list_item_masks).flatten(), [batch_size]) # (batch_size * n_masks * seq_len) place = fluid.CPUPlace() feed_dict = {} for name in conf.recent_slot_names + \ conf.item_slot_names: ft = batch_data.tensor_dict[name] v = ft.values extra_shape = list(v.shape[1:]) v = v.reshape([batch_size, -1] + extra_shape) # (batch_size, seq_len/recent_len, ...) v = np.repeat( v, n_masks, axis=0) # (batch_size * n_masks, seq_len/recent_len, ...) seq_lens = [v.shape[1]] * (batch_size * n_masks) v = v.reshape([-1] + extra_shape ) # (batch_size * n_masks * seq_len/recent_len, ...) if name in conf.item_slot_names and name != 'last_click_id': v = v * batch_item_masks.reshape([-1] + [1] * (len(v.shape) - 1)) feed_dict[name] = create_tensor(v, lod=[seq_len_2_lod(seq_lens)], place=place) return feed_dict
def sequence_sampling(scores, lens, sampling_type): """ scores: (sum(lens),) lens: (batch_size,) return: (batch_size,) e.g. scores = [0.4,0.3, 0.4,0.9,0.8] lens = [2, 0, 3] return [0, None, 1] """ AssertEqual(len(scores), np.sum(lens)) scores_unconcat = sequence_unconcat(scores, lens) res_index = [] for sub_score in scores_unconcat: if len(sub_score) > 0: if sampling_type == 'greedy': selected_index = np.argmax(sub_score) res_index.append(selected_index) else: res_index.append(None) return res_index
def replace_following_items(self, pos, ref_batch_data): """ Replace items starting from `pos` by items from `ref_batch_data` Replace click_id as well. Used for credit variance calculation Ignore last_click_id. """ batch_size = self.batch_size() AssertEqual(batch_size, ref_batch_data.batch_size()) new_batch_data = BatchData(self.conf, self.tensor_dict) for name in new_batch_data.conf.item_slot_names + new_batch_data.conf.label_slot_names: if name == 'last_click_id': continue else: v = new_batch_data.tensor_dict[name].values # (b*seq_len, *) ref_v = ref_batch_data.tensor_dict[name].values # (b*seq_len, *) tail_shape = list(v.shape[1:]) new_v = np.concatenate([v.reshape([batch_size, -1] + tail_shape)[:, :pos], ref_v.reshape([batch_size, -1] + tail_shape)[:, pos:]], 1) # (b, seq_len, *) new_v = new_v.reshape(v.shape) # (b*seq_len, *) new_batch_data.tensor_dict[name].values = new_v return new_batch_data