def _build_session_graph(self, bat_items): A_in, A_out, alias_inputs = [], [], [] all_mask = [[1] * len(items) for items in bat_items] bat_items = pad_sequences(bat_items, value=self.num_item) unique_nodes = [np.unique(items).tolist() for items in bat_items] max_n_node = np.max([len(nodes) for nodes in unique_nodes]) for u_seq, u_node, mask in zip(bat_items, unique_nodes, all_mask): adj_mat = np.zeros((max_n_node, max_n_node)) id_map = {node: idx for idx, node in enumerate(u_node)} if len(u_seq) > 1: alias_previous = [id_map[i] for i in u_seq[:len(mask) - 1]] alias_next = [id_map[i] for i in u_seq[1:len(mask)]] adj_mat[alias_previous, alias_next] = 1 u_sum_in = np.sum(adj_mat, axis=0) u_sum_in[np.where(u_sum_in == 0)] = 1 u_A_in = np.divide(adj_mat, u_sum_in) u_sum_out = np.sum(adj_mat, 1) u_sum_out[np.where(u_sum_out == 0)] = 1 u_A_out = np.divide(adj_mat.transpose(), u_sum_out) A_in.append(u_A_in) A_out.append(u_A_out) alias_inputs.append([id_map[i] for i in u_seq]) items = pad_sequences(unique_nodes, value=self.num_item) all_mask = pad_sequences(all_mask, value=0) return A_in, A_out, alias_inputs, items, all_mask
def create_batches(self): self.train_df = self.shuffle_data(self.train_df) # Randomlise data #train set: self.train_x = np.array([d[0] for d in self.train_df]) self.train_size = np.array([len(seq) for seq in self.train_x]) self.train_y = np.array([d[-1] for d in self.train_df]) self.train_left_x = np.array([d[1] for d in self.train_df]) self.train_left_size = np.array([len(seq) for seq in self.train_left_x]) self.train_right_x = np.array([d[2] for d in self.train_df]) self.train_right_size = np.array([len(seq) for seq in self.train_right_x]) self.train_target_x = np.array([d[3] for d in self.train_df]) self.train_x = util.pad_sequences(self.train_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding self.train_left_x = util.pad_sequences(self.train_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.train_right_x = util.pad_sequences(self.train_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.train_x = np.array(self.train_x) self.train_left_x = np.array(self.train_left_x) self.train_right_x = np.array(self.train_right_x) #dev set: self.dev_x = np.array([d[0] for d in self.dev_df]) self.dev_size = np.array([len(seq) for seq in self.dev_x]) self.dev_y = np.array([d[-1] for d in self.dev_df]) self.dev_left_x = np.array([d[1] for d in self.dev_df]) self.dev_left_size = np.array([len(seq) for seq in self.dev_left_x]) self.dev_right_x = np.array([d[2] for d in self.dev_df]) self.dev_right_size = np.array([len(seq) for seq in self.dev_right_x]) self.dev_target_x = np.array([d[3] for d in self.dev_df]) self.dev_x = util.pad_sequences(self.dev_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding self.dev_left_x = util.pad_sequences(self.dev_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.dev_right_x = util.pad_sequences(self.dev_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.dev_x = np.array(self.dev_x) self.dev_left_x = np.array(self.dev_left_x) self.dev_right_x = np.array(self.dev_right_x) #test set: self.test_x = np.array([d[0] for d in self.test_df]) self.test_size = np.array([len(seq) for seq in self.test_x]) self.test_y = np.array([d[-1] for d in self.test_df]) self.test_left_x = np.array([d[1] for d in self.test_df]) self.test_left_size = np.array([len(seq) for seq in self.test_left_x]) self.test_right_x = np.array([d[2] for d in self.test_df]) self.test_right_size = np.array([len(seq) for seq in self.test_right_x]) self.test_target_x = np.array([d[3] for d in self.test_df]) self.test_x = util.pad_sequences(self.test_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding self.test_left_x = util.pad_sequences(self.test_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.test_right_x = util.pad_sequences(self.test_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') self.test_x = np.array(self.test_x) self.test_left_x = np.array(self.test_left_x) self.test_right_x = np.array(self.test_right_x) # Vectorizing labels # self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32) # self.dev_y = pd.get_dummies(self.dev_y).values.astype(np.int32) self.test_y = pd.get_dummies(self.test_y).values.astype(np.int32) # Creating training batches self.num_batches = len(self.train_x)//self.batch_size if self.num_batches==0: assert False, "Not enough data for the batch size." self.batch_df = np.array_split(self.train_df, self.num_batches) # Splitting train set into batches based on num_batches assert np.array([d[-1] for d in self.batch_df[-1]]).shape[1] == 3, "Watch out! All batches must contain 3 labels!"
def packed_rnn(self, x, rnn): """ Runs the provided rnn on the input x. Takes care of packing/unpacking. x: list of unpadded input sequences Returns a tensor of size: len(x) x hidden_dim """ lengths = torch.tensor([len(n) for n in x], dtype=torch.long, device=device) # Sort this batch in descending order by seq length lengths, idx_sort = torch.sort(lengths, dim=0, descending=True) _, idx_unsort = torch.sort(idx_sort, dim=0) idx_sort = torch.autograd.Variable(idx_sort) idx_unsort = torch.autograd.Variable(idx_unsort) padded_x = pad_sequences(x) x_tt = torch.from_numpy(padded_x).type(torch.long).to(device) x_tt = x_tt.index_select(0, idx_sort) # Run the embedding layer embed = self.embedding(x_tt).permute(1,0,2) # Time x Batch x EncDim # Pack padded batch of sequences for RNN module packed = nn.utils.rnn.pack_padded_sequence(embed, lengths) # Run the RNN out, _ = rnn(packed) # Unpack out, _ = nn.utils.rnn.pad_packed_sequence(out) # Get the last step of each sequence idx = (lengths-1).view(-1,1).expand(len(lengths), out.size(2)).unsqueeze(0) out = out.gather(0, idx).squeeze(0) # Unsort out = out.index_select(0, idx_unsort) return out
def _generate_sequences(self): self.user_test_seq = {} users_list, item_seq_list, item_pos_list = [], [], [] seq_len = self.seq_L + self.seq_T uni_users = np.unique(list(self.user_pos_train.keys())) for user in uni_users: seq_items = self.user_pos_train[user] if len(seq_items) - seq_len >= 0: for i in range(len(seq_items), 0, -1): if i - seq_len >= 0: seq_i = seq_items[i - seq_len:i] if user not in self.user_test_seq: self.user_test_seq[user] = seq_i[-self.seq_L:] users_list.append(user) item_seq_list.append(seq_i[:self.seq_L]) item_pos_list.append(seq_i[-self.seq_T:]) else: break else: seq_items = np.reshape(seq_items, newshape=[1, -1]).astype(np.int32) seq_items = pad_sequences(seq_items, value=self.items_num, max_len=seq_len, padding='pre', truncating='pre') seq_i = np.reshape(seq_items, newshape=[-1]) if user not in self.user_test_seq: self.user_test_seq[user] = seq_i[-self.seq_L:] users_list.append(user) item_seq_list.append(seq_i[:self.seq_L]) item_pos_list.append(seq_i[-self.seq_T:]) return users_list, item_seq_list, item_pos_list
def _Q_oneshot(self, batch): new_observations_ids = [] new_admissible_actions_ids = [] observation_padding_size = 0 idx_mapping = {} current_idx = 0 for idx, sample in enumerate(batch): if sample.done: continue new_observations_ids.append(sample.new_observation_ids) new_admissible_actions_ids.append( sample.new_admissible_actions_ids) observation_padding_size = max(observation_padding_size, len(sample.new_observation_ids)) idx_mapping[idx] = (current_idx, current_idx + len(sample.new_admissible_actions_ids)) current_idx += len(sample.new_admissible_actions_ids) new_observations_ids = pad_sequences(new_observations_ids, max_len=observation_padding_size) new_tiled_observations = [] for idx, observation in enumerate(new_observations_ids): num_actions = len(new_admissible_actions_ids[idx]) new_tiled_observations.append( np.tile(observation, (num_actions, 1))) q_values, _ = self.model.predict( np.concatenate(new_tiled_observations), np.concatenate(new_admissible_actions_ids)) return q_values, idx_mapping
def train_model(self): self.logger.info(self.evaluator.metrics_info()) for epoch in range(1, self.num_epochs + 1): user_input, num_idx, item_input, labels = \ data_generator._get_pointwise_all_likefism_data(self.dataset, self.num_negatives, self.train_dict) data_iter = DataIterator(user_input, num_idx, item_input, labels, batch_size=self.batch_size, shuffle=True) num_training_instances = len(user_input) total_loss = 0.0 training_start_time = time() for bat_users, bat_idx, bat_items, bat_labels in data_iter: bat_users = pad_sequences(bat_users, value=self.num_items) feed_dict = { self.user_input: bat_users, self.num_idx: bat_idx, self.item_input: bat_items, self.labels: bat_labels, self.is_train_phase: True } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss self.logger.info("[iter %d : loss : %f, time: %f]" % (epoch, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
def evaluate(self, model, test_users=None): """Evaluate `model`. Args: model: The model need to be evaluated. This model must have a method `predict_for_eval(self, users)`, where the argument `users` is a list of users and the return is a 2-D array that contains `users` rating/ranking scores on all items. Returns: str: A single-line string consist of all results, such as `"0.18663847 0.11239596 0.35824192 0.21479650"`. """ # B: batch size # N: the number of items test_users = test_users if test_users is not None else list(self.user_pos_test.keys()) if not isinstance(test_users, (list, tuple, set, np.ndarray)): raise TypeError("'test_user' must be a list, tuple, set or numpy array!") test_users = DataIterator(test_users, batch_size=self.batch_size, shuffle=False, drop_last=False) batch_result = [] for batch_users in test_users: if self.user_neg_test is not None: candidate_items = [list(self.user_pos_test[u]) + self.user_neg_test[u] for u in batch_users] test_items = [set(range(len(self.user_pos_test[u]))) for u in batch_users] ranking_score = model.predict(batch_users, candidate_items) # (B,N) ranking_score = pad_sequences(ranking_score, value=-np.inf, dtype=np.float32) ranking_score = np.array(ranking_score) else: test_items = [self.user_pos_test[u] for u in batch_users] ranking_score = model.predict(batch_users, None) # (B,N) ranking_score = np.array(ranking_score) # set the ranking scores of training items to -inf, # then the training items will be sorted at the end of the ranking list. for idx, user in enumerate(batch_users): if user in self.user_pos_train and len(self.user_pos_train[user]) > 0: train_items = self.user_pos_train[user] ranking_score[idx][train_items] = -np.inf result = self.eval_score_matrix(ranking_score, test_items, self.metrics, top_k=self.max_top, thread_num=self.num_thread) # (B,k*metric_num) batch_result.append(result) # concatenate the batch results to a matrix all_user_result = np.concatenate(batch_result, axis=0) # (num_users, metrics_num*max_top) final_result = np.mean(all_user_result, axis=0) # (1, metrics_num*max_top) final_result = np.reshape(final_result, newshape=[self.metrics_num, self.max_top]) # (metrics_num, max_top) final_result = final_result[:, self.top_show - 1] final_result = np.reshape(final_result, newshape=[-1]) buf = '\t'.join([("%.8f" % x).ljust(12) for x in final_result]) return buf
def train(self): batch = self.replay_buffer.sample(self.config['training_batch_size']) if not batch: return if self.config['batch_oneshot']: observations_ids, rewards, actions_ids = self._preprocess_batch_oneshot( batch) else: observations_ids, rewards, actions_ids = self._preprocess_batch( batch) self.model.train( np.stack( pad_sequences(observations_ids, max_len=self.get_observation_padding_size())), np.stack(rewards), np.stack( pad_sequences(actions_ids, max_len=self.get_actions_padding_size()))) return
def pandasToTensor(data, globalVocab): data = shuffle(data) # # Preprocessing data # # retain only text that contain less that 70 tokens to avoid too much padding data["token_size"] = data["text"].apply(lambda x: len(x.split(' '))) data = data.loc[data['token_size'] < 70].copy() # # sampling # data = data.sample(n=50000); # # construct vocab and indexing # inputs = construct.ConstructVocab(data["text"].values.tolist()) # print(globalVocab.vocab[0:10]) input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')] for es in data["text"].values.tolist()] # examples of what is in the input tensors # print(input_tensor[0:2]) # calculate the max_length of input tensor max_length_inp = util.max_length(input_tensor) # print(max_length_inp) # inplace padding input_tensor = [ util.pad_sequences(x, max_length_inp) for x in input_tensor ] # print(input_tensor[0:2]) ###Binarization emotions = list(emotion_dict.values()) num_emotions = len(emotion_dict) # print(emotions) # binarizer mlb = preprocessing.MultiLabelBinarizer(classes=emotions) data_labels = [emos for emos in data[['emotions']].values] # print(data_labels) bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) # print(target_tensor[0:2]) # print(data[0:2]) get_emotion = lambda t: np.argmax(t) get_emotion(target_tensor[0]) emotion_dict[get_emotion(target_tensor[0])] return input_tensor, target_tensor
def predict_script(model, users, items=None): users = DataIterator(users, batch_size=512, shuffle=False, drop_last=False) all_ratings = [] for bat_user in users: bat_seq = [model.user_pos_train[u] for u in bat_user] bat_seq = pad_sequences(bat_seq, value=model.items_num, max_len=model.max_len, padding='pre', truncating='pre') bat_pos = [model.user_pos_train[u][1:] for u in bat_user] n_neg_items = [len(pos) for pos in bat_pos] exclusion = [model.user_pos_train[u] for u in bat_user] bat_neg = batch_randint_choice(model.items_num, n_neg_items, replace=True, exclusion=exclusion) bat_pos = pad_sequences(bat_pos, value=model.items_num, max_len=model.max_len, padding='pre', truncating='pre') bat_neg = pad_sequences(bat_neg, value=model.items_num, max_len=model.max_len, padding='pre', truncating='pre') _, _x, bat_ratings = model(bat_seq, bat_pos, bat_neg) all_ratings.extend(bat_ratings) all_ratings = [t.detach().cpu().numpy() for t in all_ratings] # all_ratings = np.array(all_ratings, dtype=np.float32) if items is not None: all_ratings = [ all_ratings[idx][item] for idx, item in enumerate(items) ] return all_ratings
def _build_admissible_actions_ids(self, info, shuffle): admissible_actions_ids = [ memoized_string_to_ids(admissible_action, self.word_ids, tokenizer=self.nlp) for admissible_action in info['admissible_commands'] ] result = np.array( pad_sequences(admissible_actions_ids, max_len=self.get_actions_padding_size())) if shuffle: np.random.shuffle(result) return result
def get_train_data(self): item_seq_list, item_pos_list, item_neg_list = [], [], [] all_users = DataIterator(list(self.user_pos_train.keys()), batch_size=1024, shuffle=False) for bat_users in all_users: bat_seq = [self.user_pos_train[u][:-1] for u in bat_users] bat_pos = [self.user_pos_train[u][1:] for u in bat_users] n_neg_items = [len(pos) for pos in bat_pos] exclusion = [self.user_pos_train[u] for u in bat_users] bat_neg = batch_randint_choice(self.items_num, n_neg_items, replace=True, exclusion=exclusion) # padding bat_seq = pad_sequences(bat_seq, value=self.items_num, max_len=self.max_len, padding='pre', truncating='pre') bat_pos = pad_sequences(bat_pos, value=self.items_num, max_len=self.max_len, padding='pre', truncating='pre') bat_neg = pad_sequences(bat_neg, value=self.items_num, max_len=self.max_len, padding='pre', truncating='pre') item_seq_list.extend(bat_seq) item_pos_list.extend(bat_pos) item_neg_list.extend(bat_neg) return item_seq_list, item_pos_list, item_neg_list # , user_list
def pandasToTensor(data): data["token_size"] = data["text"].apply(lambda x: len(x.split(' '))) data = data.loc[data['token_size'] < 70].copy() # load globalVocab word2idx if os.path.exists(vocabPath): globalVocab.loadFile(vocabPath) else: print("Vocabulary doesn't exist") input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')] for es in data["text"].values.tolist()] # examples of what is in the input tensors # print(input_tensor[0:2]) # calculate the max_length of input tensor max_length_inp = util.max_length(input_tensor) # print(max_length_inp) # inplace padding input_tensor = [ util.pad_sequences(x, max_length_inp) for x in input_tensor ] # print(input_tensor[0:2]) ###Binarization emotions = list(emotion_dict.values()) num_emotions = len(emotion_dict) # print(emotions) # binarizer mlb = preprocessing.MultiLabelBinarizer(classes=emotions) data_labels = [emos for emos in data[['emotion']].values] # print(data_labels) bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) # print(target_tensor[0:2]) # print(data[0:2]) get_emotion = lambda t: np.argmax(t) get_emotion(target_tensor[0]) emotion_dict[get_emotion(target_tensor[0])] return input_tensor, target_tensor
def evaluate(self, model): # B: batch size # N: the number of items test_users = DataIterator(list(self.user_pos_test.keys()), batch_size=self.batch_size, shuffle=False, drop_last=False) batch_result = [] for batch_users in test_users: if self.user_neg_test is not None: candidate_items = [] test_items = [] for user in batch_users: pos = self.user_pos_test[user] neg = self.user_neg_test[user] candidate_items.append(pos+neg) test_items.append(list(range(len(pos)))) ranking_score = model.predict(batch_users, candidate_items) # (B,N) ranking_score = pad_sequences(ranking_score, value=-np.inf) ranking_score = np.array(ranking_score) else: test_items = [] for user in batch_users: test_items.append(self.user_pos_test[user]) ranking_score = model.predict(batch_users, None) # (B,N) ranking_score = np.array(ranking_score) # set the ranking scores of training items to -inf, # then the training items will be sorted at the end of the ranking list. for idx, user in enumerate(batch_users): train_items = self.user_pos_train[user] ranking_score[idx][train_items] = -np.inf result = eval_score_matrix_foldout(ranking_score, test_items, top_k=self.max_top, thread_num=None) # (B,k*metric_num) batch_result.append(result) # concatenate the batch results to a matrix all_user_result = np.concatenate(batch_result, axis=0) final_result = np.mean(all_user_result, axis=0) # mean final_result = np.reshape(final_result, newshape=[self.metrics_num, self.max_top]) final_result = final_result[:, self.top_show-1] final_result = np.reshape(final_result, newshape=[-1]) buf = '\t'.join([("%.8f" % x).ljust(12) for x in final_result]) return buf
def predict(self, users, items=None): users = DataIterator(users, batch_size=512, shuffle=False, drop_last=False) all_ratings = [] for bat_user in users: bat_seq = [self.user_pos_train[u] for u in bat_user] bat_seq = pad_sequences(bat_seq, value=self.items_num, max_len=self.max_len, padding='pre', truncating='pre') feed = {self.item_seq_ph: bat_seq, self.is_training: False} bat_ratings = self.sess.run(self.all_logits, feed_dict=feed) all_ratings.extend(bat_ratings) all_ratings = np.array(all_ratings, dtype=np.float32) if items is not None: all_ratings = [ all_ratings[idx][item] for idx, item in enumerate(items) ] return all_ratings
def train_model(self): self.logger.info(self.evaluator.metrics_info()) for epoch in range(1, self.num_epochs + 1): if self.is_pairwise is True: user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg = \ data_generator._get_pairwise_all_likefism_data(self.dataset) data_iter = DataIterator(user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg, batch_size=self.batch_size, shuffle=True) else: user_input, num_idx, item_input, labels = \ data_generator._get_pointwise_all_likefism_data_debug(self.dataset, self.num_negatives, self.train_dict) data_iter = DataIterator(user_input, num_idx, item_input, labels, batch_size=self.batch_size, shuffle=True) total_loss = 0.0 training_start_time = time() if self.is_pairwise is True: for bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, bat_items_pos, bat_items_neg in data_iter: bat_users_pos = pad_sequences(bat_users_pos, value=self.num_items) bat_users_neg = pad_sequences(bat_users_neg, value=self.num_items) feed_dict = { self.user_input: bat_users_pos, self.user_input_neg: bat_users_neg, self.num_idx: bat_idx_pos, self.num_idx_neg: bat_idx_neg, self.item_input: bat_items_pos, self.item_input_neg: bat_items_neg } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss else: for index, (bat_users, bat_idx, bat_items, bat_labels) in enumerate(data_iter): bat_users = pad_sequences(bat_users, value=self.num_items) feed_dict = { self.user_input: bat_users, self.num_idx: bat_idx, self.item_input: bat_items, self.labels: bat_labels } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss self.logger.info("[iter %d : loss : %f, time: %f]" % (epoch, total_loss / len(user_input), time() - training_start_time)) if epoch % self.verbose == 0: params = self.sess.run([self.c1, self.embedding_Q, self.bias]) #with open("/gdata/yujr/pretrained/epoch=%d_gamma=%f_lambda=%f_fism.pkl" % (epoch, self.gamma_bilinear, self.lambda_bilinear), "wb") as fout: # pickle.dump(params, fout) self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate_val())) self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
def __init__(self, batch_size, dynamic_padding=False, preprocessing=False, embedding=True, saved=False, max_length=None): train = ElectionData.read_data('../data/election-data/training/') test = ElectionData.read_data('../data/election-data/testing/') self.batch_size = batch_size self.dynamic_padding = dynamic_padding self.train_tweets, self.train_targets, self.train_y = zip(*train) self.test_tweets, self.test_targets, self.test_y = zip(*test) self.train_left_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[0] for i in range(len(self.train_tweets))] self.train_right_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[1] for i in range(len(self.train_tweets))] self.test_left_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[0] for i in range(len(self.test_tweets))] self.test_right_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[1] for i in range(len(self.test_tweets))] self.train_tweets = [ElectionData.replace_target(self.train_tweets[i], self.train_targets[i]) for i in range(len(self.train_tweets))] self.test_tweets = [ElectionData.replace_target(self.test_tweets[i], self.test_targets[i]) for i in range(len(self.test_tweets))] self.train_targets = [train_target.split('_') for train_target in self.train_targets] self.test_targets = [test_target.split('_') for test_target in self.test_targets] # Padding tweets (manually adding '<PAD> tokens') if not self.dynamic_padding: self.train_tweets = util.pad_sequences(self.train_tweets, pad_location='RIGHT') self.test_tweets = util.pad_sequences(self.test_tweets, pad_location='RIGHT') # Building vocabulary self.vocab, self.vocab_inv = util.build_vocabulary(self.train_tweets + self.test_tweets) if embedding: # Vectorizing tweets - Glove embedding start = time.clock() print(' - Loading embedding..') glove, self.glove_vec, self.glove_shape, glove_vocab = util.gensim_load_vec('../resources/wordemb/glove.twitter.word2vec.27B.100d.txt') glove_vocab = [token.encode('utf-8') for token in glove_vocab] self.glove_vocab_dict = {j:i for i, j in enumerate(glove_vocab)} self.glove_vec = np.append(self.glove_vec, [[0]*self.glove_shape[1]], axis=0) self.glove_shape = [self.glove_shape[0]+1, self.glove_shape[1]] print(' - DONE') print("time taken: %f mins"%((time.clock() - start)/60)) if saved==False: start = time.clock() print(' - Matching words-indices') self.train_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_tweets]) self.train_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_left_tweets]) self.train_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_right_tweets]) self.test_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_tweets]) self.test_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_left_tweets]) self.test_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_right_tweets]) self.train_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.train_targets]) self.test_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.test_targets]) self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32) self.train_df = [(self.train_x[i], self.train_left_x[i], self.train_right_x[i], self.train_target_x[i], self.train_y[i]) for i in range(len(self.train_x))] self.test_df = [(self.test_x[i], self.test_left_x[i], self.test_right_x[i], self.test_target_x[i], self.test_y[i]) for i in range(len(self.test_x))] train_y = np.array([d[-1] for d in self.train_df]) self.train_df, self.dev_df = self.build_train_dev(train_y) # Dividing to train and dev set print(' - DONE') print("time taken: %f mins"%((time.clock() - start)/60)) print(" - Saving data") np.save('../data/election-data/train_df.npy', self.train_df) np.save('../data/election-data/dev_df.npy', self.dev_df) np.save('../data/election-data/test_df.npy', self.test_df) print(' - DONE') else: print(" - Loading data") self.train_df = np.load('../data/election-data/train_df.npy') self.dev_df = np.load('../data/election-data/dev_df.npy') self.test_df = np.load('../data/election-data/test_df.npy') print(' - DONE') else: # Vectorizing tweets - one-hot-vector self.train_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.train_tweets]) self.test_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.test_tweets]) self.create_batches() self.reset_batch_pointer()
def pad_minibatches(self, x, pad_location): x = util.pad_sequences(x, dynamic_padding=self.dynamic_padding, pad_location=pad_location) return x
# examples of what is in the vocab print(inputs.vocab[0:10]) # vectorize to tensor input_tensor = [[inputs.word2idx[s] for s in es.split(' ')] for es in data["text"].values.tolist()] # examples of what is in the input tensors print(input_tensor[0:2]) # calculate the max_length of input tensor max_length_inp = util.max_length(input_tensor) print(max_length_inp) # inplace padding input_tensor = [util.pad_sequences(x, max_length_inp) for x in input_tensor] print(input_tensor[0:2]) ###Binarization emotions = list(set(data.emotions.unique())) num_emotions = len(emotions) # binarizer mlb = preprocessing.MultiLabelBinarizer() data_labels = [set(emos) & set(emotions) for emos in data[['emotions']].values] bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) print(target_tensor[0:2]) print(data[0:2]) get_emotion = lambda t: np.argmax(t)
def train(self): self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.config) with self.sess: if self.load_model(): print(' [*] Load SUCCESS!\n') else: print(' [!] Load Failed...\n') self.sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter("./logs", self.sess.graph) merged = tf.summary.merge_all() self.counter = 1 word2int, int2word, vocab_size, self.training_data = get_hack_data( ) self.go = word2int["<GO>"] self.end = word2int["<EOS>"] self.pad = word2int["<PAD>"] #print(self.training_data.shape) k = (len(self.training_data) // self.batch_sizer) self.start_time = time.time() loss_g_val, loss_d_val = 0, 0 self.training_data = self.training_data[0:(self.batch_sizer * k)] test_counter = 0 print('Starting the Training....') print(self.end) for e in range(0, self.epoch): epoch_loss = 0. self.training_data = shuffle_data(self.training_data) mean_epoch_loss = [] for i in range(0, k): print(i) batch = self.training_data[i * self.batch_sizer:(i + 1) * self.batch_sizer] length = len(max(batch, key=len)) batched_data, l = pad_sequences(batch, word2int) batched_data = np.asarray(batched_data, dtype="int32") _, loss_val, loss_histo = self.sess.run( [self.optim, self.loss, self.summary_loss], feed_dict={ self.input: batched_data, self.targets: batched_data, self.max_seq_len: length, self.seq_length: l, self.batch_si: self.batch_sizer, self.go_index: self.go }) train_writer.add_summary(loss_histo, self.counter) self.counter = self.counter + 1 epoch_loss += loss_val mean_epoch_loss.append(loss_val) mean = np.mean(mean_epoch_loss) std = np.std(mean_epoch_loss) epoch_loss /= k print('Validation loss mean: ', mean) print('Validation loss std: ', std) print("Loss of Seq2Seq Model: %f" % epoch_loss) print("Epoch%d" % (e)) if e % 1 == 0: save_path = self.saver.save( self.sess, "C:/Users/Andreas/Desktop/seq2seq - continous/checkpoint/model.ckpt", global_step=self.save_epoch) print("model saved: %s" % save_path) data = get_requests_from_file( "C:/Users/Andreas/Desktop/seq2seq - continous/data/anomaly.txt" ) random_number = np.random.randint(0, len(data)) data = generate_sentence_int([data[random_number]], word2int) batched_test_data, l = pad_sequences(data, word2int) batched_test_data = np.asarray(batched_test_data, dtype="int32") ba_si = 1 size = l[0] print(batched_test_data) w, test, loss_eval = self.sess.run( [self.probs, self.decoder_output, self.loss], feed_dict={ self.input: batched_test_data, self.max_seq_len: size, self.seq_length: l, self.batch_si: ba_si, self.go_index: self.go, self.eos_index: self.end, self.targets: batched_test_data }) coefs = np.array([ w[j][batched_test_data[0][j]] for j in range(len(batched_test_data)) ]) print(coefs) coefs = coefs / coefs.max() print(coefs) print(coefs.shape) intsent = np.argmax(test, axis=2) tester = getsentencce(intsent[0], int2word) print(tester) self.save_epoch += 1 print("Loss of test_data: %f" % loss_eval) print("training finished")
def train_model(self): #self.logger.info("epoch %d:\t%s" % (0, self.evaluate())) for epoch in range(1, self.num_epochs + 1): if self.is_pairwise is True: user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg = \ data_generator._get_pairwise_all_likefism_data(self.dataset) data_iter = DataIterator(user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg, batch_size=self.batch_size, shuffle=True) else: user_input, num_idx, item_input, labels = \ data_generator._get_pointwise_all_likefism_data(self.dataset, self.num_negatives, self.train_dict) data_iter = DataIterator(user_input, num_idx, item_input, labels, batch_size=1, shuffle=True) num_training_instances = len(user_input) total_loss = 0.0 training_start_time = time() if self.is_pairwise is True: for bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, bat_items_pos, bat_items_neg in data_iter: bat_users_pos = pad_sequences(bat_users_pos, value=self.num_items) bat_users_neg = pad_sequences(bat_users_neg, value=self.num_items) feed_dict = { self.user_input: bat_users_pos, self.user_input_neg: bat_users_neg, self.num_idx: bat_idx_pos, self.num_idx_neg: bat_idx_neg, self.item_input: bat_items_pos, self.item_input_neg: bat_items_neg } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss else: """ for index in range(len(batch_length)-1): temp = pad_sequences(user_input[batch_length[index]:batch_length[index+1]], value=self.num_items) feed_dict = {self.user_input: temp, self.num_idx: num_idx[batch_length[index]:batch_length[index+1]], self.item_input: item_input[batch_length[index]:batch_length[index+1]], self.labels: labels[batch_length[index]:batch_length[index+1]]} loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) print(loss) total_loss += loss """ for index, (user_input, num_idx, item_input, labels) in enumerate(data_iter): feed_dict = { self.user_input: user_input, self.num_idx: num_idx, self.item_input: item_input, self.labels: labels } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) if index % 10000 == 0: print(index) total_loss += loss self.logger.info("[iter %d : loss : %f, time: %f]" % (epoch, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate())) self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate_val()))
def train_model(self): self.logger.info(self.evaluator.metrics_info()) self.evaluate() for epoch in range(1, self.num_epochs + 1): if self.is_pairwise is True: user_input_id, user_input, user_input_neg, num_idx_pos,\ num_idx_neg, item_input_pos, item_input_neg, item_input_recent = \ data_generator._get_pairwise_all_likefossil_data(self.dataset, self.high_order, self.train_dict) data_iter = DataIterator(user_input_id, user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg, item_input_recent, batch_size=self.batch_size, shuffle=True) else: user_input_id, user_input, num_idx, item_input, item_input_recent, labels = \ data_generator._get_pointwise_all_likefossil_data(self.dataset, self.high_order, self.num_negatives, self.train_dict) data_iter = DataIterator(user_input_id, user_input, num_idx, item_input, item_input_recent, labels, batch_size=self.batch_size, shuffle=True) num_training_instances = len(user_input) total_loss = 0.0 training_start_time = time() if self.is_pairwise is True: for bat_user_input_id, bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, \ bat_items_pos, bat_items_neg, bat_item_input_recent in data_iter: bat_users_pos = pad_sequences(bat_users_pos, value=self.num_items) bat_users_neg = pad_sequences(bat_users_neg, value=self.num_items) feed_dict = { self.user_input_id: bat_user_input_id, self.user_input: bat_users_pos, self.user_input_neg: bat_users_neg, self.num_idx: bat_idx_pos, self.num_idx_neg: bat_idx_neg, self.item_input: bat_items_pos, self.item_input_neg: bat_items_neg, self.item_input_recent: bat_item_input_recent } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss else: for bat_user_input_id, bat_users, bat_idx, bat_items, bat_item_input_recent, bat_labels in data_iter: bat_users = pad_sequences(bat_users, value=self.num_items) feed_dict = { self.user_input_id: bat_user_input_id, self.user_input: bat_users, self.num_idx: bat_idx, self.item_input: bat_items, self.item_input_recent: bat_item_input_recent, self.labels: bat_labels } loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) total_loss += loss self.logger.info("[iter %d : loss : %f, time: %f]" % (epoch, total_loss / num_training_instances, time() - training_start_time)) if epoch % self.verbose == 0: self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
def evaluate(self): #self.cur_embedding_Q_, self.cur_embedding_Q, self.cur_bias, self.cur_W, self.cur_b, self.cur_h \ # = self.sess.run([self.embedding_Q_, self.embedding_Q, self.bias, self.W, self.b, self.h]) item_batch_size = 1000 self.ratings = np.empty((self.num_users, self.num_items)) eval_items = np.arange(self.num_items) indices = [] for i in range(self.num_users): indices.append(len(self.train_dict[i])) indices = np.argsort(np.array(indices)) """ u_ids = [] for u in indices: u_ids.extend([u]*self.num_items) u_ids = np.array(u_ids) i_ids = np.tile(eval_items, self.num_users) user_inputs = [] for u in indices: item_by_user = self.train_dict[u] for i in i_ids: user_inputs.append(item_by_user) """ batch_length = [0] last_point = 0 cnt = 0 shrehold = 80 global_user_cnt = 0 for _i_, u in enumerate(indices): if (cnt - last_point + 1) * len(self.train_dict[u]) > shrehold: batch_length.append(cnt) last_point = cnt cnt += 1 batch_length.append(self.num_users) for index in range(len(batch_length) - 1): if batch_length[index + 1] - batch_length[index] > 1: user_input = [] item_idx = np.array([]) for u in indices[batch_length[index]:batch_length[index + 1]]: cand_items_by_u = self.train_dict[u] num_idx = len(cand_items_by_u) item_idx = np.append( item_idx, np.full(self.num_items, num_idx, dtype=np.int32)) user_input.extend([cand_items_by_u] * self.num_items) user_input = pad_sequences(user_input, value=self.num_items) feed_dict = { self.user_input: user_input, self.num_idx: item_idx, self.item_input: np.tile(eval_items, batch_length[index + 1] - batch_length[index]) } temp_data = np.reshape( self.sess.run(self.output, feed_dict=feed_dict), (-1, self.num_items)) for i, u in enumerate( indices[batch_length[index]:batch_length[index + 1]]): self.ratings[u] = temp_data[i] else: u = indices[batch_length[index]] ratings_row = [] cand_items_by_u = self.train_dict[u] num_idx = len(cand_items_by_u) #item_batch_size = 2000000 // num_idx item_batch_size = 4000000 // num_idx item_batch = self.num_items // item_batch_size + 1 for item in range(item_batch): start = item * item_batch_size end = min((item + 1) * item_batch_size, self.num_items) user_input = [] item_idx = np.full(end - start, num_idx, dtype=np.int32) user_input.extend([cand_items_by_u] * (end - start)) feed_dict = { self.user_input: user_input, self.num_idx: item_idx, self.item_input: eval_items[start:end] } ratings_row.extend( self.sess.run(self.output, feed_dict=feed_dict)) self.ratings[u] = np.array(ratings_row) return self.evaluator.evaluate(self)