Exemple #1
0
    def _build_session_graph(self, bat_items):
        A_in, A_out, alias_inputs = [], [], []
        all_mask = [[1] * len(items) for items in bat_items]
        bat_items = pad_sequences(bat_items, value=self.num_item)

        unique_nodes = [np.unique(items).tolist() for items in bat_items]
        max_n_node = np.max([len(nodes) for nodes in unique_nodes])
        for u_seq, u_node, mask in zip(bat_items, unique_nodes, all_mask):
            adj_mat = np.zeros((max_n_node, max_n_node))
            id_map = {node: idx for idx, node in enumerate(u_node)}
            if len(u_seq) > 1:
                alias_previous = [id_map[i] for i in u_seq[:len(mask) - 1]]
                alias_next = [id_map[i] for i in u_seq[1:len(mask)]]
                adj_mat[alias_previous, alias_next] = 1

            u_sum_in = np.sum(adj_mat, axis=0)
            u_sum_in[np.where(u_sum_in == 0)] = 1
            u_A_in = np.divide(adj_mat, u_sum_in)

            u_sum_out = np.sum(adj_mat, 1)
            u_sum_out[np.where(u_sum_out == 0)] = 1
            u_A_out = np.divide(adj_mat.transpose(), u_sum_out)

            A_in.append(u_A_in)
            A_out.append(u_A_out)
            alias_inputs.append([id_map[i] for i in u_seq])

        items = pad_sequences(unique_nodes, value=self.num_item)
        all_mask = pad_sequences(all_mask, value=0)
        return A_in, A_out, alias_inputs, items, all_mask
Exemple #2
0
	def create_batches(self):
		self.train_df = self.shuffle_data(self.train_df) # Randomlise data
		#train set:
		self.train_x = np.array([d[0] for d in self.train_df])
		self.train_size = np.array([len(seq) for seq in self.train_x])
		self.train_y = np.array([d[-1] for d in self.train_df])
		self.train_left_x = np.array([d[1] for d in self.train_df])
		self.train_left_size = np.array([len(seq) for seq in self.train_left_x])
		self.train_right_x = np.array([d[2] for d in self.train_df])
		self.train_right_size = np.array([len(seq) for seq in self.train_right_x])
		self.train_target_x = np.array([d[3] for d in self.train_df])
		self.train_x = util.pad_sequences(self.train_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding
		self.train_left_x = util.pad_sequences(self.train_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.train_right_x = util.pad_sequences(self.train_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.train_x = np.array(self.train_x)
		self.train_left_x = np.array(self.train_left_x)
		self.train_right_x = np.array(self.train_right_x)
		#dev set:
		self.dev_x = np.array([d[0] for d in self.dev_df])
		self.dev_size = np.array([len(seq) for seq in self.dev_x])
		self.dev_y = np.array([d[-1] for d in self.dev_df])
		self.dev_left_x = np.array([d[1] for d in self.dev_df])
		self.dev_left_size = np.array([len(seq) for seq in self.dev_left_x])
		self.dev_right_x = np.array([d[2] for d in self.dev_df])
		self.dev_right_size = np.array([len(seq) for seq in self.dev_right_x])
		self.dev_target_x = np.array([d[3] for d in self.dev_df])
		self.dev_x = util.pad_sequences(self.dev_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding
		self.dev_left_x = util.pad_sequences(self.dev_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.dev_right_x = util.pad_sequences(self.dev_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.dev_x = np.array(self.dev_x)
		self.dev_left_x = np.array(self.dev_left_x)
		self.dev_right_x = np.array(self.dev_right_x)
		#test set:
		self.test_x = np.array([d[0] for d in self.test_df])
		self.test_size = np.array([len(seq) for seq in self.test_x])
		self.test_y = np.array([d[-1] for d in self.test_df])
		self.test_left_x = np.array([d[1] for d in self.test_df])
		self.test_left_size = np.array([len(seq) for seq in self.test_left_x])
		self.test_right_x = np.array([d[2] for d in self.test_df])
		self.test_right_size = np.array([len(seq) for seq in self.test_right_x])
		self.test_target_x = np.array([d[3] for d in self.test_df])
		self.test_x = util.pad_sequences(self.test_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT') # Padding
		self.test_left_x = util.pad_sequences(self.test_left_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.test_right_x = util.pad_sequences(self.test_right_x, dynamic_padding=self.dynamic_padding, pad_location='RIGHT')
		self.test_x = np.array(self.test_x)
		self.test_left_x = np.array(self.test_left_x)
		self.test_right_x = np.array(self.test_right_x)

		# Vectorizing labels
		# self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32)
		# self.dev_y = pd.get_dummies(self.dev_y).values.astype(np.int32)
		self.test_y = pd.get_dummies(self.test_y).values.astype(np.int32)

		# Creating training batches
		self.num_batches = len(self.train_x)//self.batch_size
		if self.num_batches==0:
			assert False, "Not enough data for the batch size."
		self.batch_df = np.array_split(self.train_df, self.num_batches) # Splitting train set into batches based on num_batches
		
		assert np.array([d[-1] for d in self.batch_df[-1]]).shape[1] == 3, "Watch out! All batches must contain 3 labels!"
Exemple #3
0
    def packed_rnn(self, x, rnn):
        """ Runs the provided rnn on the input x. Takes care of packing/unpacking.

            x: list of unpadded input sequences
            Returns a tensor of size: len(x) x hidden_dim
        """
        lengths = torch.tensor([len(n) for n in x], dtype=torch.long, device=device)
        # Sort this batch in descending order by seq length
        lengths, idx_sort = torch.sort(lengths, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        idx_sort = torch.autograd.Variable(idx_sort)
        idx_unsort = torch.autograd.Variable(idx_unsort)
        padded_x = pad_sequences(x)
        x_tt = torch.from_numpy(padded_x).type(torch.long).to(device)
        x_tt = x_tt.index_select(0, idx_sort)
        # Run the embedding layer
        embed = self.embedding(x_tt).permute(1,0,2) # Time x Batch x EncDim
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embed, lengths)
        # Run the RNN
        out, _ = rnn(packed)
        # Unpack
        out, _ = nn.utils.rnn.pad_packed_sequence(out)
        # Get the last step of each sequence
        idx = (lengths-1).view(-1,1).expand(len(lengths), out.size(2)).unsqueeze(0)
        out = out.gather(0, idx).squeeze(0)
        # Unsort
        out = out.index_select(0, idx_unsort)
        return out
Exemple #4
0
 def _generate_sequences(self):
     self.user_test_seq = {}
     users_list, item_seq_list, item_pos_list = [], [], []
     seq_len = self.seq_L + self.seq_T
     uni_users = np.unique(list(self.user_pos_train.keys()))
     for user in uni_users:
         seq_items = self.user_pos_train[user]
         if len(seq_items) - seq_len >= 0:
             for i in range(len(seq_items), 0, -1):
                 if i - seq_len >= 0:
                     seq_i = seq_items[i - seq_len:i]
                     if user not in self.user_test_seq:
                         self.user_test_seq[user] = seq_i[-self.seq_L:]
                     users_list.append(user)
                     item_seq_list.append(seq_i[:self.seq_L])
                     item_pos_list.append(seq_i[-self.seq_T:])
                 else:
                     break
         else:
             seq_items = np.reshape(seq_items,
                                    newshape=[1, -1]).astype(np.int32)
             seq_items = pad_sequences(seq_items,
                                       value=self.items_num,
                                       max_len=seq_len,
                                       padding='pre',
                                       truncating='pre')
             seq_i = np.reshape(seq_items, newshape=[-1])
             if user not in self.user_test_seq:
                 self.user_test_seq[user] = seq_i[-self.seq_L:]
             users_list.append(user)
             item_seq_list.append(seq_i[:self.seq_L])
             item_pos_list.append(seq_i[-self.seq_T:])
     return users_list, item_seq_list, item_pos_list
Exemple #5
0
    def _Q_oneshot(self, batch):
        new_observations_ids = []
        new_admissible_actions_ids = []
        observation_padding_size = 0
        idx_mapping = {}
        current_idx = 0
        for idx, sample in enumerate(batch):
            if sample.done:
                continue
            new_observations_ids.append(sample.new_observation_ids)
            new_admissible_actions_ids.append(
                sample.new_admissible_actions_ids)
            observation_padding_size = max(observation_padding_size,
                                           len(sample.new_observation_ids))
            idx_mapping[idx] = (current_idx, current_idx +
                                len(sample.new_admissible_actions_ids))
            current_idx += len(sample.new_admissible_actions_ids)

        new_observations_ids = pad_sequences(new_observations_ids,
                                             max_len=observation_padding_size)
        new_tiled_observations = []
        for idx, observation in enumerate(new_observations_ids):
            num_actions = len(new_admissible_actions_ids[idx])
            new_tiled_observations.append(
                np.tile(observation, (num_actions, 1)))
        q_values, _ = self.model.predict(
            np.concatenate(new_tiled_observations),
            np.concatenate(new_admissible_actions_ids))
        return q_values, idx_mapping
Exemple #6
0
    def train_model(self):
        self.logger.info(self.evaluator.metrics_info())
        for epoch in range(1, self.num_epochs + 1):
            user_input, num_idx, item_input, labels = \
                data_generator._get_pointwise_all_likefism_data(self.dataset, self.num_negatives, self.train_dict)
            data_iter = DataIterator(user_input,
                                     num_idx,
                                     item_input,
                                     labels,
                                     batch_size=self.batch_size,
                                     shuffle=True)

            num_training_instances = len(user_input)
            total_loss = 0.0
            training_start_time = time()
            for bat_users, bat_idx, bat_items, bat_labels in data_iter:
                bat_users = pad_sequences(bat_users, value=self.num_items)
                feed_dict = {
                    self.user_input: bat_users,
                    self.num_idx: bat_idx,
                    self.item_input: bat_items,
                    self.labels: bat_labels,
                    self.is_train_phase: True
                }
                loss, _ = self.sess.run((self.loss, self.optimizer),
                                        feed_dict=feed_dict)
                total_loss += loss
            self.logger.info("[iter %d : loss : %f, time: %f]" %
                             (epoch, total_loss / num_training_instances,
                              time() - training_start_time))
            if epoch % self.verbose == 0:
                self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
Exemple #7
0
    def evaluate(self, model, test_users=None):
        """Evaluate `model`.

        Args:
            model: The model need to be evaluated. This model must have
                a method `predict_for_eval(self, users)`, where the argument
                `users` is a list of users and the return is a 2-D array that
                contains `users` rating/ranking scores on all items.

        Returns:
            str: A single-line string consist of all results, such as
                `"0.18663847    0.11239596    0.35824192    0.21479650"`.
        """
        # B: batch size
        # N: the number of items
        test_users = test_users if test_users is not None else list(self.user_pos_test.keys())
        if not isinstance(test_users, (list, tuple, set, np.ndarray)):
            raise TypeError("'test_user' must be a list, tuple, set or numpy array!")

        test_users = DataIterator(test_users, batch_size=self.batch_size,
                                  shuffle=False, drop_last=False)
        batch_result = []
        for batch_users in test_users:
            if self.user_neg_test is not None:
                candidate_items = [list(self.user_pos_test[u]) + self.user_neg_test[u] for u in batch_users]
                test_items = [set(range(len(self.user_pos_test[u]))) for u in batch_users]

                ranking_score = model.predict(batch_users, candidate_items)  # (B,N)
                ranking_score = pad_sequences(ranking_score, value=-np.inf, dtype=np.float32)

                ranking_score = np.array(ranking_score)
            else:
                test_items = [self.user_pos_test[u] for u in batch_users]
                ranking_score = model.predict(batch_users, None)  # (B,N)
                ranking_score = np.array(ranking_score)

                # set the ranking scores of training items to -inf,
                # then the training items will be sorted at the end of the ranking list.
                for idx, user in enumerate(batch_users):
                    if user in self.user_pos_train and len(self.user_pos_train[user]) > 0:
                        train_items = self.user_pos_train[user]
                        ranking_score[idx][train_items] = -np.inf

            result = self.eval_score_matrix(ranking_score, test_items, self.metrics,
                                            top_k=self.max_top, thread_num=self.num_thread)  # (B,k*metric_num)
            batch_result.append(result)

        # concatenate the batch results to a matrix
        all_user_result = np.concatenate(batch_result, axis=0)  # (num_users, metrics_num*max_top)
        final_result = np.mean(all_user_result, axis=0)  # (1, metrics_num*max_top)

        final_result = np.reshape(final_result, newshape=[self.metrics_num, self.max_top])  # (metrics_num, max_top)
        final_result = final_result[:, self.top_show - 1]
        final_result = np.reshape(final_result, newshape=[-1])
        buf = '\t'.join([("%.8f" % x).ljust(12) for x in final_result])
        return buf
Exemple #8
0
 def train(self):
     batch = self.replay_buffer.sample(self.config['training_batch_size'])
     if not batch:
         return
     if self.config['batch_oneshot']:
         observations_ids, rewards, actions_ids = self._preprocess_batch_oneshot(
             batch)
     else:
         observations_ids, rewards, actions_ids = self._preprocess_batch(
             batch)
     self.model.train(
         np.stack(
             pad_sequences(observations_ids,
                           max_len=self.get_observation_padding_size())),
         np.stack(rewards),
         np.stack(
             pad_sequences(actions_ids,
                           max_len=self.get_actions_padding_size())))
     return
Exemple #9
0
def pandasToTensor(data, globalVocab):

    data = shuffle(data)

    # # Preprocessing data
    # # retain only text that contain less that 70 tokens to avoid too much padding
    data["token_size"] = data["text"].apply(lambda x: len(x.split(' ')))
    data = data.loc[data['token_size'] < 70].copy()

    # # sampling
    # data = data.sample(n=50000);

    # # construct vocab and indexing
    # inputs = construct.ConstructVocab(data["text"].values.tolist())

    # print(globalVocab.vocab[0:10])

    input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')]
                    for es in data["text"].values.tolist()]

    # examples of what is in the input tensors
    # print(input_tensor[0:2])

    # calculate the max_length of input tensor
    max_length_inp = util.max_length(input_tensor)
    # print(max_length_inp)

    # inplace padding
    input_tensor = [
        util.pad_sequences(x, max_length_inp) for x in input_tensor
    ]
    # print(input_tensor[0:2])

    ###Binarization
    emotions = list(emotion_dict.values())
    num_emotions = len(emotion_dict)
    # print(emotions)
    # binarizer
    mlb = preprocessing.MultiLabelBinarizer(classes=emotions)
    data_labels = [emos for emos in data[['emotions']].values]
    # print(data_labels)
    bin_emotions = mlb.fit_transform(data_labels)
    target_tensor = np.array(bin_emotions.tolist())

    # print(target_tensor[0:2])
    # print(data[0:2])

    get_emotion = lambda t: np.argmax(t)

    get_emotion(target_tensor[0])
    emotion_dict[get_emotion(target_tensor[0])]

    return input_tensor, target_tensor
Exemple #10
0
def predict_script(model, users, items=None):
    users = DataIterator(users, batch_size=512, shuffle=False, drop_last=False)
    all_ratings = []
    for bat_user in users:
        bat_seq = [model.user_pos_train[u] for u in bat_user]
        bat_seq = pad_sequences(bat_seq,
                                value=model.items_num,
                                max_len=model.max_len,
                                padding='pre',
                                truncating='pre')
        bat_pos = [model.user_pos_train[u][1:] for u in bat_user]
        n_neg_items = [len(pos) for pos in bat_pos]
        exclusion = [model.user_pos_train[u] for u in bat_user]
        bat_neg = batch_randint_choice(model.items_num,
                                       n_neg_items,
                                       replace=True,
                                       exclusion=exclusion)

        bat_pos = pad_sequences(bat_pos,
                                value=model.items_num,
                                max_len=model.max_len,
                                padding='pre',
                                truncating='pre')
        bat_neg = pad_sequences(bat_neg,
                                value=model.items_num,
                                max_len=model.max_len,
                                padding='pre',
                                truncating='pre')

        _, _x, bat_ratings = model(bat_seq, bat_pos, bat_neg)
        all_ratings.extend(bat_ratings)
    all_ratings = [t.detach().cpu().numpy() for t in all_ratings]
    # all_ratings = np.array(all_ratings, dtype=np.float32)
    if items is not None:
        all_ratings = [
            all_ratings[idx][item] for idx, item in enumerate(items)
        ]
    return all_ratings
Exemple #11
0
    def _build_admissible_actions_ids(self, info, shuffle):
        admissible_actions_ids = [
            memoized_string_to_ids(admissible_action,
                                   self.word_ids,
                                   tokenizer=self.nlp)
            for admissible_action in info['admissible_commands']
        ]

        result = np.array(
            pad_sequences(admissible_actions_ids,
                          max_len=self.get_actions_padding_size()))
        if shuffle:
            np.random.shuffle(result)
        return result
Exemple #12
0
    def get_train_data(self):
        item_seq_list, item_pos_list, item_neg_list = [], [], []
        all_users = DataIterator(list(self.user_pos_train.keys()),
                                 batch_size=1024,
                                 shuffle=False)
        for bat_users in all_users:
            bat_seq = [self.user_pos_train[u][:-1] for u in bat_users]
            bat_pos = [self.user_pos_train[u][1:] for u in bat_users]
            n_neg_items = [len(pos) for pos in bat_pos]
            exclusion = [self.user_pos_train[u] for u in bat_users]
            bat_neg = batch_randint_choice(self.items_num,
                                           n_neg_items,
                                           replace=True,
                                           exclusion=exclusion)

            # padding
            bat_seq = pad_sequences(bat_seq,
                                    value=self.items_num,
                                    max_len=self.max_len,
                                    padding='pre',
                                    truncating='pre')
            bat_pos = pad_sequences(bat_pos,
                                    value=self.items_num,
                                    max_len=self.max_len,
                                    padding='pre',
                                    truncating='pre')
            bat_neg = pad_sequences(bat_neg,
                                    value=self.items_num,
                                    max_len=self.max_len,
                                    padding='pre',
                                    truncating='pre')

            item_seq_list.extend(bat_seq)
            item_pos_list.extend(bat_pos)
            item_neg_list.extend(bat_neg)

        return item_seq_list, item_pos_list, item_neg_list  # , user_list
def pandasToTensor(data):

    data["token_size"] = data["text"].apply(lambda x: len(x.split(' ')))
    data = data.loc[data['token_size'] < 70].copy()

    # load globalVocab word2idx

    if os.path.exists(vocabPath):
        globalVocab.loadFile(vocabPath)
    else:
        print("Vocabulary doesn't exist")

    input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')]
                    for es in data["text"].values.tolist()]

    # examples of what is in the input tensors
    # print(input_tensor[0:2])

    # calculate the max_length of input tensor
    max_length_inp = util.max_length(input_tensor)
    # print(max_length_inp)

    # inplace padding
    input_tensor = [
        util.pad_sequences(x, max_length_inp) for x in input_tensor
    ]
    # print(input_tensor[0:2])

    ###Binarization
    emotions = list(emotion_dict.values())
    num_emotions = len(emotion_dict)
    # print(emotions)
    # binarizer
    mlb = preprocessing.MultiLabelBinarizer(classes=emotions)
    data_labels = [emos for emos in data[['emotion']].values]
    # print(data_labels)
    bin_emotions = mlb.fit_transform(data_labels)
    target_tensor = np.array(bin_emotions.tolist())

    # print(target_tensor[0:2])
    # print(data[0:2])

    get_emotion = lambda t: np.argmax(t)

    get_emotion(target_tensor[0])
    emotion_dict[get_emotion(target_tensor[0])]

    return input_tensor, target_tensor
Exemple #14
0
    def evaluate(self, model):
        # B: batch size
        # N: the number of items
        test_users = DataIterator(list(self.user_pos_test.keys()), batch_size=self.batch_size, shuffle=False, drop_last=False)
        batch_result = []
        for batch_users in test_users:
            if self.user_neg_test is not None:
                candidate_items = []
                test_items = []
                for user in batch_users:
                    pos = self.user_pos_test[user]
                    neg = self.user_neg_test[user]
                    candidate_items.append(pos+neg)
                    test_items.append(list(range(len(pos))))
                ranking_score = model.predict(batch_users, candidate_items)  # (B,N)
                ranking_score = pad_sequences(ranking_score, value=-np.inf)

                ranking_score = np.array(ranking_score)
            else:
                test_items = []
                for user in batch_users:
                    test_items.append(self.user_pos_test[user])
                ranking_score = model.predict(batch_users, None)  # (B,N)
                ranking_score = np.array(ranking_score)

                # set the ranking scores of training items to -inf,
                # then the training items will be sorted at the end of the ranking list.
                for idx, user in enumerate(batch_users):
                    train_items = self.user_pos_train[user]
                    ranking_score[idx][train_items] = -np.inf

            result = eval_score_matrix_foldout(ranking_score, test_items, top_k=self.max_top, thread_num=None)  # (B,k*metric_num)
            batch_result.append(result)

        # concatenate the batch results to a matrix
        all_user_result = np.concatenate(batch_result, axis=0)
        final_result = np.mean(all_user_result, axis=0)  # mean

        final_result = np.reshape(final_result, newshape=[self.metrics_num, self.max_top])
        final_result = final_result[:, self.top_show-1]
        final_result = np.reshape(final_result, newshape=[-1])
        buf = '\t'.join([("%.8f" % x).ljust(12) for x in final_result])
        return buf
Exemple #15
0
 def predict(self, users, items=None):
     users = DataIterator(users,
                          batch_size=512,
                          shuffle=False,
                          drop_last=False)
     all_ratings = []
     for bat_user in users:
         bat_seq = [self.user_pos_train[u] for u in bat_user]
         bat_seq = pad_sequences(bat_seq,
                                 value=self.items_num,
                                 max_len=self.max_len,
                                 padding='pre',
                                 truncating='pre')
         feed = {self.item_seq_ph: bat_seq, self.is_training: False}
         bat_ratings = self.sess.run(self.all_logits, feed_dict=feed)
         all_ratings.extend(bat_ratings)
     all_ratings = np.array(all_ratings, dtype=np.float32)
     if items is not None:
         all_ratings = [
             all_ratings[idx][item] for idx, item in enumerate(items)
         ]
     return all_ratings
Exemple #16
0
    def train_model(self):
        self.logger.info(self.evaluator.metrics_info())
        for epoch in range(1, self.num_epochs + 1):
            if self.is_pairwise is True:
                user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg = \
                    data_generator._get_pairwise_all_likefism_data(self.dataset)
                data_iter = DataIterator(user_input,
                                         user_input_neg,
                                         num_idx_pos,
                                         num_idx_neg,
                                         item_input_pos,
                                         item_input_neg,
                                         batch_size=self.batch_size,
                                         shuffle=True)
            else:
                user_input, num_idx, item_input, labels = \
                data_generator._get_pointwise_all_likefism_data_debug(self.dataset, self.num_negatives, self.train_dict)
                data_iter = DataIterator(user_input,
                                         num_idx,
                                         item_input,
                                         labels,
                                         batch_size=self.batch_size,
                                         shuffle=True)

            total_loss = 0.0
            training_start_time = time()

            if self.is_pairwise is True:
                for bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, bat_items_pos, bat_items_neg in data_iter:
                    bat_users_pos = pad_sequences(bat_users_pos,
                                                  value=self.num_items)
                    bat_users_neg = pad_sequences(bat_users_neg,
                                                  value=self.num_items)
                    feed_dict = {
                        self.user_input: bat_users_pos,
                        self.user_input_neg: bat_users_neg,
                        self.num_idx: bat_idx_pos,
                        self.num_idx_neg: bat_idx_neg,
                        self.item_input: bat_items_pos,
                        self.item_input_neg: bat_items_neg
                    }

                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    total_loss += loss
            else:
                for index, (bat_users, bat_idx, bat_items,
                            bat_labels) in enumerate(data_iter):
                    bat_users = pad_sequences(bat_users, value=self.num_items)
                    feed_dict = {
                        self.user_input: bat_users,
                        self.num_idx: bat_idx,
                        self.item_input: bat_items,
                        self.labels: bat_labels
                    }
                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    total_loss += loss

            self.logger.info("[iter %d : loss : %f, time: %f]" %
                             (epoch, total_loss / len(user_input),
                              time() - training_start_time))
            if epoch % self.verbose == 0:
                params = self.sess.run([self.c1, self.embedding_Q, self.bias])
                #with open("/gdata/yujr/pretrained/epoch=%d_gamma=%f_lambda=%f_fism.pkl" % (epoch, self.gamma_bilinear, self.lambda_bilinear), "wb") as fout:
                #    pickle.dump(params, fout)
                self.logger.info("epoch %d:\t%s" %
                                 (epoch, self.evaluate_val()))
                self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
Exemple #17
0
	def __init__(self, batch_size, dynamic_padding=False, preprocessing=False, embedding=True, saved=False, max_length=None):
		train = ElectionData.read_data('../data/election-data/training/')
		test = ElectionData.read_data('../data/election-data/testing/')
		self.batch_size = batch_size
		self.dynamic_padding = dynamic_padding
		self.train_tweets, self.train_targets, self.train_y = zip(*train)
		self.test_tweets, self.test_targets, self.test_y = zip(*test)

		self.train_left_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[0] for i in range(len(self.train_tweets))]
		self.train_right_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[1] for i in range(len(self.train_tweets))]
		self.test_left_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[0] for i in range(len(self.test_tweets))]
		self.test_right_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[1] for i in range(len(self.test_tweets))]

		self.train_tweets = [ElectionData.replace_target(self.train_tweets[i], self.train_targets[i]) for i in range(len(self.train_tweets))]
		self.test_tweets = [ElectionData.replace_target(self.test_tweets[i], self.test_targets[i]) for i in range(len(self.test_tweets))]
		self.train_targets = [train_target.split('_') for train_target in self.train_targets]
		self.test_targets = [test_target.split('_') for test_target in self.test_targets]

		# Padding tweets (manually adding '<PAD> tokens')
		if not self.dynamic_padding:
			self.train_tweets = util.pad_sequences(self.train_tweets, pad_location='RIGHT')
			self.test_tweets = util.pad_sequences(self.test_tweets, pad_location='RIGHT')

		# Building vocabulary
		self.vocab, self.vocab_inv = util.build_vocabulary(self.train_tweets + self.test_tweets)

		if embedding:
			# Vectorizing tweets - Glove embedding
			start = time.clock()
			print(' - Loading embedding..')
			glove, self.glove_vec, self.glove_shape, glove_vocab = util.gensim_load_vec('../resources/wordemb/glove.twitter.word2vec.27B.100d.txt')
			glove_vocab = [token.encode('utf-8') for token in glove_vocab]
			self.glove_vocab_dict = {j:i for i, j in enumerate(glove_vocab)}
			self.glove_vec = np.append(self.glove_vec, [[0]*self.glove_shape[1]], axis=0)
			self.glove_shape = [self.glove_shape[0]+1, self.glove_shape[1]]
			print(' - DONE')
			print("time taken: %f mins"%((time.clock() - start)/60))

			if saved==False:
				start = time.clock()
				print(' - Matching words-indices')
				self.train_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_tweets])
				self.train_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_left_tweets])
				self.train_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_right_tweets])
				self.test_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_tweets])
				self.test_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_left_tweets])
				self.test_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_right_tweets])
				self.train_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.train_targets])
				self.test_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.test_targets])
				self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32)

				self.train_df = [(self.train_x[i], self.train_left_x[i], self.train_right_x[i], self.train_target_x[i], self.train_y[i]) 
								for i in range(len(self.train_x))]
				self.test_df = [(self.test_x[i], self.test_left_x[i], self.test_right_x[i], self.test_target_x[i], self.test_y[i]) 
								for i in range(len(self.test_x))]

				train_y = np.array([d[-1] for d in self.train_df])
				self.train_df, self.dev_df = self.build_train_dev(train_y) # Dividing to train and dev set
				print(' - DONE')
				print("time taken: %f mins"%((time.clock() - start)/60))
				print(" - Saving data")
				np.save('../data/election-data/train_df.npy', self.train_df)
				np.save('../data/election-data/dev_df.npy', self.dev_df)
				np.save('../data/election-data/test_df.npy', self.test_df)
				print(' - DONE')
			else:
				print(" - Loading data")
				self.train_df = np.load('../data/election-data/train_df.npy')
				self.dev_df = np.load('../data/election-data/dev_df.npy')
				self.test_df = np.load('../data/election-data/test_df.npy')
				print(' - DONE')

		else:
			# Vectorizing tweets - one-hot-vector
			self.train_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.train_tweets])
			self.test_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.test_tweets])

		self.create_batches()
		self.reset_batch_pointer()
Exemple #18
0
	def pad_minibatches(self, x, pad_location):
		x = util.pad_sequences(x, dynamic_padding=self.dynamic_padding, pad_location=pad_location)
		return x
Exemple #19
0
# examples of what is in the vocab
print(inputs.vocab[0:10])

# vectorize to tensor
input_tensor = [[inputs.word2idx[s] for s in es.split(' ')]
                for es in data["text"].values.tolist()]

# examples of what is in the input tensors
print(input_tensor[0:2])

# calculate the max_length of input tensor
max_length_inp = util.max_length(input_tensor)
print(max_length_inp)

# inplace padding
input_tensor = [util.pad_sequences(x, max_length_inp) for x in input_tensor]
print(input_tensor[0:2])

###Binarization
emotions = list(set(data.emotions.unique()))
num_emotions = len(emotions)
# binarizer
mlb = preprocessing.MultiLabelBinarizer()
data_labels = [set(emos) & set(emotions) for emos in data[['emotions']].values]
bin_emotions = mlb.fit_transform(data_labels)
target_tensor = np.array(bin_emotions.tolist())

print(target_tensor[0:2])
print(data[0:2])

get_emotion = lambda t: np.argmax(t)
    def train(self):
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.config)
        with self.sess:
            if self.load_model():
                print(' [*] Load SUCCESS!\n')
            else:
                print(' [!] Load Failed...\n')
                self.sess.run(tf.global_variables_initializer())
            train_writer = tf.summary.FileWriter("./logs", self.sess.graph)
            merged = tf.summary.merge_all()
            self.counter = 1
            word2int, int2word, vocab_size, self.training_data = get_hack_data(
            )
            self.go = word2int["<GO>"]
            self.end = word2int["<EOS>"]
            self.pad = word2int["<PAD>"]
            #print(self.training_data.shape)
            k = (len(self.training_data) // self.batch_sizer)
            self.start_time = time.time()
            loss_g_val, loss_d_val = 0, 0
            self.training_data = self.training_data[0:(self.batch_sizer * k)]
            test_counter = 0
            print('Starting the Training....')
            print(self.end)

            for e in range(0, self.epoch):
                epoch_loss = 0.
                self.training_data = shuffle_data(self.training_data)
                mean_epoch_loss = []
                for i in range(0, k):
                    print(i)
                    batch = self.training_data[i * self.batch_sizer:(i + 1) *
                                               self.batch_sizer]
                    length = len(max(batch, key=len))
                    batched_data, l = pad_sequences(batch, word2int)
                    batched_data = np.asarray(batched_data, dtype="int32")
                    _, loss_val, loss_histo = self.sess.run(
                        [self.optim, self.loss, self.summary_loss],
                        feed_dict={
                            self.input: batched_data,
                            self.targets: batched_data,
                            self.max_seq_len: length,
                            self.seq_length: l,
                            self.batch_si: self.batch_sizer,
                            self.go_index: self.go
                        })
                    train_writer.add_summary(loss_histo, self.counter)
                    self.counter = self.counter + 1
                    epoch_loss += loss_val
                    mean_epoch_loss.append(loss_val)
                mean = np.mean(mean_epoch_loss)
                std = np.std(mean_epoch_loss)
                epoch_loss /= k
                print('Validation loss mean: ', mean)
                print('Validation loss std: ', std)
                print("Loss of Seq2Seq Model: %f" % epoch_loss)
                print("Epoch%d" % (e))

                if e % 1 == 0:
                    save_path = self.saver.save(
                        self.sess,
                        "C:/Users/Andreas/Desktop/seq2seq - continous/checkpoint/model.ckpt",
                        global_step=self.save_epoch)
                    print("model saved: %s" % save_path)

                    data = get_requests_from_file(
                        "C:/Users/Andreas/Desktop/seq2seq - continous/data/anomaly.txt"
                    )
                    random_number = np.random.randint(0, len(data))

                    data = generate_sentence_int([data[random_number]],
                                                 word2int)
                    batched_test_data, l = pad_sequences(data, word2int)
                    batched_test_data = np.asarray(batched_test_data,
                                                   dtype="int32")
                    ba_si = 1
                    size = l[0]
                    print(batched_test_data)
                    w, test, loss_eval = self.sess.run(
                        [self.probs, self.decoder_output, self.loss],
                        feed_dict={
                            self.input: batched_test_data,
                            self.max_seq_len: size,
                            self.seq_length: l,
                            self.batch_si: ba_si,
                            self.go_index: self.go,
                            self.eos_index: self.end,
                            self.targets: batched_test_data
                        })

                    coefs = np.array([
                        w[j][batched_test_data[0][j]]
                        for j in range(len(batched_test_data))
                    ])
                    print(coefs)
                    coefs = coefs / coefs.max()
                    print(coefs)
                    print(coefs.shape)
                    intsent = np.argmax(test, axis=2)
                    tester = getsentencce(intsent[0], int2word)
                    print(tester)
                    self.save_epoch += 1
                    print("Loss of test_data: %f" % loss_eval)

            print("training finished")
Exemple #21
0
    def train_model(self):
        #self.logger.info("epoch %d:\t%s" % (0, self.evaluate()))
        for epoch in range(1, self.num_epochs + 1):
            if self.is_pairwise is True:
                user_input, user_input_neg, num_idx_pos, num_idx_neg, item_input_pos, item_input_neg = \
                    data_generator._get_pairwise_all_likefism_data(self.dataset)
                data_iter = DataIterator(user_input,
                                         user_input_neg,
                                         num_idx_pos,
                                         num_idx_neg,
                                         item_input_pos,
                                         item_input_neg,
                                         batch_size=self.batch_size,
                                         shuffle=True)
            else:
                user_input, num_idx, item_input, labels = \
                 data_generator._get_pointwise_all_likefism_data(self.dataset, self.num_negatives, self.train_dict)
                data_iter = DataIterator(user_input,
                                         num_idx,
                                         item_input,
                                         labels,
                                         batch_size=1,
                                         shuffle=True)
            num_training_instances = len(user_input)
            total_loss = 0.0
            training_start_time = time()
            if self.is_pairwise is True:
                for bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, bat_items_pos, bat_items_neg in data_iter:
                    bat_users_pos = pad_sequences(bat_users_pos,
                                                  value=self.num_items)
                    bat_users_neg = pad_sequences(bat_users_neg,
                                                  value=self.num_items)
                    feed_dict = {
                        self.user_input: bat_users_pos,
                        self.user_input_neg: bat_users_neg,
                        self.num_idx: bat_idx_pos,
                        self.num_idx_neg: bat_idx_neg,
                        self.item_input: bat_items_pos,
                        self.item_input_neg: bat_items_neg
                    }

                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    total_loss += loss
            else:
                """
                for index in range(len(batch_length)-1):
                    temp = pad_sequences(user_input[batch_length[index]:batch_length[index+1]], value=self.num_items)
                    feed_dict = {self.user_input: temp,
                                 self.num_idx: num_idx[batch_length[index]:batch_length[index+1]],
                                 self.item_input: item_input[batch_length[index]:batch_length[index+1]],
                                 self.labels: labels[batch_length[index]:batch_length[index+1]]}
                    loss, _ = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
                    print(loss)
                    total_loss += loss
                """
                for index, (user_input, num_idx, item_input,
                            labels) in enumerate(data_iter):
                    feed_dict = {
                        self.user_input: user_input,
                        self.num_idx: num_idx,
                        self.item_input: item_input,
                        self.labels: labels
                    }
                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    if index % 10000 == 0:
                        print(index)
                    total_loss += loss

            self.logger.info("[iter %d : loss : %f, time: %f]" %
                             (epoch, total_loss / num_training_instances,
                              time() - training_start_time))
            if epoch % self.verbose == 0:
                self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
                self.logger.info("epoch %d:\t%s" %
                                 (epoch, self.evaluate_val()))
Exemple #22
0
    def train_model(self):
        self.logger.info(self.evaluator.metrics_info())
        self.evaluate()
        for epoch in range(1, self.num_epochs + 1):
            if self.is_pairwise is True:
                user_input_id, user_input, user_input_neg, num_idx_pos,\
                    num_idx_neg, item_input_pos, item_input_neg, item_input_recent = \
                    data_generator._get_pairwise_all_likefossil_data(self.dataset, self.high_order, self.train_dict)

                data_iter = DataIterator(user_input_id,
                                         user_input,
                                         user_input_neg,
                                         num_idx_pos,
                                         num_idx_neg,
                                         item_input_pos,
                                         item_input_neg,
                                         item_input_recent,
                                         batch_size=self.batch_size,
                                         shuffle=True)
            else:
                user_input_id, user_input, num_idx, item_input, item_input_recent, labels = \
                    data_generator._get_pointwise_all_likefossil_data(self.dataset, self.high_order,
                                                                      self.num_negatives, self.train_dict)

                data_iter = DataIterator(user_input_id,
                                         user_input,
                                         num_idx,
                                         item_input,
                                         item_input_recent,
                                         labels,
                                         batch_size=self.batch_size,
                                         shuffle=True)

            num_training_instances = len(user_input)
            total_loss = 0.0
            training_start_time = time()

            if self.is_pairwise is True:
                for bat_user_input_id, bat_users_pos, bat_users_neg, bat_idx_pos, bat_idx_neg, \
                        bat_items_pos, bat_items_neg, bat_item_input_recent in data_iter:
                    bat_users_pos = pad_sequences(bat_users_pos,
                                                  value=self.num_items)
                    bat_users_neg = pad_sequences(bat_users_neg,
                                                  value=self.num_items)
                    feed_dict = {
                        self.user_input_id: bat_user_input_id,
                        self.user_input: bat_users_pos,
                        self.user_input_neg: bat_users_neg,
                        self.num_idx: bat_idx_pos,
                        self.num_idx_neg: bat_idx_neg,
                        self.item_input: bat_items_pos,
                        self.item_input_neg: bat_items_neg,
                        self.item_input_recent: bat_item_input_recent
                    }

                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    total_loss += loss
            else:
                for bat_user_input_id, bat_users, bat_idx, bat_items, bat_item_input_recent, bat_labels in data_iter:
                    bat_users = pad_sequences(bat_users, value=self.num_items)
                    feed_dict = {
                        self.user_input_id: bat_user_input_id,
                        self.user_input: bat_users,
                        self.num_idx: bat_idx,
                        self.item_input: bat_items,
                        self.item_input_recent: bat_item_input_recent,
                        self.labels: bat_labels
                    }

                    loss, _ = self.sess.run((self.loss, self.optimizer),
                                            feed_dict=feed_dict)
                    total_loss += loss

            self.logger.info("[iter %d : loss : %f, time: %f]" %
                             (epoch, total_loss / num_training_instances,
                              time() - training_start_time))

            if epoch % self.verbose == 0:
                self.logger.info("epoch %d:\t%s" % (epoch, self.evaluate()))
Exemple #23
0
 def evaluate(self):
     #self.cur_embedding_Q_, self.cur_embedding_Q, self.cur_bias, self.cur_W, self.cur_b, self.cur_h \
     #    = self.sess.run([self.embedding_Q_, self.embedding_Q, self.bias, self.W, self.b, self.h])
     item_batch_size = 1000
     self.ratings = np.empty((self.num_users, self.num_items))
     eval_items = np.arange(self.num_items)
     indices = []
     for i in range(self.num_users):
         indices.append(len(self.train_dict[i]))
     indices = np.argsort(np.array(indices))
     """
     u_ids = []
     for u in indices:
         u_ids.extend([u]*self.num_items)
     u_ids = np.array(u_ids)
     i_ids = np.tile(eval_items, self.num_users)
     user_inputs = []
     for u in indices:
         item_by_user = self.train_dict[u]
         for i in i_ids:
             user_inputs.append(item_by_user)
     """
     batch_length = [0]
     last_point = 0
     cnt = 0
     shrehold = 80
     global_user_cnt = 0
     for _i_, u in enumerate(indices):
         if (cnt - last_point + 1) * len(self.train_dict[u]) > shrehold:
             batch_length.append(cnt)
             last_point = cnt
         cnt += 1
     batch_length.append(self.num_users)
     for index in range(len(batch_length) - 1):
         if batch_length[index + 1] - batch_length[index] > 1:
             user_input = []
             item_idx = np.array([])
             for u in indices[batch_length[index]:batch_length[index + 1]]:
                 cand_items_by_u = self.train_dict[u]
                 num_idx = len(cand_items_by_u)
                 item_idx = np.append(
                     item_idx,
                     np.full(self.num_items, num_idx, dtype=np.int32))
                 user_input.extend([cand_items_by_u] * self.num_items)
             user_input = pad_sequences(user_input, value=self.num_items)
             feed_dict = {
                 self.user_input:
                 user_input,
                 self.num_idx:
                 item_idx,
                 self.item_input:
                 np.tile(eval_items,
                         batch_length[index + 1] - batch_length[index])
             }
             temp_data = np.reshape(
                 self.sess.run(self.output, feed_dict=feed_dict),
                 (-1, self.num_items))
             for i, u in enumerate(
                     indices[batch_length[index]:batch_length[index + 1]]):
                 self.ratings[u] = temp_data[i]
         else:
             u = indices[batch_length[index]]
             ratings_row = []
             cand_items_by_u = self.train_dict[u]
             num_idx = len(cand_items_by_u)
             #item_batch_size = 2000000 // num_idx
             item_batch_size = 4000000 // num_idx
             item_batch = self.num_items // item_batch_size + 1
             for item in range(item_batch):
                 start = item * item_batch_size
                 end = min((item + 1) * item_batch_size, self.num_items)
                 user_input = []
                 item_idx = np.full(end - start, num_idx, dtype=np.int32)
                 user_input.extend([cand_items_by_u] * (end - start))
                 feed_dict = {
                     self.user_input: user_input,
                     self.num_idx: item_idx,
                     self.item_input: eval_items[start:end]
                 }
                 ratings_row.extend(
                     self.sess.run(self.output, feed_dict=feed_dict))
             self.ratings[u] = np.array(ratings_row)
     return self.evaluator.evaluate(self)