def _preprocess_data(self, X, Y=None, idxs=None, train=False): """ Preprocess the data: 1. Convert sparse feature matrix to dense matrix for pytorch operation. 2. Make sentence with mention into sequence data for LSTM. 3. Select subset of the input if idxs exists. :param X: The input data of the model :param Y: The labels of input data (optional) :param idxs: The selected indexs of input data :param train: An indicator for word dictionary to extend new words """ C, F = X # Covert sparse feature matrix to dense matrix # TODO: the pytorch implementation is taking dense vector as input, # should optimize later if issparse(F): F = F.todense() # Create word dictionary for LSTM if not hasattr(self, "word_dict"): self.word_dict = SymbolTable() arity = len(C[0]) # Add paddings into word dictionary for i in range(arity): # TODO: optimize this list( map(self.word_dict.get, ["~~[[" + str(i), str(i) + "]]~~"])) # Make sequence input for LSTM from candidates seq_data = [] for candidate in C: cand_idx = [] for i in range(len(candidate)): # Add mark for each mention in the original sentence args = [( candidate[i].span.get_word_start(), candidate[i].span.get_word_end(), i, )] s = mark_sentence(mention_to_tokens(candidate[i]), args) f = self.word_dict.get if train else self.word_dict.lookup cand_idx.append(list(map(f, s))) seq_data.append(cand_idx) # Generate proprcessed the input if idxs is None: if Y is not None: return [(seq_data[i], F[i]) for i in range(len(seq_data))], Y else: return [(seq_data[i], F[i]) for i in range(len(seq_data))] if Y is not None: return [(seq_data[i], F[i]) for i in idxs], Y[idxs] else: return [(seq_data[i], F[i]) for i in idxs]
def _preprocess_data(self, X, Y=None, idxs=None, train=False): """ Preprocess the data: 1. Make sentence with mention into sequence data for LSTM. 2. Select subset of the input if idxs exists. :param X: The input data of the model. :type X: pair with candidates and corresponding features :param Y: The labels of input data (optional). :type Y: list of floats if num_classes = 2 otherwise num_classes-length numpy array :param idxs: The selected indexs of input data. :type idxs: list or numpy.array :param train: An indicator for word dictionary to extend new words. :type train: bool :return: Preprocessed data. :rtype: list of (candidate, features) pairs """ C, F = X # Create word dictionary for LSTM if not hasattr(self, "word_dict"): self.word_dict = SymbolTable() arity = len(C[0]) # Add paddings into word dictionary for i in range(arity): # TODO: optimize this list( map(self.word_dict.get, ["~~[[" + str(i), str(i) + "]]~~"])) # Make sequence input for LSTM from candidates seq_data = [] for candidate in C: cand_idx = [] for i in range(len(candidate)): # Add mark for each mention in the original sentence args = [( candidate[i].span.get_word_start_index(), candidate[i].span.get_word_end_index(), i, )] s = mark_sentence(mention_to_tokens(candidate[i]), args) f = self.word_dict.get if train else self.word_dict.lookup cand_idx.append(list(map(f, s))) seq_data.append(cand_idx) # Generate proprcessed the input if idxs is None: if Y is not None: return ( [( seq_data[i], F.indices[F.indptr[i]:F.indptr[i + 1]], F.data[F.indptr[i]:F.indptr[i + 1]], ) for i in range(len(C))], Y, ) else: return [( seq_data[i], F.indices[F.indptr[i]:F.indptr[i + 1]], F.data[F.indptr[i]:F.indptr[i + 1]], ) for i in range(len(C))] if Y is not None: return ( [( seq_data[i], F.indices[F.indptr[i]:F.indptr[i + 1]], F.data[F.indptr[i]:F.indptr[i + 1]], ) for i in idxs], Y[idxs], ) else: return [( seq_data[i], F.indices[F.indptr[i]:F.indptr[i + 1]], F.data[F.indptr[i]:F.indptr[i + 1]], ) for i in idxs]
def _preprocess_data(self, X, Y=None, idxs=None, train=False): """ Preprocess the data: 1. Convert sparse feature matrix to dense matrix for pytorch operation. 2. Make sentence with mention into sequence data for LSTM. 3. Select subset of the input if idxs exists. :param X: The input data of the model. :type X: pair with candidates and corresponding features :param Y: The labels of input data (optional). :type Y: list or numpy.array :param idxs: The selected indexs of input data. :type idxs: list or numpy.array :param train: An indicator for word dictionary to extend new words. :type train: bool :return: Preprocessed data. :rtype: list of (candidate, features) pairs """ C, F = X # Covert sparse feature matrix to dense matrix if issparse(F): F = np.array(F.todense(), dtype=np.float32) if Y is not None: Y = np.array(Y).astype(np.float32) # Create word dictionary for LSTM if not hasattr(self, "word_dict"): self.word_dict = SymbolTable() arity = len(C[0]) # Add paddings into word dictionary for i in range(arity): list(map(self.word_dict.get, ["~~[[" + str(i), str(i) + "]]~~"])) # Make sequence input for LSTM from candidates seq_data = [] for candidate in C: cand_idx = [] for i in range(len(candidate)): # Add mark for each mention in the original sentence args = [ ( candidate[i].context.get_word_start_index(), candidate[i].context.get_word_end_index(), i, ) ] s = mark_sentence(mention_to_tokens(candidate[i]), args) f = self.word_dict.get if train else self.word_dict.lookup cand_idx.append(list(map(f, s))) seq_data.append(cand_idx) # Generate proprcessed the input if idxs is None: if Y is not None: return [[seq_data[i], F[i]] for i in range(len(seq_data))], Y else: return [[seq_data[i], F[i]] for i in range(len(seq_data))] if Y is not None: return [[seq_data[i], F[i]] for i in idxs], Y[idxs] else: return [[seq_data[i], F[i]] for i in idxs]