def encode_ner_y(y_ner_list_train, y_ner_list_test, CLASS_COUNT_DICT): y_ner_encoder = LabelEncoder(sample=CLASS_COUNT_DICT.keys()) y_ner_encoded_train = [[ y_ner_encoder.encode(label) for label in label_list ] for label_list in y_ner_list_train] y_ner_encoded_train = [torch.stack(tens) for tens in y_ner_encoded_train] y_ner_padded_train = torch.LongTensor( pad_sequence(y_ner_encoded_train, MAX_SENTENCE_LEN + 1)) y_ner_encoded_test = [[ y_ner_encoder.encode(label) for label in label_list ] for label_list in y_ner_list_test] y_ner_encoded_test = [torch.stack(tens) for tens in y_ner_encoded_test] y_ner_padded_test = torch.LongTensor( pad_sequence(y_ner_encoded_test, MAX_SENTENCE_LEN + 1)) if y_ner_padded_train.shape[1] > y_ner_padded_test.shape[1]: y_ner_padded_test = torch.cat( ( y_ner_padded_test, torch.zeros( y_ner_padded_test.shape[0], y_ner_padded_train.shape[1] - y_ner_padded_test.shape[1], ), ), dim=1, ).type(torch.long) return y_ner_padded_train, y_ner_padded_test
def test_label_encoder_no_reserved(): sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] label_encoder = LabelEncoder(sample, reserved_labels=[], unknown_index=None) label_encoder.encode('people/deceased_person/place_of_death') # No ``unknown_index`` defined causes ``RuntimeError`` if an unknown label is used. with pytest.raises(TypeError): label_encoder.encode('symbols/namesake/named_after')
def test_label_encoder_known(label_encoder): input_ = 'symbols/namesake/named_after' sample = [ 'people/deceased_person/place_of_death', 'symbols/name_source/namesakes' ] sample.append(input_) label_encoder = LabelEncoder(sample) output = label_encoder.encode(input_) assert label_encoder.decode(output) == input_
def encode_ner_y(y_ner_list_train, y_ner_list_test, class_count_dict, max_sent_len): """ Tokenize y :param y_ner_list_train: :param y_ner_list_test: :param class_count_dict: :param max_sent_len: :return: """ y_ner_encoder = LabelEncoder(sample=class_count_dict.keys()) y_ner_encoded_train = [ [y_ner_encoder.encode(label) for label in label_list] for label_list in y_ner_list_train ] y_ner_encoded_train = [torch.stack(tens) for tens in y_ner_encoded_train] y_ner_padded_train = torch.LongTensor( pad_sequence(y_ner_encoded_train, max_sent_len + 1) ) y_ner_encoded_test = [ [y_ner_encoder.encode(label) for label in label_list] for label_list in y_ner_list_test ] y_ner_encoded_test = [torch.stack(tens) for tens in y_ner_encoded_test] y_ner_padded_test = torch.LongTensor( pad_sequence(y_ner_encoded_test, max_sent_len + 1) ) if y_ner_padded_train.shape[1] > y_ner_padded_test.shape[1]: y_ner_padded_test = torch.cat( ( y_ner_padded_test, torch.zeros( y_ner_padded_test.shape[0], y_ner_padded_train.shape[1] - y_ner_padded_test.shape[1], ), ), dim=1, ).type(torch.long) return y_ner_encoder, y_ner_padded_train, y_ner_padded_test
# Make Encoders sentence_corpus = [row['premise'] for row in itertools.chain(train, dev, test)] sentence_corpus += [ row['hypothesis'] for row in itertools.chain(train, dev, test) ] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in itertools.chain(train, dev, test)] label_encoder = LabelEncoder(label_corpus) # Encode for row in itertools.chain(train, dev, test): row['premise'] = sentence_encoder.encode(row['premise']) row['hypothesis'] = sentence_encoder.encode(row['hypothesis']) row['label'] = label_encoder.encode(row['label']) config = args config.n_embed = sentence_encoder.vocab_size config.d_out = label_encoder.vocab_size config.n_cells = config.n_layers # double the number of cells for bidirectional networks if config.birnn: config.n_cells *= 2 if args.resume_snapshot: model = torch.load( args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) else:
class ActionSequenceEncoder: def __init__(self, samples: Samples, token_threshold: int): reserved_labels: List[Union[Unknown, CloseVariadicFieldRule]] = [Unknown()] reserved_labels.append(CloseVariadicFieldRule()) self._rule_encoder = LabelEncoder(samples.rules, reserved_labels=reserved_labels, unknown_index=0) self._node_type_encoder = LabelEncoder(samples.node_types) reserved_labels = [Unknown()] self._token_encoder = LabelEncoder(samples.tokens, min_occurrences=token_threshold, reserved_labels=reserved_labels, unknown_index=0) self.value_to_idx: Dict[str, List[int]] = {} for kind, value in self._token_encoder.vocab[len(reserved_labels):]: idx = self._token_encoder.encode((kind, value)) if value not in self.value_to_idx: self.value_to_idx[value] = [] self.value_to_idx[value].append(idx) def decode(self, tensor: torch.LongTensor, reference: List[Token]) \ -> Optional[ActionSequence]: """ Return the action sequence corresponding to the tensor Parameters ---------- tensor: torch.LongTensor The encoded tensor with the shape of (len(action_sequence), 3). Each action will be encoded by the tuple of (ID of the applied rule, ID of the inserted token, the index of the word copied from the reference). The padding value should be -1. reference Returns ------- Optional[action_sequence] The action sequence corresponding to the tensor None if the action sequence cannot be generated. """ retval = ActionSequence() for i in range(tensor.shape[0]): if tensor[i, 0] > 0: # ApplyRule rule = self._rule_encoder.decode(tensor[i, 0]) retval.eval(ApplyRule(rule)) elif tensor[i, 1] > 0: # GenerateToken kind, value = self._token_encoder.decode(tensor[i, 1]) retval.eval(GenerateToken(kind, value)) elif tensor[i, 2] >= 0: # GenerateToken (Copy) index = int(tensor[i, 2].numpy()) if index >= len(reference): logger.debug("reference index is out-of-bounds") return None token = reference[index] retval.eval(GenerateToken(token.kind, token.raw_value)) else: logger.debug("invalid actions") return None return retval def encode_action(self, action_sequence: ActionSequence, reference: List[Token]) \ -> Optional[torch.Tensor]: """ Return the tensor encoded the action sequence Parameters ---------- action_sequence: action_sequence The action_sequence containing action sequence to be encoded reference Returns ------- Optional[torch.Tensor] The encoded tensor. The shape of tensor is (len(action_sequence) + 1, 4). Each action will be encoded by the tuple of (ID of the node types, ID of the applied rule, ID of the inserted token, the index of the word copied from the reference. The padding value should be -1. None if the action sequence cannot be encoded. """ reference_value = [token.raw_value for token in reference] action = \ torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \ * -1 for i in range(len(action_sequence.action_sequence)): a = action_sequence.action_sequence[i] parent = action_sequence.parent(i) if parent is not None: parent_action = \ cast(ApplyRule, action_sequence.action_sequence[parent.action]) parent_rule = cast(ExpandTreeRule, parent_action.rule) action[i, 0] = self._node_type_encoder.encode( parent_rule.children[parent.field][1]) if isinstance(a, ApplyRule): rule = a.rule action[i, 1] = self._rule_encoder.encode(rule) else: encoded_token = \ int(self._token_encoder.encode((a.kind, a.value)).numpy()) if encoded_token != 0: action[i, 2] = encoded_token # Unknown token if a.value in reference_value: # TODO use kind in reference action[i, 3] = \ reference_value.index(cast(str, a.value)) if encoded_token == 0 and \ a.value not in reference_value: logger.debug("cannot encode token") return None head = action_sequence.head length = len(action_sequence.action_sequence) if head is not None: head_action = \ cast(ApplyRule, action_sequence.action_sequence[head.action]) head_rule = cast(ExpandTreeRule, head_action.rule) action[length, 0] = self._node_type_encoder.encode( head_rule.children[head.field][1]) return action def encode_raw_value(self, text: str) -> List[int]: if text in self.value_to_idx: return self.value_to_idx[text] else: return [self._token_encoder.encode(Unknown()).item()] def batch_encode_raw_value(self, texts: List[str]) -> List[List[int]]: return [self.encode_raw_value(text) for text in texts] def encode_parent(self, action_sequence) -> torch.Tensor: """ Return the tensor encoded the action sequence Parameters ---------- action_sequence: action_sequence The action_sequence containing action sequence to be encoded Returns ------- torch.Tensor The encoded tensor. The shape of `action` tensor is (len(action_sequence) + 1, 4). Each action will be encoded by the tuple of (ID of the parent node types, ID of the parent-action's rule, the index of the parent action, the index of the field). The padding value should be -1. """ parent_tensor = \ torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \ * -1 for i in range(len(action_sequence.action_sequence)): parent = action_sequence.parent(i) if parent is not None: parent_action = \ cast(ApplyRule, action_sequence.action_sequence[parent.action]) parent_rule = cast(ExpandTreeRule, parent_action.rule) parent_tensor[i, 0] = \ self._node_type_encoder.encode(parent_rule.parent) parent_tensor[i, 1] = self._rule_encoder.encode(parent_rule) parent_tensor[i, 2] = parent.action parent_tensor[i, 3] = parent.field head = action_sequence.head length = len(action_sequence.action_sequence) if head is not None: head_action = \ cast(ApplyRule, action_sequence.action_sequence[head.action]) head_rule = cast(ExpandTreeRule, head_action.rule) parent_tensor[length, 0] = \ self._node_type_encoder.encode(head_rule.parent) parent_tensor[length, 1] = self._rule_encoder.encode(head_rule) parent_tensor[length, 2] = head.action parent_tensor[length, 3] = head.field return parent_tensor def encode_tree(self, action_sequence: ActionSequence) \ -> Union[torch.Tensor, torch.Tensor]: """ Return the tensor adjacency matrix of the action sequence Parameters ---------- action_sequence: action_sequence The action_sequence containing action sequence to be encoded Returns ------- depth: torch.Tensor The depth of each action. The shape is (len(action_sequence),). adjacency_matrix: torch.Tensor The encoded tensor. The shape of tensor is (len(action_sequence), len(action_sequence)). If i th action is a parent of j th action, (i, j) element will be 1. the element will be 0 otherwise. """ L = len(action_sequence.action_sequence) depth = torch.zeros(L) m = torch.zeros(L, L) for i in range(L): p = action_sequence.parent(i) if p is not None: depth[i] = depth[p.action] + 1 m[p.action, i] = 1 return depth, m def encode_each_action(self, action_sequence: ActionSequence, reference: List[Token], max_arity: int) \ -> torch.Tensor: """ Return the tensor encoding the each action Parameters ---------- action_sequence: action_sequence The action_sequence containing action sequence to be encoded reference max_arity: int Returns ------- torch.Tensor The encoded tensor. The shape of tensor is (len(action_sequence), max_arity + 1, 3). [:, 0, 0] encodes the parent node type. [:, i, 0] encodes the node type of (i - 1)-th child node. [:, i, 1] encodes the token of (i - 1)-th child node. [:, i, 2] encodes the reference index of (i - 1)-th child node. The padding value is -1. """ L = len(action_sequence.action_sequence) reference_value = [token.raw_value for token in reference] retval = torch.ones(L, max_arity + 1, 3).long() * -1 for i, action in enumerate(action_sequence.action_sequence): if isinstance(action, ApplyRule): if isinstance(action.rule, ExpandTreeRule): # Encode parent retval[i, 0, 0] = \ self._node_type_encoder.encode(action.rule.parent) # Encode children for j, (_, child) in enumerate( action.rule.children[:max_arity]): retval[i, j + 1, 0] = \ self._node_type_encoder.encode(child) else: gentoken: GenerateToken = action kind = gentoken.kind value = gentoken.value encoded_token = \ int(self._token_encoder.encode((kind, value)).numpy()) if encoded_token != 0: retval[i, 1, 1] = encoded_token if value in reference_value: # TODO use kind in reference retval[i, 1, 2] = \ reference_value.index(cast(str, value)) return retval def encode_path(self, action_sequence: ActionSequence, max_depth: int) \ -> torch.Tensor: """ Return the tensor encoding the each action Parameters ---------- action_sequence: action_sequence The action_sequence containing action sequence to be encoded max_depth: int Returns ------- torch.Tensor The encoded tensor. The shape of tensor is (len(action_sequence), max_depth). [i, :] encodes the path from the root node to i-th node. Each node represented by the rule id. The padding value is -1. """ L = len(action_sequence.action_sequence) retval = torch.ones(L, max_depth).long() * -1 for i in range(L): parent_opt = action_sequence.parent(i) if parent_opt is not None: p = action_sequence.action_sequence[parent_opt.action] if isinstance(p, ApplyRule): retval[i, 0] = self._rule_encoder.encode(p.rule) retval[i, 1:] = retval[parent_opt.action, :max_depth - 1] return retval
class WikiDataset(Dataset): ''' A custom dataset object that encodes a tokenized text and its labels according to the corresponding encoders ''' def __init__(self, json, text_encoder=None, label_encoder=None, vocab=None, mode='train'): ''' Initialization Arguments: json: Json file containing the data. Structure of json file: e.g: json: {'data' : [{'id': filename, 'title': title of page, 'toc': [list of items in table of contents section of wikipage], 'intro':introduction of wiki page, 'label':'positive'/'negative' flag}] } Labels-required only when mode = 'train' text_encoder: encoder object that encodes tokens to their unique integer ids label_encoder: encoder object that encodes labels to their unique integer ids vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels ''' self.data = json assert 'data' in self.data # Define the mode in which the dataset object is to be used self.mode = mode # Define text encoder and vocabulary if text_encoder: self._text_encoder = text_encoder self._vocab = self._text_encoder.vocab elif vocab: self._vocab = vocab self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) else: self._vocab = self.create_vocab() self._text_encoder = StaticTokenizerEncoder(self._vocab, append_eos=False, tokenize=self.split) self._vocab_size = self._text_encoder.vocab_size # Define label encoder if self.mode == 'train': if label_encoder: self._label_encoder = label_encoder else: self._label_encoder = LabelEncoder( [sample['label'] for sample in self.data['data']]) self._label_size = self._label_encoder.vocab_size else: self._label_encoder = None self._label_size = None def __len__(self): ''' Size of dataset ''' return len(self.data['data']) def __getitem__(self, idx): ''' Extract item corresponding to idx'th index in data ''' item = self.data['data'][idx] intro_enc = self._text_encoder.encode(item['intro']) toc = item['toc'] if toc == []: toc_enc = self._text_encoder.encode('.') else: toc = ' '.join(toc) toc_enc = self._text_encoder.encode(toc) title_enc = self._text_encoder.encode(item['title']) if self.mode == 'train': return title_enc, toc_enc, intro_enc, self._label_encoder.encode( item['label']).view(-1) else: return title_enc, toc_enc, intro_enc @property def vocab_size(self): return self._vocab_size @property def label_size(self): return self._label_size @property def text_encoder(self): return self._text_encoder @property def label_encoder(self): return self._label_encoder @property def vocab(self): return self._vocab def create_vocab(self, remove_less_freq_words=True, threshold=1): ''' Creates vocabulary from the dataset tokens Returns: List of unique tokens in dataset ''' temp_vocab = [] for sample in self.data['data']: temp_vocab.extend(sample['title'].split()) temp_vocab.extend(' '.join(sample['toc']).split()) temp_vocab.extend(sample['intro'].split()) vocab = [] if remove_less_freq_words: count_dict = collections.Counter(temp_vocab) for word in count_dict.keys(): if count_dict[word] > threshold: vocab.append(word) else: vocab = sorted(list(set(temp_vocab))) return vocab def split(self, x): ''' Splits the text into tokens ''' return x.split() def collate_fn(self, batch, padding=True): """ Collate function needs to be passed to the pytorch dataloader Returns: (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths labels: tensor containing labels for the batch """ if self.mode == 'train': title, toc, intro, labels = zip(*batch) labels = torch.cat(labels) else: title, toc, intro = zip(*batch) if isinstance(intro, collections.Sequence): if padding: title, title_lengths = stack_and_pad_tensors(title) toc, toc_lengths = stack_and_pad_tensors(toc) intro, intro_lengths = stack_and_pad_tensors(intro) if self.mode == 'train': return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths), labels else: return (title, title_lengths), (toc, toc_lengths), (intro, intro_lengths) else: return batch @classmethod def fromJsonFile(cls, json_file, text_encoder=None, label_encoder=None, vocab=None, mode='train'): ''' Read data from json file Arguments: json_file: string specifying location to json_file ''' with open(json_file, 'r') as f: json_data = json.load(f) return cls(json_data, text_encoder, label_encoder, vocab, mode)