def build_vocabs(self, dataset, logger=None, transformer=False): rel_vocab = self.vocabs.get('rel', None) if rel_vocab is None: rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None)) self.vocabs.put(rel=rel_vocab) timer = CountdownTimer(len(dataset)) if transformer: token_vocab = None else: self.vocabs.token = token_vocab = VocabCounter(unk_token=self.config.get('unk', UNK)) for i, sample in enumerate(dataset): timer.log('Building vocab [blink][yellow]...[/yellow][/blink]', ratio_percentage=True) min_freq = self.config.get('min_freq', None) if min_freq: token_vocab.trim(min_freq) rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV self.vocabs.lock() self.vocabs.summary(logger=logger) if token_vocab: self.config.n_words = len(self.vocabs['token']) self.config.n_rels = len(self.vocabs['rel']) if token_vocab: self.config.pad_index = self.vocabs['token'].pad_idx self.config.unk_index = self.vocabs['token'].unk_idx
def build_vocabs(self, dataset, logger): self.vocabs.tag = Vocab(unk_token=None, pad_token=None) self.vocabs[self.config.token_key] = Vocab() for each in dataset: pass self.vocabs.lock() self.vocabs.summary(logger)
def build_vocabs(self, dataset, logger=None, transformer=None): self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel, unk_token=self.config.pad_rel) if self.config.joint: self.vocabs['rel'] = rel_2nd super().build_vocabs(dataset, logger, transformer) self.config.n_rels_2nd = len(rel_2nd)
def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]: if isinstance(self.embed, Embedding): self.embed.transform(vocabs=vocabs) vocab_name = self.vocab_name if vocab_name not in vocabs: vocabs[vocab_name] = Vocab() return ToChar(self.field, vocab_name, max_word_length=self.max_word_length)
def transform(self, vocabs: VocabDict = None, **kwargs) -> Optional[Callable]: assert vocabs is not None if self.field not in vocabs: vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk) return super().transform(**kwargs)
def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger): additional_tokens = set() self.collect_additional_tokens(additional_tokens, trn) additional_tokens = sorted(additional_tokens) self.build_tokenizer(additional_tokens) self.vocabs['additional_tokens'] = Vocab( idx_to_token=list(additional_tokens))
def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs): vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship label_vocab.add('<null>') timer = CountdownTimer(len(dataset)) for each in dataset: timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]') label_vocab.set_unk_as_safe_unk() if lock: vocabs.lock() vocabs.summary(logger)
def build_vocabs(self, dataset, logger=None, transformer=None): rel_vocab = self.vocabs.get('rel', None) if rel_vocab is None: rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None)) self.vocabs.put(rel=rel_vocab) if self.config.get('feat', None) == 'pos' or self.config.get( 'use_pos', False): self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None) timer = CountdownTimer(len(dataset)) if transformer: token_vocab = None else: token_vocab = Vocab() self.vocabs.token = token_vocab unk = self.config.get('unk', None) if unk is not None: token_vocab.unk_token = unk if token_vocab and self.config.get('min_freq', None): counter = Counter() for sample in dataset: for form in sample['token']: counter[form] += 1 reserved_token = [token_vocab.pad_token, token_vocab.unk_token] if ROOT in token_vocab: reserved_token.append(ROOT) freq_words = reserved_token + [ token for token, freq in counter.items() if freq >= self.config.min_freq ] token_vocab.token_to_idx.clear() for word in freq_words: token_vocab(word) else: for i, sample in enumerate(dataset): timer.log('vocab building [blink][yellow]...[/yellow][/blink]', ratio_percentage=True) rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV self.vocabs.lock() self.vocabs.summary(logger=logger) if token_vocab: self.config.n_words = len(self.vocabs['token']) if 'pos' in self.vocabs: self.config.n_feats = len(self.vocabs['pos']) self.vocabs['pos'].set_unk_as_safe_unk() self.config.n_rels = len(self.vocabs['rel']) if token_vocab: self.config.pad_index = self.vocabs['token'].pad_idx self.config.unk_index = self.vocabs['token'].unk_idx
def build_vocabs(self, trn, logger, **kwargs): self.vocabs.tag = Vocab(pad_token=None, unk_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 token_key = self.config.token_key for each in trn: max_seq_len = max(max_seq_len, len(each[token_key])) timer.log( f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})' ) self.vocabs.tag.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger, **kwargs): char_min_freq = self.config.char_min_freq if char_min_freq: has_cache = dataset.cache is not None char_counter = Counter() for each in dataset: for c in each['char']: char_counter[c] += 1 self.vocabs.char = vocab = Vocab() for c, f in char_counter.items(): if f >= char_min_freq: vocab.add(c) if has_cache: dataset.purge_cache() for each in dataset: pass else: self.vocabs.char = Vocab() for each in dataset: pass self.config.eos_chars = dataset.eos_chars self.vocabs.lock() self.vocabs.summary(logger)
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship self.vocabs.srl_label.add('<null>') timer = CountdownTimer(len(dataset)) max_seq_len = 0 for each in dataset: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log( f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]' ) pass timer.stop() timer.erase() self.vocabs['srl_label'].set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def __init__(self, data: str, batch_size, seq_len, tokenizer='char', eos='\n', strip=True, vocab=None, cache=False, transform: Union[Callable, List] = None) -> None: self.cache = cache self.eos = eos self.strip = strip super().__init__(transform) if isinstance(tokenizer, str): available_tokenizers = { 'char': ToChar('text', 'token'), 'whitespace': WhitespaceTokenizer('text', 'token') } assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} ' self.append_transform(available_tokenizers[tokenizer]) if vocab is None: vocab = Vocab() self.training = True else: self.training = vocab.mutable self.append_transform(AppendEOS('token', eos=eos)) self.append_transform(FieldToIndex('token', vocab)) self.batch_size = batch_size data = get_resource(data) self.data = data self.num_tokens = None self.load_file(data) self._fp = None if isinstance(seq_len, int): self.seq_len = lambda: seq_len else: self.seq_len = seq_len
def index_word2vec_with_vocab(filepath: str, vocab: Vocab, extend_vocab=True, unk=None, lowercase=False, init='uniform', normalize=None) -> torch.Tensor: pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath) if unk and unk in pret_vocab: pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk) if extend_vocab: vocab.unlock() for word in pret_vocab: vocab.get_idx(word.lower() if lowercase else word) vocab.lock() ids = [] unk_id_offset = 0 for word, idx in vocab.token_to_idx.items(): word_id = pret_vocab.get(word, None) # Retry lower case if word_id is None: word_id = pret_vocab.get(word.lower(), None) if word_id is None: word_id = len(pret_vocab) + unk_id_offset unk_id_offset += 1 ids.append(word_id) if unk_id_offset: unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1)) if init and init != 'zeros': if init == 'uniform': init = embedding_uniform else: raise ValueError(f'Unsupported init {init}') unk_embeds = init(unk_embeds) pret_matrix = torch.cat([pret_matrix, unk_embeds]) ids = torch.LongTensor(ids) embedding = pret_matrix.index_select(0, ids) if normalize == 'norm': embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) elif normalize == 'std': embedding /= torch.std(embedding) return embedding
def __init__(self, vocab: Vocab = None) -> None: super().__init__() if vocab is None: vocab = Vocab() self.vocab = vocab
def __init__(self, *args, **kwargs) -> None: vocabs = dict(kwargs) for each in args: vocabs[each] = Vocab() super().__init__(vocabs)
def transform(self, **kwargs) -> Callable: vocab = Vocab() vocab.load(os.path.join(get_resource(self.path), 'vocab.json')) return TransformList(ContextualStringEmbeddingTransform(self.field), FieldToIndex(f'{self.field}_f_char', vocab), FieldToIndex(f'{self.field}_b_char', vocab))
self.input_dim = input_dim self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2) for _ in range(layers)]) self.reset_parameters() def reset_parameters(self): for layer in self.layers: nn.init.normal_(layer.weight, std=0.02) nn.init.constant_(layer.bias[self.input_dim:], 1) nn.init.constant_(layer.bias[:self.input_dim], 0) def forward(self, x): for layer in self.layers: new_x = layer(x) new_x, gate = new_x.chunk(2, dim=-1) new_x = F.relu(new_x) gate = torch.sigmoid(gate) x = gate * x + (1 - gate) * new_x return x if __name__ == "__main__": from data import Vocab, CLS, DUM, END vocab = Vocab('../data/AMR/amr_1.0_reca/lem_vocab', 3, [CLS]) embed = AMREmbedding(vocab, 300, pretrained_file='../data/glove.840B.300d.txt', dump_file='../data/AMR/amr_1.0_reca/glove_lem_embed') vocab = Vocab('../data/AMR/amr_1.0_reca/concept_vocab', 3, [DUM, END]) embed = AMREmbedding(vocab, 300, pretrained_file='../data/glove.840B.300d.txt', amr=True, dump_file='../data/AMR/amr_1.0_reca/glove_concept_embed')
def AMREmbedding(vocab: Vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None): if pretrained_file is None: return Embedding(len(vocab), embedding_dim, vocab.pad_idx) tokens_to_keep = set() for idx in range(vocab.size): token = vocab.idx2token(idx) # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param. if amr: token = re.sub(r'-\d\d$', '', token) tokens_to_keep.add(token) embeddings = {} if dump_file is not None: fo = open(dump_file, 'w', encoding='utf8') with open(pretrained_file, encoding='utf8') as embeddings_file: for line in embeddings_file.readlines(): fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: continue token = fields[0] if token in tokens_to_keep: if dump_file is not None: fo.write(line) vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector if dump_file is not None: fo.close() all_embeddings = np.asarray(list(embeddings.values())) print('pretrained', all_embeddings.shape) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) all_embeddings -= embeddings_mean all_embeddings /= embeddings_std all_embeddings *= 0.02 embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) print(embeddings_mean, embeddings_std) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_(embeddings_mean, embeddings_std) for i in range(vocab.size): token = vocab.idx2token(i) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) else: if amr: normalized_token = re.sub(r'-\d\d$', '', token) if normalized_token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[normalized_token]) embedding_matrix[vocab.padding_idx].fill_(0.) return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
def build_vocabs(self, trn, logger, **kwargs): self.vocabs.label = Vocab(pad_token=None, unk_token=None) for each in trn: pass self.vocabs.lock() self.vocabs.summary(logger)