def test_crashes_with_empty_feature_value_and_no_default(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() vocab.add_token_to_namespace("ROOT", namespace="dep_labels") vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_") with pytest.raises(ValueError): indexer.tokens_to_indices([tokens[-1]], vocab)
def test_tokens_to_indices_with_non_default_feature_name(self): tokenizer = SpacyTokenizer(parse=True) tokens = tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels") none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = SingleIdTokenIndexer(namespace="dep_labels", feature_name="dep_", default_value="NONE") assert indexer.tokens_to_indices([tokens[1]], vocab) == { "tokens": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab) == { "tokens": [none_index] }
class FasttextTokenIndexer(TokenIndexer[int]): def __init__(self, model_path, namespace: str = 'tokens', lowercase_tokens: bool = False, model_params_path=None): self.model_path = model_path self.model_params_path = model_params_path or self.get_params_path( model_path) self.hash_params = {} self.vocab = {} self.num_vectors = 0 self.single_id_indexer = SingleIdTokenIndexer( namespace, lowercase_tokens) # ToDo: Add start and end tokens params if os.path.exists(self.model_params_path): # Assume weights will be loaded later self.load_saved_params(self.model_params_path) else: self.load_ft_model(model_path) @classmethod def get_params_path(cls, model_path): return model_path + '.params' def load_saved_params(self, model_param_path): with open(model_param_path, encoding="utf-8") as fd: ft_params = json.load(fd) self.hash_params = ft_params['hash_params'] self.vocab = ft_params['vocab'] def load_ft_model(self, model_path): self.model_params_path = self.get_params_path(model_path) ft = load_fasttext_model(model_path) self.hash_params = { "minn": ft.min_n, "maxn": ft.max_n, "num_buckets": ft.bucket, "fb_compatible": ft.compatible_hash, } self.vocab = dict( (word, keydvector.index) for word, keydvector in ft.vocab.items()) with open(self.model_params_path, 'w', encoding="utf-8") as out: json.dump( { 'dimensions': ft.vector_size, 'hash_params': self.hash_params, 'vocab': self.vocab, }, out, ensure_ascii=False, indent=2) def words_to_indexes(self, words): words_ngram_ids = [] word_lengths = [] mask = [] for word in words: ngram_ids = self.get_ngram_ids(word) words_ngram_ids += ngram_ids mask += [1] * len(ngram_ids) word_lengths.append(len(ngram_ids)) return words_ngram_ids, word_lengths, mask def get_ngram_ids(self, word): if word in self.vocab: return [self.vocab[word]] res = [] for ngram_id in ft_ngram_hashes(word, **self.hash_params): res.append(ngram_id + len(self.vocab)) return res def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): return self.single_id_indexer.count_vocab_items(token, counter) def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[TokenType]]: words = [token.text for token in tokens] word_ngram_ids, word_lengths, mask = self.words_to_indexes(words) return { f"{index_name}-ngram": word_ngram_ids, f"{index_name}-ngram-lengths": word_lengths, f"{index_name}-ngram-mask": mask, **self.single_id_indexer.tokens_to_indices(tokens, vocabulary, index_name) } def get_padding_token(self) -> TokenType: return 0 def get_padding_lengths(self, token: TokenType) -> Dict[str, int]: return {} def as_padded_tensor( self, tokens: Dict[str, List[TokenType]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: padded = { key: pad_sequence_to_length(val, desired_num_tokens[key]) for key, val in tokens.items() } return {key: torch.LongTensor(array) for key, array in padded.items()}
def test_no_namespace_means_no_indexing(self): indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id") assert indexer.tokens_to_indices([Token(text_id=23)], None) == { "tokens": [23] }
class UnbiasedTopicalExtractorGloVe(nn.Module): def __init__(self, device, vocab_dir): super(UnbiasedTopicalExtractorGloVe, self).__init__() self.device = device vocab = Vocabulary() self.vocab = vocab.from_files(vocab_dir) self.embedding = Embedding( embedding_dim=300, trainable=True, num_embeddings=self.vocab.get_vocab_size("tokens"), pretrained_file= "https://allennlp.s3.amazonaws.com/datasets/glove/glove.840B.300d.txt.gz" ) self.tokenizer = WordTokenizer() self.token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.rnn_input_dropout = InputVariationalDropout(0.5) self.encoder = _Seq2SeqWrapper(torch.nn.LSTM)(input_size=300, hidden_size=300, num_layers=1, bidirectional=True) self._feature_feedforward_layer = torch.nn.Linear(600, 300) self._feature_feedforward_dropout = torch.nn.Dropout(0.5) self._feature_feedforward_activation = torch.nn.ReLU() self._class_classification_layer = torch.nn.Linear(300, 2) self._group_classification_layer = torch.nn.Linear(300, 2) init.xavier_uniform_(self._feature_feedforward_layer.weight) init.zeros_(self._feature_feedforward_layer.bias) init.xavier_uniform_(self._class_classification_layer.weight) init.zeros_(self._class_classification_layer.bias) init.xavier_uniform_(self._group_classification_layer.weight) init.zeros_(self._group_classification_layer.bias) self._class_loss = torch.nn.CrossEntropyLoss() self._group_loss = torch.nn.CrossEntropyLoss() def forward(self, alpha, tweets, class_labels=None, group_labels=None): batch_indices = [] batch_masks = [] max_len = 0 for tweet in tweets: tokens = self.tokenizer.tokenize(tweet) indices = self.token_indexer.tokens_to_indices( tokens, self.vocab, index_name="tokens")["tokens"] max_len = max(len(indices), max_len) # The mask has 1 for real tokens and 0 for padding tokens. input_mask = [1] * len(indices) batch_indices.append(indices) batch_masks.append(input_mask) for indices, input_mask in zip(batch_indices, batch_masks): # Zero-pad up to the max sequence length within the batch. padding = [0] * (max_len - len(indices)) indices += padding input_mask += padding assert len(indices) == max_len assert len(input_mask) == max_len batch_indices = torch.tensor(batch_indices).to(self.device) mask = torch.tensor(batch_masks).float().to(self.device) embedded_tweet = self.embedding(batch_indices) embedded_tweet = self.rnn_input_dropout(embedded_tweet) # encode tweet, (batch_size, tweet_length, hidden_dim) encoded_tweet = self.encoder(embedded_tweet, mask) # The pooling layer -- max pooling. # (batch_size, model_dim) encode_max, _ = replace_masked_values(encoded_tweet, mask.unsqueeze(-1), -1e7).max(dim=1) feature = self._feature_feedforward_dropout( self._feature_feedforward_activation( self._feature_feedforward_layer(encode_max))) reverse_feature = ReverseLayerF.apply(feature, alpha) class_logits = self._class_classification_layer(feature) group_logits = self._group_classification_layer(reverse_feature) class_probs = torch.nn.functional.softmax(class_logits, dim=-1) group_probs = torch.nn.functional.softmax(group_logits, dim=-1) output_dict = { "class_logits": class_logits, "group_logits": group_logits, "class_probs": class_probs, "group_probs": group_probs } if class_labels is not None: class_loss = self._class_loss(class_logits, class_labels.long().view(-1)) output_dict["class_loss"] = class_loss if group_labels is not None: group_loss = self._group_loss(group_logits, group_labels.long().view(-1)) output_dict["group_loss"] = group_loss return output_dict