def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises( ValueError, message= "No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str( self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri( archive_path, 'folder/fake_embeddings.5d.txt') params = Params({'pretrained_file': file_uri, 'embedding_dim': 5}) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance([ "this", "is", "a", "very", "very", "very", "very", "long", "sentence" ]), self.create_instance(["sentence"]), ] class LazyIterable: def __iter__(self): return (instance for instance in instances) self.instances = instances self.lazy_instances = LazyIterable()
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) }
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp()
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp()
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_weights = _read_pretrained_embeddings_file( embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == ( 4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
def test_make_vocab_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False make_vocab_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5}, "text2": {"num_tokens": 6}} def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, attend_feedforward: FeedForward, similarity_function: SimilarityFunction, compare_feedforward: FeedForward, aggregate_feedforward: FeedForward, premise_encoder: Optional[Seq2SeqEncoder] = None, hypothesis_encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(DecomposableAttention, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._attend_feedforward = TimeDistributed(attend_feedforward) self._matrix_attention = LegacyMatrixAttention(similarity_function) self._compare_feedforward = TimeDistributed(compare_feedforward) self._aggregate_feedforward = aggregate_feedforward self._premise_encoder = premise_encoder self._hypothesis_encoder = hypothesis_encoder or premise_encoder self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(), "text field embedding dim", "attend feedforward input dim") check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels, "final output dimension", "number of labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read(self.FIXTURES_ROOT / 'data' / 'squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_embedders'][ 'token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab=vocab, params=params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def setUp(self): super(TestTrainer, self).setUp() self.instances = SequenceTaggingDatasetReader().read( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') vocab = Vocabulary.from_instances(self.instances) self.vocab = vocab self.model_params = Params({ "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01) self.iterator = BasicIterator(batch_size=2) self.iterator.index_with(vocab)
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() tokens = [Token('Second'), Token('.')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"] padded_tokens = indexer.pad_token_sequence( {'test-elmo': indices}, desired_num_tokens={'test-elmo': 3}, padding_lengths={}) expected_padded_tokens = [ [ 259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] assert padded_tokens['test-elmo'] == expected_padded_tokens
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore # pylint: disable=arguments-differ params.add_file_to_archive('options_file') params.add_file_to_archive('weight_file') options_file = params.pop('options_file') weight_file = params.pop('weight_file') requires_grad = params.pop('requires_grad', False) do_layer_norm = params.pop_bool('do_layer_norm', False) dropout = params.pop_float("dropout", 0.5) namespace_to_cache = params.pop("namespace_to_cache", None) if namespace_to_cache is not None: vocab_to_cache = list( vocab.get_token_to_index_vocabulary(namespace_to_cache).keys()) else: vocab_to_cache = None projection_dim = params.pop_int("projection_dim", None) params.assert_empty(cls.__name__) return cls(options_file=options_file, weight_file=weight_file, do_layer_norm=do_layer_norm, dropout=dropout, requires_grad=requires_grad, projection_dim=projection_dim, vocab_to_cache=vocab_to_cache)
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _get_vocab_index_mapping( self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index( token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index( archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
class TestTokenCharactersEncoder(AllenNlpTestCase): def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder) def test_get_output_dim_uses_encoder_output_dim(self): assert self.encoder.get_output_dim() == 3 def test_forward_applies_embedding_then_encoder(self): numpy_tensor = numpy.random.randint(6, size=(3, 4, 7)) inputs = torch.from_numpy(numpy_tensor) encoder_output = self.encoder(inputs) reshaped_input = inputs.view(12, 7) embedded = self.embedding(reshaped_input) mask = (inputs != 0).long().view(12, 7) reshaped_manual_output = self.inner_encoder(embedded, mask) manual_output = reshaped_manual_output.view(3, 4, 3) assert_almost_equal(encoder_output.data.numpy(), manual_output.data.numpy())
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument return { "token_ids": [10, 15] + \ [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \ [25], "additional_key": [22, 29] }
def test_tokens_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}
def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} json = { 'question': self.utterance, 'columns': ['Name in English', 'Location in English'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } self.graph = TableQuestionKnowledgeGraph.read_from_json(json) self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace( "english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace( "location", namespace='tokens') self.paradeniz_index = self.vocab.add_token_to_namespace( "paradeniz", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace='tokens') self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens') self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens') self.negative_one_index = self.vocab.add_token_to_namespace( "-1", namespace='tokens') self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens') self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp()
def _read_embeddings_from_hdf5(embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``. """ with h5py.File(embeddings_filename, 'r') as fin: embeddings = fin['embedding'][...] if list(embeddings.shape) != [ vocab.get_vocab_size(namespace), embedding_dim ]: raise ConfigurationError( "Read shape {0} embeddings from the file, but expected {1}".format( list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim])) return torch.FloatTensor(embeddings)
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + 't')], Vocabulary(), "test-unicode") expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-unicode": [expected_indices]}
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo") expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-elmo": [expected_indices]}
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) super(TestListField, self).setUp()