def test_set_from_file_reads_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('<S>\n') vocab_file.write('</S>\n') vocab_file.write('<UNK>\n') vocab_file.write('a\n') vocab_file.write('tricky\x0bchar\n') vocab_file.write('word\n') vocab_file.write('another\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>") assert vocab._oov_token == DEFAULT_OOV_TOKEN assert vocab.get_token_index("random string") == 3 assert vocab.get_token_index("<S>") == 1 assert vocab.get_token_index("</S>") == 2 assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3 assert vocab.get_token_index("a") == 4 assert vocab.get_token_index("tricky\x0bchar") == 5 assert vocab.get_token_index("word") == 6 assert vocab.get_token_index("another") == 7 assert vocab.get_token_from_index(0) == vocab._padding_token assert vocab.get_token_from_index(1) == "<S>" assert vocab.get_token_from_index(2) == "</S>" assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN assert vocab.get_token_from_index(4) == "a" assert vocab.get_token_from_index(5) == "tricky\x0bchar" assert vocab.get_token_from_index(6) == "word" assert vocab.get_token_from_index(7) == "another"
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary( namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary( namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary( namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace="tokens")): token = self.vocab.get_token_from_index(index=index, namespace="tokens") archived_token_index = archived_vocab.get_token_index(token, namespace="tokens") # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if ( archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token ): vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def indices_to_tokens(self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary) -> List[Token]: self._add_encoding_to_vocabulary_if_needed(vocabulary) token_ids = indexed_tokens["token_ids"] type_ids = indexed_tokens.get("type_ids") return [ Token( text=vocabulary.get_token_from_index(token_ids[i], self._namespace), text_id=token_ids[i], type_id=type_ids[i] if type_ids is not None else None, ) for i in range(len(token_ids)) ]
def get_question_tensors_for_clause_tensors_batched( batch_size: int, vocab: Vocabulary, all_slots: Dict[str, torch.LongTensor], all_probs: torch.LongTensor): clause_slots = { k[len("clause-"):] : v for k, v in all_slots.items() if k.startswith("clause-")} question_slot_names = ["wh", "aux", "subj", "verb", "obj", "prep", "obj2"] clause_slot_names = ["subj", "aux", "verb", "obj", "prep1", "prep1-obj", "prep2", "prep2-obj", "misc", "qarg"] stringy_clause_slots = [ {k : vocab.get_token_from_index( v[i].item(), namespace = get_slot_label_namespace("clause-%s" % k)) for k, v in clause_slots.items()} for i in range(batch_size) ] filtered_stringy_clause_slots = [] stringy_question_slots = [] question_probs = [] # for clause_slots, prob in zip(stringy_clause_slots, all_probs): for i in range(len(stringy_clause_slots)): try: stringy_question_slots.append(get_question_for_clause(stringy_clause_slots[i], vocab)) filtered_stringy_clause_slots.append(stringy_clause_slots[i]) question_probs.append(all_probs[i].item()) except ValueError as e: print(str(e)) device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda:%s" % torch.cuda.current_device()) filtered_clause_slots = { ("clause-%s" % slot_name) : torch.tensor( [vocab.get_token_index(slots[slot_name], namespace = get_slot_label_namespace("clause-%s" % slot_name)) for slots in stringy_clause_slots], device = device ).long() for slot_name in clause_slot_names } question_slots = { slot_name : torch.tensor( [vocab.get_token_index(slots[slot_name], namespace = get_slot_label_namespace(slot_name)) for slots in stringy_question_slots], device = device ).long() for slot_name in question_slot_names } question_probs_tensor = torch.tensor(question_probs, device = device) return filtered_clause_slots, question_slots, question_probs_tensor
def __init__(self, experiment_path: str, vocab: Vocabulary, embedder: SyllableEmbedder, embedding_dim: int = 300) -> None: summary_writer = SummaryWriter(f'{experiment_path}/log/visualization') data_reader = KoWikiReader() words = [vocab.get_token_from_index(i, namespace='words') for i in range(vocab.get_vocab_size('words'))] embeddings = torch.zeros(vocab.get_vocab_size('words'), embedding_dim) for i, c in enumerate(words): word = data_reader.text_to_instance(source=Token(c))['source'] word.index(vocab) word_tensor = word.as_tensor(word.get_padding_lengths())['syllables'] embeddings[i] = embedder(word_tensor) summary_writer.add_embedding(embeddings, metadata=words, tag='syllable_embeddings') summary_writer.close()
def index(self, vocab: Vocabulary): if self._index_label_id is None: self._index_label_id = vocab.get_token_index( self.index_label, self._index_label_namespace) # type: ignore if self._set_label_id is None: self._set_label_id = vocab.get_token_index( self.set_label, self._set_label_namespace) # type: ignore if len(self.target_label_set) == 0: set_set = vocab.get_token_from_index(self.index_label, self._set_index_map) flat_set = [] for tgt_set in set_set: flat_set += vocab.get_token_index( tgt_set, self._set_target_map) # type: ignore self.target_label_set = set(flat_set) target_ind_set = [ vocab.get_token_index(lb, self._target_label_namespace) for lb in self.target_label_set ] self.target_ind_set = sorted(list(target_ind_set))
def decode_tokens(batch_predicted_indices: torch.Tensor, vocab: Vocabulary, end_index: int, start_index: int = None, vocab_namespace: str = 'tokens', truncate: bool = False): if not isinstance(batch_predicted_indices, numpy.ndarray): batch_predicted_indices = batch_predicted_indices.detach().cpu().numpy( ) all_predicted_tokens = [] for predicted_indices in batch_predicted_indices: # Beam search gives us the top k results for each source sentence in the batch # but we just want the single best. if len(predicted_indices.shape) == 1: predicted_indices = numpy.expand_dims(predicted_indices, axis=0) instance_predicted_tokens = [] for indices in predicted_indices: # We add start token to the predictions. # In case it is present at position 0, remove it. if start_index is not None and start_index == indices[0]: indices = indices[1:] indices = list(indices) # Collect indices till the first end_symbol if truncate and end_index in indices: indices = indices[:indices.index(end_index)] predicted_tokens = [ vocab.get_token_from_index(x, namespace=vocab_namespace) for x in indices ] instance_predicted_tokens.append(predicted_tokens) all_predicted_tokens.append(instance_predicted_tokens) return all_predicted_tokens
# # for article in data: # story = article['story'] a = "the man went to the store and bought a gallon of milk" b = tokenizer.tokenize(a) print(b) bert_vocab = Vocabulary() c = token_indexer.tokens_to_indices(b, bert_vocab, 'bert') print(c) input_ids = c['bert'] for input_id in input_ids: tokens = [ bert_vocab.get_token_from_index(index=idx, namespace='bert') for idx in input_id ] print(tokens) d = token_embedder(torch.LongTensor(c['bert'])) print(d.size()) e = token_embedder(torch.LongTensor(c['bert']), torch.LongTensor(c['bert-offsets'])) print(e.size()) # d = TextField(b, {'bert': token_indexer}) # print(b) # # sentence1 = a
def train_lstm(train_dataset, batch_size, num_layers, use_elmo=False, epochs=15, bidirectional=True, learning_rate=3e-4, hidden_size=64, num_classes=2, use_gpu=False): """ Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings. Parameters ---------- train_dataset: List[Instance] Instances for training set batch_size: int number of Instances to process in a batch num_layers: int number of BiLSTM layers: 2 or higher for Stacked BiLSTMs use_elmo: bool use elmo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) bidirectional: bool True for a bidirectional LSTM learning_rate: float learning rate for Adam Optimizer hidden_size: int size of the hidden layer in the encoder num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() vocab.add_tokens_to_namespace(tokens=['fic', 'non'], namespace="labels") word_embeddings: TextFieldEmbedder = load_elmo_embeddings() else: vocab = Vocabulary.from_instances(train_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic' assert vocab.get_token_from_index(index=1, namespace='labels') == 'non' print("\n\nThe ordering of labels is ['fic', 'non']\n\n") encoder: Seq2VecEncoder = PytorchSeq2VecWrapper( nn.LSTM(word_embeddings.get_output_dim(), hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)) classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.Classifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
def train_cnn(train_dataset, batch_size, num_filters, filter_sizes, use_elmo=False, epochs=15, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains CNN on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings. The CNN has one convolution layer for each ngram filter size. Parameters ---------- train_dataset: List[Instance] Instances for training set batch_size: int number of Instances to process in a batch num_filters: int output dim for each convolutional layer, which is the number of 'filters' learned by that layer filter_sizes: Tuple[int] specifies the number of convolutional layers and their sizes use_elmo: bool use ELMo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() vocab.add_tokens_to_namespace(tokens=['fic', 'non'], namespace="labels") word_embeddings: TextFieldEmbedder = load_elmo_embeddings() else: vocab = Vocabulary.from_instances(train_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic' assert vocab.get_token_from_index(index=1, namespace='labels') == 'non' print("\n\nThe ordering of labels is ['fic', 'non']\n\n") encoder: Seq2VecEncoder = CnnEncoder( embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.Classifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
token_embedding = token_embedding.cuda(cuda_device) token_ids = token_ids.cuda(cuda_device) else: cuda_device = -1 token_embedding.eval() with torch.no_grad(): embeddings = token_embedding(token_ids).cpu().numpy() print(embeddings.shape) tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=300) tsne_results = tsne.fit_transform(embeddings) plt.figure() for i, (x, y) in enumerate(tsne_results): label = vocab.get_token_from_index(i + 2) color, marker = label_to_color_and_marker(label) plt.scatter(x, y, s=1, c=color, marker=marker) plt.text(x, y, label, fontsize=2, color=color) plt.savefig("figures/tsne_results.pdf") print(embeddings.shape) pca = PCA(n_components=2) pca_results = pca.fit_transform(embeddings) plt.figure() for i, (x, y) in enumerate(pca_results): label = vocab.get_token_from_index(i + 2) color, marker = label_to_color_and_marker(label) plt.scatter(x, y, s=1, c=color, marker=marker) plt.text(x, y, label, fontsize=2, color=color)
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}, ) text_field1 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}, ) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5, }) ]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens5"], }) extended_vocab = Vocabulary.from_params(params, instances=instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
class KNNPredictor(Predictor): def __init__(self, model: Model, dataset_reader: DatasetReader, vocab_path: str = 'resources/vocab', df_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv', annoy_index_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/index.tree' ) -> None: super().__init__(model.eval(), dataset_reader) self.vocab = Vocabulary().from_files(vocab_path) self.df = pd.read_csv(df_path).set_index("track_id") self.index = None if annoy_index_path: self.build_index(annoy_index_path) def build_index(self, path: str, tracks: List[Tuple[str, np.array]] =None): features = self._model.classifier_feedforward.get_output_dim() if tracks is None: if not os.path.exists(path): path = urlretrieve(path)[0] self.index = AnnoyIndex(features, metric='angular') self.index.load(path) return index = AnnoyIndex(features, metric='angular') for track, vector in tqdm(tracks): i = self.vocab.get_token_to_index_vocabulary("labels")[track] index.add_item(i, vector) index.build(-1) index.save(path) self.index = index def neighbors_to_tracks(self, nns): tracks = [self.vocab.get_token_from_index(i, "labels") for i in nns] return self.df.loc[tracks].reset_index(drop=True).to_dict(orient='records') def predict_json(self, inputs: JsonDict) -> JsonDict: n = inputs.pop('n', 10) if 'track_id' in inputs: if self.index is None: raise AttributeError("Please build an index before searching by track.") idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']] nns = self.index.get_nns_by_item(idx, n+1)[1:] #scores = self.index.get_item_vector(idx) tracks = self.neighbors_to_tracks(nns) return tracks #return {'tracks': tracks, 'scores': scores} instance = self._json_to_instance(inputs) output_dict = self.predict_instance(instance) output_dict['inputs'] = inputs if self.index: logits = output_dict.get('logits') nns = self.index.get_nns_by_vector(logits, n) return self.neighbors_to_tracks(nns) #output_dict['tracks'] = self.neighbors_to_tracks(nns) return output_dict @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: return self._dataset_reader.text_to_instance(text=json_dict['query'])