def test_load_pretrained_embeddings(self): pretrained_emb = embeddings_utils.PretrainedEmbedding( RAW_EMBEDDING_PATH) self.assertEqual(len(pretrained_emb.embed_vocab), VOCAB_SIZE) self.assertEqual(pretrained_emb.embed_vocab[0], "</s>") self.assertEqual(pretrained_emb.embed_vocab[2], "to") self.assertEqual(len(pretrained_emb.stoi), VOCAB_SIZE) self.assertEqual(pretrained_emb.stoi["</s>"], 0) self.assertEqual(pretrained_emb.stoi["to"], 2) self.assertEqual(pretrained_emb.embedding_vectors.size()[0], VOCAB_SIZE) self.assertEqual(pretrained_emb.embedding_vectors.size()[1], EMB_SIZE)
def test_initialize_embeddings_weights(self): text_field = get_text_field() pretrained_emb = embeddings_utils.PretrainedEmbedding( RAW_EMBEDDING_PATH, text_field.lower) pretrained_emb_tensor = pretrained_emb.initialize_embeddings_weights( text_field.vocab.stoi, VocabMeta.UNK_TOKEN, EMB_SIZE, EmbedInitStrategy.ZERO) self.assertEqual(pretrained_emb_tensor.size()[0], 4) self.assertEqual(pretrained_emb_tensor.size()[1], EMB_SIZE) self.assertEqual(text_field.vocab.itos[2], "good") self.assertEqual(text_field.vocab.itos[3], "boy") self.assertEqual(text_field.vocab.stoi["good"], 2) self.assertEqual(text_field.vocab.stoi["boy"], 3)
def init_feature_metadata( self, train_data: textdata.Dataset, eval_data: textdata.Dataset, test_data: textdata.Dataset, ): # field metadata self.metadata.features = {} # build vocabs for features for name, feat in self.features.items(): weights = None if feat.use_vocab: pretrained_embeddings = None pretrained_embeddings_path = getattr( feat, "pretrained_embeddings_path", None) if pretrained_embeddings_path: print("load pretrained embeddings from {}".format( pretrained_embeddings_path)) pretrained_embeddings = embeddings_utils.PretrainedEmbedding( pretrained_embeddings_path, feat.lower) if hasattr(feat, "vocab"): # Don't rebuild vocab print( f"Vocab for feature {name} has been built. Not rebuilding." ) else: print(f"Building vocab for feature {name}.") vocab_data = self._get_data_to_build_vocab( feat, train_data, eval_data, test_data, pretrained_embeddings) feat.build_vocab(*vocab_data, min_freq=feat.min_freq) print("{} field's vocabulary size is {}".format( name, len(feat.vocab))) # Initialize pretrained embedding weights. if pretrained_embeddings: weights = pretrained_embeddings.initialize_embeddings_weights( feat.vocab.stoi, VocabMeta.UNK_TOKEN, feat.embed_dim, feat.embedding_init_strategy, ) # this is of type torch.Tensor meta = feat.get_meta() meta.pretrained_embeds_weight = weights self.metadata.features[name] = meta
def init_target_metadata( self, train_data: textdata.Dataset, eval_data: textdata.Dataset, test_data: textdata.Dataset, ): self.metadata.target = [] # build vocabs for label fields for name, label in self.labels.items(): if name in [Target.TARGET_PROB_FIELD, Target.TARGET_LOGITS_FIELD]: continue # Need test data to make sure we cover all of the labels in it # It is particularly important when BIO is enabled as a B-[Label] can # appear in train and eval but test can have B-[Label] and I-[Label] weights = None if label.use_vocab: if not hasattr(label, "vocab"): # Don't rebuild vocab print("Building vocab for label {}".format(name)) label.build_vocab(train_data, eval_data, test_data) else: print( f"Vocab for label {name} has been built. Not rebuilding." ) print("{} field's vocabulary size is {}".format( name, len(label.vocab.itos))) pretrained_embeddings = None pretrained_embeddings_path = getattr( label, "pretrained_embeddings_path", None) if pretrained_embeddings_path: pretrained_embeddings = embeddings_utils.PretrainedEmbedding( pretrained_embeddings_path) if pretrained_embeddings: weights = pretrained_embeddings.initialize_embeddings_weights( label.vocab.stoi, label.unk_token, label.embed_dim, label.embedding_init_strategy, ) # this is of type torch.Tensor meta = label.get_meta() meta.pretrained_embeds_weight = weights self.metadata.target.append(meta) if len(self.metadata.target) == 1: [self.metadata.target] = self.metadata.target