def init_target_metadata( self, train_data: textdata.Dataset, eval_data: textdata.Dataset, test_data: textdata.Dataset, ): self.metadata.target = [] # build vocabs for label fields for name, label in self.labels.items(): if name in [Target.TARGET_PROB_FIELD, Target.TARGET_LOGITS_FIELD]: continue # Need test data to make sure we cover all of the labels in it # It is particularly important when BIO is enabled as a B-[Label] can # appear in train and eval but test can have B-[Label] and I-[Label] weights = None if label.use_vocab: if not hasattr(label, "vocab"): # Don't rebuild vocab print("Building vocab for label {}".format(name)) label.build_vocab( train_data, eval_data, test_data, min_freq=getattr(label, "min_freq", 1), ) else: print(f"Vocab for label {name} has been built. Not rebuilding.") print( "{} field's vocabulary size is {}".format( name, len(label.vocab.itos) ) ) pretrained_embeddings = None pretrained_embeddings_path = getattr( label, "pretrained_embeddings_path", None ) if pretrained_embeddings_path: pretrained_embeddings = embeddings_utils.PretrainedEmbedding( pretrained_embeddings_path ) if pretrained_embeddings: weights = pretrained_embeddings.initialize_embeddings_weights( label.vocab.stoi, label.unk_token, label.embed_dim, label.embedding_init_strategy, ) # this is of type torch.Tensor meta = label.get_meta() meta.pretrained_embeds_weight = weights self.metadata.target.append(meta) if len(self.metadata.target) == 1: [self.metadata.target] = self.metadata.target
def init_feature_metadata( self, train_data: textdata.Dataset, eval_data: textdata.Dataset, test_data: textdata.Dataset, ): # field metadata self.metadata.features = {} # build vocabs for features for name, feat in self.features.items(): weights = None if feat.use_vocab: pretrained_embeddings = None pretrained_embeddings_path = getattr( feat, "pretrained_embeddings_path", None ) if pretrained_embeddings_path: print( "load pretrained embeddings from {}".format( pretrained_embeddings_path ) ) pretrained_embeddings = embeddings_utils.PretrainedEmbedding( pretrained_embeddings_path, feat.lower ) if hasattr(feat, "vocab"): # Don't rebuild vocab print(f"Vocab for feature {name} has been built. Not rebuilding.") else: print(f"Building vocab for feature {name}.") vocab_data = self._get_data_to_build_vocab( feat, train_data, eval_data, test_data, pretrained_embeddings ) feat.build_vocab(*vocab_data, min_freq=feat.min_freq) print("{} field's vocabulary size is {}".format(name, len(feat.vocab))) # Initialize pretrained embedding weights. if pretrained_embeddings: weights = pretrained_embeddings.initialize_embeddings_weights( feat.vocab.stoi, VocabMeta.UNK_TOKEN, feat.embed_dim, feat.embedding_init_strategy, ) # this is of type torch.Tensor meta = feat.get_meta() meta.pretrained_embeds_weight = weights self.metadata.features[name] = meta