def __init__( self, model: RNNModel, output_layer: Seq2SeqOutputLayer, sequence_generator: ScriptedSequenceGenerator, src_vocab: Vocabulary, trg_vocab: Vocabulary, dictfeat_vocab: Vocabulary, ): BaseModel.__init__(self) self.model = model self.encoder = self.model.encoder self.decoder = self.model.decoder self.output_layer = output_layer self.sequence_generator = sequence_generator # Target vocab EOS index is useful for recognizing when to stop generating self.trg_eos_index = trg_vocab.get_eos_index() # Target vocab PAD index is useful for shifting source/target prior to decoding self.trg_pad_index = trg_vocab.get_pad_index() # Source, target and dictfeat vocab are needed for export so that we can handle # string input self.src_dict = src_vocab self.trg_dict = trg_vocab self.dictfeat_dict = dictfeat_vocab self.force_eval_predictions = False
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) replacements = { config.unk_token: UNK, config.pad_token: PAD, config.bos_token: BOS, config.eos_token: EOS, config.mask_token: MASK, } if isinstance(tokenizer, WordPieceTokenizer): vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=replacements, ) else: dictionary = BertDictionary.load(config.vocab_file) vocab = Vocabulary( dictionary.symbols, dictionary.count, replacements=replacements ) return cls( columns=config.columns, tokenizer=tokenizer, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, max_seq_len=config.max_seq_len, vocab=vocab, **kwargs, )
def __init__(self, vocab: Vocabulary): super().__init__() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), )
def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None): super().__init__() self.NO_LABEL = Token("NoLabel") poss_slots = list(poss_slots) if self.NO_LABEL not in poss_slots: poss_slots.insert(0, self.NO_LABEL) if SpecialTokens.PAD not in poss_slots: poss_slots.insert(1, SpecialTokens.PAD) self.vocab = Vocabulary(poss_slots)
def test_torchscript_intent_slot_output_layer(self, num_doc_labels, num_word_labels, seq_lens): batch_size = len(seq_lens) doc_vocab = Vocabulary([ OutputLayerTest._generate_random_string() for _ in range(num_doc_labels) ]) word_vocab = Vocabulary([ OutputLayerTest._generate_random_string() for _ in range(num_word_labels) ]) intent_slot_output_layer = IntentSlotOutputLayer.from_config( config=IntentSlotOutputLayer.Config(), doc_labels=doc_vocab, word_labels=word_vocab, ) doc_logits = OutputLayerTest._generate_doc_classification_inputs( batch_size, num_doc_labels) word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs( batch_size, num_word_labels, seq_lens) context = {"seq_lens": seq_lens_tensor} torchscript_output_layer = intent_slot_output_layer.torchscript_predictions( ) pt_output = intent_slot_output_layer.get_pred( (doc_logits, word_logits), None, context)[1] ts_output = torchscript_output_layer((doc_logits, word_logits), seq_lens_tensor) self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab) self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab) ( word_bpe_logits, seq_lens_tensor, token_indices_tensor, ) = OutputLayerTest._generate_bpe_tagging_inputs( batch_size, num_word_labels, seq_lens) context = { "seq_lens": seq_lens_tensor, "token_indices": token_indices_tensor } pt_output = intent_slot_output_layer.get_pred( (doc_logits, word_bpe_logits), None, context)[1] ts_output = torchscript_output_layer((doc_logits, word_bpe_logits), seq_lens_tensor, token_indices_tensor) self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab) self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)
def _prepare_dec_target(self, dec_source: List[int], clean_input_tokens: List[int], vocab: Vocabulary) -> List[int]: dec_target = [ vocab.get_pad_index() if dec_source_token != vocab.get_mask_index() else dec_real_target_token for (dec_source_token, dec_real_target_token) in zip(dec_source, clean_input_tokens) ] return dec_target
def test_torchscript_intent_slot_output_layer( self, num_doc_labels, num_word_labels, seq_lens ): batch_size = len(seq_lens) doc_vocab = Vocabulary( [OutputLayerTest._generate_random_string() for _ in range(num_doc_labels)] ) word_vocab = Vocabulary( [OutputLayerTest._generate_random_string() for _ in range(num_word_labels)] ) intent_slot_output_layer = IntentSlotOutputLayer.from_config( config=IntentSlotOutputLayer.Config(), doc_labels=doc_vocab, word_labels=word_vocab, ) doc_logits = OutputLayerTest._generate_doc_classification_inputs( batch_size, num_doc_labels ) word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs( batch_size, num_word_labels, seq_lens ) context = {"seq_lens": seq_lens_tensor} torchscript_output_layer = intent_slot_output_layer.torchscript_predictions() pt_output = intent_slot_output_layer.get_pred( (doc_logits, word_logits), None, context )[1] with redirect_stdout() as redirected_stdout: ts_output = torchscript_output_layer((doc_logits, word_logits), context) buffer = redirected_stdout.getvalue() assert ( "Implicit dimension choice for log_softmax has been deprecated" not in buffer ) self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab) self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab) ( word_bpe_logits, seq_lens_tensor, token_indices_tensor, ) = OutputLayerTest._generate_bpe_tagging_inputs( batch_size, num_word_labels, seq_lens ) context = {"seq_lens": seq_lens_tensor, "token_indices": token_indices_tensor} pt_output = intent_slot_output_layer.get_pred( (doc_logits, word_bpe_logits), None, context )[1] ts_output = torchscript_output_layer((doc_logits, word_bpe_logits), context) self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab) self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)
def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int): super().__init__() self.tokenizer = tokenizer self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(), ) self.vocab_lookup = VocabLookup(self.vocab) self.max_seq_len = max_seq_len
def test_wordblstm_export_to_caffe2(self, export_num_words, num_word_classes, test_num_words, num_predictions): for WORD_CONFIG in WORD_CONFIGS: config = self._get_config(WordTaggingTask.Config, WORD_CONFIG) tensorizers, data = _NewTask._init_tensorizers(config) word_labels = [ SpecialTokens.PAD, SpecialTokens.UNK, "NoLabel", "person" ] tensorizers["labels"].vocab = Vocabulary(word_labels) tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB) py_model = _NewTask._init_model(config.model, tensorizers) dummy_test_input = self._get_rand_input_intent_slot( BATCH_SIZE, W_VOCAB_SIZE, test_num_words) exporter = ModelExporter( ModelExporter.Config(), py_model.get_export_input_names(tensorizers), dummy_test_input, py_model.vocab_to_export(tensorizers), py_model.get_export_output_names(tensorizers), ) with tempfile.NamedTemporaryFile( delete=False, suffix=".{}".format(".predictor")) as pred_file: exporter.export_to_caffe2(py_model, pred_file.name) workspace.ResetWorkspace() pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE) for _i in range(num_predictions): test_inputs = self._get_rand_input_intent_slot( BATCH_SIZE, W_VOCAB_SIZE, test_num_words) self._feed_c2_input(workspace, test_inputs, exporter.input_names, exporter.vocab_map) workspace.RunNetOnce(pred_net) word_output_names = [ "{}:{}".format("word_scores", class_name) for class_name in word_labels ] py_model.eval() py_outs = py_model(*test_inputs) context = {"seq_lens": test_inputs[-1]} target = None pred, score = py_model.get_pred(py_outs, target, context) c2_word_out = [] for o_name in word_output_names: c2_word_out.extend(list(workspace.FetchBlob(o_name))) np.testing.assert_array_almost_equal( torch.transpose(score, 1, 2).contiguous().view(-1).detach().numpy(), np.array(c2_word_out).flatten(), )
def test_seq_nn_export_to_caffe2( self, export_num_words, num_doc_classes, test_num_words, num_predictions, test_num_seq, ): config = self._get_config(SeqNNTask.Config, SEQ_NN_CONFIG) tensorizers, data = _NewTask._init_tensorizers(config) doc_labels = [SpecialTokens.UNK, "cu:other", "cu:address_Person"] tensorizers["labels"].vocab = Vocabulary(doc_labels) tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB) py_model = _NewTask._init_model(config.model, tensorizers) dummy_test_input = self._get_seq_nn_rand_input(BATCH_SIZE, W_VOCAB_SIZE, test_num_words, test_num_seq) exporter = ModelExporter( ModelExporter.Config(), py_model.get_export_input_names(tensorizers), dummy_test_input, py_model.vocab_to_export(tensorizers), py_model.get_export_output_names(tensorizers), ) with tempfile.NamedTemporaryFile( delete=False, suffix=".{}".format(".predictor")) as pred_file: output_names = exporter.export_to_caffe2(py_model, pred_file.name) workspace.ResetWorkspace() pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE) for _i in range(num_predictions): test_inputs = self._get_seq_nn_rand_input(BATCH_SIZE, W_VOCAB_SIZE, test_num_words, test_num_seq) self._feed_c2_input(workspace, test_inputs, exporter.input_names, exporter.vocab_map) workspace.RunNetOnce(pred_net) c2_out = [ list(workspace.FetchBlob(o_name)) for o_name in output_names ] py_model.eval() py_outs = py_model(*test_inputs) # Do log_softmax since we do that before exporting predictor nets py_outs = F.log_softmax(py_outs, 1) np.testing.assert_array_almost_equal( py_outs.view(-1).detach().numpy(), np.array(c2_out).flatten())
def __init__( self, pretrained_embeddings_path: str, vocab: Vocabulary, embedding_dim: int, mlp_layer_dims: Optional[Sequence[int]] = None, lowercase_tokens: bool = False, skip_header: bool = True, delimiter: str = " ", ) -> None: super().__init__() pretrained_embedding = PretrainedEmbedding( pretrained_embeddings_path, lowercase_tokens=lowercase_tokens, skip_header=skip_header, delimiter=delimiter, ) embeddings_weight = pretrained_embedding.initialize_embeddings_weights( vocab.idx, # tensorizer.vocab.idx, vocab.unk_token, # tensorizer.vocab.unk_token, embedding_dim, EmbedInitStrategy.RANDOM, ) num_embeddings = len(vocab.idx) self.embedding = nn.Embedding( num_embeddings, embedding_dim, _weight=embeddings_weight, padding_idx=vocab.get_pad_index(), ) # Initialize unk embedding with zeros # to guard the model against randomized decisions based on unknown words unk_token_idx = vocab.get_unk_index() if unk_token_idx >= 0: self.embedding.weight.data[unk_token_idx].fill_(0.0) # Create MLP layers if mlp_layer_dims is None: mlp_layer_dims = [] self.mlp = nn.Sequential( *(nn.Sequential(nn.Linear(m, n), nn.ReLU()) for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims))) self.output_dim = mlp_layer_dims[ -1] if mlp_layer_dims else embedding_dim
def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary): cleaned_tokens = self.clean_eos_bos(tokens) original_target_string = " ".join( [vocab[idx] for idx in cleaned_tokens]).upper() try: annotation = Annotation( original_target_string, accept_flat_intents_slots=self.accept_flat_intents_slots, ) except Exception as e: # This should never happen other than when testing print(e, original_target_string) dec_source = [ vocab.idx[vocab.mask_token] for _ in range(len(tokens)) ] dec_target = [ vocab.idx[vocab.pad_token] for _ in range(len(tokens)) ] return dec_source, dec_target assert len(annotation.root.children) == 1 mask_tree_str = self.gen_masked_tree(annotation.root.children[0], vocab.mask_token) # We are calling the .split() instead of the tokenize() of tensorizer # because the input str contains special MASK token __MASK__ # It we call tokenize() on this input_str, it may lower __MASK__ or split # in unexpected ways causing issues. # Hence temporary workaround is that we call split(" ") and lower all tokens # other than MASK tokens # handle special tokens in vocab mask_tree_str: List[str] = list( map( lambda token: SPECIAL_TOKENS.get(token, token.lower()), mask_tree_str.split(" "), )) dec_source = [vocab.idx.get(t) for t in mask_tree_str] dec_target = self._prepare_dec_target(dec_source, cleaned_tokens, vocab) if self.use_bos: if self.should_mask(): dec_source.insert(0, vocab.get_mask_index()) dec_target.insert(0, vocab.get_bos_index()) else: dec_source.insert(0, vocab.get_bos_index()) dec_target.insert(0, vocab.get_pad_index()) if self.use_eos: if self.should_mask(): dec_source.append(vocab.get_mask_index()) dec_target.append(vocab.get_eos_index()) else: dec_source.append(vocab.get_eos_index()) dec_target.append(vocab.get_pad_index()) return dec_source, dec_target
def build_fairseq_vocab( vocab_file: str, dictionary_class: Dictionary = Dictionary, special_token_replacements: Dict[str, SpecialToken] = None, max_vocab: int = -1, min_count: int = -1, tokens_to_add: Optional[List[str]] = None, ): """ Function builds a PyText vocabulary for models pre-trained using Fairseq modules. The dictionary class can take any Fairseq Dictionary class and is used to load the vocab file. """ if not special_token_replacements: special_token_replacements = { "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, } with PathManager.open(vocab_file) as f: dictionary = dictionary_class.load(f) # finalize will sort the dict based on frequency so only do this if # a min_count or max_vocab size is specified if min_count > 0 or max_vocab > 0: dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1) if tokens_to_add: for token in tokens_to_add: dictionary.add_symbol(token) return Vocabulary( dictionary.symbols, dictionary.count, replacements=special_token_replacements, )
def from_config(cls, config: Config, tensorizers): has_answer_labels = ["False", "True"] tensorizers["has_answer"].vocab = Vocabulary(has_answer_labels) vocab = tensorizers["squad_input"].vocab encoder = create_module( config.encoder, output_encoded_layers=True, padding_idx=vocab.get_pad_index(), vocab_size=vocab.__len__(), ) pos_decoder = create_module(config.pos_decoder, in_dim=encoder.representation_dim, out_dim=2) has_ans_decoder = create_module( config.has_ans_decoder, in_dim=encoder.representation_dim, out_dim=len(has_answer_labels), ) output_layer = create_module(config.output_layer, labels=has_answer_labels, is_kd=config.is_kd) return cls(encoder, pos_decoder, has_ans_decoder, output_layer, is_kd=config.is_kd)
def build_fairseq_vocab( vocab_file: str, dictionary_class: Dictionary = Dictionary, special_token_replacements: Dict[str, Token] = None, max_vocab: int = -1, min_count: int = -1, tokens_to_add: Optional[List[str]] = None, ) -> Vocabulary: """ Function builds a PyText vocabulary for models pre-trained using Fairseq modules. The dictionary class can take any Fairseq Dictionary class and is used to load the vocab file. """ dictionary = dictionary_class.load(vocab_file) # finalize will sort the dict based on frequency so only do this if # a min_count or max_vocab size is specified if min_count > 0 or max_vocab > 0: dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1) if tokens_to_add: for token in tokens_to_add: dictionary.add_symbol(token) return Vocabulary(dictionary.symbols, dictionary.count, replacements=special_token_replacements)
def test_doc_classification_output_layer(self): tensorizer = LabelTensorizer() tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"]) layer = ClassificationOutputLayer.from_config( config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()), labels=tensorizer.vocab, ) self.assertEqual(layer.loss_fn.ignore_index, 0) # use default pad tensorizer.vocab = Vocabulary(["foo", "bar"]) layer = ClassificationOutputLayer.from_config( config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()), labels=tensorizer.vocab, ) self.assertEqual(layer.loss_fn.ignore_index, -1)
def __init__(self, num_tags, labels: Vocabulary, *args) -> None: super().__init__(list(labels), *args) self.crf = CRF( num_tags=num_tags, ignore_index=labels.get_pad_index(Padding.DEFAULT_LABEL_PAD_IDX), default_label_pad_index=Padding.DEFAULT_LABEL_PAD_IDX, )
def test_lookup_tokens(self): text = "let's tokenize this" tokenizer = Tokenizer() vocab = Vocabulary(text.split() + [BOS, EOS]) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=False, add_eos_token=False, ) self.assertEqual(tokens, [0, 1, 2]) self.assertEqual(start_idx, (0, 6, 15)) self.assertEqual(end_idx, (5, 14, 19)) tokens, start_idx, end_idx = lookup_tokens( text, tokenizer=tokenizer, vocab=vocab, add_bos_token=True, add_eos_token=True, ) self.assertEqual(tokens, [3, 0, 1, 2, 4]) self.assertEqual(start_idx, (-1, 0, 6, 15, -1)) self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
def from_config(cls, config: Config, **kwargs): """ from_config parses the config associated with the tensorizer and creates both the tokenizer and the Vocabulary object. The extra arguments passed as kwargs allow us to reuse thie function with variable number of arguments (eg: for classes which derive from this class). """ tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) special_token_replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[MASK]": MASK, "[SEP]": EOS, } if isinstance(tokenizer, WordPieceTokenizer): vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=special_token_replacements, ) else: with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( dictionary_class=BertDictionary, vocab_file=file_path, special_token_replacements=special_token_replacements, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, **kwargs, )
def test_torchscript_word_tagging_output_layer(self, num_labels, seq_lens): batch_size = len(seq_lens) vocab = Vocabulary( [OutputLayerTest._generate_random_string() for _ in range(num_labels)] ) word_layer = WordTaggingOutputLayer.from_config( config=WordTaggingOutputLayer.Config(), labels=vocab ) crf_layer = CRFOutputLayer.from_config( config=CRFOutputLayer.Config(), labels=vocab ) logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs( batch_size, num_labels, seq_lens ) context = {"seq_lens": seq_lens_tensor} torchsript_word_layer = word_layer.torchscript_predictions() torchscript_crf_layer = crf_layer.torchscript_predictions() self._validate_word_tagging_result( word_layer.get_pred(logits, None, context)[1], torchsript_word_layer(logits, context), vocab, ) self._validate_word_tagging_result( crf_layer.get_pred(logits, None, context)[1], torchscript_crf_layer(logits, context), vocab, )
class SlotLabelTransform(Transform): def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None): super().__init__() self.NO_LABEL = Token("NoLabel") poss_slots = list(poss_slots) if self.NO_LABEL not in poss_slots: poss_slots.insert(0, self.NO_LABEL) if SpecialTokens.PAD not in poss_slots: poss_slots.insert(1, SpecialTokens.PAD) if SpecialTokens.UNK not in poss_slots: poss_slots.insert(2, SpecialTokens.UNK) self.vocab = Vocabulary(poss_slots) def process_slots(self, slots_list: str) -> List[Slot]: if "," in slots_list: slots_list = slots_list.split(",") elif slots_list != "": slots_list = [slots_list] else: return [] slot_labels: List[Slot] = [] for curr_slot in slots_list: first_delim = curr_slot.find(":") second_delim = curr_slot.find(":", first_delim + 1) start_ind = int(curr_slot[0:first_delim]) end_ind = int(curr_slot[first_delim + 1:second_delim]) slot_name = curr_slot[second_delim + 1:] slot_labels.append(Slot(slot_name, start_ind, end_ind)) return slot_labels def forward(self, text_and_slots): """ Turn slot labels and text into a list of token labels with the same length as the number of tokens in the text. """ tokens, start, end = text_and_slots[0].values() slots = self.process_slots(text_and_slots[1]) curr_slot_i = 0 curr_token_i = 0 slot_labels: List[str] = [] while curr_token_i < len(tokens) and curr_slot_i < len(slots): curr_slot = slots[curr_slot_i] if int(start[curr_token_i]) > curr_slot.end: curr_slot_i += 1 else: if int(end[curr_token_i]) > curr_slot.start: slot_labels.append(curr_slot.label) else: slot_labels.append(self.NO_LABEL) curr_token_i += 1 slot_labels += [self.NO_LABEL] * (len(tokens) - curr_token_i) slot_label_idx = self.vocab.lookup_all(slot_labels) return {"slot_labels": torch.tensor(slot_label_idx)} @property def is_jitable(self) -> bool: return False
def setUp(self): self.input_iterator = [ {"text": "hello world"}, {"text": "feeling lucky today"}, {"text": "hello"}, {"text": "lucky world"}, {"text": "today world"}, ] self.vocab = Vocabulary(["hello", "world", "feeling", "lucky", "today"])
def __init__( self, model: RNNModel, output_layer: Seq2SeqOutputLayer, src_vocab: Vocabulary, trg_vocab: Vocabulary, dictfeat_vocab: Vocabulary, generator_config=None, ): BaseModel.__init__(self) self.model = model self.encoder = self.model.encoder self.decoder = self.model.decoder self.output_layer = output_layer # Sequence generation is expected to be used only for inference, and to # take the trained model(s) as input. Creating the sequence generator # may apply Torchscript JIT compilation and quantization, which modify # the input model. Therefore, we want to create the sequence generator # after training. if generator_config is not None: self.sequence_generator_builder = lambda models: create_module( generator_config, models, trg_vocab.get_eos_index()) self.sequence_generator = None # Disable predictions until testing (see above comment about sequence # generator). If this functionality is needed, a new sequence generator # with a copy of the model should be used for each epoch during the # EVAL stage. self.force_eval_predictions = False # Target vocab EOS index is useful for recognizing when to stop generating self.trg_eos_index = trg_vocab.get_eos_index() # Target vocab PAD index is useful for shifting source/target prior to decoding self.trg_pad_index = trg_vocab.get_pad_index() # Source, target and dictfeat vocab are needed for export so that we can handle # string input self.src_dict = src_vocab self.trg_dict = trg_vocab self.dictfeat_dict = dictfeat_vocab log_class_usage(__class__)
def __init__(self, bpe, dictionary: Dictionary): self.bpe = bpe self.vocab = Vocabulary( dictionary.symbols, pad_token=str(dictionary[dictionary.pad()]), bos_token=str(dictionary[dictionary.bos()]), eos_token=str(dictionary[dictionary.eos()]), ) self.bos = self.vocab.bos_token self.eos = self.vocab.eos_token
def test_create_word_tagging_output_layer(self): tensorizer = LabelTensorizer() tensorizer.vocab = Vocabulary(["foo", "bar"]) tensorizer.pad_idx = 0 layer = WordTaggingOutputLayer.from_config( config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}), labels=tensorizer.vocab, ) np.testing.assert_array_almost_equal( np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy() )
class LabelTransform(Transform): def __init__(self, label_names: List[str]): super().__init__() self.vocab = Vocabulary(label_names) def forward(self, label: str) -> Dict[str, torch.Tensor]: label_id = self.vocab.lookup_all(label) return {"label_ids": torch.tensor(label_id, dtype=torch.long)} @property def is_jitable(self) -> bool: return False
def build_dumb_slot_labelling_model(): return build_slot_labelling_model( None, 5, 100, [10 for i in range(100)], 0.4, False, None, None, 5, Vocabulary([SpecialTokens.UNK, SpecialTokens.PAD, "the", "cat"]), )
def _build_vocab(self, vocab_file: str, max_vocab: int, min_count: int) -> Vocabulary: """ Build Vocab for XLM by calling the vocab reader associated with the model source. """ if self.is_fairseq: vocab_list, counts, replacements = read_fairseq_vocab( vocab_file, max_vocab, min_count) else: vocab_list, counts, replacements = read_vocab( vocab_file, max_vocab, min_count) return Vocabulary(vocab_list, counts, replacements=replacements)
def gen_masked_source_target(self, tokens: List[int], vocab: Vocabulary): num_masks = self.random.randint(self.minimum_masks, len(tokens)) ind: Set[int] = set( self.random.choice(len(tokens), size=num_masks, replace=False)) dec_source: List[int] = [ vocab.get_mask_index() if idx in ind else token for idx, token in enumerate(tokens) ] dec_target = self._prepare_dec_target(dec_source, tokens, vocab) return dec_source, dec_target
def build_legacy_pytext_vocab_pipeline(vocab_file): from pytext.data.utils import Vocabulary tokenizer = get_tokenizer("basic_english") f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = sequential_transforms(tokenizer_func(tokenizer), PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>"))) return pipeline, None, None