def __init__(self, args): super().__init__() self.cache_dir = FLAGS.cache_dir utils.maybe_make_dir(self.cache_dir) self.output_mode = args.transformers_output_mode self.input_module = args.input_module self.tokenizer_required = input_module_tokenizer_name( args.input_module) # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None # self.model = transformers.RobertaModel.from_pretrained( # args.input_module, cache_dir=self.cache_dir, output_hidden_states=True # ) if FLAGS.saved_pretrained_model_path: self.model = load_pretrained_model_for_SG() else: self.model = MODEL_MAPPING[FLAGS.model](finetune_stage=True) self.max_pos = None self.tokenizer = get_my_tokenizer() self._sep_id = self.tokenizer.sep_token_id self._cls_id = self.tokenizer.cls_token_id self._pad_id = self.tokenizer.pad_token_id self._unk_id = self.tokenizer.unk_token_id self.parameter_setup(args)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: text_tokens = [t.text for t in tokens] indices = get_my_tokenizer().convert_tokens_to_ids(text_tokens) return {index_name: indices}
def forward(self, target_ids, input_ids, padding_mask) -> Dict[str, torch.Tensor]: tokenizer = get_my_tokenizer() float_input_ids = input_ids.to(torch.float).clone() # Replace mask-ids with random floats to make sure they are not the most common element maskless_input_ids = torch.where(input_ids == tokenizer.mask_token_id, torch.rand_like(float_input_ids), float_input_ids) # Pick the most common element in each sample most_common_ids = maskless_input_ids.mode()[0].to(torch.long) vocab_scores = torch.zeros(target_ids.shape[0], target_ids.shape[1], tokenizer.vocab_size).cuda() mask_idxs = (input_ids == tokenizer.mask_token_id).nonzero() for batch_idx, common_id in enumerate( most_common_ids.tolist() ): #TODO maybe change loss calculation to only consider masked positions single_sample_mask_idxs = mask_idxs[( mask_idxs[:, 0] == batch_idx).nonzero().squeeze(1)] for sequence_idx in range(input_ids.shape[1]): if sequence_idx in single_sample_mask_idxs: vocab_scores[batch_idx, sequence_idx, common_id] = 1 #TODO check if this works else: existing_id = input_ids[batch_idx, sequence_idx] vocab_scores[batch_idx, sequence_idx, existing_id] = 1 result_dict = {} if target_ids is not None: result_dict['loss'] = nn.CrossEntropyLoss()(vocab_scores.contiguous().view(-1, tokenizer.vocab_size), target_ids.contiguous().view(-1))\ + self.dummy_param - self.dummy_param result_dict['vocab_scores'] = vocab_scores return result_dict
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("dirt"): tokenizer = get_my_tokenizer() elif tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__(self, corpus_name, split_name): self.id_tensor_path = Path( FLAGS.blob_folder, f'{corpus_name}_{split_name}_ids_tensor').as_posix() self.text_path = corpus_to_data[corpus_name] self.split_name = split_name self.corpus = corpus_name self.token_indexer = get_my_tokenizer()
def __init__(self): super().__init__() self.dense = nn.Linear(FLAGS.d_hidden, FLAGS.d_emb) self.LayerNorm = nn.LayerNorm(FLAGS.d_emb) self.decoder = nn.Linear( FLAGS.d_emb, get_my_tokenizer().vocab_size ) # TODO add activation here for consistency with ALBERT self.activation = get_activation()
def __init__(self, model,finetune_stage=False): super().__init__(Vocabulary()) self.finetune_stage=finetune_stage self.model = model(finetune_stage) self.objective = OBJECTIVE_MAPPING[FLAGS.objective] self.token_indexer = get_my_tokenizer() if FLAGS.selfpretrained_weights_path: self.load_selfpretrained_weights()
def __init__(self, split): super().__init__() self.token_indexer = get_my_tokenizer() self.split_name = split self.split_chunks_folder = Path(FLAGS.blob_folder, f'{split}') self.chunk_paths = None self.pop_indices = None self.row_index = None self.current_permuted_indices = None self.current_chunk_path = None
def __init__(self): super().__init__() self.idx_to_embedding = nn.Embedding(get_my_tokenizer().vocab_size, FLAGS.d_emb) self.token_type_embeddings = nn.Embedding(TYPE_VOCAB_SIZE, FLAGS.d_emb) if FLAGS.pos_embeddings == 'absolute': self.position_embeddings = nn.Embedding(FLAGS.max_seq_length, FLAGS.d_emb) self.embedding_to_hidden = nn.Linear(FLAGS.d_emb, FLAGS.d_hidden) self.LayerNorm = InternalLayerNorm(FLAGS.d_emb) self.dropout = MyDropout()
def forward(self, target_ids, input_ids, padding_mask) -> Dict[str, torch.Tensor]: tokenizer = get_my_tokenizer() vocab_scores = torch.rand(target_ids.shape[0], target_ids.shape[1], tokenizer.vocab_size).cuda() result_dict = {} if target_ids is not None: result_dict['loss'] = nn.CrossEntropyLoss()(vocab_scores.contiguous().view(-1, tokenizer.vocab_size), target_ids.contiguous().view(-1)) \ + self.dummy_param - self.dummy_param # To trick the trainer ;) result_dict['vocab_scores'] = vocab_scores return result_dict
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("dirt"): tokenizer = get_my_tokenizer() elif tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if ( tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-") ): tokenizer.add_special_tokens( {"bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>"} ) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace(word, input_module_tokenizer_name(tokenizer_name))
def forward(self, input_ids, padding_mask, masked_lm_labels=None, token_type_ids=None): # ENCODING clean = (FLAGS.DIR != 'combo') or (not self.training) or ( self.finetune_stage and not FLAGS.replace_self_predictions) if FLAGS.DIR == 'combo': normalizer = FLAGS.nb_encoder_layers - FLAGS.top_down_distance if FLAGS.replace_self_predictions == 'alternate': self.learn_phase = not self.learn_phase elif FLAGS.replace_self_predictions == 'always': self.learn_phase = False else: normalizer = FLAGS.nb_encoder_layers if clean: encoder = MySequential(*[ self.shared_encoder_block for _ in range(FLAGS.nb_encoder_layers) ], clean=clean) else: encoder = MySequential(*[ self.shared_encoder_block for _ in range(FLAGS.nb_encoder_layers) ], top_down=self.shared_top_down_predictor, from_left=self.shared_from_left_predictor, from_right=self.shared_from_right_predictor, combiner=self.combiner, clean=clean, learn_phase=self.learn_phase) embedded_inputs = self.embedder(input_ids, token_type_ids) encoded, _, cum_layer_loss, layer_loss_list = encoder( embedded_inputs, padding_mask) cum_layer_loss = cum_layer_loss / normalizer # Normalize layer loss by number of times it is calculated result_dict = {} result_dict['encoded_activations'] = encoded vocab_scores = self.lm_head(encoded) if masked_lm_labels is not None: targets = process_targets_for_loss(masked_lm_labels) vocab_scores_contiguous = vocab_scores.contiguous().view( -1, get_my_tokenizer().vocab_size) MLM_loss = nn.CrossEntropyLoss()(vocab_scores_contiguous, targets) result_dict['loss'] = FLAGS.DIR_loss_fraction * cum_layer_loss + ( 1 - FLAGS.DIR_loss_fraction) * MLM_loss if FLAGS.DIR else MLM_loss self.metrics_dict['crossentropy_loss'] = MLM_loss.item() self.metrics_dict['perplexity'] = torch.exp(MLM_loss).item() if FLAGS.DIR: self.metrics_dict['DIR_loss'] = cum_layer_loss.item( ) if isinstance(cum_layer_loss, torch.Tensor) else cum_layer_loss for layer, loss in enumerate(layer_loss_list): self.metrics_dict[f'DIR_loss_layer_{layer}'] = loss.item( ) if isinstance(loss, torch.Tensor) else loss result_dict['vocab_scores'] = vocab_scores return result_dict # Dictionary format for AllenNLP trainer loop
def get_padding_token(self) -> int: return get_my_tokenizer().pad_token_id
def __init__(self, text_data_path, blob_path): super().__init__() self.token_indexer = get_my_tokenizer() self.text_data_path = text_data_path self.blob_path = blob_path self.data = self.get_data()