Esempio n. 1
0
    def __init__(self, args):
        super().__init__()
        self.cache_dir = FLAGS.cache_dir
        utils.maybe_make_dir(self.cache_dir)

        self.output_mode = args.transformers_output_mode
        self.input_module = args.input_module
        self.tokenizer_required = input_module_tokenizer_name(
            args.input_module)

        # If set, treat these special tokens as part of input segments other than A/B.
        self._SEG_ID_CLS = None
        self._SEG_ID_SEP = None
        # self.model = transformers.RobertaModel.from_pretrained(
        #     args.input_module, cache_dir=self.cache_dir, output_hidden_states=True
        # )
        if FLAGS.saved_pretrained_model_path:
            self.model = load_pretrained_model_for_SG()
        else:
            self.model = MODEL_MAPPING[FLAGS.model](finetune_stage=True)
        self.max_pos = None

        self.tokenizer = get_my_tokenizer()
        self._sep_id = self.tokenizer.sep_token_id
        self._cls_id = self.tokenizer.cls_token_id
        self._pad_id = self.tokenizer.pad_token_id
        self._unk_id = self.tokenizer.unk_token_id

        self.parameter_setup(args)
Esempio n. 2
0
 def tokens_to_indices(self,
                       tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[int]]:
     text_tokens = [t.text for t in tokens]
     indices = get_my_tokenizer().convert_tokens_to_ids(text_tokens)
     return {index_name: indices}
Esempio n. 3
0
    def forward(self, target_ids, input_ids,
                padding_mask) -> Dict[str, torch.Tensor]:
        tokenizer = get_my_tokenizer()
        float_input_ids = input_ids.to(torch.float).clone()
        # Replace mask-ids with random floats to make sure they are not the most common element
        maskless_input_ids = torch.where(input_ids == tokenizer.mask_token_id,
                                         torch.rand_like(float_input_ids),
                                         float_input_ids)
        # Pick the most common element in each sample
        most_common_ids = maskless_input_ids.mode()[0].to(torch.long)
        vocab_scores = torch.zeros(target_ids.shape[0], target_ids.shape[1],
                                   tokenizer.vocab_size).cuda()

        mask_idxs = (input_ids == tokenizer.mask_token_id).nonzero()
        for batch_idx, common_id in enumerate(
                most_common_ids.tolist()
        ):  #TODO maybe change loss calculation to only consider masked positions
            single_sample_mask_idxs = mask_idxs[(
                mask_idxs[:, 0] == batch_idx).nonzero().squeeze(1)]
            for sequence_idx in range(input_ids.shape[1]):
                if sequence_idx in single_sample_mask_idxs:
                    vocab_scores[batch_idx, sequence_idx,
                                 common_id] = 1  #TODO check if this works
                else:
                    existing_id = input_ids[batch_idx, sequence_idx]
                    vocab_scores[batch_idx, sequence_idx, existing_id] = 1
        result_dict = {}
        if target_ids is not None:
            result_dict['loss'] = nn.CrossEntropyLoss()(vocab_scores.contiguous().view(-1, tokenizer.vocab_size),
                                                        target_ids.contiguous().view(-1))\
                                  + self.dummy_param - self.dummy_param
        result_dict['vocab_scores'] = vocab_scores
        return result_dict
Esempio n. 4
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("dirt"):
        tokenizer = get_my_tokenizer()
    elif tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
Esempio n. 5
0
 def __init__(self, corpus_name, split_name):
     self.id_tensor_path = Path(
         FLAGS.blob_folder,
         f'{corpus_name}_{split_name}_ids_tensor').as_posix()
     self.text_path = corpus_to_data[corpus_name]
     self.split_name = split_name
     self.corpus = corpus_name
     self.token_indexer = get_my_tokenizer()
Esempio n. 6
0
 def __init__(self):
     super().__init__()
     self.dense = nn.Linear(FLAGS.d_hidden, FLAGS.d_emb)
     self.LayerNorm = nn.LayerNorm(FLAGS.d_emb)
     self.decoder = nn.Linear(
         FLAGS.d_emb,
         get_my_tokenizer().vocab_size
     )  # TODO add activation here for consistency with ALBERT
     self.activation = get_activation()
Esempio n. 7
0
    def __init__(self, model,finetune_stage=False):
        super().__init__(Vocabulary())
        self.finetune_stage=finetune_stage
        self.model = model(finetune_stage)
        self.objective = OBJECTIVE_MAPPING[FLAGS.objective]
        self.token_indexer = get_my_tokenizer()

        if FLAGS.selfpretrained_weights_path:
            self.load_selfpretrained_weights()
Esempio n. 8
0
 def __init__(self, split):
     super().__init__()
     self.token_indexer = get_my_tokenizer()
     self.split_name = split
     self.split_chunks_folder = Path(FLAGS.blob_folder, f'{split}')
     self.chunk_paths = None
     self.pop_indices = None
     self.row_index = None
     self.current_permuted_indices = None
     self.current_chunk_path = None
Esempio n. 9
0
    def __init__(self):
        super().__init__()
        self.idx_to_embedding = nn.Embedding(get_my_tokenizer().vocab_size,
                                             FLAGS.d_emb)
        self.token_type_embeddings = nn.Embedding(TYPE_VOCAB_SIZE, FLAGS.d_emb)

        if FLAGS.pos_embeddings == 'absolute':
            self.position_embeddings = nn.Embedding(FLAGS.max_seq_length,
                                                    FLAGS.d_emb)
        self.embedding_to_hidden = nn.Linear(FLAGS.d_emb, FLAGS.d_hidden)
        self.LayerNorm = InternalLayerNorm(FLAGS.d_emb)
        self.dropout = MyDropout()
Esempio n. 10
0
 def forward(self, target_ids, input_ids,
             padding_mask) -> Dict[str, torch.Tensor]:
     tokenizer = get_my_tokenizer()
     vocab_scores = torch.rand(target_ids.shape[0], target_ids.shape[1],
                               tokenizer.vocab_size).cuda()
     result_dict = {}
     if target_ids is not None:
         result_dict['loss'] = nn.CrossEntropyLoss()(vocab_scores.contiguous().view(-1, tokenizer.vocab_size),
                                                     target_ids.contiguous().view(-1)) \
                               + self.dummy_param - self.dummy_param  # To trick the trainer ;)
     result_dict['vocab_scores'] = vocab_scores
     return result_dict
Esempio n. 11
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name


    if tokenizer_name.startswith("dirt"):
        tokenizer = get_my_tokenizer()
    elif tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (
        tokenizer_name.startswith("openai-gpt")
        or tokenizer_name.startswith("gpt2")
        or tokenizer_name.startswith("transo-xl-")
    ):
        tokenizer.add_special_tokens(
            {"bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>"}
        )
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(word, input_module_tokenizer_name(tokenizer_name))
Esempio n. 12
0
    def forward(self,
                input_ids,
                padding_mask,
                masked_lm_labels=None,
                token_type_ids=None):

        # ENCODING
        clean = (FLAGS.DIR != 'combo') or (not self.training) or (
            self.finetune_stage and not FLAGS.replace_self_predictions)
        if FLAGS.DIR == 'combo':
            normalizer = FLAGS.nb_encoder_layers - FLAGS.top_down_distance

            if FLAGS.replace_self_predictions == 'alternate':
                self.learn_phase = not self.learn_phase
            elif FLAGS.replace_self_predictions == 'always':
                self.learn_phase = False
        else:
            normalizer = FLAGS.nb_encoder_layers
        if clean:
            encoder = MySequential(*[
                self.shared_encoder_block
                for _ in range(FLAGS.nb_encoder_layers)
            ],
                                   clean=clean)
        else:

            encoder = MySequential(*[
                self.shared_encoder_block
                for _ in range(FLAGS.nb_encoder_layers)
            ],
                                   top_down=self.shared_top_down_predictor,
                                   from_left=self.shared_from_left_predictor,
                                   from_right=self.shared_from_right_predictor,
                                   combiner=self.combiner,
                                   clean=clean,
                                   learn_phase=self.learn_phase)
        embedded_inputs = self.embedder(input_ids, token_type_ids)
        encoded, _, cum_layer_loss, layer_loss_list = encoder(
            embedded_inputs, padding_mask)

        cum_layer_loss = cum_layer_loss / normalizer  # Normalize layer loss by number of times it is calculated
        result_dict = {}
        result_dict['encoded_activations'] = encoded

        vocab_scores = self.lm_head(encoded)

        if masked_lm_labels is not None:
            targets = process_targets_for_loss(masked_lm_labels)
            vocab_scores_contiguous = vocab_scores.contiguous().view(
                -1,
                get_my_tokenizer().vocab_size)
            MLM_loss = nn.CrossEntropyLoss()(vocab_scores_contiguous, targets)
            result_dict['loss'] = FLAGS.DIR_loss_fraction * cum_layer_loss + (
                1 -
                FLAGS.DIR_loss_fraction) * MLM_loss if FLAGS.DIR else MLM_loss

            self.metrics_dict['crossentropy_loss'] = MLM_loss.item()
            self.metrics_dict['perplexity'] = torch.exp(MLM_loss).item()

            if FLAGS.DIR:
                self.metrics_dict['DIR_loss'] = cum_layer_loss.item(
                ) if isinstance(cum_layer_loss,
                                torch.Tensor) else cum_layer_loss
                for layer, loss in enumerate(layer_loss_list):
                    self.metrics_dict[f'DIR_loss_layer_{layer}'] = loss.item(
                    ) if isinstance(loss, torch.Tensor) else loss

        result_dict['vocab_scores'] = vocab_scores

        return result_dict  # Dictionary format for AllenNLP trainer loop
Esempio n. 13
0
 def get_padding_token(self) -> int:
     return get_my_tokenizer().pad_token_id
Esempio n. 14
0
 def __init__(self, text_data_path, blob_path):
     super().__init__()
     self.token_indexer = get_my_tokenizer()
     self.text_data_path = text_data_path
     self.blob_path = blob_path
     self.data = self.get_data()