def __init__(self, tokenizer: PreTrainedTokenizer, filter_callable: Callable[[str], bool] = default_filter,
                 prefix: str = '', suffix: str = ' .', pmin: int = 0, pmax: int = 0, smin: int = 0, smax: int = 0,
                 seed: int = 1234, eval_sentence: str = None, **_):
        self.tokenizer = tokenizer

        if isinstance(tokenizer, BertTokenizer):
            vocab = tokenizer.vocab.keys()
        elif isinstance(tokenizer, GPT2Tokenizer):
            vocab = tokenizer.encoder.keys()
        else:
            raise ValueError('Access to vocab is currently only implemented for BertTokenizer and GPT2Tokenizer')

        self.words = [x for x in vocab if not filter_callable or filter_callable(x)]
        self.prefix = tokenizer.tokenize(prefix)
        self.suffix = tokenizer.tokenize(suffix)
        self.pmin = pmin
        self.pmax = pmax
        self.smin = smin
        self.smax = smax
        self.eval_sentence = eval_sentence

        if seed:
            random.seed(seed)
Esempio n. 2
0
def get_word_to_id_map(tokenizer: PreTrainedTokenizer,
                       word_counts=None,
                       max_words: int = -1):
    """
    Return a mapping from all tokens to their internal ids for a given tokenizer
    :param tokenizer: the tokenizer
    :param word_counts: a dictionary mapping words to their number of occurrences
    :param max_words: if set to a value >0, only the `max_words` most frequent words according to `word_counts` are kept
    :return:
    """
    if not isinstance(tokenizer, RobertaTokenizer):
        raise ValueError(
            "this function currently only supports instances of 'RobertaTokenizer'"
        )

    words = filter_words(tokenizer.encoder.keys(), word_counts, max_words)
    word2id = {
        word[1:]: tokenizer.convert_tokens_to_ids(word)
        for word in words
    }
    logger.info(
        f"There are {len(word2id)} words left after filtering non-word tokens")
    return word2id
    def __init__(self, tokenizer: PreTrainedTokenizer, sample_path: [], block_size: int, overwrite_cache=False,
                num_processes=8, cached_directory = "/workdir/Code/bma_transformer_model/data/cached_data"):
        # assert os.path.isfile(file_path)
        # For Loop MultiFile
        self.examples = []
        self.sample_path = sample_path
#         print(f"THIS IS SAMPLE PATH {sample_path}")
        self.tokenizer = tokenizer
        
        # Set block size to be the blocksize-special tokens
        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        
        self.overwrite_cache = overwrite_cache
        self.cached_directory = cached_directory
        if not os.path.exists(cached_directory):
            os.makedirs(cached_directory)
        
        # Multiprocess for getting examples
        with Pool(processes=num_processes) as p:
            self.examples = list(tqdm.tqdm(p.imap(self.load_data_tokenized, self.sample_path), total=len(self.sample_path)))
        # Convert from 3d list to 2d 
        # self.examples from [[[3], [4]], [[5], [6]], [[7], [8]]] => [[3], [4], [5], [6], [7], [8]]
        self.examples = [each_batch for each_file in self.examples for each_batch in each_file]
Esempio n. 4
0
def get_verbalization_ids(word: str, tokenizer: PreTrainedTokenizer,
                          force_single_token: bool) -> Union[int, List[int]]:
    """
    Get the token ids corresponding to a verbalization
    :param word: the verbalization
    :param tokenizer: the tokenizer to use
    :param force_single_token: whether it should be enforced that the verbalization corresponds to a single token.
           If set to true, this method returns a single int instead of a list and throws an error if the word
           corresponds to multiple tokens.
    :return: either the list of token ids or the single token id corresponding to this word
    """
    kwargs = {
        'add_prefix_space': True
    } if isinstance(tokenizer, GPT2Tokenizer) else {}
    ids = tokenizer.encode(word, add_special_tokens=False, **kwargs)
    if not force_single_token:
        return ids
    assert len(ids) == 1, \
        f'Verbalization "{word}" does not correspond to a single token, got {tokenizer.convert_ids_to_tokens(ids)}'
    verbalization_id = ids[0]
    assert verbalization_id not in tokenizer.all_special_ids, \
        f'Verbalization {word} is mapped to a special token {tokenizer.convert_ids_to_tokens(verbalization_id)}'
    return verbalization_id
Esempio n. 5
0
def collate_twitter(instances: List[Dict],
                    tokenizer: PreTrainedTokenizer,
                    return_attention_masks: bool = True,
                    pad_to_max_length: bool = False,
                    device='cuda') -> List[torch.Tensor]:
    token_ids = [tokenizer.encode(_x[0], max_length=509) for _x in instances]
    if pad_to_max_length:
        batch_max_len = 512
    else:
        batch_max_len = max([len(_s) for _s in token_ids])
    padded_ids_tensor = torch.tensor([
        _s + [tokenizer.pad_token_id] * (batch_max_len - len(_s))
        for _s in token_ids
    ])
    labels = torch.tensor([_twitter_label[_x[1]] for _x in instances],
                          dtype=torch.long)

    output_tensors = [padded_ids_tensor]
    if return_attention_masks:
        output_tensors.append(padded_ids_tensor > 0)
    output_tensors.append(labels)

    return list(_t.to(device) for _t in output_tensors)
Esempio n. 6
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [
                line for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]

        self.examples = tokenizer.batch_encode_plus(
            lines,
            add_special_tokens=True,
            max_length=block_size,
            truncation=True,
            pad_to_max_length=True)["input_ids"]
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path + "-x.csv")
        logger.info("Creating features from dataset file at %s", file_path)

        X_lines = list()
        with open(file_path + "-x.csv") as f:
            reader = csv.reader(f)

            for row in reader:
                X_lines.append(row[1])
        X_lines = X_lines[1:]

        Y_lines = list()
        with open(file_path + "-y.csv") as f:
            reader = csv.reader(f)

            for row in reader:
                Y_lines.append(row[1:])

        lines = list()
        for x_line, y_line in zip(X_lines, Y_lines):
            for i in range(3):
                if len(y_line[i].strip()) > 0:
                    lines.append(x_line.strip() + " <|continue|> " +
                                 y_line[i].strip())
        random.shuffle(lines)

        #self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
        self.examples = []
        for i in lines:
            #pdb.set_trace()
            self.examples.append(
                tokenizer.encode_plus(i, max_length=block_size)["input_ids"])
def text_to_batch_transformer(claims: List, tokenizer: PreTrainedTokenizer,
                              evidence: List) -> Tuple[List, List]:
    """Turn a piece of text into a batch for transformer model

    :param text: The text to tokenize and encode
    :param tokenizer: The tokenizer to use
    :param: text_pair: An optional second string (for multiple sentence sequences)
    :return: A list of IDs and a mask
    """
    # Create the input string; first get a target word
    cands = [[
        w for w in word_tokenize(c) if w not in stopwords_en and w not in punc
    ] for c in claims]
    #targets = [','.join([w.lower() for w in set(random.sample(cand, min(1,len(cand))) + random.sample(eng_words, 4))]) for cand in cands]
    # Using only candidates
    targets = [
        ','.join(
            [w.lower() for w in set(random.sample(cand, min(5, len(cand))))])
        for cand in cands
    ]
    # # First get 5 possibel real candidates and 25 noise candidates
    # potential_words = [[w.lower() for w in set(random.sample(cand, min(5,len(cand))) + random.sample(eng_words, 25))] for cand in cands]
    # # Now randomly select 5 words from this list; we add more possible noise to give the model a better chance at generating good claims
    # # we want the model to just add words when they make sense, not force it to always pick 1-2 words
    # targets = [','.join(random.sample(pw, 5)) for pw in potential_words]
    texts = [
        f"{target}||{evid}||{claim}"
        for target, evid, claim in zip(targets, evidence, claims)
    ]
    input_ids = [
        tokenizer.encode(t, max_length=tokenizer.max_len - 1) +
        [tokenizer.eos_token_id] for t in texts
    ]

    masks = [[1] * len(i) for i in input_ids]

    return input_ids, masks
Esempio n. 9
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 block_size: int,
                 local_rank=-1):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        self.tokenizer = tokenizer
        logger.info("Creating features from dataset file at %s", file_path)

        csv_data = self.read_csv(file_path)

        # with open(file_path, encoding="utf-8") as f:
        #     lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        logger.info('Finishing reading csv file.')

        lines = [' '.join(_['text1']) for _ in csv_data]
        batch_encoding = tokenizer.batch_encode_plus(lines,
                                                     add_special_tokens=True,
                                                     max_length=block_size)
        self.examples = batch_encoding["input_ids"]
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 block_size: int,
                 sep_token="<sep>"):
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = []
            i = 0
            for line in f:
                line = json.loads(line)
                question_text = line.get("question_text")
                answer_text = line.get("answer_text")
                example_text = question_text + " " + sep_token + " " + answer_text
                # if i < 10:
                #     logger.info(f"{example_text}")
                lines.append(example_text)
                i += 1

        batch_encoding = tokenizer.batch_encode_plus(lines,
                                                     add_special_tokens=True,
                                                     max_length=block_size)
        self.examples = batch_encoding["input_ids"]
    def __init__(self,
                 domain_list: List[str],
                 data_dir: Path,
                 tokenizer: PreTrainedTokenizer,
                 is_train: bool = True):
        ''' Initialize the fudan review dataset

        Parameters
        ---
        domain_list:
        list of domains to be included

        data_dir: 
        path to the data directory

        tokenizer:
        PreTrainedTokenizer from one of the transformer models

        is_train: 
        is this for train dataset or test dataset?
        '''
        super().__init__()
        self.is_train = is_train
        self.data_dir = data_dir
        self.domain_list = domain_list

        df = self._prepare_df()
        batch = tokenizer.batch_encode_plus(df['text'].tolist(),
                                            max_length=tokenizer.max_len,
                                            pad_to_max_length=True,
                                            return_tensors='pt',
                                            return_attention_masks=True)
        self.x = batch['input_ids']
        self.attn_mask = batch['attention_mask']
        self.domains = torch.tensor(df['domain'].tolist())
        self.y = torch.tensor(df['label'].tolist())
Esempio n. 12
0
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        """
        Generate inputs to provide to the ONNX exporter for the specific framework

        Args:
            tokenizer: The tokenizer associated with this model configuration
            batch_size: The batch size (int) to export the model for (-1 means dynamic axis)
            seq_length: The sequence length (int) to export the model for (-1 means dynamic axis)
            is_pair: Indicate if the input is a pair (sentence 1, sentence 2)
            framework: The framework (optional) the tokenizer will generate tensor for

        Returns:
            Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
        """

        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.DEFAULT_FIXED_BATCH, num_token_to_add=0
        )

        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.DEFAULT_FIXED_SEQUENCE, num_token_to_add=token_to_add
        )

        # Generate dummy inputs according to compute batch and sequence
        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
        return dict(tokenizer(dummy_input, return_tensors=framework))
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, "bert_cached_lm_" + str(block_size) + "_" + filename)
        if args.overwrite_cache:
            print(args.overwrite_cache)
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from datasets file at %s",
                        directory)

            with open(file_path, encoding="utf-8") as f:
                lines = [
                    line for line in f.read().splitlines()
                    if (len(line) > 0 and not line.isspace())
                ]
            self.examples = tokenizer.batch_encode_plus(
                lines, add_special_tokens=True,
                max_length=block_size)["input_ids"]

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 14
0
def extract_features(data_file_path: str, split: str, max_seq_length: int,
                     tokenizer: PreTrainedTokenizer,
                     processor: DataProcessor) -> List[NLIExample]:
    """
    Extract features for the given dataset file, using `processor`.
    Returns a list of NLIExample.
    """
    nli_base_dir, _ = os.path.split(data_file_path)

    logger.info(f'About to extract examples in {nli_base_dir} '
                f'with {type(processor)}')

    examples = (processor.get_train_examples(nli_base_dir) if split == 'train'
                else processor.get_dev_examples(nli_base_dir))

    features = []
    available_labels = processor.get_labels()

    for example in tqdm(examples, desc='tokenizing examples'):
        ###### editeeeei
        #    if split != 'train':
        #        example.text_a = translate(example.text_a)
        #        example.text_b = translate(example.text_b)
        ##### editeeeeeei
        encoded = tokenizer.encode_plus(example.text_a,
                                        example.text_b,
                                        max_length=max_seq_length,
                                        pad_to_max_length=True,
                                        truncation=True)

        encoded['label'] = available_labels.index(example.label)
        encoded['pairID'] = example.guid

        features.append(NLIExample(**encoded))

    return features
Esempio n. 15
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 split_token='<EOD>',
                 block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [
                line + split_token for line in f.read().split(split_token)
                if (len(line) > 0 and not line.isspace())
            ]
        # add special tokens which shouldn't be split
        special_tokens_dict = {
            'cls_token': '<TLDR>',
            'eos_token': '<EOD>'
        }  #, 'additional_special_tokens': ['<EOT>']}
        tokenizer.add_special_tokens(special_tokens_dict)

        self.examples = tokenizer.batch_encode_plus(
            lines, add_special_tokens=True, max_length=block_size)["input_ids"]
        self.examples = [
            ex for ex in self.examples if tokenizer.encode('<TLDR>')[0] in ex
        ]

        self.labels = []
        max_block = torch.arange(block_size)
        for ex in self.examples:
            # note that this will throw an exeption if token is not in the training example.
            try:
                idx = ex.index(tokenizer.encode('<TLDR>')[0])
            except ValueError as e:
                print("Example does not contain <TLDR> token.")
                print(tokenizer.decode(ex))
                exit()
            mask = (max_block <= idx)[:len(ex)]
            masked_labels = torch.tensor(ex) * ~mask - mask.type(
                torch.int) * 100  # ignore context when computing loss
            self.labels.append(masked_labels)
Esempio n. 16
0
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path)

        block_size = block_size - tokenizer.num_special_tokens_to_add(
            pair=False)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)

            else:
                logger.info(
                    f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(text))

                for i in range(0,
                               len(tokenized_text) - block_size + 1,
                               block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(
                            tokenized_text[i:i + block_size]))
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
Esempio n. 17
0
    def convert_examples_to_features(
        self,
        examples: List[InputExample],
        label_list: List[str],
        max_seq_length: int,
        tokenizer: PreTrainedTokenizer,
        cls_token_at_end=False,
        cls_token="[CLS]",
        cls_token_segment_id=1,
        sep_token="[SEP]",
        sep_token_extra=False,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        pad_token_label_id=-100,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ) -> List[InputFeatures]:
        """ Loads a data file into a list of `InputFeatures`
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        # TODO clean up all this to leverage built-in features of tokenizers

        label_map = {label: i for i, label in enumerate(label_list)}

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10_000 == 0:
                logger.info("Writing example %d of %d", ex_index, len(examples))

            tokens = []
            label_ids = []
            for word, label in zip(example.words, example.labels):
                word_tokens = tokenizer.tokenize(word)

                # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
                if len(word_tokens) > 0:
                    tokens.extend(word_tokens)
                    # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                    label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = tokenizer.num_special_tokens_to_add()
            if len(tokens) > max_seq_length - special_tokens_count:
                tokens = tokens[: (max_seq_length - special_tokens_count)]
                label_ids = label_ids[: (max_seq_length - special_tokens_count)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids:   0   0   0   0  0     0   0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
            if sep_token_extra:
                # roberta uses an extra separator b/w pairs of sentences
                tokens += [sep_token]
                label_ids += [pad_token_label_id]
            segment_ids = [sequence_a_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens += [cls_token]
                label_ids += [pad_token_label_id]
                segment_ids += [cls_token_segment_id]
            else:
                tokens = [cls_token] + tokens
                label_ids = [pad_token_label_id] + label_ids
                segment_ids = [cls_token_segment_id] + segment_ids

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
                label_ids = ([pad_token_label_id] * padding_length) + label_ids
            else:
                input_ids += [pad_token] * padding_length
                input_mask += [0 if mask_padding_with_zero else 1] * padding_length
                segment_ids += [pad_token_segment_id] * padding_length
                label_ids += [pad_token_label_id] * padding_length

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(label_ids) == max_seq_length

            if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s", example.guid)
                logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
                logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
                logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
                logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

            if "token_type_ids" not in tokenizer.model_input_names:
                segment_ids = None

            features.append(
                InputFeatures(
                    input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
                )
            )
Esempio n. 18
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):

        block_size = block_size - (tokenizer.max_len -
                                   tokenizer.max_len_single_sentence)
        is_folder = True if args.data_mapfile is not None else False

        if is_folder == False:
            assert os.path.isfile(file_path)
            directory, filename = os.path.split(file_path)
            cached_features_file = os.path.join(
                directory, args.model_type + "_cached_lm_" + str(block_size) +
                "_" + filename)
        else:
            directory = file_path
            cached_features_file = os.path.join(
                directory, args.model_type + "_cached_lm_" + str(block_size))

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
                self.client_mapping = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            self.client_mapping = {}
            sample_id = -1
            user_id = -1

            if is_folder == False:
                files = [file_path]
            else:
                files = [
                    os.path.join(file_path, entry.name)
                    for entry in os.scandir(file_path)
                    if '_cached_lm_' not in entry.name
                ]

            # make sure files are ordered
            files = sorted(files)

            for file in files:
                with open(file, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(text))

                if len(tokenized_text) > 0:
                    user_id += 1
                    self.client_mapping[user_id] = []

                for i in range(0,
                               len(tokenized_text) - block_size + 1,
                               block_size):  # Truncate in block of block_size
                    sample_id += 1
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(
                            tokenized_text[i:i + block_size]))
                    self.client_mapping[user_id].append(sample_id)

            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=-1)
                pickle.dump(self.client_mapping, handle, protocol=-1)

        self.data = self.examples
        self.targets = [0 for i in range(len(self.data))]
Esempio n. 19
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 dir_path: str,
                 block_size=1024):
        self.examples = []
        tokenizer_class = tokenizer.__class__.__name__
        cached_features_file = os.path.join(
            dir_path, args.model_type + "_cached2_maskedsents3_" +
            str(block_size) + "_" + tokenizer_class)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", dir_path)
            good_docs = bad_docs = 0
            for filename in os.listdir(dir_path):
                try:
                    if not filename.endswith(".json"):
                        continue

                    path = os.path.join(dir_path, filename)
                    with open(path) as json_file:
                        data = json.load(json_file)
                        facts_doc = FactsDoc.Schema().load(data)

                    splitter = SentenceSplitter(language='en')
                    full_text_sentence_split = splitter.split(
                        text=facts_doc.text)
                    sent_one = full_text_sentence_split[START_SENT]
                    sent_two = full_text_sentence_split[END_SENT]
                    inbetween_text = " ".join(
                        full_text_sentence_split[START_SENT + 1:END_SENT])
                    tokenized_sent_one = tokenizer.encode(
                        sent_one,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    tokenized_sent_two = tokenizer.encode(
                        sent_two,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    tokenized_inbetween_text = tokenizer.encode(
                        inbetween_text,
                        add_special_tokens=False,
                        return_tensors="pt").squeeze(0)
                    full_text_tensor = torch.cat([
                        tokenized_sent_one, tokenized_inbetween_text,
                        tokenized_sent_two
                    ],
                                                 dim=0)
                    mask = torch.cat([
                        torch.ones(tokenized_sent_one.size()),
                        torch.zeros(tokenized_inbetween_text.size()),
                        torch.ones(tokenized_sent_two.size())
                    ])
                    self.examples.append((full_text_tensor, mask))
                    good_docs += 1
                except:
                    bad_docs += 1

            logger.info("finished creating examples for " + dir_path)
            logger.info(
                f"docs with exceptions = {bad_docs} fro total {bad_docs+good_docs}"
            )
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 21
0
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_features = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
            text_a = context
            if example.question.find("_") != -1:
                # this is for cloze question
                text_b = example.question.replace("_", ending)
            else:
                text_b = example.question + " " + ending

            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True,)
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!"
                )

            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length
            assert len(attention_mask) == max_length
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

        label = label_map[example.label]

        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("race_id: {}".format(example.example_id))
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
                logger.info("label: {}".format(label))

        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))

    return features
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples),
                                         desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_inputs = []
        for ending_idx, (context, ending) in enumerate(
                zip(example.contexts, example.endings)):
            text_a = context
            if example.question.find("_") != -1:
                # this is for cloze question
                text_b = example.question.replace("_", ending)
            else:
                text_b = example.question + " " + ending

            inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=max_length,
                pad_to_max_length=True,
            )
            if "num_truncated_tokens" in inputs and inputs[
                    "num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!")

            choices_inputs.append(inputs)

        label = label_map[example.label]

        input_ids = [x["input_ids"] for x in choices_inputs]
        attention_mask = ([x["attention_mask"] for x in choices_inputs]
                          if "attention_mask" in choices_inputs[0] else None)
        token_type_ids = ([x["token_type_ids"] for x in choices_inputs]
                          if "token_type_ids" in choices_inputs[0] else None)

        features.append(
            InputFeatures(
                example_id=example.example_id,
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            ))

    for f in features[:2]:
        logger.info("*** Example ***")
        logger.info("feature: %s" % f)

    return features
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 mode,
                 block_size=512):
        assert os.path.isfile(file_path)

        block_size = block_size - (tokenizer.max_len -
                                   tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            args["cache_dir"], args["model_type"] + "_cached_lm_" +
            str(block_size) + "_" + filename)

        if os.path.exists(cached_features_file) and (
            (not args["reprocess_input_data"] and not args["no_cache"]) or
            (mode == "dev" and args["use_cached_eval_features"]
             and not args["no_cache"])):
            logger.info(" Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info(" Creating features from dataset file at %s",
                        args["cache_dir"])

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            # tokenizer = ByteLevelBPETokenizer(
            #     "outputs/vocab.json",
            #     "outputs/merges.txt",
            # )
            # tokenizer._tokenizer.post_processor = BertProcessing(
            #     ("</s>", tokenizer.token_to_id("</s>")),
            #     ("<s>", tokenizer.token_to_id("<s>")),
            # )

            # logger.info(" Encoding")
            # tokenized_text = tokenizer.encode(text).ids
            # logger.info(" Encoded")
            # self.examples = [tokenized_text[i : i + block_size] for i in tqdm(range(0, len(tokenized_text) - block_size + 1, block_size))] # noqa

            tokenized_text = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(text))
            tokenized_text_split = [
                tokenized_text[i:i + block_size] for i in tqdm(
                    range(0,
                          len(tokenized_text) - block_size + 1, block_size))
            ]

            with Pool(args["process_count"]) as p:
                self.examples = list(
                    tqdm(
                        p.imap(tokenizer.build_inputs_with_special_tokens,
                               tokenized_text_split,
                               chunksize=500),
                        total=len(tokenized_text_split),
                        # disable=silent,
                    ))

            # for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
            #     self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info(" Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 24
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    non_multi_model = model
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(non_multi_model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    best_perplexity = float('inf')
    for i, epoch in enumerate(train_iterator):
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])

        if args.local_rank != -1:
            train_sampler.set_epoch(epoch)

        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

        if args.do_eval:
            file_path = Path(args.data_dir, args.eval_data_file)

            out_file_path = Path(args.data_dir,
                                 "output_" + args.eval_data_file)
            id_to_json_map = {}
            with open(file_path, encoding="utf-8") as f:
                lines = []
                i = 0

                eval_loss = 0.0
                nb_eval_steps = 0
                for line in tqdm(f, desc="Evaluating"):
                    out_json = {}
                    line = json.loads(line)
                    example_id = line.get("example_id")
                    question_text = line.get("question_text")

                    prompt_text = question_text + " " + args.sep_token + " "
                    encoded_prompt = tokenizer.encode(prompt_text,
                                                      add_special_tokens=False,
                                                      return_tensors="pt")
                    encoded_prompt = encoded_prompt.to(args.device)

                    output_sequences = non_multi_model.generate(
                        input_ids=encoded_prompt,
                        max_length=args.length + len(encoded_prompt[0]),
                        temperature=args.temperature,
                        top_k=args.k,
                        top_p=args.p,
                        repetition_penalty=args.repetition_penalty,
                        do_sample=True,
                        num_return_sequences=args.num_return_sequences,
                    )
                    if len(output_sequences.shape) > 2:
                        output_sequences.squeeze_()

                    generated_sequences = []

                    for generated_sequence_idx, generated_sequence in enumerate(
                            output_sequences):
                        # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
                        # generated_sequence = output_sequences[0]

                        generated_sequence = generated_sequence.tolist()

                        # Decode text
                        text = tokenizer.decode(
                            generated_sequence,
                            clean_up_tokenization_spaces=True)

                        # Remove all text after the stop token
                        if args.stop_token:
                            text = text[:text.find(args.stop_token)]

                        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
                        total_sequence = (prompt_text + text[len(
                            tokenizer.decode(encoded_prompt[0],
                                             clean_up_tokenization_spaces=True)
                        ):])

                        # print(total_sequence)

                        out_json["journaling_input"], out_json[
                            "reflection_output"] = total_sequence.split(
                                args.sep_token)[:2]

                        sample_dataset = GenerateTextDataset(
                            tokenizer, total_sequence, args.block_size)

                        def collate(examples: List[torch.Tensor]):
                            if tokenizer._pad_token is None:
                                return pad_sequence(examples, batch_first=True)
                            return pad_sequence(
                                examples,
                                batch_first=True,
                                padding_value=tokenizer.pad_token_id)

                        eval_sampler = SequentialSampler(sample_dataset)
                        eval_dataloader = DataLoader(sample_dataset,
                                                     sampler=eval_sampler,
                                                     batch_size=1,
                                                     collate_fn=collate)

                        model_lm = model
                        if args.n_gpu > 1:
                            model_lm = torch.nn.DataParallel(model_lm)

                        model_lm.eval()

                        for batch in eval_dataloader:
                            inputs, labels = mask_tokens(
                                batch, tokenizer,
                                args) if args.mlm else (batch, batch)
                            inputs = inputs.to(args.device)
                            labels = labels.to(args.device)

                            with torch.no_grad():
                                outputs = model_lm(inputs,
                                                   masked_lm_labels=labels
                                                   ) if args.mlm else model_lm(
                                                       inputs, labels=labels)
                                lm_loss = outputs[0]
                                example_loss = lm_loss.mean().item()
                                eval_loss += example_loss
                            nb_eval_steps += 1

                        perplexity = torch.exp(
                            torch.tensor(example_loss)).item()
                        # print(perplexity)
                        out_json["perplexity"] = perplexity

                        example_id += "-" + str(generated_sequence_idx)
                        id_to_json_map[example_id] = json.dumps(
                            out_json, ensure_ascii=False)

                    # result = {"perplexity": perplexity}

                eval_loss = eval_loss / nb_eval_steps
                total_perplexity = torch.exp(torch.tensor(eval_loss))
                logger.info(f"total_loss:: {eval_loss}")
                logger.info(
                    f"total_perplexity:: {torch.exp(torch.tensor(eval_loss))}")
                if total_perplexity < best_perplexity:
                    logger.info(
                        f"Current best epoch::: {i}, with perplexity:: {total_perplexity}"
                    )
                    best_perplexity = total_perplexity

                    with open(out_file_path, "w+",
                              encoding="utf-8") as out_file:
                        for _, out_json in id_to_json_map.items():
                            out_file.write(out_json + "\n")

                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

    return global_step, tr_loss / global_step
Esempio n. 25
0
    def build(
        cls,
        dump_db: DumpDB,
        tokenizer: PreTrainedTokenizer,
        sentence_tokenizer: SentenceTokenizer,
        entity_vocab: EntityVocab,
        output_dir: str,
        max_seq_length: int,
        max_entity_length: int,
        max_mention_length: int,
        min_sentence_length: int,
        include_sentences_without_entities: bool,
        include_unk_entities: bool,
        pool_size: int,
        chunk_size: int,
        max_num_documents: int,
    ):

        target_titles = [
            title for title in dump_db.titles()
            if not (":" in title and title.lower().split(":")[0] in
                    ("image", "file", "category"))
        ]
        random.shuffle(target_titles)

        if max_num_documents is not None:
            target_titles = target_titles[:max_num_documents]

        max_num_tokens = max_seq_length - 2  # 2 for [CLS] and [SEP]

        tokenizer.save_pretrained(output_dir)

        entity_vocab.save(os.path.join(output_dir, ENTITY_VOCAB_FILE))
        number_of_items = 0
        tf_file = os.path.join(output_dir, DATASET_FILE)
        options = tf.io.TFRecordOptions(
            tf.compat.v1.io.TFRecordCompressionType.GZIP)
        with TFRecordWriter(tf_file, options=options) as writer:
            with tqdm(total=len(target_titles)) as pbar:
                initargs = (
                    dump_db,
                    tokenizer,
                    sentence_tokenizer,
                    entity_vocab,
                    max_num_tokens,
                    max_entity_length,
                    max_mention_length,
                    min_sentence_length,
                    include_sentences_without_entities,
                    include_unk_entities,
                )
                with closing(
                        Pool(pool_size,
                             initializer=WikipediaPretrainingDataset.
                             _initialize_worker,
                             initargs=initargs)) as pool:
                    for ret in pool.imap(
                            WikipediaPretrainingDataset._process_page,
                            target_titles,
                            chunksize=chunk_size):
                        for data in ret:
                            writer.write(data)
                            number_of_items += 1
                        pbar.update()

        with open(os.path.join(output_dir, METADATA_FILE),
                  "w") as metadata_file:
            json.dump(
                dict(
                    number_of_items=number_of_items,
                    max_seq_length=max_seq_length,
                    max_entity_length=max_entity_length,
                    max_mention_length=max_mention_length,
                    min_sentence_length=min_sentence_length,
                    tokenizer_class=tokenizer.__class__.__name__,
                    language=dump_db.language,
                ),
                metadata_file,
                indent=2,
            )
Esempio n. 26
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 block_size: int,
                 datasets_cache_dir: str = None,
                 chunk_size: int = 2500,
                 overwrite_cache: bool = False,
                 progress: bool = True):
        assert os.path.isfile(
            file_path), f"Input file path {file_path} not found"
        if datasets_cache_dir is None:
            datasets_cache_dir = tempfile.mkdtemp()
        else:
            found_cache = (
                not overwrite_cache
                and os.path.exists(os.path.join(datasets_cache_dir, 'arr.dat'))
                and os.path.exists(
                    os.path.join(datasets_cache_dir, 'idx_arr.dat')))
            os.makedirs(datasets_cache_dir, exist_ok=True)
        self.memmap_index_dataset = MemmapIndexDataset(
            os.path.join(datasets_cache_dir, 'arr.dat'),
            os.path.join(datasets_cache_dir, 'idx_arr.dat'))
        if found_cache:
            logger.info("Found cached features at %s", datasets_cache_dir)
            self.memmap_index_dataset.load()
            return
        else:
            # Handle overwrite_cache case
            self.memmap_index_dataset.clear()
        logger.info("Creating features from dataset file at %s", file_path)
        eos_token_id = tokenizer.eos_token_id
        bos_token_id = tokenizer.bos_token_id
        tokenizer_vocab = tokenizer.get_vocab()
        if '▁' in tokenizer_vocab:
            newline_token_id = tokenizer_vocab['▁']
        elif '\n' in tokenizer_vocab:
            newline_token_id = tokenizer_vocab['\n']
        usable_block_size = block_size - 2

        def add_to_block(ids, block, blocks):
            """
            Add indices to block, if the combined size of indices and block + 2 (bos, eos)
            exceed block_size, add the block to blocks if block is not empty
            then try to add indices again.
            """
            size = len(block) + len(ids)
            if block:
                size += 1
                if size > usable_block_size:
                    blocks.append([bos_token_id] + block + [eos_token_id])
                    return add_to_block(ids, [], blocks)
                else:
                    block.append(newline_token_id)
                    block.extend(ids)
                    return block
            else:
                if size > usable_block_size:
                    return []
                else:
                    return ids

        skipped_n = 0
        lines = []
        block = []
        with open(file_path, encoding="utf-8") as f:
            file_size = get_file_size(f)
            for line in _readline_clean_and_strip(f):
                lines.append(line)
                if len(lines) >= chunk_size:
                    batch_encoding = tokenizer(lines,
                                               add_special_tokens=False,
                                               truncation=True,
                                               max_length=usable_block_size +
                                               1)
                    input_ids = batch_encoding["input_ids"]
                    blocks = []
                    for ids in input_ids:
                        block = add_to_block(ids, block, blocks)
                        if not block:
                            skipped_n += 1
                    lines = []
                    self.memmap_index_dataset.add(blocks)
                    if progress:
                        print(f'\rProcessed {f.tell() / file_size * 100:.2f}%',
                              flush=True,
                              end=' ')
            if len(lines) > 0:
                batch_encoding = tokenizer(lines,
                                           add_special_tokens=False,
                                           truncation=True,
                                           max_length=usable_block_size + 1)
                input_ids = batch_encoding["input_ids"]
                blocks = []
                for ids in input_ids:
                    block = add_to_block(ids, block, blocks)
                    if not block:
                        skipped_n += 1
            if block:
                blocks.append(block)
                self.memmap_index_dataset.add(blocks)
            if progress:
                print(f'\rProcessed {f.tell() / file_size * 100:.2f}%',
                      flush=True,
                      end=' ')
        print()
        logger.info(f'Skipped {skipped_n}')
def get_tfds(
    train_file: str,
    eval_file: str,
    test_file: str,
    tokenizer: PreTrainedTokenizer,
    label_column_id: int,
    max_seq_length: Optional[int] = None,
):
    files = {}

    if train_file is not None:
        files[datasets.Split.TRAIN] = [train_file]
    if eval_file is not None:
        files[datasets.Split.VALIDATION] = [eval_file]
    if test_file is not None:
        files[datasets.Split.TEST] = [test_file]

    ds = datasets.load_dataset("csv", data_files=files)
    features_name = list(ds[list(files.keys())[0]].features.keys())
    label_name = features_name.pop(label_column_id)
    label_list = list(set(ds[list(files.keys())[0]][label_name]))
    label2id = {label: i for i, label in enumerate(label_list)}
    input_names = tokenizer.model_input_names
    transformed_ds = {}

    if len(features_name) == 1:
        for k in files.keys():
            transformed_ds[k] = ds[k].map(
                lambda example: tokenizer.batch_encode_plus(
                    example[features_name[0]],
                    truncation=True,
                    max_length=max_seq_length,
                    padding="max_length"),
                batched=True,
            )
    elif len(features_name) == 2:
        for k in files.keys():
            transformed_ds[k] = ds[k].map(
                lambda example: tokenizer.batch_encode_plus(
                    (example[features_name[0]], example[features_name[1]]),
                    truncation=True,
                    max_length=max_seq_length,
                    padding="max_length",
                ),
                batched=True,
            )

    def gen_train():
        for ex in transformed_ds[datasets.Split.TRAIN]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    def gen_val():
        for ex in transformed_ds[datasets.Split.VALIDATION]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    def gen_test():
        for ex in transformed_ds[datasets.Split.TEST]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    train_ds = (tf.data.Dataset.from_generator(
        gen_train,
        ({k: tf.int32
          for k in input_names}, tf.int64),
        ({k: tf.TensorShape([None])
          for k in input_names}, tf.TensorShape([])),
    ) if datasets.Split.TRAIN in transformed_ds else None)

    if train_ds is not None:
        train_ds = train_ds.apply(
            tf.data.experimental.assert_cardinality(
                len(ds[datasets.Split.TRAIN])))

    val_ds = (tf.data.Dataset.from_generator(
        gen_val,
        ({k: tf.int32
          for k in input_names}, tf.int64),
        ({k: tf.TensorShape([None])
          for k in input_names}, tf.TensorShape([])),
    ) if datasets.Split.VALIDATION in transformed_ds else None)

    if val_ds is not None:
        val_ds = val_ds.apply(
            tf.data.experimental.assert_cardinality(
                len(ds[datasets.Split.VALIDATION])))

    test_ds = (tf.data.Dataset.from_generator(
        gen_test,
        ({k: tf.int32
          for k in input_names}, tf.int64),
        ({k: tf.TensorShape([None])
          for k in input_names}, tf.TensorShape([])),
    ) if datasets.Split.TEST in transformed_ds else None)

    if test_ds is not None:
        test_ds = test_ds.apply(
            tf.data.experimental.assert_cardinality(
                len(ds[datasets.Split.TEST])))

    return train_ds, val_ds, test_ds, label2id
Esempio n. 28
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 split: str,
                 file_path: str,
                 block_size: int,
                 overwrite_cache=False,
                 local_rank=-1):
        block_size = 4096
        self.block_size = 4096

        directory = './processed_files'
        cached_features_file = os.path.join(
            directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                split,
            ),
        )

        with torch_distributed_zero_first(local_rank):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                self.examples = [
                    ex for ex in self.examples[:int(len(self.examples))]
                    if len(ex[0]) == self.block_size
                    and len(ex[1]) == self.block_size
                ]
                self.examples = [ex for ex in self.examples[:]]
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)

            else:
                input_path = './multinews/train.txt.src'
                logger.info(
                    f"Creating features from dataset file at {input_path}")
                encoder_func = select_words_to_mask_special_tokens_only_multiple_docs
                self.examples = []
                self.masking_samples = []
                corpus = read_in_train_set(input_path)
                ln = []
                for i in range(len(corpus)):
                    sample = corpus[i].strip()
                    articles = sample.split("story_separator_special_tag")
                    ln.append(articles)
                tokenizer.add_tokens(['<doc-s>'], special_tokens=True)
                tokenizer.add_tokens(['</doc-s>'], special_tokens=True)
                stats = []
                while len(stats) < 64 * 25 * 1000:
                    for topic in ln:
                        if len(topic) > 2:
                            s = random.sample(topic, len(topic))
                            examp, st = encoder_func(s, tokenizer, block_size)
                            self.examples.append(examp)
                            stats.append(st)
                # Uncomment for creating data for the random baseline
                # while len(self.examples) < 64*25*1000:
                #     s = random.sample(ln, 10)
                #     curr_false_topic = []
                #     for topic in s:
                #         curr_false_topic.append(random.sample(topic, 1)[0])
                #     examp, st = encoder_func(curr_false_topic, tokenizer, block_size)
                #     self.examples.append(examp)
                #     stats.append(st)
                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    f"Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
Esempio n. 29
0
 def _has_initial_cls_token(tokenizer: PreTrainedTokenizer) -> bool:
     # most models have CLS token as last token (GPT-1, GPT-2, TransfoXL, XLNet, XLM), but BERT is initial
     tokens = tokenizer.encode('a')
     initial_cls_token: bool = False
     if tokens[0] == tokenizer.cls_token_id: initial_cls_token = True
     return initial_cls_token
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        langs_to_id: dict,
        file_path: str,
        block_size: int,
        fix_ahalf: bool,
        overwrite_cache=False,
        cache_dir: Optional[str] = None,
    ):
        assert os.path.isfile(
            file_path), f"Input file path {file_path} not found"
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            "cached_transfer_lm_{}_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
                'fix_ahalf' if fix_ahalf else 'fix_bhalf',
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:

                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    cache_data = pickle.load(handle)
                self.src = cache_data['src']
                self.src_langids = cache_data['src_langids']
                self.tgt = cache_data['tgt']
                self.tgt_langids = cache_data['tgt_langids']
                self.align = cache_data['align']
                self.for_mlm = cache_data['for_mlm']
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)

            else:
                logger.info(
                    f"Creating features from dataset file at {directory}")

                self.src = []
                self.src_langids = []
                self.tgt = []
                self.tgt_langids = []
                self.align = []
                self.for_mlm = []

                cls_token_id = tokenizer.cls_token_id
                sep_token_id = tokenizer.sep_token_id

                with open(file_path, encoding="utf-8") as fin:
                    for line in fin:
                        line = line.strip()
                        if len(line) > 0:
                            line = json.loads(line)

                            src_ids = tokenizer.convert_tokens_to_ids(
                                line['src'])
                            src_langid = langs_to_id[line['src_lang']]

                            assert len(src_ids) <= block_size and src_ids[
                                0] == cls_token_id and src_ids[
                                    -2] == sep_token_id and src_ids[
                                        -1] == tokenizer.pad_token_id

                            tgt_ids = tokenizer.convert_tokens_to_ids(
                                line['tgt'])
                            tgt_langid = langs_to_id[line['tgt_lang']]

                            assert len(tgt_ids) <= block_size and tgt_ids[
                                0] == cls_token_id and tgt_ids[
                                    -1] == sep_token_id

                            align_index = line['align_index']

                            assert len(tgt_ids) == len(align_index)

                            pad_align_index = [len(src_ids) - 1] * block_size
                            pad_align_index[:len(tgt_ids)] = align_index

                            pad_src_ids = [tokenizer.pad_token_id] * block_size
                            pad_src_ids[:len(src_ids)] = src_ids

                            pad_tgt_ids = [tokenizer.pad_token_id] * block_size
                            pad_tgt_ids[:len(tgt_ids)] = tgt_ids

                            self.src.append(pad_src_ids)
                            self.src_langids.append(src_langid)
                            self.tgt.append(pad_tgt_ids)
                            self.tgt_langids.append(tgt_langid)
                            self.align.append(pad_align_index)
                            self.for_mlm.append(False)

                            # for mlm
                            if fix_ahalf:
                                self.src.append(pad_src_ids)
                                self.src_langids.append(src_langid)
                                self.tgt.append(pad_src_ids)
                                self.tgt_langids.append(src_langid)
                                self.align.append(list(range(block_size)))
                                self.for_mlm.append(True)
                            else:
                                self.src.append(pad_tgt_ids)
                                self.src_langids.append(tgt_langid)
                                self.tgt.append(pad_tgt_ids)
                                self.tgt_langids.append(tgt_langid)
                                self.align.append(list(range(block_size)))
                                self.for_mlm.append(True)

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    cache_data = {
                        'src': self.src,
                        'src_langids': self.src_langids,
                        'tgt': self.tgt,
                        'tgt_langids': self.tgt_langids,
                        'align': self.align,
                        'for_mlm': self.for_mlm
                    }
                    pickle.dump(cache_data,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)