def __init__(self, tokenizer: PreTrainedTokenizer, filter_callable: Callable[[str], bool] = default_filter, prefix: str = '', suffix: str = ' .', pmin: int = 0, pmax: int = 0, smin: int = 0, smax: int = 0, seed: int = 1234, eval_sentence: str = None, **_): self.tokenizer = tokenizer if isinstance(tokenizer, BertTokenizer): vocab = tokenizer.vocab.keys() elif isinstance(tokenizer, GPT2Tokenizer): vocab = tokenizer.encoder.keys() else: raise ValueError('Access to vocab is currently only implemented for BertTokenizer and GPT2Tokenizer') self.words = [x for x in vocab if not filter_callable or filter_callable(x)] self.prefix = tokenizer.tokenize(prefix) self.suffix = tokenizer.tokenize(suffix) self.pmin = pmin self.pmax = pmax self.smin = smin self.smax = smax self.eval_sentence = eval_sentence if seed: random.seed(seed)
def get_word_to_id_map(tokenizer: PreTrainedTokenizer, word_counts=None, max_words: int = -1): """ Return a mapping from all tokens to their internal ids for a given tokenizer :param tokenizer: the tokenizer :param word_counts: a dictionary mapping words to their number of occurrences :param max_words: if set to a value >0, only the `max_words` most frequent words according to `word_counts` are kept :return: """ if not isinstance(tokenizer, RobertaTokenizer): raise ValueError( "this function currently only supports instances of 'RobertaTokenizer'" ) words = filter_words(tokenizer.encoder.keys(), word_counts, max_words) word2id = { word[1:]: tokenizer.convert_tokens_to_ids(word) for word in words } logger.info( f"There are {len(word2id)} words left after filtering non-word tokens") return word2id
def __init__(self, tokenizer: PreTrainedTokenizer, sample_path: [], block_size: int, overwrite_cache=False, num_processes=8, cached_directory = "/workdir/Code/bma_transformer_model/data/cached_data"): # assert os.path.isfile(file_path) # For Loop MultiFile self.examples = [] self.sample_path = sample_path # print(f"THIS IS SAMPLE PATH {sample_path}") self.tokenizer = tokenizer # Set block size to be the blocksize-special tokens self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) self.overwrite_cache = overwrite_cache self.cached_directory = cached_directory if not os.path.exists(cached_directory): os.makedirs(cached_directory) # Multiprocess for getting examples with Pool(processes=num_processes) as p: self.examples = list(tqdm.tqdm(p.imap(self.load_data_tokenized, self.sample_path), total=len(self.sample_path))) # Convert from 3d list to 2d # self.examples from [[[3], [4]], [[5], [6]], [[7], [8]]] => [[3], [4], [5], [6], [7], [8]] self.examples = [each_batch for each_file in self.examples for each_batch in each_file]
def get_verbalization_ids(word: str, tokenizer: PreTrainedTokenizer, force_single_token: bool) -> Union[int, List[int]]: """ Get the token ids corresponding to a verbalization :param word: the verbalization :param tokenizer: the tokenizer to use :param force_single_token: whether it should be enforced that the verbalization corresponds to a single token. If set to true, this method returns a single int instead of a list and throws an error if the word corresponds to multiple tokens. :return: either the list of token ids or the single token id corresponding to this word """ kwargs = { 'add_prefix_space': True } if isinstance(tokenizer, GPT2Tokenizer) else {} ids = tokenizer.encode(word, add_special_tokens=False, **kwargs) if not force_single_token: return ids assert len(ids) == 1, \ f'Verbalization "{word}" does not correspond to a single token, got {tokenizer.convert_ids_to_tokens(ids)}' verbalization_id = ids[0] assert verbalization_id not in tokenizer.all_special_ids, \ f'Verbalization {word} is mapped to a special token {tokenizer.convert_ids_to_tokens(verbalization_id)}' return verbalization_id
def collate_twitter(instances: List[Dict], tokenizer: PreTrainedTokenizer, return_attention_masks: bool = True, pad_to_max_length: bool = False, device='cuda') -> List[torch.Tensor]: token_ids = [tokenizer.encode(_x[0], max_length=509) for _x in instances] if pad_to_max_length: batch_max_len = 512 else: batch_max_len = max([len(_s) for _s in token_ids]) padded_ids_tensor = torch.tensor([ _s + [tokenizer.pad_token_id] * (batch_max_len - len(_s)) for _s in token_ids ]) labels = torch.tensor([_twitter_label[_x[1]] for _x in instances], dtype=torch.long) output_tensors = [padded_ids_tensor] if return_attention_masks: output_tensors.append(padded_ids_tensor > 0) output_tensors.append(labels) return list(_t.to(device) for _t in output_tensors)
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] self.examples = tokenizer.batch_encode_plus( lines, add_special_tokens=True, max_length=block_size, truncation=True, pad_to_max_length=True)["input_ids"]
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path + "-x.csv") logger.info("Creating features from dataset file at %s", file_path) X_lines = list() with open(file_path + "-x.csv") as f: reader = csv.reader(f) for row in reader: X_lines.append(row[1]) X_lines = X_lines[1:] Y_lines = list() with open(file_path + "-y.csv") as f: reader = csv.reader(f) for row in reader: Y_lines.append(row[1:]) lines = list() for x_line, y_line in zip(X_lines, Y_lines): for i in range(3): if len(y_line[i].strip()) > 0: lines.append(x_line.strip() + " <|continue|> " + y_line[i].strip()) random.shuffle(lines) #self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"] self.examples = [] for i in lines: #pdb.set_trace() self.examples.append( tokenizer.encode_plus(i, max_length=block_size)["input_ids"])
def text_to_batch_transformer(claims: List, tokenizer: PreTrainedTokenizer, evidence: List) -> Tuple[List, List]: """Turn a piece of text into a batch for transformer model :param text: The text to tokenize and encode :param tokenizer: The tokenizer to use :param: text_pair: An optional second string (for multiple sentence sequences) :return: A list of IDs and a mask """ # Create the input string; first get a target word cands = [[ w for w in word_tokenize(c) if w not in stopwords_en and w not in punc ] for c in claims] #targets = [','.join([w.lower() for w in set(random.sample(cand, min(1,len(cand))) + random.sample(eng_words, 4))]) for cand in cands] # Using only candidates targets = [ ','.join( [w.lower() for w in set(random.sample(cand, min(5, len(cand))))]) for cand in cands ] # # First get 5 possibel real candidates and 25 noise candidates # potential_words = [[w.lower() for w in set(random.sample(cand, min(5,len(cand))) + random.sample(eng_words, 25))] for cand in cands] # # Now randomly select 5 words from this list; we add more possible noise to give the model a better chance at generating good claims # # we want the model to just add words when they make sense, not force it to always pick 1-2 words # targets = [','.join(random.sample(pw, 5)) for pw in potential_words] texts = [ f"{target}||{evid}||{claim}" for target, evid, claim in zip(targets, evidence, claims) ] input_ids = [ tokenizer.encode(t, max_length=tokenizer.max_len - 1) + [tokenizer.eos_token_id] for t in texts ] masks = [[1] * len(i) for i in input_ids] return input_ids, masks
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) self.tokenizer = tokenizer logger.info("Creating features from dataset file at %s", file_path) csv_data = self.read_csv(file_path) # with open(file_path, encoding="utf-8") as f: # lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] logger.info('Finishing reading csv file.') lines = [' '.join(_['text1']) for _ in csv_data] batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"]
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, sep_token="<sep>"): logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [] i = 0 for line in f: line = json.loads(line) question_text = line.get("question_text") answer_text = line.get("answer_text") example_text = question_text + " " + sep_token + " " + answer_text # if i < 10: # logger.info(f"{example_text}") lines.append(example_text) i += 1 batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"]
def __init__(self, domain_list: List[str], data_dir: Path, tokenizer: PreTrainedTokenizer, is_train: bool = True): ''' Initialize the fudan review dataset Parameters --- domain_list: list of domains to be included data_dir: path to the data directory tokenizer: PreTrainedTokenizer from one of the transformer models is_train: is this for train dataset or test dataset? ''' super().__init__() self.is_train = is_train self.data_dir = data_dir self.domain_list = domain_list df = self._prepare_df() batch = tokenizer.batch_encode_plus(df['text'].tolist(), max_length=tokenizer.max_len, pad_to_max_length=True, return_tensors='pt', return_attention_masks=True) self.x = batch['input_ids'] self.attn_mask = batch['attention_mask'] self.domains = torch.tensor(df['domain'].tolist()) self.y = torch.tensor(df['label'].tolist())
def generate_dummy_inputs( self, tokenizer: PreTrainedTokenizer, batch_size: int = -1, seq_length: int = -1, is_pair: bool = False, framework: Optional[TensorType] = None, ) -> Mapping[str, Any]: """ Generate inputs to provide to the ONNX exporter for the specific framework Args: tokenizer: The tokenizer associated with this model configuration batch_size: The batch size (int) to export the model for (-1 means dynamic axis) seq_length: The sequence length (int) to export the model for (-1 means dynamic axis) is_pair: Indicate if the input is a pair (sentence 1, sentence 2) framework: The framework (optional) the tokenizer will generate tensor for Returns: Mapping[str, Tensor] holding the kwargs to provide to the model's forward function """ # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX batch_size = compute_effective_axis_dimension( batch_size, fixed_dimension=OnnxConfig.DEFAULT_FIXED_BATCH, num_token_to_add=0 ) # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX token_to_add = tokenizer.num_special_tokens_to_add(is_pair) seq_length = compute_effective_axis_dimension( seq_length, fixed_dimension=OnnxConfig.DEFAULT_FIXED_SEQUENCE, num_token_to_add=token_to_add ) # Generate dummy inputs according to compute batch and sequence dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size return dict(tokenizer(dummy_input, return_tensors=framework))
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "bert_cached_lm_" + str(block_size) + "_" + filename) if args.overwrite_cache: print(args.overwrite_cache) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from datasets file at %s", directory) with open(file_path, encoding="utf-8") as f: lines = [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] self.examples = tokenizer.batch_encode_plus( lines, add_special_tokens=True, max_length=block_size)["input_ids"] logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def extract_features(data_file_path: str, split: str, max_seq_length: int, tokenizer: PreTrainedTokenizer, processor: DataProcessor) -> List[NLIExample]: """ Extract features for the given dataset file, using `processor`. Returns a list of NLIExample. """ nli_base_dir, _ = os.path.split(data_file_path) logger.info(f'About to extract examples in {nli_base_dir} ' f'with {type(processor)}') examples = (processor.get_train_examples(nli_base_dir) if split == 'train' else processor.get_dev_examples(nli_base_dir)) features = [] available_labels = processor.get_labels() for example in tqdm(examples, desc='tokenizing examples'): ###### editeeeei # if split != 'train': # example.text_a = translate(example.text_a) # example.text_b = translate(example.text_b) ##### editeeeeeei encoded = tokenizer.encode_plus(example.text_a, example.text_b, max_length=max_seq_length, pad_to_max_length=True, truncation=True) encoded['label'] = available_labels.index(example.label) encoded['pairID'] = example.guid features.append(NLIExample(**encoded)) return features
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, split_token='<EOD>', block_size=512): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [ line + split_token for line in f.read().split(split_token) if (len(line) > 0 and not line.isspace()) ] # add special tokens which shouldn't be split special_tokens_dict = { 'cls_token': '<TLDR>', 'eos_token': '<EOD>' } #, 'additional_special_tokens': ['<EOT>']} tokenizer.add_special_tokens(special_tokens_dict) self.examples = tokenizer.batch_encode_plus( lines, add_special_tokens=True, max_length=block_size)["input_ids"] self.examples = [ ex for ex in self.examples if tokenizer.encode('<TLDR>')[0] in ex ] self.labels = [] max_block = torch.arange(block_size) for ex in self.examples: # note that this will throw an exeption if token is not in the training example. try: idx = ex.index(tokenizer.encode('<TLDR>')[0]) except ValueError as e: print("Example does not contain <TLDR> token.") print(tokenizer.decode(ex)) exit() mask = (max_block <= idx)[:len(ex)] masked_labels = torch.tensor(ex) * ~mask - mask.type( torch.int) * 100 # ignore context when computing loss self.labels.append(masked_labels)
def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, ): assert os.path.isfile(file_path) block_size = block_size - tokenizer.num_special_tokens_to_add( pair=False) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, ), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info( f"Creating features from dataset file at {directory}") self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append( tokenizer.build_inputs_with_special_tokens( tokenized_text[i:i + block_size])) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)
def convert_examples_to_features( self, examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: PreTrainedTokenizer, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=1, sep_token="[SEP]", sep_token_extra=False, pad_on_left=False, pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True, ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ # TODO clean up all this to leverage built-in features of tokenizers label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] for word, label in zip(example.words, example.labels): word_tokens = tokenizer.tokenize(word) # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. if len(word_tokens) > 0: tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = tokenizer.num_special_tokens_to_add() if len(tokens) > max_seq_length - special_tokens_count: tokens = tokens[: (max_seq_length - special_tokens_count)] label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens += [sep_token] label_ids += [pad_token_label_id] if sep_token_extra: # roberta uses an extra separator b/w pairs of sentences tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] label_ids += [pad_token_label_id] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s", example.guid) logger.info("tokens: %s", " ".join([str(x) for x in tokens])) logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) if "token_type_ids" not in tokenizer.model_input_names: segment_ids = None features.append( InputFeatures( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids ) )
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence) is_folder = True if args.data_mapfile is not None else False if is_folder == False: assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename) else: directory = file_path cached_features_file = os.path.join( directory, args.model_type + "_cached_lm_" + str(block_size)) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) self.client_mapping = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) self.examples = [] self.client_mapping = {} sample_id = -1 user_id = -1 if is_folder == False: files = [file_path] else: files = [ os.path.join(file_path, entry.name) for entry in os.scandir(file_path) if '_cached_lm_' not in entry.name ] # make sure files are ordered files = sorted(files) for file in files: with open(file, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) if len(tokenized_text) > 0: user_id += 1 self.client_mapping[user_id] = [] for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size sample_id += 1 self.examples.append( tokenizer.build_inputs_with_special_tokens( tokenized_text[i:i + block_size])) self.client_mapping[user_id].append(sample_id) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=-1) pickle.dump(self.client_mapping, handle, protocol=-1) self.data = self.examples self.targets = [0 for i in range(len(self.data))]
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def __init__(self, tokenizer: PreTrainedTokenizer, args, dir_path: str, block_size=1024): self.examples = [] tokenizer_class = tokenizer.__class__.__name__ cached_features_file = os.path.join( dir_path, args.model_type + "_cached2_maskedsents3_" + str(block_size) + "_" + tokenizer_class) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", dir_path) good_docs = bad_docs = 0 for filename in os.listdir(dir_path): try: if not filename.endswith(".json"): continue path = os.path.join(dir_path, filename) with open(path) as json_file: data = json.load(json_file) facts_doc = FactsDoc.Schema().load(data) splitter = SentenceSplitter(language='en') full_text_sentence_split = splitter.split( text=facts_doc.text) sent_one = full_text_sentence_split[START_SENT] sent_two = full_text_sentence_split[END_SENT] inbetween_text = " ".join( full_text_sentence_split[START_SENT + 1:END_SENT]) tokenized_sent_one = tokenizer.encode( sent_one, add_special_tokens=False, return_tensors="pt").squeeze(0) tokenized_sent_two = tokenizer.encode( sent_two, add_special_tokens=False, return_tensors="pt").squeeze(0) tokenized_inbetween_text = tokenizer.encode( inbetween_text, add_special_tokens=False, return_tensors="pt").squeeze(0) full_text_tensor = torch.cat([ tokenized_sent_one, tokenized_inbetween_text, tokenized_sent_two ], dim=0) mask = torch.cat([ torch.ones(tokenized_sent_one.size()), torch.zeros(tokenized_inbetween_text.size()), torch.ones(tokenized_sent_two.size()) ]) self.examples.append((full_text_tensor, mask)) good_docs += 1 except: bad_docs += 1 logger.info("finished creating examples for " + dir_path) logger.info( f"docs with exceptions = {bad_docs} fro total {bad_docs+good_docs}" ) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer, pad_token_segment_id=0, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) choices_features = [] for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): text_a = context if example.question.find("_") != -1: # this is for cloze question text_b = example.question.replace("_", ending) else: text_b = example.question + " " + ending inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True,) if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: logger.info( "Attention! you are cropping tokens (swag task is ok). " "If you are training ARC and RACE and you are poping question + options," "you need to try to use a bigger max seq length!" ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length assert len(attention_mask) == max_length assert len(token_type_ids) == max_length choices_features.append((input_ids, attention_mask, token_type_ids)) label = label_map[example.label] if ex_index < 2: logger.info("*** Example ***") logger.info("race_id: {}".format(example.example_id)) for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features): logger.info("choice: {}".format(choice_idx)) logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask)))) logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids)))) logger.info("label: {}".format(label)) features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,)) return features
def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer, pad_token_segment_id=0, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) choices_inputs = [] for ending_idx, (context, ending) in enumerate( zip(example.contexts, example.endings)): text_a = context if example.question.find("_") != -1: # this is for cloze question text_b = example.question.replace("_", ending) else: text_b = example.question + " " + ending inputs = tokenizer.encode_plus( text_a, text_b, add_special_tokens=True, max_length=max_length, pad_to_max_length=True, ) if "num_truncated_tokens" in inputs and inputs[ "num_truncated_tokens"] > 0: logger.info( "Attention! you are cropping tokens (swag task is ok). " "If you are training ARC and RACE and you are poping question + options," "you need to try to use a bigger max seq length!") choices_inputs.append(inputs) label = label_map[example.label] input_ids = [x["input_ids"] for x in choices_inputs] attention_mask = ([x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None) token_type_ids = ([x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None) features.append( InputFeatures( example_id=example.example_id, input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, )) for f in features[:2]: logger.info("*** Example ***") logger.info("feature: %s" % f) return features
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, mode, block_size=512): assert os.path.isfile(file_path) block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( args["cache_dir"], args["model_type"] + "_cached_lm_" + str(block_size) + "_" + filename) if os.path.exists(cached_features_file) and ( (not args["reprocess_input_data"] and not args["no_cache"]) or (mode == "dev" and args["use_cached_eval_features"] and not args["no_cache"])): logger.info(" Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info(" Creating features from dataset file at %s", args["cache_dir"]) self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() # tokenizer = ByteLevelBPETokenizer( # "outputs/vocab.json", # "outputs/merges.txt", # ) # tokenizer._tokenizer.post_processor = BertProcessing( # ("</s>", tokenizer.token_to_id("</s>")), # ("<s>", tokenizer.token_to_id("<s>")), # ) # logger.info(" Encoding") # tokenized_text = tokenizer.encode(text).ids # logger.info(" Encoded") # self.examples = [tokenized_text[i : i + block_size] for i in tqdm(range(0, len(tokenized_text) - block_size + 1, block_size))] # noqa tokenized_text = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) tokenized_text_split = [ tokenized_text[i:i + block_size] for i in tqdm( range(0, len(tokenized_text) - block_size + 1, block_size)) ] with Pool(args["process_count"]) as p: self.examples = list( tqdm( p.imap(tokenizer.build_inputs_with_special_tokens, tokenized_text_split, chunksize=500), total=len(tokenized_text_split), # disable=silent, )) # for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size # self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info(" Saving features into cached file %s", cached_features_file) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) non_multi_model = model if args.n_gpu > 1: model = torch.nn.DataParallel(non_multi_model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility best_perplexity = float('inf') for i, epoch in enumerate(train_iterator): epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) if args.local_rank != -1: train_sampler.set_epoch(epoch) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.do_eval: file_path = Path(args.data_dir, args.eval_data_file) out_file_path = Path(args.data_dir, "output_" + args.eval_data_file) id_to_json_map = {} with open(file_path, encoding="utf-8") as f: lines = [] i = 0 eval_loss = 0.0 nb_eval_steps = 0 for line in tqdm(f, desc="Evaluating"): out_json = {} line = json.loads(line) example_id = line.get("example_id") question_text = line.get("question_text") prompt_text = question_text + " " + args.sep_token + " " encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(args.device) output_sequences = non_multi_model.generate( input_ids=encoded_prompt, max_length=args.length + len(encoded_prompt[0]), temperature=args.temperature, top_k=args.k, top_p=args.p, repetition_penalty=args.repetition_penalty, do_sample=True, num_return_sequences=args.num_return_sequences, ) if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate( output_sequences): # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) # generated_sequence = output_sequences[0] generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode( generated_sequence, clean_up_tokenization_spaces=True) # Remove all text after the stop token if args.stop_token: text = text[:text.find(args.stop_token)] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = (prompt_text + text[len( tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True) ):]) # print(total_sequence) out_json["journaling_input"], out_json[ "reflection_output"] = total_sequence.split( args.sep_token)[:2] sample_dataset = GenerateTextDataset( tokenizer, total_sequence, args.block_size) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence( examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(sample_dataset) eval_dataloader = DataLoader(sample_dataset, sampler=eval_sampler, batch_size=1, collate_fn=collate) model_lm = model if args.n_gpu > 1: model_lm = torch.nn.DataParallel(model_lm) model_lm.eval() for batch in eval_dataloader: inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model_lm(inputs, masked_lm_labels=labels ) if args.mlm else model_lm( inputs, labels=labels) lm_loss = outputs[0] example_loss = lm_loss.mean().item() eval_loss += example_loss nb_eval_steps += 1 perplexity = torch.exp( torch.tensor(example_loss)).item() # print(perplexity) out_json["perplexity"] = perplexity example_id += "-" + str(generated_sequence_idx) id_to_json_map[example_id] = json.dumps( out_json, ensure_ascii=False) # result = {"perplexity": perplexity} eval_loss = eval_loss / nb_eval_steps total_perplexity = torch.exp(torch.tensor(eval_loss)) logger.info(f"total_loss:: {eval_loss}") logger.info( f"total_perplexity:: {torch.exp(torch.tensor(eval_loss))}") if total_perplexity < best_perplexity: logger.info( f"Current best epoch::: {i}, with perplexity:: {total_perplexity}" ) best_perplexity = total_perplexity with open(out_file_path, "w+", encoding="utf-8") as out_file: for _, out_json in id_to_json_map.items(): out_file.write(out_json + "\n") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) return global_step, tr_loss / global_step
def build( cls, dump_db: DumpDB, tokenizer: PreTrainedTokenizer, sentence_tokenizer: SentenceTokenizer, entity_vocab: EntityVocab, output_dir: str, max_seq_length: int, max_entity_length: int, max_mention_length: int, min_sentence_length: int, include_sentences_without_entities: bool, include_unk_entities: bool, pool_size: int, chunk_size: int, max_num_documents: int, ): target_titles = [ title for title in dump_db.titles() if not (":" in title and title.lower().split(":")[0] in ("image", "file", "category")) ] random.shuffle(target_titles) if max_num_documents is not None: target_titles = target_titles[:max_num_documents] max_num_tokens = max_seq_length - 2 # 2 for [CLS] and [SEP] tokenizer.save_pretrained(output_dir) entity_vocab.save(os.path.join(output_dir, ENTITY_VOCAB_FILE)) number_of_items = 0 tf_file = os.path.join(output_dir, DATASET_FILE) options = tf.io.TFRecordOptions( tf.compat.v1.io.TFRecordCompressionType.GZIP) with TFRecordWriter(tf_file, options=options) as writer: with tqdm(total=len(target_titles)) as pbar: initargs = ( dump_db, tokenizer, sentence_tokenizer, entity_vocab, max_num_tokens, max_entity_length, max_mention_length, min_sentence_length, include_sentences_without_entities, include_unk_entities, ) with closing( Pool(pool_size, initializer=WikipediaPretrainingDataset. _initialize_worker, initargs=initargs)) as pool: for ret in pool.imap( WikipediaPretrainingDataset._process_page, target_titles, chunksize=chunk_size): for data in ret: writer.write(data) number_of_items += 1 pbar.update() with open(os.path.join(output_dir, METADATA_FILE), "w") as metadata_file: json.dump( dict( number_of_items=number_of_items, max_seq_length=max_seq_length, max_entity_length=max_entity_length, max_mention_length=max_mention_length, min_sentence_length=min_sentence_length, tokenizer_class=tokenizer.__class__.__name__, language=dump_db.language, ), metadata_file, indent=2, )
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, datasets_cache_dir: str = None, chunk_size: int = 2500, overwrite_cache: bool = False, progress: bool = True): assert os.path.isfile( file_path), f"Input file path {file_path} not found" if datasets_cache_dir is None: datasets_cache_dir = tempfile.mkdtemp() else: found_cache = ( not overwrite_cache and os.path.exists(os.path.join(datasets_cache_dir, 'arr.dat')) and os.path.exists( os.path.join(datasets_cache_dir, 'idx_arr.dat'))) os.makedirs(datasets_cache_dir, exist_ok=True) self.memmap_index_dataset = MemmapIndexDataset( os.path.join(datasets_cache_dir, 'arr.dat'), os.path.join(datasets_cache_dir, 'idx_arr.dat')) if found_cache: logger.info("Found cached features at %s", datasets_cache_dir) self.memmap_index_dataset.load() return else: # Handle overwrite_cache case self.memmap_index_dataset.clear() logger.info("Creating features from dataset file at %s", file_path) eos_token_id = tokenizer.eos_token_id bos_token_id = tokenizer.bos_token_id tokenizer_vocab = tokenizer.get_vocab() if '▁' in tokenizer_vocab: newline_token_id = tokenizer_vocab['▁'] elif '\n' in tokenizer_vocab: newline_token_id = tokenizer_vocab['\n'] usable_block_size = block_size - 2 def add_to_block(ids, block, blocks): """ Add indices to block, if the combined size of indices and block + 2 (bos, eos) exceed block_size, add the block to blocks if block is not empty then try to add indices again. """ size = len(block) + len(ids) if block: size += 1 if size > usable_block_size: blocks.append([bos_token_id] + block + [eos_token_id]) return add_to_block(ids, [], blocks) else: block.append(newline_token_id) block.extend(ids) return block else: if size > usable_block_size: return [] else: return ids skipped_n = 0 lines = [] block = [] with open(file_path, encoding="utf-8") as f: file_size = get_file_size(f) for line in _readline_clean_and_strip(f): lines.append(line) if len(lines) >= chunk_size: batch_encoding = tokenizer(lines, add_special_tokens=False, truncation=True, max_length=usable_block_size + 1) input_ids = batch_encoding["input_ids"] blocks = [] for ids in input_ids: block = add_to_block(ids, block, blocks) if not block: skipped_n += 1 lines = [] self.memmap_index_dataset.add(blocks) if progress: print(f'\rProcessed {f.tell() / file_size * 100:.2f}%', flush=True, end=' ') if len(lines) > 0: batch_encoding = tokenizer(lines, add_special_tokens=False, truncation=True, max_length=usable_block_size + 1) input_ids = batch_encoding["input_ids"] blocks = [] for ids in input_ids: block = add_to_block(ids, block, blocks) if not block: skipped_n += 1 if block: blocks.append(block) self.memmap_index_dataset.add(blocks) if progress: print(f'\rProcessed {f.tell() / file_size * 100:.2f}%', flush=True, end=' ') print() logger.info(f'Skipped {skipped_n}')
def get_tfds( train_file: str, eval_file: str, test_file: str, tokenizer: PreTrainedTokenizer, label_column_id: int, max_seq_length: Optional[int] = None, ): files = {} if train_file is not None: files[datasets.Split.TRAIN] = [train_file] if eval_file is not None: files[datasets.Split.VALIDATION] = [eval_file] if test_file is not None: files[datasets.Split.TEST] = [test_file] ds = datasets.load_dataset("csv", data_files=files) features_name = list(ds[list(files.keys())[0]].features.keys()) label_name = features_name.pop(label_column_id) label_list = list(set(ds[list(files.keys())[0]][label_name])) label2id = {label: i for i, label in enumerate(label_list)} input_names = tokenizer.model_input_names transformed_ds = {} if len(features_name) == 1: for k in files.keys(): transformed_ds[k] = ds[k].map( lambda example: tokenizer.batch_encode_plus( example[features_name[0]], truncation=True, max_length=max_seq_length, padding="max_length"), batched=True, ) elif len(features_name) == 2: for k in files.keys(): transformed_ds[k] = ds[k].map( lambda example: tokenizer.batch_encode_plus( (example[features_name[0]], example[features_name[1]]), truncation=True, max_length=max_seq_length, padding="max_length", ), batched=True, ) def gen_train(): for ex in transformed_ds[datasets.Split.TRAIN]: d = {k: v for k, v in ex.items() if k in input_names} label = label2id[ex[label_name]] yield (d, label) def gen_val(): for ex in transformed_ds[datasets.Split.VALIDATION]: d = {k: v for k, v in ex.items() if k in input_names} label = label2id[ex[label_name]] yield (d, label) def gen_test(): for ex in transformed_ds[datasets.Split.TEST]: d = {k: v for k, v in ex.items() if k in input_names} label = label2id[ex[label_name]] yield (d, label) train_ds = (tf.data.Dataset.from_generator( gen_train, ({k: tf.int32 for k in input_names}, tf.int64), ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), ) if datasets.Split.TRAIN in transformed_ds else None) if train_ds is not None: train_ds = train_ds.apply( tf.data.experimental.assert_cardinality( len(ds[datasets.Split.TRAIN]))) val_ds = (tf.data.Dataset.from_generator( gen_val, ({k: tf.int32 for k in input_names}, tf.int64), ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), ) if datasets.Split.VALIDATION in transformed_ds else None) if val_ds is not None: val_ds = val_ds.apply( tf.data.experimental.assert_cardinality( len(ds[datasets.Split.VALIDATION]))) test_ds = (tf.data.Dataset.from_generator( gen_test, ({k: tf.int32 for k in input_names}, tf.int64), ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])), ) if datasets.Split.TEST in transformed_ds else None) if test_ds is not None: test_ds = test_ds.apply( tf.data.experimental.assert_cardinality( len(ds[datasets.Split.TEST]))) return train_ds, val_ds, test_ds, label2id
def __init__(self, tokenizer: PreTrainedTokenizer, split: str, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1): block_size = 4096 self.block_size = 4096 directory = './processed_files' cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), split, ), ) with torch_distributed_zero_first(local_rank): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) self.examples = [ ex for ex in self.examples[:int(len(self.examples))] if len(ex[0]) == self.block_size and len(ex[1]) == self.block_size ] self.examples = [ex for ex in self.examples[:]] logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: input_path = './multinews/train.txt.src' logger.info( f"Creating features from dataset file at {input_path}") encoder_func = select_words_to_mask_special_tokens_only_multiple_docs self.examples = [] self.masking_samples = [] corpus = read_in_train_set(input_path) ln = [] for i in range(len(corpus)): sample = corpus[i].strip() articles = sample.split("story_separator_special_tag") ln.append(articles) tokenizer.add_tokens(['<doc-s>'], special_tokens=True) tokenizer.add_tokens(['</doc-s>'], special_tokens=True) stats = [] while len(stats) < 64 * 25 * 1000: for topic in ln: if len(topic) > 2: s = random.sample(topic, len(topic)) examp, st = encoder_func(s, tokenizer, block_size) self.examples.append(examp) stats.append(st) # Uncomment for creating data for the random baseline # while len(self.examples) < 64*25*1000: # s = random.sample(ln, 10) # curr_false_topic = [] # for topic in s: # curr_false_topic.append(random.sample(topic, 1)[0]) # examp, st = encoder_func(curr_false_topic, tokenizer, block_size) # self.examples.append(examp) # stats.append(st) start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)
def _has_initial_cls_token(tokenizer: PreTrainedTokenizer) -> bool: # most models have CLS token as last token (GPT-1, GPT-2, TransfoXL, XLNet, XLM), but BERT is initial tokens = tokenizer.encode('a') initial_cls_token: bool = False if tokens[0] == tokenizer.cls_token_id: initial_cls_token = True return initial_cls_token
def __init__( self, tokenizer: PreTrainedTokenizer, langs_to_id: dict, file_path: str, block_size: int, fix_ahalf: bool, overwrite_cache=False, cache_dir: Optional[str] = None, ): assert os.path.isfile( file_path), f"Input file path {file_path} not found" # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( cache_dir if cache_dir is not None else directory, "cached_transfer_lm_{}_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, 'fix_ahalf' if fix_ahalf else 'fix_bhalf', ), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: cache_data = pickle.load(handle) self.src = cache_data['src'] self.src_langids = cache_data['src_langids'] self.tgt = cache_data['tgt'] self.tgt_langids = cache_data['tgt_langids'] self.align = cache_data['align'] self.for_mlm = cache_data['for_mlm'] logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info( f"Creating features from dataset file at {directory}") self.src = [] self.src_langids = [] self.tgt = [] self.tgt_langids = [] self.align = [] self.for_mlm = [] cls_token_id = tokenizer.cls_token_id sep_token_id = tokenizer.sep_token_id with open(file_path, encoding="utf-8") as fin: for line in fin: line = line.strip() if len(line) > 0: line = json.loads(line) src_ids = tokenizer.convert_tokens_to_ids( line['src']) src_langid = langs_to_id[line['src_lang']] assert len(src_ids) <= block_size and src_ids[ 0] == cls_token_id and src_ids[ -2] == sep_token_id and src_ids[ -1] == tokenizer.pad_token_id tgt_ids = tokenizer.convert_tokens_to_ids( line['tgt']) tgt_langid = langs_to_id[line['tgt_lang']] assert len(tgt_ids) <= block_size and tgt_ids[ 0] == cls_token_id and tgt_ids[ -1] == sep_token_id align_index = line['align_index'] assert len(tgt_ids) == len(align_index) pad_align_index = [len(src_ids) - 1] * block_size pad_align_index[:len(tgt_ids)] = align_index pad_src_ids = [tokenizer.pad_token_id] * block_size pad_src_ids[:len(src_ids)] = src_ids pad_tgt_ids = [tokenizer.pad_token_id] * block_size pad_tgt_ids[:len(tgt_ids)] = tgt_ids self.src.append(pad_src_ids) self.src_langids.append(src_langid) self.tgt.append(pad_tgt_ids) self.tgt_langids.append(tgt_langid) self.align.append(pad_align_index) self.for_mlm.append(False) # for mlm if fix_ahalf: self.src.append(pad_src_ids) self.src_langids.append(src_langid) self.tgt.append(pad_src_ids) self.tgt_langids.append(src_langid) self.align.append(list(range(block_size))) self.for_mlm.append(True) else: self.src.append(pad_tgt_ids) self.src_langids.append(tgt_langid) self.tgt.append(pad_tgt_ids) self.tgt_langids.append(tgt_langid) self.align.append(list(range(block_size))) self.for_mlm.append(True) start = time.time() with open(cached_features_file, "wb") as handle: cache_data = { 'src': self.src, 'src_langids': self.src_langids, 'tgt': self.tgt, 'tgt_langids': self.tgt_langids, 'align': self.align, 'for_mlm': self.for_mlm } pickle.dump(cache_data, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)