def __iter__(self): # iterator to load data batch = [] count = 0 for line in itertools.islice(self.lines, 1, None): is_next = int(line[0]) tokens_a = self.tokenize(line[1]) tokens_b = self.tokenize(line[2]) truncate_tokens_pair(tokens_a, tokens_b, self.max_len) instance = (is_next, tokens_a, tokens_b) for proc in self.pipeline: instance = proc(instance) batch.append(instance) count += 1 if count == self.batch_size: batch_tensors = [ torch.tensor(x, dtype=torch.long) for x in zip(*batch) ] yield batch_tensors count = 0 batch = [] self.f_pos.seek(0)
def __call__(self, instance): is_next, tokens_a, tokens_b = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) input_mask = [1] * len(tokens) # the number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob)))) # For masked Language Models masked_tokens, masked_pos, tokens = _sample_mask( tokens, self.mask_alpha, self.mask_beta, self.max_gram, goal_num_predict=n_pred) # masked_token -> 마스크 된 token의 원래 값 # masked_pos -> 마스크 된 token의 index 값 # tokens > 마스크 처리된 전체 tokens([CLS] + masked_sentence_A + [SEP] + masked_setence_B + [SEP]) masked_weights = [1] * len(masked_tokens) # Token Indexing input_ids = self.indexer(tokens) masked_ids = self.indexer(masked_tokens) """ Indexer def convert_tokens_to_ids(vocab, tokens): # Converts a sequence of tokens into ids using the vocab. ids = [] for token in tokens: ids.append(vocab[token]) return ids """ # Zero Padding n_pad = self.max_len - len(input_ids) input_ids.extend([0] * n_pad) segment_ids.extend([0] * n_pad) input_mask.extend([0] * n_pad) # Zero Padding for masked target if self.max_pred > len(masked_ids): masked_ids.extend([0] * (self.max_pred - len(masked_ids))) if self.max_pred > len(masked_pos): masked_pos.extend([0] * (self.max_pred - len(masked_pos))) if self.max_pred > len(masked_weights): masked_weights.extend([0] * (self.max_pred - len(masked_weights))) return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next)
def __call__(self, instance): label, tokens_a, tokens_b = instance # -3 special tokens for [CLS] text_a [SEP] text_b [SEP] # -2 special tokens for [CLS] text_a [SEP] _max_len = self.max_len - 3 if tokens_b else self.max_len - 2 truncate_tokens_pair(tokens_a, tokens_b, _max_len) # Add Special Tokens tokens_a = ['[CLS]'] + tokens_a + ['[SEP]'] tokens_b = tokens_b + ['[SEP]'] if tokens_b else [] return (label, tokens_a, tokens_b)
def __call__(self, instance): is_next, tokens_a, tokens_b = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) input_mask = [1] * len(tokens) # For masked Language Models masked_tokens, masked_pos = [], [] # the number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob)))) # candidate positions of masked tokens cand_pos = [ i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]' ] shuffle(cand_pos) for pos in cand_pos[:n_pred]: masked_tokens.append(tokens[pos]) masked_pos.append(pos) if rand() < 0.8: # 80% tokens[pos] = '[MASK]' elif rand() < 0.5: # 10% tokens[pos] = get_random_word(self.vocab_words) # when n_pred < max_pred, we only calculate loss within n_pred masked_weights = [1] * len(masked_tokens) # Token Indexing input_ids = self.indexer(tokens) masked_ids = self.indexer(masked_tokens) # Zero Padding n_pad = self.max_len - len(input_ids) input_ids.extend([0] * n_pad) segment_ids.extend([0] * n_pad) input_mask.extend([0] * n_pad) # Zero Padding for masked target if self.max_pred > n_pred: n_pad = self.max_pred - n_pred masked_ids.extend([0] * n_pad) masked_pos.extend([0] * n_pad) masked_weights.extend([0] * n_pad) return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next)
def __call__(self, instance): label, tokens_a, tokens_b = instance #print(tokens_a) _max_len = self.max_len - 3 if tokens_b else self.max_len - 2 truncate_tokens_pair(tokens_a, tokens_b, _max_len) # -3 special tokens for [CLS] text_a [SEP] text_b [SEP] # -2 special tokens for [CLS] text_a [SEP] # Add Special Tokens tokens_a = tokens_a #print(label) return (label, tokens_a)
def __call__(self, instance): is_next, tokens_a, tokens_b = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) input_mask = [1] * len(tokens) # the number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob)))) original_ids = self.indexer(tokens) # For masked Language Models masked_tokens, masked_pos, tokens = _sample_mask( tokens, self.mask_alpha, self.mask_beta, self.max_gram, goal_num_predict=n_pred) masked_weights = [1] * len(masked_tokens) # Token Indexing input_ids = self.indexer(tokens) masked_ids = self.indexer(masked_tokens) # Zero Padding n_pad = self.max_len - len(input_ids) original_ids.extend([0] * n_pad) input_ids.extend([0] * n_pad) segment_ids.extend([0] * n_pad) input_mask.extend([0] * n_pad) # Zero Padding for masked target if self.max_pred > len(masked_ids): masked_ids.extend([0] * (self.max_pred - len(masked_ids))) if self.max_pred > len(masked_pos): masked_pos.extend([0] * (self.max_pred - len(masked_pos))) if self.max_pred > len(masked_weights): masked_weights.extend([0] * (self.max_pred - len(masked_weights))) # Author implementation isn't exact the same as original bert model # as masked_ids only contain the un-masked token return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next, original_ids)
def __call__(self, data): is_next, tokens_a, tokens_b = data truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) input_mask = [1] * len(tokens) # For masked Language Models masked_tokens, masked_pos = [], [] n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob)))) cand_pos = [ i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]' ] shuffle(cand_pos) for pos in cand_pos[:int(n_pred)]: masked_tokens.append(tokens[pos]) masked_pos.append(pos) if rand() < 0.8: # 80% tokens[pos] = '[MASK]' elif rand() < 0.5: # 10% tokens[pos] = get_random_word(self.indexer.vocab) masked_weights = [1] * len(masked_tokens) # Token Indexing input_ids = self.indexer.convert_tokens_to_ids(tokens) masked_ids = self.indexer.convert_tokens_to_ids(masked_tokens) # Zero Padding n_pad = self.max_len - len(input_ids) input_ids.extend([0] * int(n_pad)) segment_ids.extend([0] * int(n_pad)) input_mask.extend([0] * int(n_pad)) # Zero Padding for masked target if self.max_pred > n_pred: n_pad = self.max_pred - n_pred masked_ids.extend([0] * int(n_pad)) masked_pos.extend([0] * int(n_pad)) masked_weights.extend([0] * int(n_pad)) return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next)
def __call__(self, tokens_a): # -2 for special tokens [CLS], [SEP] truncate_tokens_pair(tokens_a, [], self.max_len - 2) # Add Special Tokens tokens = ['[CLS]'] + tokens_a + ['[SEP]'] token_type_ids = [0] * self.max_len attention_mask = [1] * len(tokens) original_attention_mask = attention_mask.copy() # Get ElectraGenerator label. "-100" means the corresponding token is unmasked, else means the masked token ids g_label = [-100] * self.max_len # Get original input ids as ElectraDiscriminator labels original_input_ids = self.indexer(tokens) # For masked Language Models # The number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(tokens) * self.mask_prob)))) # candidate positions of masked tokens cand_pos = [ i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]' ] shuffle(cand_pos) for pos in cand_pos[:n_pred]: attention_mask[pos] = 0 g_label[pos] = self.indexer( tokens[pos])[0] # get the only one element from list tokens[pos] = '[MASK]' # Token Indexing input_ids = self.indexer(tokens) # Zero Padding n_pad = self.max_len - len(input_ids) input_ids.extend([0] * n_pad) attention_mask.extend([0] * n_pad) return input_ids, attention_mask, token_type_ids, g_label, original_input_ids, original_attention_mask
def __call__(self, instance): input_tokens, input_pos, input_dep, target_tokens, target_pos, target_dep = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(input_tokens, target_tokens, self.max_len - 3) truncate_tokens_pair(input_pos, target_pos, self.max_len - 3) truncate_tokens_pair(input_dep, target_dep, self.max_len - 3) target_tokens = truncate_tokens(target_tokens, self.max_len) target_pos = truncate_tokens(target_pos, self.max_len) target_dep = truncate_tokens(target_dep, self.max_len) # Add Special Tokens origin_word_tokens = ['[CLS]'] + input_tokens + [ '[SEP]' ] + target_tokens + ['[SEP]'] if rand() < 0.5: word_tokens = origin_word_tokens else: word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + ( ['[MASK]'] * len(target_tokens)) + ['[SEP]'] #word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + target_tokens + ['[SEP]'] pos_tokens = ['[CLS]'] + input_pos + ['[SEP]'] + target_pos + ['[SEP]'] dep_tokens = ['[CLS]'] + input_dep + ['[SEP]'] + target_dep + ['[SEP]'] input_segment_ids = [0] * (len(input_tokens) + 2) + [1] * (len(target_tokens) + 1) input_mask = [1] * len(word_tokens) target_mask = [1] * (len(target_tokens) + 1) input_len = len(input_tokens) + 2 target_len = len(target_tokens) + 1 input_word_ids, input_pos_ids, input_dep_ids = self.indexer( word_tokens, pos_tokens, dep_tokens) origin_input_word_ids, _, _ = self.indexer(origin_word_tokens, [], []) target_word_ids, target_pos_ids, target_dep_ids = self.indexer( target_tokens + ['[SEP]'], target_pos + ['[SEP]'], target_dep + ['[SEP]']) # Zero Padding input_n_pad = self.max_len - len(input_word_ids) origin_input_word_ids.extend([0] * input_n_pad) input_word_ids.extend([0] * input_n_pad) input_pos_ids.extend([0] * input_n_pad) input_dep_ids.extend([0] * input_n_pad) input_segment_ids.extend([0] * input_n_pad) input_mask.extend([0] * input_n_pad) target_n_pad = self.max_len - len(target_word_ids) target_word_ids.extend([0] * target_n_pad) target_pos_ids.extend([0] * target_n_pad) target_dep_ids.extend([0] * target_n_pad) target_mask.extend([0] * target_n_pad) return (origin_input_word_ids, input_word_ids, input_pos_ids, input_dep_ids, input_segment_ids, input_mask, target_word_ids, target_pos_ids, target_dep_ids, target_mask, input_len, target_len)
def __call__(self, instance): input_tokens, input_pos, input_dep, target_tokens, target_pos, target_dep = instance # -3 for special tokens [CLS], [SEP], [SEP] truncate_tokens_pair(input_tokens, target_tokens, self.max_len - 3) truncate_tokens_pair(input_pos, target_pos, self.max_len - 3) truncate_tokens_pair(input_dep, target_dep, self.max_len - 3) target_tokens = truncate_tokens(target_tokens, self.max_len) target_pos = truncate_tokens(target_pos, self.max_len) target_dep = truncate_tokens(target_dep, self.max_len) # Add Special Tokens word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + target_tokens + [ '[SEP]' ] pos_tokens = ['[CLS]'] + input_pos + ['[SEP]'] + target_pos + ['[SEP]'] dep_tokens = ['[CLS]'] + input_dep + ['[SEP]'] + target_dep + ['[SEP]'] input_segment_ids = [0] * (len(input_tokens) + 2) + [1] * (len(target_tokens) + 1) input_mask = [1] * len(word_tokens) target_mask = [1] * len(target_tokens) # For masked Language Models masked_word_tokens, masked_pos_tokens, masked_dep_tokens, masked_pos = [], [], [], [] # the number of prediction is sometimes less than max_pred when sequence is short n_pred = min(self.max_pred, max(1, int(round(len(word_tokens) * self.mask_prob)))) # candidate positions of masked tokens # #cand_pos = [i for i, token in enumerate(word_tokens) # if word_tokens != '[CLS]' and word_tokens != '[SEP]'] #Detect SEP for summary cand_pos = [ i for i, token in enumerate(word_tokens) if word_tokens != '[CLS]' ] shuffle(cand_pos) for pos in cand_pos[:n_pred]: masked_word_tokens.append(word_tokens[pos]) masked_pos_tokens.append(pos_tokens[pos]) masked_dep_tokens.append(dep_tokens[pos]) masked_pos.append(pos) if rand() < 0.8: # 80% word_tokens[pos] = '[MASK]' pos_tokens[pos] = '[MASK]' dep_tokens[pos] = '[MASK]' #elif rand() < 0.5: # 10% # word_tokens[pos] = get_random_word(self.vocab_words) # pos_tokens[pos] = get_random_word(self.vocab_pos) # dep_tokens[pos] = get_random_word(self.vocab_dep) # when n_pred < max_pred, we only calculate loss within n_pred masked_weights = [1] * len(masked_pos_tokens) #replace right as mask for summary #if rand() < 0.1: # word_tokens = word_tokens[:len(input_tokens)+2] + ['[MASK]']*len(self.max_len - len(input_tokens)+2) # pos_tokens = pos_tokens[:len(input_pos)+2] + ['[MASK]']*len(self.max_len - len(input_pos)+2) # dep_tokens = dep_tokens[:len(input_dep)+2] + ['[MASK]']*len(self.max_len - len(input_dep)+2) input_word_ids, input_pos_ids, input_dep_ids = self.indexer( word_tokens, pos_tokens, dep_tokens) masked_word_ids, masked_pos_ids, masked_dep_ids = self.indexer( masked_word_tokens, masked_pos_tokens, masked_dep_tokens) target_word_ids, target_pos_ids, target_dep_ids = self.indexer( target_tokens, target_pos, target_dep) # Zero Padding input_n_pad = self.max_len - len(input_word_ids) input_word_ids.extend([0] * input_n_pad) input_pos_ids.extend([0] * input_n_pad) input_dep_ids.extend([0] * input_n_pad) input_segment_ids.extend([0] * input_n_pad) input_mask.extend([0] * input_n_pad) target_n_pad = self.max_len - len(target_word_ids) target_word_ids.extend([0] * target_n_pad) target_pos_ids.extend([0] * target_n_pad) target_dep_ids.extend([0] * target_n_pad) target_mask.extend([0] * target_n_pad) # Zero Padding for masked target if self.max_pred > n_pred: n_pad = self.max_pred - n_pred masked_word_ids.extend([0] * n_pad) masked_pos_ids.extend([0] * n_pad) masked_dep_ids.extend([0] * n_pad) masked_pos.extend([0] * n_pad) masked_weights.extend([0] * n_pad) return (input_word_ids, input_pos_ids, input_dep_ids, input_segment_ids, input_mask, masked_word_ids, masked_pos_ids, masked_dep_ids, masked_pos, masked_weights, target_word_ids, target_pos_ids, target_dep_ids, target_mask)