def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] pred_len = len(tokenizer.convert_tokens_to_ids( target)) if target is not None else len(previous) t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - pred_len - reserved_len, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_begin(tokenizer)] t_input.extend(previous) t_input_id = tokenizer.convert_tokens_to_ids(t_input) target_start = len(t_input_id) - 1 row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen if target is not None: t_input_id.extend( tokenizer.convert_tokens_to_ids(target[:len(target)])) tokenized_target_id = [-1] * target_start # tokenized_target_id = tokenizer.convert_tokens_to_ids(t_input[1:]) tokenized_target_id.extend( tokenizer.convert_tokens_to_ids(target + [tok.tok_sep(tokenizer)])) tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = [-1] * target_start # tokenized_ntarget_id = tokenizer.convert_tokens_to_ids(t_input[1:]) tokenized_ntarget_id.extend(tokenized_ntarget) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id mask_id = [1] * len(t_input_id) t_input_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(t_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['input'] = t_input_id row_dict['mask'] = mask_id row_dict['start'] = target_start feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, tasks, task, input, target=None, handle_exceed='slide', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() row_dict['task'] = task input_token = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_sep(tokenizer)] tokenized_input_id = tokenizer.convert_tokens_to_ids(input_token) mask_id = [1] * len(tokenized_input_id) tokenized_input_id.extend([tokenizer.pad_token_id] * (maxlen - len(tokenized_input_id))) mask_id.extend([-1] * (maxlen - len(mask_id))) row_dict['input'] = tokenized_input_id row_dict['mask'] = mask_id row_dict['target'] = [-1] if target is not None: if 'multi_label' in task: mlb = MultiLabelBinarizer(classes=tasks[task]) tar = mlb.fit_transform([target]) tokenize_label = tar else: tokenize_label = [tasks[task].index(target[0])] row_dict['target'] = tokenize_label feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0, handle_exceed='start_slice', add_end_tok=True, **kwargs): feature_dict_list = [] tokenized_target = tokenizer.tokenize(target) if target is not None else [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed) for t_input in t_input_list: # -2 for cls and sep and prediction end sep row_dict = dict() tokenized_input = [ tok.tok_begin(tokenizer) ] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)] row_dict['target'] = [-1] * maxlen row_dict['target_once'] = [-1] * maxlen tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) target_start = len(tokenized_input_id) target_end = maxlen target_length = target_end - target_start if target is not None: if add_end_tok: tokenized_target += [tok.tok_sep(tokenizer)] tokenized_target_id = [] tokenized_target_once_id = [-1] * len(tokenized_input) target_ids = tokenizer.convert_tokens_to_ids(tokenized_target) target_length = len(target_ids) tokenized_target_id.extend(target_ids) tokenized_target_once_id.extend(target_ids) target_end = len(tokenized_target_id) - 1 tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) tokenized_target_once_id.extend( [-1] * (maxlen - len(tokenized_target_once_id))) row_dict['target'] = tokenized_target_id row_dict['target_once'] = tokenized_target_once_id input_length = min(maxlen, target_start * 3) tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id))) mask_id = [1] * input_length mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['input'] = tokenized_input_id row_dict['mask'] = mask_id row_dict['start'] = target_start row_dict['end'] = target_end row_dict['input_length'] = input_length row_dict['target_length'] = target_length feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2 - len(previous) - 1, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer)] + \ t_input[:maxlen - reserved_len - 2] + \ [tok.tok_sep(tokenizer)] t_input.extend(previous) t_input.append(tok.tok_mask(tokenizer)) t_input_id = tokenizer.convert_tokens_to_ids(t_input) mask_id = [1] * len(t_input) target_start = len(t_input_id) - 1 target_end = maxlen t_input_id.extend([0] * (maxlen - len(t_input_id))) row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen tokenized_target_id = None if target is not None: tokenized_target_id = [-1] * target_start tokenized_target_id.append( tokenizer.convert_tokens_to_ids(target)[-1]) target_end = len(tokenized_target_id) - 1 tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = [-1] * target_start tokenized_ntarget_id.extend(tokenized_ntarget) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id mask_id.extend([0] * (maxlen - len(mask_id))) type_id = [0] * len(t_input) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = t_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id row_dict['start'] = target_start row_dict['end'] = target_end feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, target=None, handle_exceed='start_slice', **kwargs): feature_dict_list = [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer)] + t_input + [tok.tok_sep(tokenizer)] tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) row_dict['target'] = [-1] * maxlen if target is not None: tokenized_target = [] targets_pointer = 0 for tok_pos, text in enumerate(tokenized_input): if text == tok.tok_mask(tokenizer): if targets_pointer == int(target): tok_target = 1 else: tok_target = 0 tokenized_target.extend([tok_target]) targets_pointer += 1 else: tokenized_target.append(-1) tokenized_target.extend([-1] * (maxlen - len(tokenized_target))) row_dict['target'] = tokenized_target target_pos_list = [] for tok_pos, text in enumerate(tokenized_input): if text == tok.tok_mask(tokenizer): target_pos_list.append(tok_pos) target_pos_list.extend([0] * (4 - len(target_pos_list))) if len(target_pos_list) != 4: continue row_dict['target_pos'] = target_pos_list mask_id = [1] * len(tokenized_input) type_id = [0] * len(tokenized_input) tokenized_input_id.extend( [tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)])[0]] * (maxlen - len(tokenized_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = tokenized_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0, handle_exceed='start_slice', add_end_tok=True, **kwargs): feature_dict_list = [] tokenized_target = tokenizer.tokenize(target) if target is not None else [] t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed) for t_input in t_input_list: # -2 for cls and sep and prediction end sep row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer)] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)] mask_id = [1] * len(tokenized_input) type_id = [0] * len(tokenized_input) row_dict['target'] = [-1] * maxlen row_dict['ntarget'] = [-1] * maxlen tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input) target_start = len(tokenized_input_id) if target is not None: if add_end_tok: tokenized_target += [tok.tok_sep(tokenizer)] tokenized_target_id = [-1] * len(tokenized_input) tokenized_target_id.extend(tokenizer.convert_tokens_to_ids(tokenized_target)) tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id))) print(len(tokenized_target_id), len(tokenized_input), len(t_input), tokenized_target_id) row_dict['target'] = tokenized_target_id if ntarget is not None: tokenized_ntarget = tokenizer.tokenize(ntarget) tokenized_ntarget_id = [-1] * target_start tokenized_ntarget_id.extend(tokenizer.convert_tokens_to_ids(tokenized_ntarget)) tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id))) row_dict['ntarget'] = tokenized_ntarget_id tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id))) mask_id.extend([0] * (maxlen - len(mask_id))) type_id.extend([1] * (maxlen - len(type_id))) row_dict['input'] = tokenized_input_id row_dict['type'] = type_id row_dict['mask'] = mask_id row_dict['start'] = target_start feature_dict_list.append(row_dict) return feature_dict_list
def preprocessing_data(item, tokenizer, maxlen=512, handle_exceed='start_slice', likelihood=['none', 'pos', 'neg', 'both'], reserved_len=0, **kwargs): likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood tasks, task, input, targets = item p_target, n_target = targets input = input.strip() tokenized_target = tokenizer.tokenize(" ".join(p_target)) param_dict = { 'tokenizer': tokenizer, 'maxlen': maxlen, 'handle_exceed': handle_exceed, 'reserved_len': reserved_len } if "neg" in likelihood or 'both' in likelihood: # formatting neg data in csv if n_target is None: ntext_arr = [ tokenizer.convert_tokens_to_string([tok.tok_begin(tokenizer)] + tokenized_target) ] elif "[SEP]" in n_target: ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")] else: ntext_arr = [n_target.strip()] for neg_text in ntext_arr: yield get_feature_from_data, { **{ 'input': input, 'previous': [], 'target': tokenized_target, 'ntarget': neg_text }, **param_dict } else: yield get_feature_from_data, { **{ 'input': input, 'previous': [], 'target': tokenized_target, 'ntarget': None }, **param_dict } # whole sentence masking if 'pos' in likelihood: yield once.get_feature_from_data, { **{ 'input': input, 'target': " ".join(p_target) }, **param_dict } elif 'both' in likelihood: # formatting neg data in csv if n_target is None: ntext_arr = [] elif "[SEP]" in n_target: ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")] else: ntext_arr = [n_target.strip()] for neg_text in ntext_arr: yield once.get_feature_from_data, { **{ 'input': input, 'target': " ".join(p_target), 'ntarget': neg_text }, **param_dict }
def get_feature_from_data(tokenizer, maxlen, input, previous, target=None, ntarget=None, reserved_len=0, handle_exceed='noop', **kwargs): feature_dict_list = [] pred_len = len(tokenizer.convert_tokens_to_ids( target)) + 1 if target is not None else len(previous) - 1 t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2 - pred_len, handle_exceed) for t_input in t_input_list: # -2 for cls and sep row_dict = dict() t_input = [tok.tok_begin(tokenizer)] + \ t_input[:maxlen - reserved_len - 2] + \ [tok.tok_sep(tokenizer)] t_input_id = tokenizer.convert_tokens_to_ids(t_input) encoder_mask_id = [1] * (len(t_input)) encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id))) t_input_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(t_input_id))) if target is not None: tokenized_target_id = [] tokenized_prev_id = [] tokenized_prev_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_begin(tokenizer)] + target)) tokenized_target_id.extend( tokenizer.convert_tokens_to_ids(target + [tok.tok_sep(tokenizer)])) decoder_mask_id = [1] * (len(tokenized_prev_id)) decoder_mask_id.extend([0] * (maxlen - len(decoder_mask_id))) tokenized_prev_id.extend( tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) * (maxlen - len(tokenized_prev_id))) tokenized_target_id.extend([-100] * (maxlen - len(tokenized_target_id))) row_dict['target'] = tokenized_target_id row_dict['prev'] = tokenized_prev_id if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0: tokenized_ntarget = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(ntarget)) tokenized_ntarget_id = tokenized_ntarget tokenized_ntarget_id.extend( [-100] * (maxlen - len(tokenized_ntarget_id))) if len(tokenized_ntarget_id) <= maxlen: row_dict['ntarget'] = tokenized_ntarget_id else: tokenized_prev_id = [ tokenizer.convert_tokens_to_ids(tok.tok_begin(tokenizer)) ] tokenized_prev_id.extend(tokenizer.convert_tokens_to_ids(previous)) target_start = len(tokenized_prev_id) - 1 row_dict['start'] = target_start decoder_mask_id = [1] * (len(tokenized_prev_id)) row_dict['prev'] = tokenized_prev_id row_dict['input'] = t_input_id row_dict['encoder_mask'] = encoder_mask_id row_dict['decoder_mask'] = decoder_mask_id feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, labels, input, target=None, maxlen=512, separator=" ", handle_exceed='slide'): feature_dict_list = [] mapping_index = [] pos = 1 # cls as start 0 for i in input.split(" "): for _ in range(len(tokenizer.tokenize(i))): if _ < 1: mapping_index.append({'char': i, 'pos': pos}) pos += 1 if target is not None: target = target.split(separator) t_input_list, t_pos_list = tok.handle_exceed(tokenizer, input, maxlen - 2, mode=handle_exceed, keep_after_sep=False) for t_input, t_pos in zip(t_input_list, t_pos_list): # -2 for cls and sep # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer) ] + t_input + [tok.tok_sep(tokenizer)] input_id = tokenizer.convert_tokens_to_ids(tokenized_input) if target is not None: target_token = [] pev = 0 for tok_map, target_label in zip(mapping_index, target): if t_pos[0] < tok_map['pos'] <= t_pos[1]: for _ in range(tok_map['pos'] - pev): target_token += [labels.index(target_label)] pev = tok_map['pos'] if "O" in labels: target_id = [labels.index("O") ] + target_token + [labels.index("O")] else: target_id = [target_token[0] ] + target_token + [target_token[-1]] if len(input_id) != len(target_id): print("input target len not equal ", len(input_id), len(target_id)) target_id.extend([0] * (maxlen - len(target_id))) row_dict['target'] = target_id map_start = 0 map_end = len(mapping_index) for pos, tok_map in enumerate(mapping_index): if t_pos[0] == tok_map['pos']: map_start = pos elif t_pos[1] == tok_map['pos']: map_end = pos row_dict['mapping'] = mapping_index[map_start:map_end + 1] mask_id = [1] * len(input_id) mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['mask'] = mask_id row_dict['end'] = len(input_id) input_id.extend([0] * (maxlen - len(input_id))) row_dict['input'] = input_id row_dict['pos'] = [map_start, map_end] feature_dict_list.append(row_dict) return feature_dict_list
def get_feature_from_data(tokenizer, labels, input, target=None, maxlen=512, separator=" ", handle_exceed='slide'): feature_dict_list = [] word_token_mapping = [] token_word_mapping = [] pos = 0 for word_i, word in enumerate(input.split(separator)): tokenize_word = tokenizer.tokenize(word) for _ in range(len(tokenize_word)): if _ < 1: # only record first token (one word one record) word_token_mapping.append({ 'char': word, 'pos': pos, 'len': len(tokenize_word) }) token_word_mapping.append({ 'tok': tokenize_word[_], 'word': word, 'pos': len(word_token_mapping) - 1 }) pos += 1 t_input_list, t_pos_list = tok.handle_exceed(tokenizer, input, maxlen - 1, mode=handle_exceed, keep_after_sep=False) for t_input, t_pos in zip(t_input_list, t_pos_list): # -1 for cls # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. row_dict = dict() tokenized_input = [tok.tok_begin(tokenizer)] + t_input input_id = tokenizer.convert_tokens_to_ids(tokenized_input) if target is not None: target_token = [] for input_word, target_label in zip(word_token_mapping, target.split(separator)): if t_pos[0] <= input_word['pos'] < t_pos[1]: for _ in range(input_word['len']): target_token += [labels.index(target_label)] if "O" in labels: target_id = [labels.index("O")] + target_token else: target_id = [target_token[0]] + target_token if len(input_id) != len(target_id): print( list(zip(input.split(separator), target.split(separator)))) print(tokenizer.decode(input_id)) print(input_id) print(target_id) print("input target len not equal ", len(input_id), len(target_id)) continue target_id.extend([0] * (maxlen - len(target_id))) row_dict['target'] = target_id row_dict['word_token_mapping'] = word_token_mapping row_dict['token_word_mapping'] = token_word_mapping mask_id = [1] * len(input_id) mask_id.extend([0] * (maxlen - len(mask_id))) row_dict['mask'] = mask_id row_dict['end'] = len(input_id) row_dict['pos'] = t_pos input_id.extend([0] * (maxlen - len(input_id))) row_dict['input'] = input_id feature_dict_list.append(row_dict) return feature_dict_list