def conver_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode): """ 将一个训练样本转化为InputFeature,其中进行字符seg并且index化,和label的index转化 :param ex_index: :param example: :param label_list: :param max_seq_length: :param tokenizer: :return: """ # 1. 构建label->id的映射 label_map = {} if os.path.exists(os.path.join(FLAGS.output_dir, 'label2id.pkl')): with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as fd: label_map = pickle.load(fd) else: for i, label in enumerate(label_list): label_map[label] = i with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as fd: pickle.dump(label_map, fd) # 不考虑seq pair 分类的情况 tokens_a = tokenizer.tokenize(example.text_a) # 截断,因为有句首和句尾的标识符 if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append('[CLS]') segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append('[SEP]') segment_ids.append(0) #将字符转化为id形式 input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) #补全到max_seg_length while len(input_ids) < max_seq_length: input_ids.append(0) segment_ids.append(0) input_mask.append(0) if example.label is None: label_id = -1 else: label_id = label_map[example.label] if ex_index < 2 and mode in ['train', 'dev']: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) logger.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) return feature
def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" features = [] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("unique_id: %s" % (example.unique_id)) tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info( "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) features.append( InputFeatures( unique_id=example.unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids)) return features
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param output_dir :param mode: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # 保存label->index 的map if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) ######调用example初始化类的属性值 textlist = example.text.split(' ') labellist = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(textlist): # 分词,如果是中文,就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理 # (例如中文的引号),可以将所有的分字操作替换为list(input) token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: # 一般不会出现else labels.append("X") # tokens = tokenizer.tokenize(example.text) # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) # append("O") or append("[CLS]") not sure! label_ids.append( label_map["[CLS]"] ) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病 for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) # append("O") or append("[SEP]") not sure! label_ids.append(label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids( ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) # label_mask = [1] * len(input_ids) # padding, 使用 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") # label_mask.append(0) # print(len(input_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # assert len(label_mask) == max_seq_length # 打印部分样本数据信息 if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, # label_mask = label_mask ) # mode='test.txt'的时候才有效 write_tokens(ntokens, output_dir, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): return InputFeatures(input_ids=[0] * max_seq_length, input_mask=[0] * max_seq_length, segment_ids=[0] * max_seq_length, label_id=0, is_real_example=False) label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i # 保存label->index 的map if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) print("label map: {}".format(label_map)) tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature
def convert_single_examples(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): """ 将一个样本进行分析,然后将字转化成id,标签转化成id,然后结构化到InputFeature对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param output_dir: :param mode: :return: """ label_map = {} # 从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # 保存label->index的map if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as f: pickle.dump(label_map, f) text_list = example.text.split(' ') label_list = example.label.split(' ') tokens = [] labels = [] for i, word in enumerate(text_list): # 分词,如果是中文,就是分字,但是对于一些不在Bert的vocab.txt中得字符会被进行WordPice处理(例如中文中得引号) # 可以将所有得分词操作替换为list(input) token = tokenizer.tokenize(word) tokens.extend(token) label_tmp = label_list[i] for m in range(len(token)): if m == 0: labels.append(label_tmp) else: # 一般不会出现else情况 labels.append('X') # 序列截断 if len(tokens) >= max_seq_length - 1: # -2得原因是因为序列需要加一个句首和句尾标志 tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] n_tokens = [] segment_ids = [] label_ids = [] # -----句始添加CLS标志----- n_tokens.append('[CLS]') segment_ids.append(0) label_ids.append(label_map['CLS']) for token_i, token in enumerate(tokens): n_tokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) # -----句尾添加SEP标志----- n_tokens.append('[SEP]') segment_ids.append(0) label_ids.append(label_map['[SEP]']) input_ids = tokenizer.convert_tokens_to_ids(n_tokens) # 将序列中得字转换成ID形式 input_mask = [1] * len(input_ids) # padding 使用 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) n_tokens.append('**NULL**') # 判断长度 assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # 打印部分样本信息 if ex_index < 5: logger.info('*** Example ***') logger.info('guid: %s' % example.guid) logger.info( 'tokens: %s' % ' '.join([tokenization.printable_text(x) for x in tokens])) logger.info('input_ids: %s' % ' '.join([str(x) for x in input_ids])) logger.info('input_mask: %s' % ' '.join([str(x) for x in input_mask])) logger.info('segment_ids: %s' % ' '.join([str(x) for x in segment_ids])) logger.info('label_ids: %s' % ' '.join([str(x) for x in label_ids])) # 结构化一个类 feature = InputFeature(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) # mode='test'的时候才有效 write_tokens(n_tokens, output_dir, mode) return feature
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param label_list: 标签列表 :param max_seq_length: :param tokenizer: :param output_dir :param mode: :return: """ label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(label_list, 1): label_map[label] = i # 保存label->index 的map if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: pickle.dump(label_map, w) textlist = example.text.split(" ") labellist = example.label.split(" ") tokens = [] labels = [] for i, word in enumerate(textlist): token = tokenizer.tokenize(word) tokens.extend(token) label_1 = labellist[i] for m in range(len(token)): if m == 0: labels.append(label_1) else: labels.append('X') # tokens = tokenizer.tokenize(example.text) # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] labels = labels[0:(max_seq_length - 2)] ntokens = [] segment_ids = [] label_ids = [] ntokens.append('[CLS]') segment_ids.append(0) label_ids.append(label_map["[CLS]"]) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(label_map[labels[i]]) # 句尾添加[SEP] 标志 ntokens.append('[SEP]') segment_ids.append(0) label_ids.append(label_map['[SEP]']) input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) # we don't concerned about it! label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: tf.logging.info("""*** Example ***""") tf.logging.info( "guid: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info('input_ids:%s' % " ".join([str(x) for x in input_ids])) tf.logging.info('input_mask: %s' % " ".join([str(x) for x in input_mask])) tf.logging.info('segment_ids: %s' % " ".join([str(x) for x in segment_ids])) tf.logging.info('label_ids:%s' % "".join([str(x) for x in label_ids])) # 结构化一个类 feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) write_tokens(ntokens, output_dir, mode) return feature
def convert_lst_to_features(lst_str, seq_length, tokenizer, logger, is_tokenized=False, mask_cls_sep=False): """Loads a data file into a list of `InputBatch`s.""" examples = read_tokenized_examples( lst_str) if is_tokenized else read_line_examples(lst_str) _tokenize = lambda x: tokenizer.mark_unk_tokens( x) if is_tokenized else tokenizer.tokenize(x) for (ex_index, example) in enumerate(examples): tokens_a = _tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = _tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ['[CLS]'] + tokens_a + ['[SEP]'] input_type_ids = [0] * len(tokens) input_mask = [int(not mask_cls_sep) ] + [1] * len(tokens_a) + [int(not mask_cls_sep)] if tokens_b: tokens += tokens_b + ['[SEP]'] input_type_ids += [1] * (len(tokens_b) + 1) input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)] input_ids = tokenizer.convert_tokens_to_ids(tokens) # Zero-pad up to the sequence length. more pythonic pad_len = seq_length - len(input_ids) input_ids += [0] * pad_len input_mask += [0] * pad_len input_type_ids += [0] * pad_len assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length logger.debug('tokens: %s' % ' '.join([tokenization.printable_text(x) for x in tokens])) logger.debug('input_ids: %s' % ' '.join([str(x) for x in input_ids])) logger.debug('input_mask: %s' % ' '.join([str(x) for x in input_mask])) logger.debug('input_type_ids: %s' % ' '.join([str(x) for x in input_type_ids])) yield InputFeatures(unique_id=example.unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids)