Python FullTokenizerの例、utils.bert.tokenization.FullTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bert_data_utils.py プロジェクト: yphacker/text_classification_tf

def get_dataset(data):
    _, label2id = get_vocab()
    tokenizer = tokenization.FullTokenizer(vocab_file=model_config_bert.bert_vocab_path)

    def label2index(label, label2id):
        return label2id[str(label)]

    def solve(df):
        x_data = df['review']
        input_ids_list = []
        input_masks_list = []
        segment_ids_list = []
        for text in x_data:
            single_input_id, single_input_mask, single_segment_id = encode_data(text)
            input_ids_list.append(single_input_id)
            input_masks_list.append(single_input_mask)
            segment_ids_list.append(single_segment_id)
        input_ids = np.asarray(input_ids_list, dtype=np.int32)
        input_masks = np.asarray(input_masks_list, dtype=np.int32)
        segment_ids = np.asarray(segment_ids_list, dtype=np.int32)

        y_tensor = None
        if 'sentiment' in df.columns.tolist():
            y_data = df['sentiment']
            y_tensor = np.array([label2index(label, label2id) for label in y_data])
        return (input_ids, input_masks, segment_ids), y_tensor

    def encode_data(text, max_seq_len=config.max_seq_len):
        input_ids, input_mask, segment_ids = convert_single_example(max_seq_len, tokenizer, text)
        return input_ids, input_mask, segment_ids

    return solve(data)

コード例 #2

0

ファイルを表示

ファイル: data_process.py プロジェクト: colabnlp/nlp_research

def create_tfrecorf_file():
    processor = AiSentimentProcess()
    label_list = processor.get_labels()
    # 分词器，不支持中文分词，
    tokenizer = tokenization.FullTokenizer(
        vocab_file=settings.bert_model_vocab_path, do_lower_case=True)
    if not os.path.exists(settings.train_tfrecord_path):
        train_examples = processor.get_train_examples(settings.train_data_path)
        file_based_convert_examples_to_features(train_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.train_tfrecord_path)

    if not os.path.exists(settings.dev_tfrecord_path):
        dev_examples = processor.get_dev_examples(settings.dev_data_path)
        file_based_convert_examples_to_features(dev_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.dev_tfrecord_path)

    if not os.path.exists(settings.test_tfrecord_path):
        test_examples = processor.get_test_examples(settings.test_data_path)
        file_based_convert_examples_to_features(test_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.test_tfrecord_path)

コード例 #3

0

ファイルを表示

def get_bert_param_lists(texts):
    # token 处理器，主要作用就是 分字，将字转换成ID。vocab_file 字典文件路径
    tokenizer = tokenization.FullTokenizer(vocab_file=config.bert_vocab_path)
    input_ids_list = []
    input_masks_list = []
    segment_ids_list = []
    for text in texts:
        single_input_id, single_input_mask, single_segment_id = \
            convert_single_example_simple(config.max_seq_length, tokenizer, text)
        input_ids_list.append(single_input_id)
        input_masks_list.append(single_input_mask)
        segment_ids_list.append(single_segment_id)
    input_ids = np.asarray(input_ids_list, dtype=np.int32)
    input_masks = np.asarray(input_masks_list, dtype=np.int32)
    segment_ids = np.asarray(segment_ids_list, dtype=np.int32)
    return input_ids, input_masks, segment_ids

コード例 #4

0

ファイルを表示

ファイル: data_process.py プロジェクト: colabnlp/nlp_research

def make_aspect_array():
    tokenizer = tokenization.FullTokenizer(
        vocab_file=settings.bert_model_vocab_path, do_lower_case=True)
    total_sentence_char_ids = []
    for subject in settings.subjects:
        sub_list = []
        for index, i in enumerate(subject.split(" ")):
            example = InputExample(guid=index, text_a=i, label="0")
            feature = convert_single_example(index,
                                             example,
                                             settings.label_list,
                                             max_seq_length=10,
                                             tokenizer=tokenizer)
            sub_list.append(feature.input_ids)
        total_sentence_char_ids.append(sub_list)
    return np.array(total_sentence_char_ids, dtype=np.int64)

コード例 #5

0

ファイルを表示

def get_bert_param_lists(texts):
    """
    将数据转换成Bert能够使用的格式
    input_ids：根据BERT-Base-Chinese checkpoint中的vocabtxt中每个字出现的index，将训练文本中的每一个字替换为vocab.txt中的index，需要添加开始CLS和结束SEP
    input_masks：包含开始CLS和结束SEP有字就填1
    segment_ids：seq2seq类任务同时传入两句训练关联训练数据时，有意义，传入一句训练数据则都为0
    以上三个list需要用0补齐到max_seq_length的长度
    """
    # token 处理器，主要作用就是 分字，将字转换成ID。vocab_file 字典文件路径
    tokenizer = tokenization.FullTokenizer(
        vocab_file=bert_model_config.bert_vocab_path)
    input_ids_list = []
    input_masks_list = []
    segment_ids_list = []
    for text in texts:
        single_input_id, single_input_mask, single_segment_id = \
            convert_single_example_simple(config.max_seq_length, tokenizer, text)
        input_ids_list.append(single_input_id)
        input_masks_list.append(single_input_mask)
        segment_ids_list.append(single_segment_id)
    input_ids = np.asarray(input_ids_list, dtype=np.int32)
    input_masks = np.asarray(input_masks_list, dtype=np.int32)
    segment_ids = np.asarray(segment_ids_list, dtype=np.int32)
    return input_ids, input_masks, segment_ids

コード例 #6

0

ファイルを表示

ファイル: CNN_BERT.py プロジェクト: xxxxxxxxy/MQNLP

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  processors = {
      "cola": ColaProcessor,
      "mnli": MnliProcessor,
      "mrpc": MrpcProcessor,
      "xnli": XnliProcessor,
  }

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))
  # new 模型
  processor = processors[task_name]()

  label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    file_based_convert_examples_to_features(
        train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_eval:
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d", len(eval_examples))
    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    # This tells the estimator to run through the entire set.
    eval_steps = None
    # However, if running eval on the TPU, you will need to specify the
    # number of steps.
    if FLAGS.use_tpu:
      # Eval will be slightly WRONG on the TPU because it will truncate
      # the last batch.
      eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
      tf.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

  if FLAGS.do_predict:
    predict_examples = processor.get_test_examples(FLAGS.data_dir)
    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d", len(predict_examples))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    if FLAGS.use_tpu:
      # Warning: According to tpu_estimator.py Prediction on TPU is an
      # experimental feature and hence not supported here
      raise ValueError("Prediction in TPU not supported")

    predict_drop_remainder = True if FLAGS.use_tpu else False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)

    output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
    with tf.gfile.GFile(output_predict_file, "w") as writer:
      tf.logging.info("***** Predict results *****")
      for prediction in result:
        output_line = "\t".join(
            str(class_probability) for class_probability in prediction) + "\n"
        writer.write(output_line)

コード例 #7

0

ファイルを表示

# create by fanfan on 2020/3/17 0017
import tensorflow as tf
import third_models.albert_zh.modeling_google as modeling
import tensorflow_estimator as tf_estimator
import third_models.albert_zh.optimization as optimization
from Competition.ai_challenger_2018_sentiment_analysis import settings
from tensorflow.contrib.layers import fully_connected, conv1d
from utils.bert import tokenization

params = settings.ParamsModel()
tokenizer = tokenization.FullTokenizer(
    vocab_file=settings.bert_model_vocab_path, do_lower_case=True)
params.char2id = tokenizer.vocab


def get_setence_length(data, name):
    used = tf.sign(tf.abs(data))
    length = tf.reduce_sum(used, reduction_indices=-1)
    length = tf.cast(length, tf.int32, name=name)
    return length


def create_model(albert_config, is_training, input_ids, input_mask,
                 segment_ids, labels, aspechts_char):
    model = modeling.AlbertModel(config=albert_config,
                                 is_training=is_training,
                                 input_ids=input_ids,
                                 input_mask=input_mask,
                                 token_type_ids=segment_ids,
                                 use_one_hot_embeddings=False)
    context_embedding = model.get_sequence_output()