Python FullTokenizerの例、bert_model.tokenization.FullTokenizer Pythonの例

コード例 #1

0

ファイルを表示

    def _make_data_processor(self):
        processors = {
            "ubuntu": UbuntuProcessor,
        }

        data_dir = self.hparams.data_dir
        self.processor = processors[self.hparams.task_name](self.hparams)
        self.train_examples, self.train_knowledge_examples, self.train_similar_examples = \
         self.processor.get_train_examples(data_dir)
        self.valid_examples, self.valid_knowledge_examples, self.valid_similar_examples = \
         self.processor.get_dev_examples(data_dir)
        self.test_examples, self.test_knowledge_examples, self.test_similar_examples = \
         self.processor.get_test_examples(data_dir)
        self.label_list = self.processor.get_labels()

        self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir,
                                                    self.hparams.do_lower_case)
        self.processor.data_process_feature(self.hparams, self.tokenizer)

        self.num_train_steps = int(
            len(self.train_examples) / self.hparams.train_batch_size *
            self.hparams.num_epochs)
        self.warmup_proportion = 0.1
        self.num_warmup_steps = int(self.num_train_steps *
                                    self.warmup_proportion)

コード例 #2

0

ファイルを表示

ファイル: data_utils.py プロジェクト: tpoljak/BERT_RESSEL

  def __init__(self):
    self.data_dir = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/bert_multi_turn_negative_1/bert_%s.pickle" % \
                    "train"
    self.bert_vocab_file = "/mnt/raid5/shared/bert/uncased_L-12_H-768_A-12/vocab.txt"
    self.data_l = []

    self.load_data_dir()
    self.tokenizer = tokenization.FullTokenizer(self.bert_vocab_file, True)

    self.max_dialog_context = 0
    self.max_response = 0
    self.max_utterance = 0

    self.avg_dialog_context = 0
    self.avg_response = 0
    self.avg_utterance = 0

    self.min_dialog_context = 10000
    self.min_response = 10000
    self.min_utterance = 10000

    self.get_sentence_statistics()

    print("="*200)
    print("Final Stat Info")
    print("avg_dialog_context", self.avg_dialog_context)
    print("avg_response", self.avg_response)

    print("max_dialog_context", self.max_dialog_context)
    print("max_response", self.max_response)

    print("min_dialog_context", self.min_dialog_context)
    print("min_response", self.min_response)

コード例 #3

0

ファイルを表示

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)

コード例 #4

0

ファイルを表示

ファイル: pretrained_knowledge.py プロジェクト: tpoljak/BERT_RESSEL

    def __init__(self, hparams):
        self.hparams = hparams
        self.bert_config = modeling_base.BertConfig.from_json_file(
            self.hparams.bert_config_dir)
        self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir,
                                                    self.hparams.do_lower_case)

        self._make_data_processor()

コード例 #5

0

ファイルを表示

ファイル: run_similarity.py プロジェクト: lixuanhng/NLP_related_projects

    def __init__(self, batch_size=cf.batch_size):

        self.mode = None
        self.max_seq_length = cf.max_seq_length
        self.tokenizer = tokenization.FullTokenizer(vocab_file=cf.vocab_file,
                                                    do_lower_case=True)
        self.batch_size = batch_size
        self.estimator = None
        self.processor = SimProcessor()  # 加载训练、测试数据class
        # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
        tf.logging.set_verbosity(tf.logging.INFO)

コード例 #6

0

ファイルを表示

    def _make_data_processor(self):
        processors = {
            "fnc": FNCProcessor,
        }
        self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir,
                                                    self.hparams.do_lower_case)

        data_dir = self.hparams.data_dir
        self.processor = processors[self.hparams.task_name](self.hparams,
                                                            self.tokenizer)
        self.train_examples, self.label_dict = self.processor.get_train_examples(
            data_dir)
        self.test_examples = self.processor.get_test_examples(data_dir)
        self.label_list = self.processor.get_labels()

        self.num_train_steps = int(
            len(self.train_examples) / self.hparams.train_batch_size *
            self.hparams.num_epochs)
        self.warmup_proportion = 0.1
        self.num_warmup_steps = int(self.num_train_steps *
                                    self.warmup_proportion)

コード例 #7

0

ファイルを表示

 def __init__(self, args):
     self.max_len = args.max_len
     self.bert_config_path = args.bert_config_path
     self.bert_path = args.bert_path
     self.loginfo = get_logger(args.log_path)
     self.export_model_path = args.export_model_path
     self.batch_size = args.batch_size
     self.epochs = args.epochs
     self.is_train = args.is_train
     self.config = tf.ConfigProto(allow_soft_placement=True)
     self.config.gpu_options.per_process_gpu_memory_fraction = 0.4  # 占用40%显存
     self.tokenize = tokenization.FullTokenizer(args.vocab_path,
                                                do_lower_case=True)
     self.num_train_steps = args.num_train_steps
     self.num_warmup_steps = args.num_warmup_steps
     self.init_lr = args.init_lr
     self.restore_on_train = args.restore_on_train
     self.isload = args.isload
     self.bert_config_path = args.bert_config_path
     self.bert_path = args.bert_path
     self.epochs = args.epochs
     self.rate = args.keep_rate
     self.version = args.version

コード例 #8

0

ファイルを表示

    from bert_model import tokenization
    abs_path=os.path.abspath('.')
    # print(abs_path)
    file_path = os.path.dirname(abs_path)
    os.sys.path.append(file_path)
    train=pd.read_csv('../data/train/classify/train.csv',sep='\t',encoding='utf-8',header=0)
    valid=pd.read_csv('../data/train/classify/valid.csv',sep='\t',encoding='utf-8',header=0)
    test=pd.read_csv('../data/test/test.csv',sep='\t',encoding='utf-8',header=0)
    # q=train['query']
    # r=train.reply
    # e=
    train=list(zip(train.label,train['query'],train.reply,train.seq_id))
    valid=list(zip(valid.label,valid['query'],valid.reply,valid.seq_id))
    test=list(zip(test.label,test['query'],test.reply,test.seq_id))

    tokenize = tokenization.FullTokenizer('./bert_model/chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True)
    for input_ids, mask_ids, type_ids, labels,seq_ids in batch_yield(train,batch_size=64,tokenize=tokenize):
        input_ids=np.array(input_ids)
        labels=np.array(labels)
        seq_ids=np.array(seq_ids)
        print(input_ids.shape)
        print(labels.shape)
        print(seq_ids.shape)
        print('=' * 10)
        break
    # print(train.head())
    for input_ids, mask_ids, type_ids, labels,seq_ids in batch_yield(valid, batch_size=64, tokenize=tokenize):
        input_ids = np.array(input_ids)
        labels = np.array(labels)
        seq_ids = np.array(seq_ids)
        print(input_ids.shape)

コード例 #9

0

ファイルを表示

#-----------------------------------------------------------------------------------------------------

def sentence_heatmap(score_sent_mat, dialog, response):
  hm_sent_mat = softmax((np.max(score_sent_mat, axis=0)*25), dim=-1)
  print(response)
  print(list(hm_sent_mat))

def softmax(x, dim=-1):
  """Compute softmax values for each sets of scores in x."""
  exp_x = np.exp(x)
  sum_exp_x = np.sum(exp_x, axis=dim)
  sf = exp_x / np.expand_dims(sum_exp_x, axis=dim)

  return sf
if __name__ == '__main__':
  tokenizer = tokenization.FullTokenizer("/mnt/raid5/shared/bert/tensorflow/uncased_L-12_H-768_A-12/vocab.txt", True)

  with open("./attention_score_%s.pickle" % idx, "rb") as frb_handle:
    dialog, response, raw_dialog, raw_response, sequence_rep = pickle.load(frb_handle)  # dialog_len, response_len

  dialog_len = len(dialog)
  response_len = len(response)
  dialog_rep = np.array(sequence_rep[0:dialog_len])  # 24, 768
  response_rep = np.array(sequence_rep[280:280 + response_len])  # 40, 768

  dialog_merged_embeddings = merge_subtokens([" ".join(raw_dialog)], tokenizer, np.expand_dims(dialog_rep,0), is_cls=True)
  response_merged_embeddings = merge_subtokens([" ".join(raw_response)], tokenizer, np.expand_dims(response_rep,0))

  # 24, 40
  dialog_sentence, response_sentence = [], []
  dialog_sent_vec, response_sent_vec = [], []

コード例 #10

0

ファイルを表示

def make_bert_multi_turn_data_pickle(num_negative_samples=5):
    from bert_model import tokenization
    tokenizer = tokenization.FullTokenizer(
        "/mnt/raid5/shared/bert/tensorflow/uncased_L-12_H-768_A-12/vocab.txt",
        True)

    dialog_turn_num = 10
    orig_path = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/%s.txt"
    data_path = "bert_%s_eot_none.pickle"
    file_dir = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/bert_dialog_turn_stat_len"
    data_type = ["test"]

    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    for t in data_type:
        print(t + " data is loading now...")
        curr_idx = 0
        stat_utt_len = 0

        with open(os.path.join(file_dir, data_path % t), "wb") as fw_handle:
            dialog_data_l, candidates_pool = get_dialog_dataset(orig_path % t,
                                                                is_eot=True)
            print(len(dialog_data_l))
            print("candidates_pool", len(candidates_pool))
            current_ground_truth = ""
            print(dialog_data_l[0])
            for idx, dialog_data in enumerate(dialog_data_l):
                utterances = dialog_data[0]
                response = dialog_data[1][0]
                label = str(dialog_data[2])

                if label == "0":
                    continue

                dialog_context = ""
                utt_count = 0
                for utt in utterances:
                    dialog_context += utt
                    utt_count += 1

                if utt_count != dialog_turn_num:
                    continue

                stat_utt_len += len(tokenizer.tokenize(dialog_context))

                if t in ["test", "valid"]:
                    pickle.dump([dialog_context, response, label], fw_handle)
                    curr_idx += 1
                    continue

                # pos : neg ==> 1 : 1
                if num_negative_samples == 1:
                    pickle.dump([dialog_context, response, label], fw_handle)
                    curr_idx += 1

                else:
                    if label == "1":
                        current_ground_truth = response
                        pickle.dump([dialog_context, response, label],
                                    fw_handle)
                        curr_idx += 1

                    # negative sample
                    if label == "0":
                        for post_idx in range(1, num_negative_samples + 1):
                            try:
                                neg_sample = dialog_data_l[idx +
                                                           2 * post_idx][1][0]
                            except IndexError:
                                print(idx, ":", idx + 2 * post_idx,
                                      "index Error")
                                neg_sample = random.sample(
                                    candidates_pool.difference(
                                        response, current_ground_truth), 1)[0]
                            finally:
                                pickle.dump(
                                    [dialog_context, neg_sample, label],
                                    fw_handle)
                                curr_idx += 1

                if curr_idx % 10000 == 0:
                    print(str(curr_idx) + " data has been saved now...")
                    print(dialog_context)

            print(str(t), stat_utt_len, curr_idx)
            print(t + " data pickle save complete")