def get_test_example(): processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, spm_model_file=spm_model_file) # save data to tf_record #test_examples = processor.get_test_examples("")#测试数据目录 test_examples = processor.get_test_examples(data_dir) features = get_test_features(test_examples, label_list, max_seq_length, tokenizer) return features
def create_tokenizer_from_hub_module(albert_hub_module_handle): """Get the vocab file and casing info from the Hub module.""" with tf.Graph().as_default(): albert_module = hub.Module(albert_hub_module_handle) tokenization_info = albert_module(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ]) return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file=FLAGS.spm_model_file)
def transformer(path, s3_path, class_name, model='xlnet', **kwargs): check_file(path[model], s3_path[model], **kwargs) g = load_graph(path[model]['model'], **kwargs) try: with open(path[model]['setting']) as fopen: nodes = json.load(fopen) except: raise Exception( f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again" ) if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return TAGGING_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, ) if model in ['xlnet', 'alxlnet']: tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return TAGGING_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=nodes, )
def load(model: str = 'albert', **kwargs): """ Load albert model. Parameters ---------- model : str, optional (default='base') Model architecture supported. Allowed values: * ``'albert'`` - base albert-bahasa released by Malaya. * ``'albert-tiny'`` - tiny bert-bahasa released by Malaya. Returns ------- result : malaya.transformers.albert.Model class """ from malaya.path import PATH_ALBERT, S3_PATH_ALBERT from malaya.function import check_file model = model.lower() check_file(PATH_ALBERT[model]['model'], S3_PATH_ALBERT[model], **kwargs) if not os.path.exists(PATH_ALBERT[model]['directory'] + 'model.ckpt'): import tarfile with tarfile.open(PATH_ALBERT[model]['model']['model']) as tar: tar.extractall(path=PATH_ALBERT[model]['path']) from albert import tokenization bert_checkpoint = PATH_ALBERT[model]['directory'] + 'model.ckpt' vocab_model = PATH_ALBERT[model]['directory'] + 'sp10m.cased.v10.model' vocab = PATH_ALBERT[model]['directory'] + 'sp10m.cased.v10.vocab' bert_config = PATH_ALBERT[model]['directory'] + 'config.json' tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=False, spm_model_file=vocab_model) bert_config = modeling.AlbertConfig.from_json_file(bert_config) model = Model(bert_config, tokenizer) model._saver.restore(model._sess, bert_checkpoint) return model
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file, ) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng, ) tf.logging.info("number of instances: %i", len(instances)) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) write_instance_to_example_files( instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files, )
def sentence_to_idx(self, text): """ 将分词后的句子转换成idx表示 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) text = tokenization.convert_to_unicode(text) tokens = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * len(input_id) input_id, input_mask, segment_id = self.padding( input_id, input_mask, segment_id) return [input_id], [input_mask], [segment_id]
def trans_to_index(self, inputs): """ 将输入转化为索引表示 :param inputs: 输入 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path, do_lower_case=True) input_ids = [] input_masks = [] segment_ids = [] for text in inputs: text = tokenization.convert_to_unicode(text) tokens = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_ids.append(input_id) input_masks.append([1] * len(input_id)) segment_ids.append([0] * len(input_id)) return input_ids, input_masks, segment_ids
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: contents = "".join([six.ensure_str(x) + "\n" for x in vocab_tokens]) vocab_writer.write(six.ensure_binary(contents, "utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def make_tf_record(output_dir, data_dir, vocab_file, spm_model_file): tf.gfile.MakeDirs(output_dir) #"model/bert" processor = processors[task_name]() #"atec" label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, spm_model_file=spm_model_file) train_file = os.path.join(output_dir, "train.tf_record") eval_file = os.path.join(output_dir, "eval.tf_record") # save data to tf_record if not os.path.isfile(train_file): train_examples = processor.get_train_examples(data_dir) file_based_convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer, train_file, task_name) del train_examples # eval data if not os.path.isfile(eval_file): eval_examples = processor.get_dev_examples(data_dir) file_based_convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, eval_file, task_name) del eval_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) validate_flags_or_throw(albert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int(min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = squad_utils.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if FLAGS.do_train: num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = squad_utils.v1_model_fn_builder( albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. if not tf.gfile.Exists(FLAGS.train_feature_file): train_writer = squad_utils.FeatureWriter( filename=os.path.join(FLAGS.train_feature_file), is_training=True) squad_utils.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, do_lower_case=FLAGS.do_lower_case) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) # tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, is_v2=False) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: with tf.gfile.Open(FLAGS.predict_file) as predict_file: prediction_json = json.load(predict_file)["data"] eval_examples = squad_utils.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( FLAGS.predict_feature_left_file)): tf.logging.info("Loading eval features from {}".format( FLAGS.predict_feature_left_file)) with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: eval_features = pickle.load(fin) else: eval_writer = squad_utils.FeatureWriter( filename=FLAGS.predict_feature_file, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) squad_utils.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, do_lower_case=FLAGS.do_lower_case) eval_writer.close() with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: pickle.dump(eval_features, fout) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.predict_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, is_v2=False) def get_result(checkpoint): """Evaluate the checkpoint on SQuAD 1.0.""" # If running eval on the TPU, you will need to specify the number of # steps. reader = tf.train.NewCheckpointReader(checkpoint) global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) all_results = [] for result in estimator.predict( predict_input_fn, yield_single_examples=True, checkpoint_path=checkpoint): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_log_prob = [float(x) for x in result["start_log_prob"].flat] end_log_prob = [float(x) for x in result["end_log_prob"].flat] all_results.append( squad_utils.RawResult( unique_id=unique_id, start_log_prob=start_log_prob, end_log_prob=end_log_prob)) output_prediction_file = os.path.join( FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join( FLAGS.output_dir, "nbest_predictions.json") result_dict = {} squad_utils.accumulate_predictions_v1( result_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length) predictions = squad_utils.write_predictions_v1( result_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file) return squad_utils.evaluate_v1( prediction_json, predictions), int(global_step) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "f1" writer = tf.gfile.GFile(output_eval_file, "w") if tf.gfile.Exists(checkpoint_path + ".index"): result = get_result(checkpoint_path) best_perf = result[0][key_name] global_step = result[1] else: global_step = -1 best_perf = -1 checkpoint_path = None while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info("found 0 file, global step: {}. Sleeping." .format(global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result, global_step = get_result(checkpoint_path) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) writer.write("best {} = {}\n".format(key_name, best_perf)) tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) if len(_find_valid_cands(global_step)) > 2: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result, global_step = get_result(checkpoint_path) tf.logging.info("***** Final Eval results *****\n") writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format(FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) if num_train_steps and num_warmup_steps: writer.write("Training steps: {}\n".format(num_train_steps)) writer.write("Warmup steps: {}\n".format(num_warmup_steps)) writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best perf happened at step: {}".format(global_step))
# Author: dgm # Description: 数据预处理 # Date: 2020-08-14 import math import codecs import random from albert import tokenization from utils import create_dico, create_mapping, zero_digits tokenizer = tokenization.FullTokenizer( vocab_file='albert_model/albert_base/vocab_chinese.txt', do_lower_case=True) def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = []
def trans_to_features(self, examples, is_training): """ 将输入转化为索引表示 :param examples: 输入 :param is_training: :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path, do_lower_case=True) features = [] unique_id = 1000000000 for (example_index, example) in enumerate(examples): # 用wordpiece的方法对query进行分词处理 query_tokens = tokenizer.tokenize(example['question']) # 给定query一个最大长度来控制query的长度 if len(query_tokens) > self.__query_length: query_tokens = query_tokens[:self.__query_length] # 主要是针对context构造索引,之前我们将中文,标点符号,空格,一连串的数字,英文单词分割存储在doc_tokens中 # 但在bert的分词器中会将一连串的数字,中文,英文等分割成子词,也就是说经过bert的分词之后得到的tokens和之前 # 获得的doc_tokens是不一样的,因此我们仍需要对start和end position从doc_tokens中的位置映射到当前tokens的位置 tok_to_orig_index = [] # 存储未分词的token的索引,但长度和下面的相等 orig_to_tok_index = [] # 存储分词后的token的索引,但索引不是连续的,会存在跳跃的情况 all_doc_tokens = [] # 存储分词后的token,理论上长度是要大于all_tokens的 for (i, token) in enumerate(example['doc_tokens']): sub_tokens = tokenizer.tokenize(token) # orig_to_tok_index的长度等于doc_tokens,里面每个值存储的是doc_tokens中的token在all_doc_tokens中的起止索引值 # 用来将在all_token中的start和end转移到all_doc_tokens中 orig_to_tok_index.append([len(all_doc_tokens)]) for sub_token in sub_tokens: # tok_to_orig_index的长度等于all_doc_tokens, 里面会有重复的值 tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) orig_to_tok_index[-1].append(len(all_doc_tokens) - 1) tok_start_position = -1 tok_end_position = -1 if is_training: # 原来token到新token的映射,这是新token的起点 tok_start_position = orig_to_tok_index[ example['start_position']][0] tok_end_position = orig_to_tok_index[ example['end_position']][1] tok_start_position, tok_end_position = self._improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example['orig_answer_text']) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = self.__max_length - len(query_tokens) - 3 doc_spans = [] _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) # 在使用bert的时候,一般会将最大的序列长度控制在512,因此对于长度大于最大长度的context,我们需要将其分成多个片段 # 采用滑窗的方式,滑窗大小是小于最大长度的,因此分割的片段之间是存在重复的子片段。 start_offset = 0 # 截取的片段的起始位置 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset # 当长度超标,需要使用滑窗 if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len( all_doc_tokens): # 当length < max_len时,该条件成立 break start_offset += min(length, self.__doc_stride) # 组合query和context的片段成一个序列输入到bert中 for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} # 因为片段之间会存在重复的子片段,但是子片段中的token在不同的片段中的重要性是不一样的, # 在这里根据上下文的数量来决定token的重要性,在之后预测时对于出现在两个片段中的token,只取重要性高的片段 # 中的token的分数作为该token的分数 token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[ split_token_index] # 映射当前span组成的句子对的索引到原始token的索引 # 在利用滑窗分割多个span时会存在有的词出现在两个span中,但最后统计的时候,我们只能选择一个span,因此 # 作者根据该词上下文词的数量构建了一个分数,取分数最高的那个span is_max_context = self._check_is_max_context( doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.__max_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.__max_length assert len(input_mask) == self.__max_length assert len(segment_ids) == self.__max_length start_position = -1 end_position = -1 if is_training: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. if tok_start_position == -1 and tok_end_position == -1: start_position = 0 # 问题本来没答案,0是[CLS]的位子 end_position = 0 else: # 如果原本是有答案的,那么去除没有答案的feature out_of_span = False doc_start = doc_span.start # 映射回原文的起点和终点 doc_end = doc_span.start + doc_span.length - 1 if not (tok_start_position >= doc_start and tok_end_position <= doc_end): # 该划窗没答案作为无答案增强 out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset features.append({ 'unique_id': unique_id, 'example_index': example_index, 'doc_span_index': doc_span_index, 'tokens': tokens, 'token_to_orig_map': token_to_orig_map, 'token_is_max_context': token_is_max_context, 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, 'start_position': start_position, 'end_position': end_position }) unique_id += 1 return features
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, sp_cdc_file=FLAGS.cdc_spm_model_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=8, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: if FLAGS.data_examples: tf.gfile.MakeDirs(FLAGS.data_examples) train_file = os.path.join(FLAGS.data_examples, "train.tf_record") else: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) tf.logging.set_verbosity(tf.logging.INFO) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) if FLAGS.data_examples: tf.gfile.MakeDirs(FLAGS.data_examples) eval_file = os.path.join(FLAGS.data_examples, "eval.tf_record") else: eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) if FLAGS.data_examples: tf.gfile.MakeDirs(FLAGS.data_examples) predict_file = os.path.join(FLAGS.data_examples, "predict.tf_record") else: predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) actual_label = label_list[int(prediction["predictions"])] sub_writer.write( six.ensure_str(example.guid) + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = CommonsenseQAProcessor if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") if not FLAGS.albert_config_file: raise ValueError("At least one of `--albert_config_file`must be set") if FLAGS.albert_config_file: albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) else: albert_config = None # Get the config from TF-Hub. tf.gfile.MakeDirs(FLAGS.output_dir) processor = processors( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int(min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps= num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, "train.tf_record") if not tf.gfile.Exists(train_file): file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, "eval.tf_record") if not tf.gfile.Exists(eval_file): file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt") def _best_trial_info(): """Returns information about which checkpoints have been evaled so far.""" if tf.gfile.Exists(best_trial_info_file): with tf.gfile.GFile(best_trial_info_file, "r") as best_info: global_step, best_metric_global_step, metric_value = ( best_info.read().split(":")) global_step = int(global_step) best_metric_global_step = int(best_metric_global_step) metric_value = float(metric_value) else: metric_value = -1 best_metric_global_step = -1 global_step = -1 tf.logging.info( "Best trial info: Step: %s, Best Value Step: %s, " "Best Value: %s", global_step, best_metric_global_step, metric_value) return global_step, best_metric_global_step, metric_value def _remove_checkpoint(checkpoint_path): for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") global_step, best_perf_global_step, best_perf = _best_trial_info() writer = tf.gfile.GFile(output_eval_file, "w") while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info("found 0 file, global step: {}. Sleeping." .format(global_step)) time.sleep(60) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if (best_perf_global_step != step and len(_find_valid_cands(step)) > 1): _remove_checkpoint(checkpoint_path) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result["eval_accuracy"] > best_perf: best_perf = result["eval_accuracy"] best_perf_global_step = global_step elif len(_find_valid_cands(global_step)) > 1: _remove_checkpoint(checkpoint_path) writer.write("=" * 50 + "\n") writer.flush() with tf.gfile.GFile(best_trial_info_file, "w") as best_info: best_info.write("{}:{}:{}".format( global_step, best_perf_global_step, best_perf)) writer.close() for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext) tgt_ckpt = "model.ckpt-best.{}".format(ext) tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) tf.io.gfile.rename( os.path.join(FLAGS.output_dir, src_ckpt), os.path.join(FLAGS.output_dir, tgt_ckpt), overwrite=True) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(classifier_utils.PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict( input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.csv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) actual_label = label_list[int(prediction["predictions"])] sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer toxicity model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. * ``'tiny-bert'`` - Google BERT TINY parameters. * ``'albert'`` - Google ALBERT BASE parameters. * ``'tiny-albert'`` - Google ALBERT TINY parameters. * ``'xlnet'`` - Google XLNET BASE parameters. * ``'alxlnet'`` - Malaya ALXLNET BASE parameters. quantized : bool, optional (default=False) if True, will load 8-bit quantized model. Quantized model not necessary faster, totally depends on the machine. Returns ------- result : malaya.model.bert.SIGMOID_BERT class """ model = model.lower() if model not in _transformer_availability: raise Exception( 'model not supported, please check supported models from `malaya.toxicity.available_transformer()`.' ) check_file( PATH_TOXIC[model], S3_PATH_TOXIC[model], quantized = quantized, **kwargs ) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(PATH_TOXIC[model][model_path], **kwargs) path = PATH_TOXIC if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert( path[model]['tokenizer'], path[model]['vocab'] ) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file = path[model]['vocab'], do_lower_case = False, spm_model_file = path[model]['tokenizer'], ) return SIGMOID_BERT( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = None, input_masks = g.get_tensor_by_name('import/Placeholder_1:0'), logits = g.get_tensor_by_name('import/logits:0'), logits_seq = g.get_tensor_by_name('import/logits_seq:0'), vectorizer = g.get_tensor_by_name('import/dense/BiasAdd:0'), sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, label = label, attns = _extract_attention_weights_import( bert_num_layers[model], g ), class_name = 'toxic', ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return SIGMOID_XLNET( X = g.get_tensor_by_name('import/Placeholder:0'), segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'), input_masks = g.get_tensor_by_name('import/Placeholder_2:0'), logits = g.get_tensor_by_name('import/logits:0'), logits_seq = g.get_tensor_by_name('import/logits_seq:0'), vectorizer = g.get_tensor_by_name('import/transpose_3:0'), sess = generate_session(graph = g, **kwargs), tokenizer = tokenizer, label = label, attns = _extract_attention_weights_import(g), class_name = 'toxic', )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mismnli": classifier_utils.MisMnliProcessor, "mrpc": classifier_utils.MrpcProcessor, "rte": classifier_utils.RteProcessor, "sst-2": classifier_utils.Sst2Processor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "qnli": classifier_utils.QnliProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, task_name) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, task_name) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" else: key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(classifier_utils.PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, task_name) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"race": race_utils.RaceProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) albert_config.hidden_dropout_prob = FLAGS.albert_dropout_prob albert_config.attention_probs_dropout_prob = FLAGS.albert_dropout_prob if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case, high_only=FLAGS.high_only, middle_only=FLAGS.middle_only) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None total_time = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = race_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, dropout_prob=FLAGS.dropout_prob, customized=using_customized_optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: if not tf.gfile.Exists(FLAGS.train_feature_file): race_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.train_feature_file, FLAGS.max_qa_length) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, multiple=len(label_list)) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) total_time = sum(time_hist.times) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) if not tf.gfile.Exists(FLAGS.eval_feature_file): race_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.eval_feature_file, FLAGS.max_qa_length) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=FLAGS.eval_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size, multiple=len(label_list)) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") avg_time_per_batch = np.mean(time_hist.times) writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Max qa length: {}\n".format(FLAGS.max_qa_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if FLAGS.train_step and FLAGS.warmup_step: writer.write("Training steps: {}\n".format(FLAGS.train_step)) writer.write("Warmup steps: {}\n".format(FLAGS.warmup_step)) while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(classifier_utils.PaddingInputExample()) assert len(predict_examples) % FLAGS.predict_batch_size == 0 predict_steps = int( len(predict_examples) // FLAGS.predict_batch_size) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") race_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, FLAGS.max_qa_length) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, multiple=len(label_list)) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.evaluate(input_fn=predict_input_fn, steps=predict_steps, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "predict_results.txt") with tf.gfile.GFile(output_predict_file, "w") as pred_writer: # num_written_lines = 0 tf.logging.info("***** Predict results *****") pred_writer.write("***** Predict results *****\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) pred_writer.write("%s = %s\n" % (key, str(result[key]))) pred_writer.write("best = {}\n".format(best_perf))
def _transformer(model, bert_class, xlnet_class, **kwargs): model = model.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.similarity.available_transformer()' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs) g = load_graph(PATH_SIMILARITY[model]['model'], **kwargs) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return bert_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return xlnet_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=['not similar', 'similar'], )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) validate_flags_or_throw(albert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = squad_utils.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if FLAGS.do_train: num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = squad_utils.v2_model_fn_builder( albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, start_n_top=FLAGS.start_n_top, end_n_top=FLAGS.end_n_top, dropout_prob=FLAGS.dropout_prob, customized=using_customized_optimizer, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. if not tf.gfile.Exists(FLAGS.train_feature_file): train_writer = squad_utils.FeatureWriter(filename=os.path.join( FLAGS.train_feature_file), is_training=True) squad_utils.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, do_lower_case=FLAGS.do_lower_case) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) # tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, is_v2=True) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) total_time = sum(time_hist.times) if FLAGS.do_predict: with tf.gfile.Open(FLAGS.predict_file) as predict_file: prediction_json = json.load(predict_file)["data"] eval_examples = squad_utils.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists(FLAGS.predict_feature_left_file)): tf.logging.info("Loading eval features from {}".format( FLAGS.predict_feature_left_file)) with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: eval_features = pickle.load(fin) else: eval_writer = squad_utils.FeatureWriter( filename=FLAGS.predict_feature_file, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) squad_utils.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, do_lower_case=FLAGS.do_lower_case) eval_writer.close() with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: pickle.dump(eval_features, fout) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.predict_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, is_v2=True) def get_result(checkpoint): """Evaluate the checkpoint on SQuAD v2.0.""" # If running eval on the TPU, you will need to specify the number of # steps. reader = tf.train.NewCheckpointReader(checkpoint) global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True, checkpoint_path=checkpoint): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_top_log_probs = ([ float(x) for x in result["start_top_log_probs"].flat ]) start_top_index = [ int(x) for x in result["start_top_index"].flat ] end_top_log_probs = ([ float(x) for x in result["end_top_log_probs"].flat ]) end_top_index = [int(x) for x in result["end_top_index"].flat] cls_logits = float(result["cls_logits"].flat[0]) all_results.append( squad_utils.RawResultV2( unique_id=unique_id, start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, cls_logits=cls_logits)) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") result_dict = {} cls_dict = {} squad_utils.accumulate_predictions_v2( result_dict, cls_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.start_n_top, FLAGS.end_n_top) return squad_utils.evaluate_v2( result_dict, cls_dict, prediction_json, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file), int(global_step) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "f1" writer = tf.gfile.GFile(output_eval_file, "w") avg_time_per_batch = np.mean(time_hist.times) writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if num_train_steps and num_warmup_steps: writer.write("Training steps: {}\n".format(num_train_steps)) writer.write("Warmup steps: {}\n".format(num_warmup_steps)) if tf.gfile.Exists(checkpoint_path + ".index"): result = get_result(checkpoint_path) best_perf = result[0][key_name] global_step = result[1] else: global_step = -1 best_perf = -1 checkpoint_path = None while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result, global_step = get_result(checkpoint_path) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) writer.write("best {} = {}\n".format(key_name, best_perf)) tf.logging.info(" best {} = {}\n".format( key_name, best_perf)) if len(_find_valid_cands(global_step)) > 2: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result, global_step = get_result(checkpoint_path) tf.logging.info("***** Final Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best perf happened at step: {}".format(global_step))
def save_as_record(path, data, vocab_path, max_length, fields): path_tag_size = [40, 10, 100] max_ner_size = 64 events = { 'EquityFreeze': 0, 'EquityRepurchase': 1, 'EquityUnderweight': 2, 'EquityOverweight': 3, 'EquityPledge': 4 } events_fields = { 'EquityFreeze': [ 'EquityHolder', 'FrozeShares', 'LegalInstitution', 'TotalHoldingShares', 'TotalHoldingRatio', 'StartDate', 'EndDate', 'UnfrozeDate' ], 'EquityRepurchase': [ 'CompanyName', 'HighestTradingPrice', 'LowestTradingPrice', 'RepurchasedShares', 'ClosingDate', 'RepurchaseAmount' ], 'EquityUnderweight': [ 'EquityHolder', 'TradedShares', 'StartDate', 'EndDate', 'LaterHoldingShares', 'AveragePrice' ], 'EquityOverweight': [ 'EquityHolder', 'TradedShares', 'StartDate', 'EndDate', 'LaterHoldingShares', 'AveragePrice' ], 'EquityPledge': [ 'Pledger', 'PledgedShares', 'Pledgee', 'TotalHoldingShares', 'TotalHoldingRatio', 'TotalPledgedShares', 'StartDate', 'EndDate', 'ReleasedDate' ] } train_writer = tf.python_io.TFRecordWriter(path) vocab = tokenization.FullTokenizer(vocab_path) for x in tqdm(data): # 处理原文 sentences, sentences_mask = process(x['sentences'], vocab, max_length) # 处理ner 标签 # 空白的ner_tag ner_tag = np.zeros(max_length, dtype=np.int32) flag = 0 for w in x['ann_mspan2dranges'].keys(): field = w tag = x['ann_mspan2guess_field'][field] indexs = x['ann_mspan2dranges'][field] for index in indexs: if index[2] > max_length[1] - 1: flag = 1 break ner_tag[index[0]][index[1] + 1] = fields['%s_B' % str(tag).upper()] for i in range(index[1] + 1, index[2]): ner_tag[index[0]][i + 1] = fields['%s_I' % str(tag).upper()] if flag == 1: continue # 生成路径tag # 维度分别是路径数,字段数,候选值 path_tag = np.zeros(path_tag_size, dtype=np.int32) + (-1) # 存储完整路径 path_entity_list = np.zeros(path_tag_size[:2], dtype=np.int32) + (-1) path_event_type = np.zeros([path_tag_size[0]], dtype=np.int32) + (-1) event_tag = [0 for i in range(5)] # 辅助数据:实体index ners = [k for k in x['ann_mspan2dranges'].keys()] ner_index = [] ner_list_index = [0] for k in ners: ner_index.extend(x['ann_mspan2dranges'][k]) # ner_index[-1][1] += 1 # ner_index[-1][2] += 1 ner_list_index.append(ner_list_index[-1] + len(x['ann_mspan2dranges'][k])) for k in ner_index: k[1] += 1 k[2] += 1 for i in range(max_ner_size - len(ner_index)): ner_index.append([-1, -1, -1]) for i in range(max_ner_size - len(ner_list_index)): ner_list_index.append(-1) event_tree = {} for e in x['recguid_eventname_eventdict_list']: # 处理事件tag event_tag[events[e[1]]] = 1 if events[e[1]] not in event_tree.keys(): event_tree[events[e[1]]] = {} et = event_tree[events[e[1]]] # 合并相同前缀 for f in events_fields[e[1]]: value = e[2][f] if value == None: value = 'NA' if value not in et.keys(): et[value] = {} et = et[value] # 创建路径tag path_type = np.zeros([10], dtype=np.int32) + (-1) paths = [] for index, k in enumerate(event_tree.keys()): start = len(paths) paths.extend(get_path(event_tree[k])) path_type[start:len(paths)] = k for index2, path in enumerate(paths[start:]): cache = event_tree[k][path[0]] # 跳过第一个节点 if ners.index(path[0]) != 'NA': path_entity_list[start + index2, 0] = ners.index(path[0]) + 1 else: path_entity_list[start + index2, 0] = 0 for i, p in enumerate(path[1:]): tag = np.array( [0 if f not in cache.keys() else 1 for f in ners], dtype=np.int32) tag = np.concatenate([ tag, np.zeros([path_tag_size[-1] - tag.size], dtype=np.int32) ], axis=0) path_tag[start + index2, i + 1, :] = tag if p != 'NA': path_entity_list[start + index2, i + 1] = ners.index(p) + 1 else: path_entity_list[start + index2, i + 1] = 0 cache = cache[p] path_event_type[start + index2] = k tag = np.array( [0 if f not in [c[0] for c in paths] else 1 for f in ners], dtype=np.int32) tag = np.concatenate([ tag, np.zeros([path_tag_size[-1] - tag.size], dtype=np.int32) ], axis=0) path_tag[start:len(paths), 0, :] = tag if len(ner_list_index) != max_ner_size: continue if len(ner_index) != max_ner_size: continue # # test # def select_path(path_tag, path_num, path_event_type, path_entity_list): # path_index = np.random.randint(0, path_num[0], size=1, dtype=np.int32)[0] # return path_tag[path_index], path_index, path_event_type[path_index], path_entity_list[path_index] # # path_tag, path_index, path_event_type, path_entity_list = select_path(path_tag, [len(paths)], path_event_type, # path_entity_list) # # # 去除padding的ner_index # def select_nert_index(path_entity_list): # size2 = path_entity_list.argmin(axis=0) # return path_entity_list[:size2] # # path_entity_list = select_nert_index(path_entity_list) features = tf.train.Features( feature={ 'sentences': get_byte_feature(sentences), # 原始文本 'sentences_mask': get_byte_feature(sentences_mask), # 原始文本长度 'event_tag': get_byte_feature(event_tag), # 事件标签 'ner_tag': get_byte_feature(ner_tag), # 实体标签 'path_tag': get_byte_feature(path_tag), # 路径标签 'ner_list_index': get_byte_feature(ner_list_index), # 'ner_index': get_byte_feature(ner_index), # 'path_event_type': get_int_feature(path_event_type), 'path_num': get_int_feature([len(paths)]), 'path_entity_list': get_byte_feature(path_entity_list) # 'mask1': tf.train.Feature(int64_list=tf.train.Int64List(value=mask1)), }) example = tf.train.Example(features=features) train_writer.write(example.SerializeToString()) train_writer.close()
def transformer(model: str = 'bert', **kwargs): """ Load Transformer sentiment model. Parameters ---------- model : str, optional (default='bert') Model architecture supported. Allowed values: * ``'bert'`` - BERT architecture from google. * ``'tiny-bert'`` - BERT architecture from google with smaller parameters. * ``'albert'`` - ALBERT architecture from google. * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters. * ``'xlnet'`` - XLNET architecture from google. * ``'alxlnet'`` - XLNET architecture from google + Malaya. Returns ------- BERT : malaya._models._bert_model.BINARY_BERT class """ model = model.lower() if model not in _availability: raise Exception( 'model not supported, please check supported models from malaya.similarity.available_transformer_model()' ) check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs) g = load_graph(PATH_SIMILARITY[model]['model']) path = PATH_SIMILARITY if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return SIAMESE_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return SIAMESE_XLNET( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, label=['not similar', 'similar'], )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in FLAGS.layers.split(",")] bert_config = modeling.AlbertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer( spm_model_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) # examples = read_examples(FLAGS.input_file) json_examples = [] for x in ['test', 'train', 'dev']: with open(os.path.join(FLAGS.input_file, x + '.english.jsonlines')) as f: json_examples.extend((json.loads(jsonline) for jsonline in f.readlines())) orig_examples = [] bert_examples = [] for i, json_e in enumerate(json_examples): e = process_example(json_e, i, should_filter_embedded_mentions=True) orig_examples.append(e) bert_examples.append(e.bertify(tokenizer)) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, layer_indexes=layer_indexes, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.batch_size) input_fn = input_fn_builder( examples=bert_examples, window_size=FLAGS.window_size, stride=FLAGS.stride, tokenizer=tokenizer) writer = h5py.File(FLAGS.output_file, 'w') with tqdm(total=sum(len(e.tokens) for e in orig_examples)) as t: for result in estimator.predict(input_fn, yield_single_examples=True): document_index = int(result["unique_ids"]) bert_example = bert_examples[document_index] orig_example = orig_examples[document_index] file_key = bert_example.doc_key.replace('/', ':') t.update(n=(result['extract_indices'] >= 0).sum()) for output_index, bert_token_index in enumerate(result['extract_indices']): if bert_token_index < 0: continue token_index = bert_example.bert_to_orig_map[bert_token_index] sentence_index, token_index = orig_example.unravel_token_index(token_index) dataset_key ="{}/{}".format(file_key, sentence_index) if dataset_key not in writer: writer.create_dataset(dataset_key, (len(orig_example.sentence_tokens[sentence_index]), bert_config.hidden_size, len(layer_indexes)), dtype=np.float32) dset = writer[dataset_key] for j, layer_index in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] dset[token_index, :, j] = layer_output[output_index] writer.close()
def transformer( path, s3_path, class_name, label, model='bert', quantized=False, **kwargs, ): check_file(path[model], s3_path[model], quantized=quantized, **kwargs) if quantized: model_path = 'quantized' else: model_path = 'model' g = load_graph(path[model][model_path], **kwargs) if len(label) > 2 or class_name == 'relevancy': if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = MULTICLASS_BERT selected_node = 'import/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: selected_class = MULTICLASS_XLNET selected_node = 'import/transpose_3:0' else: if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: selected_class = BINARY_BERT selected_node = 'import/dense/BiasAdd:0' if model in ['xlnet', 'alxlnet']: selected_class = BINARY_XLNET selected_node = 'import/transpose_3:0' if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']: if model in ['bert', 'tiny-bert']: from malaya.transformers.bert import ( _extract_attention_weights_import, ) from malaya.transformers.bert import bert_num_layers tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'], path[model]['vocab']) if model in ['albert', 'tiny-albert']: from malaya.transformers.albert import ( _extract_attention_weights_import, ) from malaya.transformers.albert import bert_num_layers from albert import tokenization tokenizer = tokenization.FullTokenizer( vocab_file=path[model]['vocab'], do_lower_case=False, spm_model_file=path[model]['tokenizer'], ) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=None, input_masks=g.get_tensor_by_name('import/Placeholder_1:0'), logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, attns=_extract_attention_weights_import(bert_num_layers[model], g), class_name=class_name, ) if model in ['xlnet', 'alxlnet']: if model in ['xlnet']: from malaya.transformers.xlnet import ( _extract_attention_weights_import, ) if model in ['alxlnet']: from malaya.transformers.alxlnet import ( _extract_attention_weights_import, ) tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer']) return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), logits_seq=g.get_tensor_by_name('import/logits_seq:0'), vectorizer=g.get_tensor_by_name(selected_node), sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, label=label, attns=_extract_attention_weights_import(g), class_name=class_name, )
def trans_to_features(self, example): """ 将输入转化为索引表示 :param example: 输入 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) features = [] unique_id = 1000000000 query_tokens = tokenizer.tokenize(example['question']) # 给定query一个最大长度来控制query的长度 if len(query_tokens) > self.query_length: query_tokens = query_tokens[: self.query_length] # 主要是针对context构造索引,之前我们将中文,标点符号,空格,一连串的数字,英文单词分割存储在doc_tokens中 # 但在bert的分词器中会将一连串的数字,中文,英文等分割成子词,也就是说经过bert的分词之后得到的tokens和之前 # 获得的doc_tokens是不一样的,因此我们仍需要对start和end position从doc_tokens中的位置映射到当前tokens的位置 tok_to_orig_index = [] # 存储未分词的token的索引,但长度和下面的相等 orig_to_tok_index = [] # 存储分词后的token的索引,但索引不是连续的,会存在跳跃的情况 all_doc_tokens = [] # 存储分词后的token,理论上长度是要大于all_tokens的 for (i, token) in enumerate(example['doc_tokens']): sub_tokens = tokenizer.tokenize(token) # orig_to_tok_index的长度等于doc_tokens,里面每个值存储的是doc_tokens中的token在all_doc_tokens中的起止索引值 # 用来将在all_token中的start和end转移到all_doc_tokens中 orig_to_tok_index.append([len(all_doc_tokens)]) for sub_token in sub_tokens: # tok_to_orig_index的长度等于all_doc_tokens, 里面会有重复的值 tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) orig_to_tok_index[-1].append(len(all_doc_tokens) - 1) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = self.max_length - len(query_tokens) - 3 doc_spans = [] _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) # 在使用bert的时候,一般会将最大的序列长度控制在512,因此对于长度大于最大长度的context,我们需要将其分成多个片段 # 采用滑窗的方式,滑窗大小是小于最大长度的,因此分割的片段之间是存在重复的子片段。 start_offset = 0 # 截取的片段的起始位置 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset # 当长度超标,需要使用滑窗 if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): # 当length < max_len时,该条件成立 break start_offset += min(length, self.doc_stride) # 组合query和context的片段成一个序列输入到bert中 for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} # 因为片段之间会存在重复的子片段,但是子片段中的token在不同的片段中的重要性是不一样的, # 在这里根据上下文的数量来决定token的重要性,在之后预测时对于出现在两个片段中的token,只取重要性高的片段 # 中的token的分数作为该token的分数 token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[ split_token_index] # 映射当前span组成的句子对的索引到原始token的索引 # 在利用滑窗分割多个span时会存在有的词出现在两个span中,但最后统计的时候,我们只能选择一个span,因此 # 作者根据该词上下文词的数量构建了一个分数,取分数最高的那个span is_max_context = self._check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.max_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.max_length assert len(input_mask) == self.max_length assert len(segment_ids) == self.max_length features.append({'unique_id': unique_id, 'doc_span_index': doc_span_index, 'tokens': tokens, 'token_to_orig_map': token_to_orig_map, 'token_is_max_context': token_is_max_context, 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, 'start_position': -1, 'end_position': -1}) unique_id += 1 return features