def _convert_single_example(self, example, seq_length, tokenizer): tokens_a = tokenizer.tokenize( example.text_a ) # Todo: optimize here if you want char and word concat input if self.params['chinese_seg'] == 'mixed': tokenizer_word = tokenization.BasicTokenizer(chinese_seg='word', params=self.params) tokenizer_char = tokenization.BasicTokenizer(chinese_seg='char', params=self.params) tokens_a_word = tokenizer_word.tokenize(example.text_a) tokens_a_char = tokenizer_char.tokenize(example.text_a) if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] tokens = [] tokens.append("[CLS]") for token in tokens_a: tokens.append(token) tokens.append("[SEP]") input_ids = tokenizer.convert_tokens_to_ids(tokens=tokens) while len(input_ids) < seq_length: input_ids.append(0) assert len(input_ids) == seq_length if example.label in self.label_map.keys(): label_id = self.label_map[example.label] else: label_id = self.label_map['NA'] feature = InputFeatures(input_ids=input_ids, label_ids=label_id) #print('ids',example.label,'tokens',tokens) return feature
def main(): BERT_DIR = "/home1/s/shahkr/Penn/krunal/Courses/DecompRC/DecompRC/model/uncased_L-12_H-768_A-12/" parser = argparse.ArgumentParser("Postprocess decomposed HOTPOT questions") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--do_lower_case", default=True, action='store_true') parser.add_argument("--perturb", type=str, default="remove") parser.add_argument("--data_type", type=str, default="dev") parser.add_argument("--out_name", default="out/onehop") args = parser.parse_args() if args.perturb == "no": return out_name = args.out_name data_type = args.data_type if not os.path.isdir(os.path.join('data', 'decomposed-predictions')): os.makedirs(os.path.join('data', 'decomposed-predictions')) data_type, reasoning_type = data_type.split('_') assert data_type in ['dev', 'train'] and reasoning_type in ['b', 'i'] with open(os.path.join('data', 'hotpot-all', '{}.json'.format(data_type)), 'r') as f: orig_data = json.load(f)['data'] with open(os.path.join(out_name, '{}_predictions.json'.format(data_type)), 'r') as f: result = json.load(f) output_path = os.path.join( out_name, '{}_{}_perturbed_predictions.json'.format(data_type, args.perturb)) if not os.path.isdir(os.path.join('data', 'decomposed')): os.makedirs(os.path.join('data', 'decomposed')) if args.perturb == "remove": tokenizer = tokenization.BasicTokenizer( do_lower_case=args.do_lower_case, split_punct=False, ignore_ans=True) remove_queries(orig_data, result, output_path, tokenizer) elif args.perturb == "invert": tokenizer = tokenization.BasicTokenizer( do_lower_case=args.do_lower_case, split_punct=True, ignore_ans=True) invert(orig_data, result, output_path, tokenizer)
def test_basic_tokenizer_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=True) self.assertAllEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
def transform(sentences, start_index, end_index, make_fake=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=False) output = [] if start_index != 0: tqdm = lambda x: x else: from tqdm import tqdm for line_index in tqdm(range(start_index, end_index)): line = sentences[line_index] line = line.strip() if len(line) == 0: output.append("\n") else: words = tokenizer.tokenize(line) if make_fake: assert all( all(ord(c) < UNICODE_OFFSET for c in word) for word in words), "Character unicode >= UNICODE_OFFSET" words = [ "".join([chr(ord(c) + UNICODE_OFFSET) for c in word]) for word in words ] output.append(" ".join(words) + "\n") return output
def test_basic_tokenizer_no_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=False) self.assertAllEqual( tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"], )
def convert_pred2bauerinput(input_file, output_file, for_training, predictions=None, index_num_answers=None): tokenizer = tokenization.BasicTokenizer() sum_empty = 0 with jsonlines.open(output_file, mode="w") as writer: with jsonlines.open(input_file, mode="r") as reader: for example in reader: qid = example['id'] #context = [subelmt for elmt in example["context"] for subelmt in elmt] question = tokenizer.tokenize(example['question']) answer1, answer2 = example['final_answers'][:2] answer1, answer2 = tokenizer.tokenize( answer1), tokenizer.tokenize(answer2) if for_training: context = example['final_answers'][2:] if len(context) == 0: sum_empty += 1 context = example['context'][0] else: context = predictions[qid][index_num_answers] writer.write({ "commonsense": [], "summary": context, "ques": question, "answer1": answer1, "answer2": answer2, "doc_num": qid }) print(sum_empty)
def test_chinese(self): tokenizer = tokenization.BasicTokenizer() self.assertAllEqual( tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"], )
def test_basic_tokenizer_no_lower(self): tokenizer = tokenization.BasicTokenizer(do_lower_case=False) self.assertAllEqual(tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]) self.assertAllEqual(tokenizer.tokenize(u" \tSveIks!kā \n Tev ieT? "), ["SveIks", "!", "kā", "Tev", "ieT", "?"])
def main(argv): global tokenizer global estimator global basic_tokenizer bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True) num_train_steps = None num_warmup_steps = None model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) config = tf.ConfigProto() config.gpu_options.allow_growth = True estimator = tf.estimator.Estimator( model_fn=model_fn, config=tf.estimator.RunConfig(session_config=config)) basic_tokenizer = tokenization.BasicTokenizer(use_moran=False)
def convert(dataset, input_file, output_file, bauer, n=0): import nltk import Levenshtein tokenizer = tokenization.BasicTokenizer() with open(input_file, "r") as pred_file: pred = json.load(pred_file) with open(output_file, "w") as writer: with jsonlines.open(bauer, "r") as bauer_file: for example in bauer_file: if example['doc_num'] in dataset.keys(): writen = False for query_key, query_value in dataset[ example['doc_num']]['queries'].items(): levenshtein = Levenshtein.distance( "".join( tokenizer.tokenize(" ".join(example['ques']))), "".join(tokenizer.tokenize( query_value['query']))) #TODO rechange if levenshtein < 5: query_id = query_key generated_answer = pred.get( query_id, ["NO PREDICTION"] * (n))[n - 1] writer.write(generated_answer + "\n") writen = True break if not writen: print("f**k") writer.write("NO PREDICTION\n")
def customize_tokenizer(text, do_lower_case=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split() # 所以我们这里会拿到一个list
def train(prefix=PREFIX, vocab_size=VOCABSIZE, ctl_symbols=CTLSYMBOLS, tokenized=args.tokenized): # if files are tokenized if not tokenized: files = _get_text_file() print("files: {}".format(files)) # pre-tokenization tokenizer = tokenization.BasicTokenizer(do_lower_case=True) #False? tokenak = [] files_tokenized = "" # comma separated files for fs in files.split(","): filename = fs + ".tokenized" with open(filename, 'w', encoding='utf-8') as fw: print("fs: {}".format(fs)) with open(fs, 'r') as f: for line in f: tokenak = tokenizer.tokenize(line) fw.write(" ".join([str(x) for x in tokenak])) fw.write('\n') files_tokenized += "," + filename else: # files are tokenized files_tokenized = _get_tokenized_file() # https://github.com/allenai/scibert/blob/5d72d0ec50e2d3ebe971122f8b282278c210eccd/scripts/cheatsheet.txt # https://github.com/google/sentencepiece/blob/d4dd947fe71c4fa4ee24ad8297beee32887d8828/python/sentencepiece_python_module_example.ipynb # begiratu sentencepiece Normalization atala # parametroak # https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # spm.SentencePieceTrainer.Train('--input=combined.out --model_prefix=100B_9999_cased --vocab_size=31000 --character_coverage=0.9999 --model_type=bpe --input_sentence_size=100000000 --shuffle_input_sentence=true') # BPE #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --character_coverage=1.0 --model_type=bpe --input_sentence_size=100000000 --shuffle_input_sentence=true --normalization_rule_name=identity' # UNIGRAM command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --character_coverage=1.0 --model_type=unigram --input_sentence_size=100000000 --shuffle_input_sentence=true --normalization_rule_name=identity' # --model_type=word #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --model_type=word --hard_vocab_limit=false' # RuntimeError: Internal: /sentencepiece/src/trainer_interface.cc(498) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] # #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --hard_vocab_limit=false' # RuntimeError: Internal: /sentencepiece/src/trainer_interface.cc(498) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] sp.SentencePieceTrainer.Train(command)
def __init__(self, params, seq_length, chinese_seg, generate_label_map=False): self.seq_length = seq_length self.params = params # pass parameters by reference in python self.tokenizer = tokenization.BasicTokenizer(chinese_seg=chinese_seg, params=params) self.generate_label_map = generate_label_map if self.generate_label_map: self.labels = set(['NA']) self.label_map = {} else: _, self.label_map = self.load_label_dict()
def test_chinese(self): tokenizer = tokenization.BasicTokenizer() tokens = tokenizer.tokenize( u'患者于4年前出现活动性心悸、胸痛,多在重体力活动时发作,胸痛位于剑突下和心前区,手掌大小,' u'呈闷压样疼痛不适,每次持续10分钟左右,休息数分钟可缓解,发作时伴明显心悸、呼吸困难,无咳嗽、' u'咳痰,无恶心、呕吐,无出汗,头晕、头痛。曾于2011年来我院就诊,诊断为“冠心病 不稳定心绞痛 ' u'房颤 心功能3级”,后正规服用药物,症状仍间断发作。3月来上述症状明显加重,表现为明显不能耐受体力活动,' u'稍活动即有明显的胸痛发作,长舒气后症状有所缓解,伴四肢乏力,以双下肢为甚,' u'伴夜间阵发性呼吸困难及端坐呼吸,上述症状间断出现,进行性加重,后出现双下肢水肿,晨轻暮重,' u'今为进一步明确诊治,特来我院,门诊以“冠心病 心律失常 心功能不全”收入我科') for token in tokens: print(token) self.assertAllEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
def create_predict_examples(input_file): examples = [] id = 10000 df = pd.read_excel(input_file) basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False) str_list = list(df['Content']) for i in range(len(str_list)): str_list[i] = str_list[i].replace('"', '') str_list[i] = str_list[i].replace("'", '') for req in str_list: tok_list = basic_tokenizer.tokenize(req) #tok_list.append('EOS') examples.append(InputExample(id, tok_list, ['O'] * len(tok_list))) id += 1 return examples
def build_char_vocabs(data_dir, char_embedding_table, char_dict, PairIndexDict): all_sentences, max_len_p, max_len_h = get_all_data(data_dir, PairIndexDict) print(len(all_sentences)) tokenizer = tokenization.BasicTokenizer(do_lower_case=True) _, char_embedding_size = char_embedding_table.shape chars2id = {} id2chars = {} tokenC_embeddings = [] chars2id["[PAD]"] = 0 id2chars[0] = "[PAD]" tokenC_embeddings.append(np.zeros([30])) token_id = 1 for sen in all_sentences: tokens = tokenizer.tokenize(sen) chars_embedding = np.ndarray((len(tokens), char_embedding_size)) for token in tokens: if token not in chars2id: chars2id[token] = token_id id2chars[token_id] = token token_id += 1 chars = list(token) word_chars_embedding = np.ndarray( (len(chars), char_embedding_size)) for i, char in enumerate(chars): if char not in char_dict: word_chars_embedding[i, :] = char_embedding_table[ -1, :] else: word_chars_embedding[i, :] = char_embedding_table[ char_dict[char], :] word_char_embedding = np.amax(word_chars_embedding, axis=0) tokenC_embeddings.append(word_char_embedding) token_char_embedding_table = np.ndarray( (len(tokenC_embeddings), char_embedding_size)) for i in range(len(tokenC_embeddings)): token_char_embedding_table[i, :] = tokenC_embeddings[i] return (chars2id, id2chars, token_char_embedding_table)
def convert_pred2bauerpred(dataset, input_file, output_file, bauer, pred_or_wl=True, n=1, levenshtein_threshold=5): tokenizer = tokenization.BasicTokenizer() file_gt_a1 = open(output_file + "_gt1.txt", "w") file_gt_a2 = open(output_file + "_gt2.txt", "w") with open(input_file, "r") as pred_file: pred = json.load(pred_file) with open(output_file, "w") as writer: with jsonlines.open(bauer, "r") as bauer_file: for example in bauer_file: if example['doc_num'] in dataset.keys(): writen = False for query_key, query_value in dataset[ example['doc_num']]['queries'].items(): levenshtein = Levenshtein.distance( "".join( tokenizer.tokenize(" ".join(example['ques']))), "".join(tokenizer.tokenize(query_value['query']))) if levenshtein < levenshtein_threshold: query_id = query_key file_gt_a1.write(query_value['answer1'] + "\n") file_gt_a2.write(query_value['answer2'] + "\n") if pred_or_wl: #is true when want predictions generated_answer = pred.get( query_id, ["NO PREDICTION"] * (n))[n - 1] else: # is false when want the first weak label weak labels generated_answer = pred.get( query_id, [["NO PREDICTION"] * 3])[-1] if len(generated_answer ) > n: #n depends on if gt in generated_answer = generated_answer[n] else: generated_answer = "unknown" writer.write(generated_answer + "\n") writen = True break if not writen: writer.write("NO PREDICTION\n") file_gt_a1.close() file_gt_a2.close()
def get_final_text(pred_text, orig_text, do_lower_case, logger, verbose_logging): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in six.iteritems(tok_ns_to_s_map): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
parser.add_argument("--edit_ids", type=str, default=None, help="path to edit ids to be applied on input_tokens") parser.add_argument("--output_tokens", type=str, default=None, help="path to edited (hopefully corrected) file") parser.add_argument("--infer_mode", type=str, default="conll", help="post processing mode bea or conll") parser.add_argument("--path_common_inserts",type=str,default=None,help="path of common unigram inserts") parser.add_argument("--path_common_multitoken_inserts",type=str,default=None,help="path of common bigram inserts") parser.add_argument("--path_common_deletes",type=str,default=None,help="path to common deletions observed in train data") parser = argparse.ArgumentParser() add_arguments(parser) FLAGS, unparsed = parser.parse_known_args() DO_PARALLEL = False INFER_MODE=FLAGS.infer_mode vocab = tokenization.load_vocab(FLAGS.vocab_path) basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False,vocab=vocab) vocab_words = set(x for x in vocab) common_deletes = pickle.load(open(FLAGS.path_common_deletes,"rb")) path_common_inserts = FLAGS.path_common_inserts path_common_multitoken_inserts = FLAGS.path_common_multitoken_inserts opcodes = opcodes.Opcodes(path_common_inserts, path_common_multitoken_inserts) if __name__ == '__main__': class config: INPUT_UNCORRECTED_WORDS = FLAGS.input_tokens INPUT_EDITS = FLAGS.edit_ids OUTPUT_CORRECTED_WORDS = FLAGS.output_tokens def fix_apos_break(word, p_word, pp_word):
def main(argv): bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True) num_train_steps = None num_warmup_steps = None model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) config = tf.ConfigProto() config.gpu_options.allow_growth = True estimator = tf.estimator.Estimator( model_fn=model_fn, config=tf.estimator.RunConfig(session_config=config)) basic_tokenizer = tokenization.BasicTokenizer(use_moran=False) examples = read_korquad_examples(input_file=FLAGS.example_file) rnd = random.Random(12345) rnd.shuffle(examples) print(f"\n=========== 기계독해 예문 {len(examples)}건 Loadind Done ... by TBai ==============\n") all_results = list() output_results = dict() for result in estimator.predict( input_fn_builder( input_data=examples, tokenizer=tokenizer, seq_length=FLAGS.max_seq_length, drop_remainder=False), yield_single_examples=False) : print('########################### next1') #print('########################### result =', result) print('########################### len(eval_features[0]) =', len(eval_features[0])) for idx in range(len(eval_features[0])) : unique_id = int(result["unique_ids"][idx]) start_logits = [float(x) for x in result["start_logits"][idx].flat] end_logits = [float(x) for x in result["end_logits"][idx].flat] print('########################################################### ', unique_id); print('########################################################### ', start_logits); print('########################################################### ', end_logits); all_results.append(RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) if len(eval_examples) != 0 and len(eval_features) != 0 : answer, score, prob = write_predictions(eval_examples[0], eval_features[0], all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, FLAGS.analyze_kor_morph, basic_tokenizer) print('####### answer =', answer) print('####### score =', score) print('####### prob =', prob) tscore = str(score)[:5] tprob = str(prob*100)[:5] if answer != 'N/A' : print(f"Done ... 답변 신뢰도 : ({tscore}, {tprob}%)") else : print(f"Done ...") res = "독해결과 : " + answer + "\n\n" print(f"{res}") all_results.clear() eval_features.clear() eval_examples.clear() print("\n\n\t투블럭에이아이에서 제공하여 드렸습니다. https://twoblockai.com/\n\n")
def __init__(self, converted_filename, dataset, ranking_dic=None): self.converted_filename = converted_filename self.dataset = dataset self.ranking_dic = ranking_dic self.tokenizer = tokenization.BasicTokenizer()
def main(argv): tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if len(argv) > 1: FLAGS.predict_file = argv[1] validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) moran_tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True) basic_tokenizer = tokenization.BasicTokenizer(use_moran=False) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train : train_examples = read_squad_examples(input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(42) rng.shuffle(train_examples) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_record_exists = False train_writer = FeatureWriter( filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True, record_file_exists=train_record_exists) convert_examples_to_features( examples=train_examples, tokenizer=moran_tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = input_fn_builder( input_file=train_writer.filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict : output_prediction_file_name = "predictions.json" output_nbest_file_name = "nbest_predictions.json" output_null_log_odds_file_name = "null_odds.json" if FLAGS.korquad_refine_answer_by_pos: output_prediction_file_name = "predictions_pos.json" if FLAGS.do_predict: eval_examples = read_squad_examples(input_file=FLAGS.predict_file, is_training=False) eval_record_exists = os.path.exists(os.path.join(FLAGS.output_dir, "eval.tf_record")) eval_record_exists = False if eval_record_exists: tf.logging.info("eval.tf_record exists. Do not write tf example file.") eval_writer = FeatureWriter( filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), is_training=False, record_file_exists=eval_record_exists) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) convert_examples_to_features( examples=eval_examples, tokenizer=moran_tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) all_results = [] predict_input_fn = input_fn_builder( input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) if FLAGS.do_train: init_checkpoint = None else: if FLAGS.init_checkpoint is not None and tf.gfile.IsDirectory(FLAGS.init_checkpoint): from tensorflow.python.training import checkpoint_management init_checkpoint = checkpoint_management.latest_checkpoint(FLAGS.init_checkpoint) else: init_checkpoint = FLAGS.init_checkpoint all_results = [] for result in estimator.predict( predict_input_fn, yield_single_examples=True, checkpoint_path=init_checkpoint): unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) if len(argv) > 1: output_prediction_file = os.path.join(FLAGS.output_dir, argv[2]) else: output_prediction_file = os.path.join(FLAGS.output_dir, output_prediction_file_name) output_nbest_file = os.path.join(FLAGS.output_dir, output_nbest_file_name) output_null_log_odds_file = os.path.join(FLAGS.output_dir, output_null_log_odds_file_name) write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, basic_tokenizer)
best_span = (i, j) if f1 >= 1.0: return best_span, best_f1 return best_span, best_f1 if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_file', '-d', type=str, required=True) parser.add_argument('--output_file', '-o', type=str, required=True) parser.add_argument('--bert', '-b', action='store_true', default=False, help='Use bert tokenization') args = parser.parse_args() if args.bert: tokenizer = bt.BasicTokenizer(do_lower_case=False) _process = lambda x: process_bert(x, tokenizer) else: _process = process with open(args.data_file, 'r') as f: dataset = json.load(f) f1_scores = [] data = [] start_time = time.time() diag_rows = [] print(','.join(['gold_answer', 'gold_rationale', 'span_reconstructed_answer', 'f1'])) for i, datum in enumerate(dataset['data']): if i % 10 == 0: print('processing %d / %d (used_time = %.2fs)...' %
def _read_data(cls, input_file, mode='train'): """ 对应下文的百度比赛的数据处理。 http://lic2019.ccf.org.cn/kg 其中有两个地方注意下。 这里先将数据用bert的BasicTokenizer处理一下把一些奇奇怪怪的字符处理掉。如果不处理的话, 后文的inputs_id和label_id 对应不上,因为你的label_id是带有那些奇怪的字符的。而bert处理过后不带。 tokenizer = tokenization.BasicTokenizer(do_lower_case=True) text = tokenizer.tokenize(text) """ import json tokenizer = tokenization.BasicTokenizer(do_lower_case=True) trn_data = json.load(open(input_file, encoding='utf-8')) if mode == 'train': train_data = [] for line in trn_data: text = line['text'].strip() text = tokenizer.tokenize(text) text = ''.join([l for l in text]) label = ['O'] * len(text) spo_list = line['spo_list'] sub_ = [] obj_ = [] for i in spo_list: sub_.append(i[0]) obj_.append(i[2]) ent_spans = [] for sub in sub_: if sub == None: last_idx = 0 while True: if last_idx >= len(text): break start = text[last_idx:].find(sub) if start == -1: break end = start + len(sub) ent_spans.append((start + last_idx, end + last_idx)) last_idx = end + last_idx ent_obj = [] for obj in obj_: last_idx = 0 while True: if last_idx >= len(text): break start = text[last_idx:].find(obj) if start == -1: break end = start + len(obj) ent_obj.append((start + last_idx, end + last_idx)) last_idx = end + last_idx for i, c in enumerate(text): for sp in ent_spans: if sp[0] == i: if sp[0] == sp[1]: label[i] = 'S1' else: label[i] = 'B1' elif sp[1] - 1 == i: label[i] = 'E1' elif sp[0] < i < sp[1] - 1: label[i] = 'I1' for i, c in enumerate(text): for sp in ent_obj: if sp[0] == i: if sp[0] == sp[1]: label[i] = 'S2' else: label[i] = 'B2' elif sp[1] - 1 == i: label[i] = 'E2' elif sp[0] < i < sp[1] - 1: label[i] = 'I2' l = ' '.join([la for la in label]) w = ' '.join([word for word in text]) train_data.append((w, l)) return train_data elif mode == 'test': test_data = [] for line in trn_data: text = line['text'].strip() label = ['O'] * len(text) test_data.append((list(text), label)) return test_data
class BertMRC : bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True) num_train_steps = None num_warmup_steps = None model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) config = tf.ConfigProto() config.gpu_options.allow_growth = True estimator = tf.estimator.Estimator( model_fn=model_fn, config=tf.estimator.RunConfig(session_config=config)) basic_tokenizer = tokenization.BasicTokenizer(use_moran=False) #examples = read_korquad_examples() #rnd = random.Random(12345) #rnd.shuffle(examples) #print(f"\n=========== 기계독해 예문 {len(examples)}건 Loadind Done ... by TBai ==============\n") def predict(self, input_context, input_question): print('input_context =', input_context) print('input_question =', input_question) all_results = list() output_results = dict() response = [] response_detail = {} for result in self.estimator.predict( input_fn_builder( input_context, input_question, tokenizer=self.tokenizer, seq_length=FLAGS.max_seq_length, drop_remainder=False), yield_single_examples=False): for idx in range(len(eval_features[0])): unique_id = int(result["unique_ids"][idx]) start_logits = [float(x) for x in result["start_logits"][idx].flat] end_logits = [float(x) for x in result["end_logits"][idx].flat] all_results.append(RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) if len(eval_examples) != 0 and len(eval_features) != 0: answer, score, prob = write_predictions(eval_examples[0], eval_features[0], all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, FLAGS.analyze_kor_morph, self.basic_tokenizer) #print('####### answer = %s, score = %f, prob = %f', answer, score, prob) tscore = str(score)[:5] tprob = str(prob * 100)[:5] if answer != 'N/A': print(f"Done ... 답변 신뢰도 : ({tscore}, {tprob}%)") else: print(f"Done ...") res = "독해결과 : " + answer + "\n\n" print(f"{res}") response_detail = {'answer': answer, 'score': tscore, 'prob': tprob} response.append(response_detail) all_results.clear() eval_features.clear() eval_examples.clear() return response
import os import sys import tokenization LANG = sys.argv[1] pair = f'{LANG}-en' src_path = os.path.join("data", "europarl", pair, f"europarl-v7.{pair}.en") tgt_path = os.path.join("data", "europarl", pair, f"europarl-v7.{pair}.{LANG}") out_path = os.path.join("data", "europarl", pair, f"fastalign-europarl.{pair}") src_fp = open(src_path, 'r') tgt_fp = open(tgt_path, 'r') tokenizer = tokenization.BasicTokenizer(do_lower_case=False) with open(out_path, 'w') as out_fp: tokenize = lambda x: " ".join(tokenizer.tokenize(x)) for src_sent, tgt_sent in zip(src_fp, tgt_fp): if src_sent and tgt_sent: src_tokens = tokenize(src_sent) tgt_tokens = tokenize(tgt_sent) if src_tokens and tgt_tokens: out_fp.write(f'{src_tokens} ||| {tgt_tokens}\n') src_fp.close() tgt_fp.close()
import tokenization import torch import numpy as np import os import json import re import random import pickle import time from tqdm import tqdm from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split os.environ['CUDA_VISIBLE_DEVICES'] = '0' DATA_PATH = 'data/small_train_data.json' tokenizer = tokenization.BasicTokenizer() full_tokenizer = tokenization.BertTokenizer.from_pretrained( 'bert-base-chinese', cache_dir='./cache') device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) model = pytorch_pretrained_bert.modeling.BertForQuestionAnswering.from_pretrained( 'bert-base-chinese') MULTI_GPU = False if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model) MULTI_GPU = True model.to(device) EPOCHS = 5 BATCH_SIZE = 6
import codecs import json from tqdm import tqdm import re import tokenization # tokenizer = FullTokenizer(vocab_file='/opt/hanyaopeng/souhu/data/chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True) tokenizer = tokenization.BasicTokenizer(do_lower_case=True) input_file = '/opt/hanyaopeng/souhu/data/data_v2/coreEntityEmotion_test_stage1.txt' with open(input_file, encoding='utf-8') as f: test_data = [] for l in tqdm(f): data = json.loads(l.strip()) news_id = data['newsId'] title = data['title'] title = tokenizer.tokenize(title) title = ''.join([l for l in title]) content = data['content'] sentences = [] ans = '' + title for seq in re.split(r'[\n。]', content): seq = tokenizer.tokenize(seq) seq = ''.join([l for l in seq]) if len(seq) > 0: if len(seq) + len(ans) <= 254: if len(ans) == 0: ans = ans + seq else: ans = ans + '。' + seq elif len(seq) + len(ans) > 254 and len(seq) + len(
def get_final_text(pred_text, orig_text, do_lower_case): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heruistic between # `pred_text` and `orig_text` to get a character-to-charcter alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if FLAGS.verbose_logging: tf.logging.info( "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if FLAGS.verbose_logging: tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in six.iteritems(tok_ns_to_s_map): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if FLAGS.verbose_logging: tf.logging.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if FLAGS.verbose_logging: tf.logging.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def get_final_text(pred_text, orig_text, do_lower_case): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if FLAGS.verbose_logging: tf.compat.v1.logging.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if FLAGS.verbose_logging: tf.compat.v1.logging.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text tok_s_to_ns_map = {} for (i, tok_index) in six.iteritems(tok_ns_to_s_map): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if FLAGS.verbose_logging: tf.compat.v1.logging.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if FLAGS.verbose_logging: tf.compat.v1.logging.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text