def _convert_single_example(self, example, seq_length, tokenizer):
        tokens_a = tokenizer.tokenize(
            example.text_a
        )  # Todo: optimize here if you want char and word concat input
        if self.params['chinese_seg'] == 'mixed':
            tokenizer_word = tokenization.BasicTokenizer(chinese_seg='word',
                                                         params=self.params)
            tokenizer_char = tokenization.BasicTokenizer(chinese_seg='char',
                                                         params=self.params)
            tokens_a_word = tokenizer_word.tokenize(example.text_a)
            tokens_a_char = tokenizer_char.tokenize(example.text_a)

        if len(tokens_a) > seq_length - 2:
            tokens_a = tokens_a[0:(seq_length - 2)]

        tokens = []
        tokens.append("[CLS]")
        for token in tokens_a:
            tokens.append(token)
        tokens.append("[SEP]")

        input_ids = tokenizer.convert_tokens_to_ids(tokens=tokens)
        while len(input_ids) < seq_length:
            input_ids.append(0)
        assert len(input_ids) == seq_length

        if example.label in self.label_map.keys():
            label_id = self.label_map[example.label]
        else:
            label_id = self.label_map['NA']
        feature = InputFeatures(input_ids=input_ids, label_ids=label_id)
        #print('ids',example.label,'tokens',tokens)
        return feature
Example #2
0
def main():
    BERT_DIR = "/home1/s/shahkr/Penn/krunal/Courses/DecompRC/DecompRC/model/uncased_L-12_H-768_A-12/"
    parser = argparse.ArgumentParser("Postprocess decomposed HOTPOT questions")
    parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--do_lower_case", default=True, action='store_true')
    parser.add_argument("--perturb", type=str, default="remove")
    parser.add_argument("--data_type", type=str, default="dev")
    parser.add_argument("--out_name", default="out/onehop")
    args = parser.parse_args()

    if args.perturb == "no":
        return

    out_name = args.out_name
    data_type = args.data_type

    if not os.path.isdir(os.path.join('data', 'decomposed-predictions')):
        os.makedirs(os.path.join('data', 'decomposed-predictions'))

    data_type, reasoning_type = data_type.split('_')
    assert data_type in ['dev', 'train'] and reasoning_type in ['b', 'i']

    with open(os.path.join('data', 'hotpot-all', '{}.json'.format(data_type)),
              'r') as f:
        orig_data = json.load(f)['data']

    with open(os.path.join(out_name, '{}_predictions.json'.format(data_type)),
              'r') as f:
        result = json.load(f)

    output_path = os.path.join(
        out_name,
        '{}_{}_perturbed_predictions.json'.format(data_type, args.perturb))

    if not os.path.isdir(os.path.join('data', 'decomposed')):
        os.makedirs(os.path.join('data', 'decomposed'))

    if args.perturb == "remove":
        tokenizer = tokenization.BasicTokenizer(
            do_lower_case=args.do_lower_case,
            split_punct=False,
            ignore_ans=True)
        remove_queries(orig_data, result, output_path, tokenizer)
    elif args.perturb == "invert":
        tokenizer = tokenization.BasicTokenizer(
            do_lower_case=args.do_lower_case,
            split_punct=True,
            ignore_ans=True)
        invert(orig_data, result, output_path, tokenizer)
    def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
Example #4
0
def transform(sentences, start_index, end_index, make_fake=False):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
    output = []
    if start_index != 0:
        tqdm = lambda x: x
    else:
        from tqdm import tqdm
    for line_index in tqdm(range(start_index, end_index)):
        line = sentences[line_index]
        line = line.strip()
        if len(line) == 0:
            output.append("\n")
        else:
            words = tokenizer.tokenize(line)
            if make_fake:
                assert all(
                    all(ord(c) < UNICODE_OFFSET for c in word)
                    for word in words), "Character unicode >= UNICODE_OFFSET"

                words = [
                    "".join([chr(ord(c) + UNICODE_OFFSET) for c in word])
                    for word in words
                ]
            output.append(" ".join(words) + "\n")
    return output
Example #5
0
    def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"],
        )
def convert_pred2bauerinput(input_file,
                            output_file,
                            for_training,
                            predictions=None,
                            index_num_answers=None):
    tokenizer = tokenization.BasicTokenizer()
    sum_empty = 0
    with jsonlines.open(output_file, mode="w") as writer:
        with jsonlines.open(input_file, mode="r") as reader:
            for example in reader:
                qid = example['id']
                #context = [subelmt for elmt in example["context"] for subelmt in elmt]
                question = tokenizer.tokenize(example['question'])
                answer1, answer2 = example['final_answers'][:2]
                answer1, answer2 = tokenizer.tokenize(
                    answer1), tokenizer.tokenize(answer2)
                if for_training:
                    context = example['final_answers'][2:]
                    if len(context) == 0:
                        sum_empty += 1
                        context = example['context'][0]
                else:
                    context = predictions[qid][index_num_answers]
                writer.write({
                    "commonsense": [],
                    "summary": context,
                    "ques": question,
                    "answer1": answer1,
                    "answer2": answer2,
                    "doc_num": qid
                })
    print(sum_empty)
Example #7
0
    def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(
            tokenizer.tokenize("ah\u535A\u63A8zz"),
            ["ah", "\u535A", "\u63A8", "zz"],
        )
Example #8
0
    def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
                            ["HeLLo", "!", "how", "Are", "yoU", "?"])
        self.assertAllEqual(tokenizer.tokenize(u" \tSveIks!kā  \n Tev ieT?  "),
                            ["SveIks", "!", "kā", "Tev", "ieT", "?"])
Example #9
0
def main(argv):
    global tokenizer
    global estimator
    global basic_tokenizer
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    validate_flags_or_throw(bert_config)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case,
                                           use_moran=True)

    num_train_steps = None
    num_warmup_steps = None

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=tf.estimator.RunConfig(session_config=config))

    basic_tokenizer = tokenization.BasicTokenizer(use_moran=False)
def convert(dataset, input_file, output_file, bauer, n=0):
    import nltk
    import Levenshtein
    tokenizer = tokenization.BasicTokenizer()
    with open(input_file, "r") as pred_file:
        pred = json.load(pred_file)
    with open(output_file, "w") as writer:
        with jsonlines.open(bauer, "r") as bauer_file:
            for example in bauer_file:
                if example['doc_num'] in dataset.keys():
                    writen = False
                    for query_key, query_value in dataset[
                            example['doc_num']]['queries'].items():
                        levenshtein = Levenshtein.distance(
                            "".join(
                                tokenizer.tokenize(" ".join(example['ques']))),
                            "".join(tokenizer.tokenize(
                                query_value['query'])))  #TODO rechange
                        if levenshtein < 5:
                            query_id = query_key
                            generated_answer = pred.get(
                                query_id, ["NO PREDICTION"] * (n))[n - 1]
                            writer.write(generated_answer + "\n")
                            writen = True
                            break
                    if not writen:
                        print("f**k")
                        writer.write("NO PREDICTION\n")
def customize_tokenizer(text, do_lower_case=False):
  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
  temp_x = ""
  text = tokenization.convert_to_unicode(text)
  for c in text:
    if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c):
      temp_x += " " + c + " "
    else:
      temp_x += c
  if do_lower_case:
    temp_x = temp_x.lower()
  return temp_x.split() # 所以我们这里会拿到一个list
Example #12
0
def train(prefix=PREFIX,
          vocab_size=VOCABSIZE,
          ctl_symbols=CTLSYMBOLS,
          tokenized=args.tokenized):

    # if files are tokenized
    if not tokenized:
        files = _get_text_file()
        print("files: {}".format(files))
        # pre-tokenization
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)  #False?

        tokenak = []
        files_tokenized = ""  # comma separated files
        for fs in files.split(","):
            filename = fs + ".tokenized"
            with open(filename, 'w', encoding='utf-8') as fw:
                print("fs: {}".format(fs))
                with open(fs, 'r') as f:
                    for line in f:
                        tokenak = tokenizer.tokenize(line)
                        fw.write(" ".join([str(x) for x in tokenak]))
                        fw.write('\n')
            files_tokenized += "," + filename
    else:
        # files are tokenized
        files_tokenized = _get_tokenized_file()

    # https://github.com/allenai/scibert/blob/5d72d0ec50e2d3ebe971122f8b282278c210eccd/scripts/cheatsheet.txt

    # https://github.com/google/sentencepiece/blob/d4dd947fe71c4fa4ee24ad8297beee32887d8828/python/sentencepiece_python_module_example.ipynb
    # begiratu sentencepiece Normalization atala

    # parametroak
    # https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto

    # spm.SentencePieceTrainer.Train('--input=combined.out --model_prefix=100B_9999_cased --vocab_size=31000 --character_coverage=0.9999 --model_type=bpe --input_sentence_size=100000000 --shuffle_input_sentence=true')

    # BPE
    #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --character_coverage=1.0 --model_type=bpe --input_sentence_size=100000000 --shuffle_input_sentence=true --normalization_rule_name=identity'

    # UNIGRAM
    command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --character_coverage=1.0 --model_type=unigram --input_sentence_size=100000000 --shuffle_input_sentence=true --normalization_rule_name=identity'

    # --model_type=word
    #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --model_type=word --hard_vocab_limit=false'  # RuntimeError: Internal: /sentencepiece/src/trainer_interface.cc(498) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())]

    #
    #command = f'--input={files_tokenized} --model_prefix={prefix} --vocab_size={vocab_size} --control_symbols={ctl_symbols} --hard_vocab_limit=false'  # RuntimeError: Internal: /sentencepiece/src/trainer_interface.cc(498) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())]
    sp.SentencePieceTrainer.Train(command)
 def __init__(self,
              params,
              seq_length,
              chinese_seg,
              generate_label_map=False):
     self.seq_length = seq_length
     self.params = params  # pass parameters by reference in python
     self.tokenizer = tokenization.BasicTokenizer(chinese_seg=chinese_seg,
                                                  params=params)
     self.generate_label_map = generate_label_map
     if self.generate_label_map:
         self.labels = set(['NA'])
         self.label_map = {}
     else:
         _, self.label_map = self.load_label_dict()
Example #14
0
    def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        tokens = tokenizer.tokenize(
            u'患者于4年前出现活动性心悸、胸痛,多在重体力活动时发作,胸痛位于剑突下和心前区,手掌大小,'
            u'呈闷压样疼痛不适,每次持续10分钟左右,休息数分钟可缓解,发作时伴明显心悸、呼吸困难,无咳嗽、'
            u'咳痰,无恶心、呕吐,无出汗,头晕、头痛。曾于2011年来我院就诊,诊断为“冠心病 不稳定心绞痛 '
            u'房颤 心功能3级”,后正规服用药物,症状仍间断发作。3月来上述症状明显加重,表现为明显不能耐受体力活动,'
            u'稍活动即有明显的胸痛发作,长舒气后症状有所缓解,伴四肢乏力,以双下肢为甚,'
            u'伴夜间阵发性呼吸困难及端坐呼吸,上述症状间断出现,进行性加重,后出现双下肢水肿,晨轻暮重,'
            u'今为进一步明确诊治,特来我院,门诊以“冠心病 心律失常 心功能不全”收入我科')
        for token in tokens:
            print(token)

        self.assertAllEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"),
                            [u"ah", u"\u535A", u"\u63A8", u"zz"])
Example #15
0
def create_predict_examples(input_file):

    examples = []
    id = 10000
    df = pd.read_excel(input_file)
    basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
    str_list = list(df['Content'])
    for i in range(len(str_list)):
        str_list[i] = str_list[i].replace('"', '')
        str_list[i] = str_list[i].replace("'", '')

    for req in str_list:
        tok_list = basic_tokenizer.tokenize(req)
        #tok_list.append('EOS')
        examples.append(InputExample(id, tok_list, ['O'] * len(tok_list)))
        id += 1
    return examples
def build_char_vocabs(data_dir, char_embedding_table, char_dict,
                      PairIndexDict):

    all_sentences, max_len_p, max_len_h = get_all_data(data_dir, PairIndexDict)
    print(len(all_sentences))
    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    _, char_embedding_size = char_embedding_table.shape

    chars2id = {}
    id2chars = {}
    tokenC_embeddings = []

    chars2id["[PAD]"] = 0
    id2chars[0] = "[PAD]"
    tokenC_embeddings.append(np.zeros([30]))
    token_id = 1

    for sen in all_sentences:

        tokens = tokenizer.tokenize(sen)
        chars_embedding = np.ndarray((len(tokens), char_embedding_size))
        for token in tokens:
            if token not in chars2id:
                chars2id[token] = token_id
                id2chars[token_id] = token
                token_id += 1

                chars = list(token)
                word_chars_embedding = np.ndarray(
                    (len(chars), char_embedding_size))
                for i, char in enumerate(chars):
                    if char not in char_dict:
                        word_chars_embedding[i, :] = char_embedding_table[
                            -1, :]
                    else:
                        word_chars_embedding[i, :] = char_embedding_table[
                            char_dict[char], :]
                word_char_embedding = np.amax(word_chars_embedding, axis=0)
                tokenC_embeddings.append(word_char_embedding)

    token_char_embedding_table = np.ndarray(
        (len(tokenC_embeddings), char_embedding_size))
    for i in range(len(tokenC_embeddings)):
        token_char_embedding_table[i, :] = tokenC_embeddings[i]

    return (chars2id, id2chars, token_char_embedding_table)
def convert_pred2bauerpred(dataset,
                           input_file,
                           output_file,
                           bauer,
                           pred_or_wl=True,
                           n=1,
                           levenshtein_threshold=5):
    tokenizer = tokenization.BasicTokenizer()
    file_gt_a1 = open(output_file + "_gt1.txt", "w")
    file_gt_a2 = open(output_file + "_gt2.txt", "w")
    with open(input_file, "r") as pred_file:
        pred = json.load(pred_file)
    with open(output_file, "w") as writer:
        with jsonlines.open(bauer, "r") as bauer_file:
            for example in bauer_file:
                if example['doc_num'] in dataset.keys():
                    writen = False
                    for query_key, query_value in dataset[
                            example['doc_num']]['queries'].items():
                        levenshtein = Levenshtein.distance(
                            "".join(
                                tokenizer.tokenize(" ".join(example['ques']))),
                            "".join(tokenizer.tokenize(query_value['query'])))
                        if levenshtein < levenshtein_threshold:
                            query_id = query_key
                            file_gt_a1.write(query_value['answer1'] + "\n")
                            file_gt_a2.write(query_value['answer2'] + "\n")
                            if pred_or_wl:  #is true when want predictions
                                generated_answer = pred.get(
                                    query_id, ["NO PREDICTION"] * (n))[n - 1]
                            else:  # is false when want the first weak label weak labels
                                generated_answer = pred.get(
                                    query_id, [["NO PREDICTION"] * 3])[-1]
                                if len(generated_answer
                                       ) > n:  #n depends on if gt in
                                    generated_answer = generated_answer[n]
                                else:
                                    generated_answer = "unknown"
                            writer.write(generated_answer + "\n")
                            writen = True
                            break
                    if not writen:
                        writer.write("NO PREDICTION\n")
    file_gt_a1.close()
    file_gt_a2.close()
Example #18
0
def get_final_text(pred_text, orig_text, do_lower_case, logger,
                   verbose_logging):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Example #19
0
  parser.add_argument("--edit_ids", type=str, default=None, help="path to edit ids to be applied on input_tokens")
  parser.add_argument("--output_tokens", type=str, default=None, help="path to edited (hopefully corrected) file")
  parser.add_argument("--infer_mode", type=str, default="conll", help="post processing mode bea or conll")
  parser.add_argument("--path_common_inserts",type=str,default=None,help="path of common unigram inserts")
  parser.add_argument("--path_common_multitoken_inserts",type=str,default=None,help="path of common bigram inserts")
  parser.add_argument("--path_common_deletes",type=str,default=None,help="path to common deletions observed in train data")

parser = argparse.ArgumentParser()
add_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()

DO_PARALLEL = False
INFER_MODE=FLAGS.infer_mode

vocab = tokenization.load_vocab(FLAGS.vocab_path)
basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=False,vocab=vocab)
vocab_words = set(x for x in vocab)
common_deletes = pickle.load(open(FLAGS.path_common_deletes,"rb"))
path_common_inserts = FLAGS.path_common_inserts
path_common_multitoken_inserts = FLAGS.path_common_multitoken_inserts
opcodes = opcodes.Opcodes(path_common_inserts, path_common_multitoken_inserts)

if __name__ == '__main__':
    class config:
        INPUT_UNCORRECTED_WORDS = FLAGS.input_tokens
        INPUT_EDITS = FLAGS.edit_ids
        OUTPUT_CORRECTED_WORDS = FLAGS.output_tokens



def fix_apos_break(word, p_word, pp_word):
Example #20
0
def main(argv):
  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
  validate_flags_or_throw(bert_config)

  tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True)

  num_train_steps = None
  num_warmup_steps = None

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=tf.estimator.RunConfig(session_config=config))

  basic_tokenizer = tokenization.BasicTokenizer(use_moran=False)

  examples = read_korquad_examples(input_file=FLAGS.example_file)
  rnd = random.Random(12345)
  rnd.shuffle(examples)

  print(f"\n=========== 기계독해 예문 {len(examples)}건 Loadind Done ...  by TBai ==============\n")
  all_results = list()
  output_results = dict()
  for result in estimator.predict(
          input_fn_builder(
              input_data=examples,
              tokenizer=tokenizer,
              seq_length=FLAGS.max_seq_length,
              drop_remainder=False),
          yield_single_examples=False) :

    print('########################### next1')
    #print('########################### result =', result)
    print('########################### len(eval_features[0]) =', len(eval_features[0]))
    for idx in range(len(eval_features[0])) :
      unique_id = int(result["unique_ids"][idx])
      start_logits = [float(x) for x in result["start_logits"][idx].flat]
      end_logits = [float(x) for x in result["end_logits"][idx].flat]

      print('########################################################### ', unique_id);
      print('########################################################### ', start_logits);
      print('########################################################### ', end_logits);

      all_results.append(RawResult(
          unique_id=unique_id,
          start_logits=start_logits,
          end_logits=end_logits))

    if len(eval_examples) != 0 and len(eval_features) != 0 :
      answer, score, prob = write_predictions(eval_examples[0], eval_features[0], all_results,
                    FLAGS.n_best_size, FLAGS.max_answer_length,
                    FLAGS.do_lower_case, FLAGS.analyze_kor_morph, basic_tokenizer)

      print('####### answer =', answer)
      print('####### score =', score)
      print('####### prob =', prob)

      tscore = str(score)[:5]
      tprob  = str(prob*100)[:5]
      if answer != 'N/A' :
        print(f"Done ... 답변 신뢰도 :  ({tscore}, {tprob}%)")
      else :
        print(f"Done ...")

      res = "독해결과  : "  + answer + "\n\n"
      print(f"{res}")

    all_results.clear()
    eval_features.clear()
    eval_examples.clear()

  print("\n\n\t투블럭에이아이에서 제공하여 드렸습니다. https://twoblockai.com/\n\n")
Example #21
0
 def __init__(self, converted_filename, dataset, ranking_dic=None):
     self.converted_filename = converted_filename
     self.dataset = dataset
     self.ranking_dic = ranking_dic
     self.tokenizer = tokenization.BasicTokenizer()
Example #22
0
def main(argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)


  if len(argv) > 1:
    FLAGS.predict_file = argv[1]

  validate_flags_or_throw(bert_config)
  tf.gfile.MakeDirs(FLAGS.output_dir)
  moran_tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True)
  basic_tokenizer = tokenization.BasicTokenizer(use_moran=False)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      keep_checkpoint_max=FLAGS.keep_checkpoint_max,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train :
    train_examples = read_squad_examples(input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(42)
    rng.shuffle(train_examples)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)


  if FLAGS.do_train:

    train_record_exists = False
    train_writer = FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
        is_training=True,
        record_file_exists=train_record_exists)
    convert_examples_to_features(
        examples=train_examples,
        tokenizer=moran_tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = input_fn_builder(
        input_file=train_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict :
    output_prediction_file_name = "predictions.json"
    output_nbest_file_name = "nbest_predictions.json"
    output_null_log_odds_file_name = "null_odds.json"
    if FLAGS.korquad_refine_answer_by_pos:
        output_prediction_file_name = "predictions_pos.json"

    if FLAGS.do_predict:
      eval_examples = read_squad_examples(input_file=FLAGS.predict_file, is_training=False)

    eval_record_exists = os.path.exists(os.path.join(FLAGS.output_dir, "eval.tf_record"))
    eval_record_exists = False
    if eval_record_exists:
      tf.logging.info("eval.tf_record exists. Do not write tf example file.")
    eval_writer = FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
        is_training=False,
        record_file_exists=eval_record_exists)
    eval_features = []

    def append_feature(feature):
      eval_features.append(feature)
      eval_writer.process_feature(feature)

    convert_examples_to_features(
        examples=eval_examples,
        tokenizer=moran_tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    all_results = []

    predict_input_fn = input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    if FLAGS.do_train:
      init_checkpoint = None
    else:
      if FLAGS.init_checkpoint is not None and tf.gfile.IsDirectory(FLAGS.init_checkpoint):
        from tensorflow.python.training import checkpoint_management
        init_checkpoint = checkpoint_management.latest_checkpoint(FLAGS.init_checkpoint)
      else:
        init_checkpoint = FLAGS.init_checkpoint

    all_results = []
    for result in estimator.predict(
        predict_input_fn, yield_single_examples=True, checkpoint_path=init_checkpoint):
      unique_id = int(result["unique_ids"])
      start_logits = [float(x) for x in result["start_logits"].flat]
      end_logits = [float(x) for x in result["end_logits"].flat]
      all_results.append(
          RawResult(
              unique_id=unique_id,
              start_logits=start_logits,
              end_logits=end_logits))

    if len(argv) > 1:
      output_prediction_file = os.path.join(FLAGS.output_dir, argv[2])
    else:
      output_prediction_file = os.path.join(FLAGS.output_dir, output_prediction_file_name)
    output_nbest_file = os.path.join(FLAGS.output_dir, output_nbest_file_name)
    output_null_log_odds_file = os.path.join(FLAGS.output_dir, output_null_log_odds_file_name)

    write_predictions(eval_examples, eval_features, all_results,
                      FLAGS.n_best_size, FLAGS.max_answer_length,
                      FLAGS.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, basic_tokenizer)
                best_span = (i, j)
                if f1 >= 1.0:
                    return best_span, best_f1
    return best_span, best_f1


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', '-d', type=str, required=True)
    parser.add_argument('--output_file', '-o', type=str, required=True)
    parser.add_argument('--bert', '-b', action='store_true', default=False,
                        help='Use bert tokenization')
    args = parser.parse_args()

    if args.bert:
        tokenizer = bt.BasicTokenizer(do_lower_case=False)
        _process = lambda x: process_bert(x, tokenizer)
    else:
        _process = process

    with open(args.data_file, 'r') as f:
        dataset = json.load(f)

    f1_scores = []
    data = []
    start_time = time.time()
    diag_rows = []
    print(','.join(['gold_answer', 'gold_rationale', 'span_reconstructed_answer', 'f1']))
    for i, datum in enumerate(dataset['data']):
        if i % 10 == 0:
            print('processing %d / %d (used_time = %.2fs)...' %
Example #24
0
    def _read_data(cls, input_file, mode='train'):
        """
            对应下文的百度比赛的数据处理。
            http://lic2019.ccf.org.cn/kg
            其中有两个地方注意下。 这里先将数据用bert的BasicTokenizer处理一下把一些奇奇怪怪的字符处理掉。如果不处理的话,
            后文的inputs_id和label_id 对应不上,因为你的label_id是带有那些奇怪的字符的。而bert处理过后不带。
                    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
                    text = tokenizer.tokenize(text)
        """
        import json
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
        trn_data = json.load(open(input_file, encoding='utf-8'))
        if mode == 'train':
            train_data = []
            for line in trn_data:
                text = line['text'].strip()
                text = tokenizer.tokenize(text)
                text = ''.join([l for l in text])
                label = ['O'] * len(text)
                spo_list = line['spo_list']
                sub_ = []
                obj_ = []
                for i in spo_list:
                    sub_.append(i[0])
                    obj_.append(i[2])
                ent_spans = []
                for sub in sub_:
                    if sub == None:
                        last_idx = 0
                        while True:
                            if last_idx >= len(text):
                                break
                            start = text[last_idx:].find(sub)
                            if start == -1:
                                break
                            end = start + len(sub)
                            ent_spans.append((start + last_idx, end + last_idx))
                            last_idx = end + last_idx
                ent_obj = []
                for obj in obj_:
                    last_idx = 0
                    while True:
                        if last_idx >= len(text):
                            break
                        start = text[last_idx:].find(obj)
                        if start == -1:
                            break
                        end = start + len(obj)
                        ent_obj.append((start + last_idx, end + last_idx))
                        last_idx = end + last_idx

                for i, c in enumerate(text):
                    for sp in ent_spans:
                        if sp[0] == i:
                            if sp[0] == sp[1]:
                                label[i] = 'S1'
                            else:
                                label[i] = 'B1'
                        elif sp[1] - 1 == i:
                            label[i] = 'E1'

                        elif sp[0] < i < sp[1] - 1:
                            label[i] = 'I1'

                for i, c in enumerate(text):
                    for sp in ent_obj:
                        if sp[0] == i:
                            if sp[0] == sp[1]:
                                label[i] = 'S2'
                            else:
                                label[i] = 'B2'
                        elif sp[1] - 1 == i:
                            label[i] = 'E2'

                        elif sp[0] < i < sp[1] - 1:
                            label[i] = 'I2'

                l = ' '.join([la for la in label])
                w = ' '.join([word for word in text])
                train_data.append((w, l))
            return train_data
        elif mode == 'test':
            test_data = []
            for line in trn_data:
                text = line['text'].strip()
                label = ['O'] * len(text)
                test_data.append((list(text), label))
            return test_data
Example #25
0
class BertMRC :
  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
  validate_flags_or_throw(bert_config)

  tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, use_moran=True)

  num_train_steps = None
  num_warmup_steps = None

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=tf.estimator.RunConfig(session_config=config))

  basic_tokenizer = tokenization.BasicTokenizer(use_moran=False)

  #examples = read_korquad_examples()
  #rnd = random.Random(12345)
  #rnd.shuffle(examples)
  #print(f"\n=========== 기계독해 예문 {len(examples)}건 Loadind Done ...  by TBai ==============\n")

  def predict(self, input_context, input_question):
    print('input_context =', input_context)
    print('input_question =', input_question)

    all_results = list()
    output_results = dict()

    response = []
    response_detail  = {}
    for result in self.estimator.predict(
          input_fn_builder(
              input_context, input_question,
              tokenizer=self.tokenizer,
              seq_length=FLAGS.max_seq_length,
              drop_remainder=False),
          yield_single_examples=False):

      for idx in range(len(eval_features[0])):
          unique_id = int(result["unique_ids"][idx])
          start_logits = [float(x) for x in result["start_logits"][idx].flat]
          end_logits = [float(x) for x in result["end_logits"][idx].flat]

          all_results.append(RawResult(
              unique_id=unique_id,
              start_logits=start_logits,
              end_logits=end_logits))

      if len(eval_examples) != 0 and len(eval_features) != 0:
          answer, score, prob = write_predictions(eval_examples[0], eval_features[0], all_results,
                                                  FLAGS.n_best_size, FLAGS.max_answer_length,
                                                  FLAGS.do_lower_case, FLAGS.analyze_kor_morph, self.basic_tokenizer)

          #print('####### answer = %s, score = %f, prob = %f', answer, score, prob)

          tscore = str(score)[:5]
          tprob = str(prob * 100)[:5]
          if answer != 'N/A':
              print(f"Done ... 답변 신뢰도 :  ({tscore}, {tprob}%)")
          else:
              print(f"Done ...")

          res = "독해결과  : " + answer + "\n\n"
          print(f"{res}")
          response_detail = {'answer': answer, 'score': tscore, 'prob': tprob}
          response.append(response_detail)

      all_results.clear()
      eval_features.clear()
      eval_examples.clear()
      return response
import os
import sys
import tokenization

LANG = sys.argv[1]
pair = f'{LANG}-en'

src_path = os.path.join("data", "europarl", pair, f"europarl-v7.{pair}.en")
tgt_path = os.path.join("data", "europarl", pair, f"europarl-v7.{pair}.{LANG}")
out_path = os.path.join("data", "europarl", pair, f"fastalign-europarl.{pair}")

src_fp = open(src_path, 'r')
tgt_fp = open(tgt_path, 'r')

tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

with open(out_path, 'w') as out_fp:
	tokenize = lambda x: " ".join(tokenizer.tokenize(x))

	for src_sent, tgt_sent in zip(src_fp, tgt_fp):
		if src_sent and tgt_sent:
			src_tokens = tokenize(src_sent)
			tgt_tokens = tokenize(tgt_sent)
			
			if src_tokens and tgt_tokens:
				out_fp.write(f'{src_tokens} ||| {tgt_tokens}\n')

src_fp.close()
tgt_fp.close()
Example #27
0
import tokenization
import torch
import numpy as np
import os
import json
import re
import random
import pickle
import time
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DATA_PATH = 'data/small_train_data.json'
tokenizer = tokenization.BasicTokenizer()
full_tokenizer = tokenization.BertTokenizer.from_pretrained(
    'bert-base-chinese', cache_dir='./cache')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device:', device)
model = pytorch_pretrained_bert.modeling.BertForQuestionAnswering.from_pretrained(
    'bert-base-chinese')
MULTI_GPU = False
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = torch.nn.DataParallel(model)
    MULTI_GPU = True
model.to(device)

EPOCHS = 5
BATCH_SIZE = 6
Example #28
0
import codecs
import json
from tqdm import tqdm
import re
import tokenization

# tokenizer = FullTokenizer(vocab_file='/opt/hanyaopeng/souhu/data/chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True)
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
input_file = '/opt/hanyaopeng/souhu/data/data_v2/coreEntityEmotion_test_stage1.txt'

with open(input_file, encoding='utf-8') as f:
    test_data = []
    for l in tqdm(f):
        data = json.loads(l.strip())
        news_id = data['newsId']
        title = data['title']
        title = tokenizer.tokenize(title)
        title = ''.join([l for l in title])
        content = data['content']
        sentences = []
        ans = '' + title
        for seq in re.split(r'[\n。]', content):
            seq = tokenizer.tokenize(seq)
            seq = ''.join([l for l in seq])
            if len(seq) > 0:
                if len(seq) + len(ans) <= 254:
                    if len(ans) == 0:
                        ans = ans + seq
                    else:
                        ans = ans + '。' + seq
                elif len(seq) + len(ans) > 254 and len(seq) + len(
Example #29
0
def get_final_text(pred_text, orig_text, do_lower_case):
  """Project the tokenized prediction back to the original text."""

  # When we created the data, we kept track of the alignment between original
  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
  # now `orig_text` contains the span of our original text corresponding to the
  # span that we predicted.
  #
  # However, `orig_text` may contain extra characters that we don't want in
  # our prediction.
  #
  # For example, let's say:
  #   pred_text = steve smith
  #   orig_text = Steve Smith's
  #
  # We don't want to return `orig_text` because it contains the extra "'s".
  #
  # We don't want to return `pred_text` because it's already been normalized
  # (the SQuAD eval script also does punctuation stripping/lower casing but
  # our tokenizer does additional normalization like stripping accent
  # characters).
  #
  # What we really want to return is "Steve Smith".
  #
  # Therefore, we have to apply a semi-complicated alignment heruistic between
  # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
  # can fail in certain cases in which case we just return `orig_text`.

  def _strip_spaces(text):
    ns_chars = []
    ns_to_s_map = collections.OrderedDict()
    for (i, c) in enumerate(text):
      if c == " ":
        continue
      ns_to_s_map[len(ns_chars)] = i
      ns_chars.append(c)
    ns_text = "".join(ns_chars)
    return (ns_text, ns_to_s_map)

  # We first tokenize `orig_text`, strip whitespace from the result
  # and `pred_text`, and check if they are the same length. If they are
  # NOT the same length, the heuristic has failed. If they are the same
  # length, we assume the characters are one-to-one aligned.
  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)

  tok_text = " ".join(tokenizer.tokenize(orig_text))

  start_position = tok_text.find(pred_text)
  if start_position == -1:
    if FLAGS.verbose_logging:
      tf.logging.info(
          "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
    return orig_text
  end_position = start_position + len(pred_text) - 1

  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

  if len(orig_ns_text) != len(tok_ns_text):
    if FLAGS.verbose_logging:
      tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
                      orig_ns_text, tok_ns_text)
    return orig_text

  # We then project the characters in `pred_text` back to `orig_text` using
  # the character-to-character alignment.
  tok_s_to_ns_map = {}
  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
    tok_s_to_ns_map[tok_index] = i

  orig_start_position = None
  if start_position in tok_s_to_ns_map:
    ns_start_position = tok_s_to_ns_map[start_position]
    if ns_start_position in orig_ns_to_s_map:
      orig_start_position = orig_ns_to_s_map[ns_start_position]

  if orig_start_position is None:
    if FLAGS.verbose_logging:
      tf.logging.info("Couldn't map start position")
    return orig_text

  orig_end_position = None
  if end_position in tok_s_to_ns_map:
    ns_end_position = tok_s_to_ns_map[end_position]
    if ns_end_position in orig_ns_to_s_map:
      orig_end_position = orig_ns_to_s_map[ns_end_position]

  if orig_end_position is None:
    if FLAGS.verbose_logging:
      tf.logging.info("Couldn't map end position")
    return orig_text

  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
  return output_text
Example #30
0
def get_final_text(pred_text, orig_text, do_lower_case):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if FLAGS.verbose_logging:
            tf.compat.v1.logging.info("Unable to find text: '%s' in '%s'" %
                                      (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if FLAGS.verbose_logging:
            tf.compat.v1.logging.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if FLAGS.verbose_logging:
            tf.compat.v1.logging.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if FLAGS.verbose_logging:
            tf.compat.v1.logging.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text