def _create_eval_examples(self, lines): examples = [] max_length = 0 for (i, line) in enumerate(lines): try: qas_id = int(line["query_id"]) query = tokenization.convert_to_unicode(line["query"]) query = clean(query) answer_choice = tokenization.convert_to_unicode(line["alternatives"]).split("|") answer_choice = [clean(ans) for ans in answer_choice] answer_choice = list(set(answer_choice)) assert len(answer_choice) == 3 # random.shuffle(answer_choice) context = tokenization.convert_to_unicode(line["passage"]) context = clean(context) examples.append(data_feature_mrc.InputExample( qas_id=qas_id, question_text=query, doc_tokens=context, answer_choice=answer_choice, choice=0 )) except: continue return examples
def _create_unsupervised_distillation_examples(self, lines, distillation_prob): examples = [] cnt = 0 for (i, line) in enumerate(lines): content = line guid = int(content["ID"]) text_a = content["sentence1"] text_b = content["sentence2"] label = content["gold_label"] if isinstance(text_a, str) and isinstance(text_b, str): text_a = tokenization.convert_to_unicode(text_a) text_b = tokenization.convert_to_unicode(text_b) input_labels = [label] examples.append( data_distillation_feature_classifier.InputExample( guid=guid, text_a=text_a, text_b=text_b, label=input_labels, label_probs=distillation_prob[cnt], label_ratio=0.0, distillation_ratio=1.0)) cnt += 1 assert cnt == len(distillation_prob) return examples
def _create_examples(self, lines, LABEL_SPLITTER="__label__"): re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+") examples = [] for (i, line) in enumerate(lines): try: guid = i element_list = re.split(re_pattern, line) text_a = clean("".join(element_list[-1].split()[1:])) input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1] text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_feature_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=input_labels)) except: print(line, i) return examples
def clean(text): text = text.strip() text = tokenization.convert_to_unicode(text) text = HanziConv.toSimplified(text) text = full2half(text) text = re.sub(u"\\#.*?#|\\|.*?\\||\\[.*?]", "", text) text = re.sub(u"\\s*", "", text) return text
def _create_examples(self, lines): examples = [] choice_cnt = {} max_length = 0 for (i, line) in enumerate(lines): try: qas_id = int(line["query_id"]) query = tokenization.convert_to_unicode(line["query"]) query = clean(query) answer = tokenization.convert_to_unicode(line["answer"]) answer = clean(answer) answer_choice = tokenization.convert_to_unicode(line["alternatives"]).split("|") answer_choice = [clean(ans) for ans in answer_choice] answer_choice = list(set(answer_choice)) random.shuffle(answer_choice) assert len(answer_choice) == 3 context = tokenization.convert_to_unicode(line["passage"]) context = clean(context) for index, ans in enumerate(answer_choice): if ans == answer: choice = index break if choice in choice_cnt: choice_cnt[choice] += 1 else: choice_cnt[choice] = 1 examples.append(data_feature_mrc.InputExample( qas_id=qas_id, question_text=query, doc_tokens=context, answer_choice=answer_choice, choice=choice )) except: continue print(choice_cnt) return examples
def _create_supervised_distillation_examples(self, lines, distillation_dict_lst, LABEL_SPLITTER="__label__"): re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+") label_pattern = "(?<={})(\d+)".format(LABEL_SPLITTER) examples = [] cnt = 0 for (i, line) in enumerate(lines): try: guid = i element_list = re.split(re_pattern, line) text_a = clean(element_list[-1]) input_labels = [] for l in re.finditer(label_pattern, line): input_labels.append(l.group()) text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] if len(input_labels) == 1: assert int(input_labels[0] ) == distillation_dict_lst[cnt]["label_id"] examples.append( data_distillation_feature_classifier.InputExample( guid=guid, text_a=text_a, text_b=None, label=input_labels, label_probs=distillation_dict_lst[cnt]["prob"], label_ratio=1.0, distillation_ratio=1.0, feature=distillation_dict_lst[cnt]["feature"])) cnt += 1 except: print(line, i) continue assert cnt == len(distillation_dict_lst) return examples
def _create_examples(self, lines, LABEL_SPLITTER="__label__"): examples = [] for (i, line) in enumerate(lines): guid = i element_list = line.split(LABEL_SPLITTER) text_a = tokenization.convert_to_unicode(element_list[0].strip()) text_a = clean(text_a) input_labels = element_list[1:] input_labels = [label.strip() for label in input_labels if label.strip() in list(self.label2id.keys())] examples.append(data_feature_classifier.InputExample( guid=guid, text_a=text_a, text_b=None, label=input_labels )) return examples
def _create_examples(self, frequent_phrases): examples = [] for (i, line) in enumerate(frequent_phrases): guid = i text_a = clean(line[0]) input_labels = ["0"] text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_feature_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=input_labels)) return examples
def _create_supervised_distillation_examples(self, lines, distillation_prob, LABEL_SPLITTER="__label__"): re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+") examples = [] # assert len(lines) == len(distillation_prob) cnt = 0 for (i, line) in enumerate(lines): try: guid = i element_list = re.split(re_pattern, line) text_a = clean(element_list[-1]) input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1] except: print(line, i) continue text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_distillation_feature_classifier.InputExample( guid=guid, text_a=text_a, text_b=None, label=input_labels, label_probs=distillation_prob[cnt], label_ratio=1.0, distillation_ratio=1.0)) cnt += 1 assert cnt == len(distillation_prob) return examples
def _create_examples(self, lines, LABEL_SPLITTER="__label__"): re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+") label_pattern = "(?<={})(\d+)".format(LABEL_SPLITTER) examples = [] for (i, line) in enumerate(lines): try: guid = i element_list = re.split(re_pattern, line) text_a = clean(element_list[-1]) input_labels = [] for l in re.finditer(label_pattern, line): input_labels.append(l.group()) text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_structure_distillation.InputExample( guid=guid, text_a=text_a, text_b=None, label=input_labels, label_probs=[1.0 / len(self.label2id)] * len(self.label2id), label_ratio=1.0, distillation_ratio=0.0, feature=None)) except: print(line, i) return examples
print("End of dataset") break return pred_label, qas_id print("===========begin to eval============") [pred_label, qas_id] = eval_fn(result) result = dict(zip(qas_id, pred_label)) print(len(result), "=====valid result======") with tf.gfile.Open(FLAGS.eval_data_file, "r") as frobj: qas_answer = {} for line in frobj: content = json.loads(line.strip()) qas_answer[int( content["query_id"])] = tokenization.convert_to_unicode( content["alternatives"]).split("|") with tf.gfile.Open(FLAGS.result_file, "w") as fwobj: cnt = 0 for index, key in enumerate(qas_answer): if key in result: cnt += 1 if index == 10: print("==index=={}".format(index)) pred_ans = qas_answer[key][result[key]] fwobj.write("\t".join([str(key), pred_ans]) + "\n") else: pred_ans = qas_answer[key][0] fwobj.write("\t".join([str(key), pred_ans]) + "\n") print(len(result), cnt, len(qas_answer),