Esempio n. 1
0
	def _create_eval_examples(self, lines):
		examples = []
		max_length = 0
		for (i, line) in enumerate(lines):
			try:
				qas_id = int(line["query_id"])
				query = tokenization.convert_to_unicode(line["query"])
				query = clean(query)
				answer_choice = tokenization.convert_to_unicode(line["alternatives"]).split("|")
				answer_choice = [clean(ans) for ans in answer_choice]
				answer_choice = list(set(answer_choice))
				assert len(answer_choice) == 3
				# random.shuffle(answer_choice)
				context = tokenization.convert_to_unicode(line["passage"])
				context = clean(context)
				examples.append(data_feature_mrc.InputExample(
						qas_id=qas_id,
						question_text=query,
						doc_tokens=context,
						answer_choice=answer_choice,
						choice=0
					))
			except:
				continue
		return examples
Esempio n. 2
0
    def _create_unsupervised_distillation_examples(self, lines,
                                                   distillation_prob):

        examples = []
        cnt = 0

        for (i, line) in enumerate(lines):
            content = line
            guid = int(content["ID"])
            text_a = content["sentence1"]
            text_b = content["sentence2"]
            label = content["gold_label"]
            if isinstance(text_a, str) and isinstance(text_b, str):

                text_a = tokenization.convert_to_unicode(text_a)
                text_b = tokenization.convert_to_unicode(text_b)
                input_labels = [label]

                examples.append(
                    data_distillation_feature_classifier.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=text_b,
                        label=input_labels,
                        label_probs=distillation_prob[cnt],
                        label_ratio=0.0,
                        distillation_ratio=1.0))
            cnt += 1
        assert cnt == len(distillation_prob)
        return examples
Esempio n. 3
0
    def _create_examples(self, lines, LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")

        examples = []
        for (i, line) in enumerate(lines):
            try:
                guid = i
                element_list = re.split(re_pattern, line)
                text_a = clean("".join(element_list[-1].split()[1:]))
                input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1]

                text_a = tokenization.convert_to_unicode(text_a)
                input_labels = [
                    label.strip() for label in input_labels
                    if label.strip() in list(self.label2id.keys())
                ]

                examples.append(
                    data_feature_classifier.InputExample(guid=guid,
                                                         text_a=text_a,
                                                         text_b=None,
                                                         label=input_labels))
            except:
                print(line, i)
        return examples
Esempio n. 4
0
def clean(text):
    text = text.strip()
    text = tokenization.convert_to_unicode(text)
    text = HanziConv.toSimplified(text)
    text = full2half(text)
    text = re.sub(u"\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
    text = re.sub(u"\\s*", "", text)
    return text
Esempio n. 5
0
	def _create_examples(self, lines):
		examples = []
		choice_cnt = {}
		max_length = 0
		for (i, line) in enumerate(lines):
			try:
				qas_id = int(line["query_id"])
				query = tokenization.convert_to_unicode(line["query"])
				query = clean(query)
				answer = tokenization.convert_to_unicode(line["answer"])
				answer = clean(answer)
				answer_choice = tokenization.convert_to_unicode(line["alternatives"]).split("|")
				answer_choice = [clean(ans) for ans in answer_choice]
				answer_choice = list(set(answer_choice))
				random.shuffle(answer_choice)
				assert len(answer_choice) == 3
				context = tokenization.convert_to_unicode(line["passage"])
				context = clean(context)
				for index, ans in enumerate(answer_choice):
					if ans == answer:
						choice = index
						break
				if choice in choice_cnt:
					choice_cnt[choice] += 1
				else:
					choice_cnt[choice] = 1
				examples.append(data_feature_mrc.InputExample(
						qas_id=qas_id,
						question_text=query,
						doc_tokens=context,
						answer_choice=answer_choice,
						choice=choice
					))
			except:
				continue
		print(choice_cnt)
		return examples
Esempio n. 6
0
    def _create_supervised_distillation_examples(self,
                                                 lines,
                                                 distillation_dict_lst,
                                                 LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")
        label_pattern = "(?<={})(\d+)".format(LABEL_SPLITTER)

        examples = []
        cnt = 0
        for (i, line) in enumerate(lines):
            try:
                guid = i
                element_list = re.split(re_pattern, line)
                text_a = clean(element_list[-1])

                input_labels = []
                for l in re.finditer(label_pattern, line):
                    input_labels.append(l.group())
                text_a = tokenization.convert_to_unicode(text_a)
                input_labels = [
                    label.strip() for label in input_labels
                    if label.strip() in list(self.label2id.keys())
                ]

                if len(input_labels) == 1:
                    assert int(input_labels[0]
                               ) == distillation_dict_lst[cnt]["label_id"]

                examples.append(
                    data_distillation_feature_classifier.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=None,
                        label=input_labels,
                        label_probs=distillation_dict_lst[cnt]["prob"],
                        label_ratio=1.0,
                        distillation_ratio=1.0,
                        feature=distillation_dict_lst[cnt]["feature"]))
                cnt += 1
            except:
                print(line, i)
                continue

        assert cnt == len(distillation_dict_lst)
        return examples
Esempio n. 7
0
	def _create_examples(self, lines,
									LABEL_SPLITTER="__label__"):
		examples = []
		for (i, line) in enumerate(lines):
			guid = i
			element_list = line.split(LABEL_SPLITTER)
			text_a = tokenization.convert_to_unicode(element_list[0].strip())
			text_a = clean(text_a)
			input_labels = element_list[1:]
			input_labels = [label.strip() for label in input_labels if label.strip() in list(self.label2id.keys())]
			
			examples.append(data_feature_classifier.InputExample(
					guid=guid,
					text_a=text_a,
					text_b=None,
					label=input_labels
				))
		return examples
Esempio n. 8
0
    def _create_examples(self, frequent_phrases):

        examples = []
        for (i, line) in enumerate(frequent_phrases):
            guid = i
            text_a = clean(line[0])
            input_labels = ["0"]

            text_a = tokenization.convert_to_unicode(text_a)
            input_labels = [
                label.strip() for label in input_labels
                if label.strip() in list(self.label2id.keys())
            ]

            examples.append(
                data_feature_classifier.InputExample(guid=guid,
                                                     text_a=text_a,
                                                     text_b=None,
                                                     label=input_labels))
        return examples
Esempio n. 9
0
    def _create_supervised_distillation_examples(self,
                                                 lines,
                                                 distillation_prob,
                                                 LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")

        examples = []

        # assert len(lines) == len(distillation_prob)
        cnt = 0
        for (i, line) in enumerate(lines):
            try:
                guid = i
                element_list = re.split(re_pattern, line)
                text_a = clean(element_list[-1])
                input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1]
            except:
                print(line, i)
                continue

            text_a = tokenization.convert_to_unicode(text_a)
            input_labels = [
                label.strip() for label in input_labels
                if label.strip() in list(self.label2id.keys())
            ]

            examples.append(
                data_distillation_feature_classifier.InputExample(
                    guid=guid,
                    text_a=text_a,
                    text_b=None,
                    label=input_labels,
                    label_probs=distillation_prob[cnt],
                    label_ratio=1.0,
                    distillation_ratio=1.0))
            cnt += 1
        assert cnt == len(distillation_prob)
        return examples
Esempio n. 10
0
    def _create_examples(self, lines, LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")
        label_pattern = "(?<={})(\d+)".format(LABEL_SPLITTER)

        examples = []
        for (i, line) in enumerate(lines):
            try:
                guid = i

                element_list = re.split(re_pattern, line)
                text_a = clean(element_list[-1])

                input_labels = []
                for l in re.finditer(label_pattern, line):
                    input_labels.append(l.group())

                text_a = tokenization.convert_to_unicode(text_a)
                input_labels = [
                    label.strip() for label in input_labels
                    if label.strip() in list(self.label2id.keys())
                ]

                examples.append(
                    data_structure_distillation.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=None,
                        label=input_labels,
                        label_probs=[1.0 / len(self.label2id)] *
                        len(self.label2id),
                        label_ratio=1.0,
                        distillation_ratio=0.0,
                        feature=None))
            except:
                print(line, i)
        return examples
Esempio n. 11
0
                print("End of dataset")
                break
        return pred_label, qas_id

    print("===========begin to eval============")
    [pred_label, qas_id] = eval_fn(result)
    result = dict(zip(qas_id, pred_label))

    print(len(result), "=====valid result======")

    with tf.gfile.Open(FLAGS.eval_data_file, "r") as frobj:
        qas_answer = {}
        for line in frobj:
            content = json.loads(line.strip())
            qas_answer[int(
                content["query_id"])] = tokenization.convert_to_unicode(
                    content["alternatives"]).split("|")

    with tf.gfile.Open(FLAGS.result_file, "w") as fwobj:
        cnt = 0
        for index, key in enumerate(qas_answer):
            if key in result:
                cnt += 1
                if index == 10:
                    print("==index=={}".format(index))
                pred_ans = qas_answer[key][result[key]]
                fwobj.write("\t".join([str(key), pred_ans]) + "\n")
            else:
                pred_ans = qas_answer[key][0]
                fwobj.write("\t".join([str(key), pred_ans]) + "\n")

    print(len(result), cnt, len(qas_answer),