Example #1
0
    def _create_unsupervised_distillation_examples(self, lines,
                                                   distillation_prob):

        examples = []
        cnt = 0

        for (i, line) in enumerate(lines):
            content = line
            guid = int(content["ID"])
            text_a = content["sentence1"]
            text_b = content["sentence2"]
            label = content["gold_label"]
            if isinstance(text_a, str) and isinstance(text_b, str):

                text_a = tokenization.convert_to_unicode(text_a)
                text_b = tokenization.convert_to_unicode(text_b)
                input_labels = [label]

                examples.append(
                    data_distillation_feature_classifier.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=text_b,
                        label=input_labels,
                        label_probs=distillation_prob[cnt],
                        label_ratio=0.0,
                        distillation_ratio=1.0))
            cnt += 1
        assert cnt == len(distillation_prob)
        return examples
Example #2
0
    def _create_examples(self, lines, LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")

        examples = []
        for (i, line) in enumerate(lines):
            try:
                guid = i
                element_list = re.split(re_pattern, line)
                text_a = clean(element_list[-1])
                input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1]

                text_a = tokenization.convert_to_unicode(text_a)
                input_labels = [
                    label.strip() for label in input_labels
                    if label.strip() in list(self.label2id.keys())
                ]

                examples.append(
                    data_distillation_feature_classifier.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=None,
                        label=input_labels,
                        label_probs=[1.0 / len(self.label2id)] *
                        len(self.label2id),
                        label_ratio=1.0,
                        distillation_ratio=0.0))
            except:
                print(line, i)
        return examples
Example #3
0
    def _create_supervised_distillation_examples(self,
                                                 lines,
                                                 distillation_dict_lst,
                                                 LABEL_SPLITTER="__label__"):
        re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+")
        label_pattern = "(?<={})(\d+)".format(LABEL_SPLITTER)

        examples = []
        cnt = 0
        for (i, line) in enumerate(lines):
            try:
                guid = i
                element_list = re.split(re_pattern, line)
                text_a = clean(element_list[-1])

                input_labels = []
                for l in re.finditer(label_pattern, line):
                    input_labels.append(l.group())
                text_a = tokenization.convert_to_unicode(text_a)
                input_labels = [
                    label.strip() for label in input_labels
                    if label.strip() in list(self.label2id.keys())
                ]

                if len(input_labels) == 1:
                    assert int(input_labels[0]
                               ) == distillation_dict_lst[cnt]["label_id"]

                examples.append(
                    data_distillation_feature_classifier.InputExample(
                        guid=guid,
                        text_a=text_a,
                        text_b=None,
                        label=input_labels,
                        label_probs=distillation_dict_lst[cnt]["prob"],
                        label_ratio=1.0,
                        distillation_ratio=1.0,
                        feature=distillation_dict_lst[cnt]["feature"]))
                cnt += 1
            except:
                print(line, i)
                continue

        assert cnt == len(distillation_dict_lst)
        return examples
Example #4
0
 def _create_examples(self, data, lang="zh"):
     examples = []
     for index in range(len(data)):
         content = data[index]
         guid = int(content["ID"])
         text_a = content["sentence1"]
         text_b = content["sentence2"]
         label = content["gold_label"]
         if isinstance(text_a, str) and isinstance(text_b, str):
             examples.append(
                 data_distillation_feature_classifier.InputExample(
                     guid=guid,
                     text_a=clean(text_a),
                     text_b=clean(text_b),
                     label=[label],
                     label_probs=[1.0 / len(self.label2id)] *
                     len(self.label2id),
                     label_ratio=1.0,
                     distillation_ratio=0.0))
     return examples