def _create_examples(self, lines, LABEL_SPLITTER="__label__"): re_pattern = u"({}{})".format(LABEL_SPLITTER, "\d+") examples = [] for (i, line) in enumerate(lines): try: guid = i element_list = re.split(re_pattern, line) text_a = clean("".join(element_list[-1].split()[1:])) input_labels = clean(element_list[1]).split(LABEL_SPLITTER)[-1] text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_feature_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=input_labels)) except: print(line, i) return examples
def _create_test_examples(self, data, lang="zh"): examples = [] for index in range(data.shape[0]): content = data[index] guid = int(content["id"]) text_a = content["sentence1"] text_b = content["sentence2"] if isinstance(text_a, str) and isinstance(text_b, str): examples.append(data_feature_classifier.InputExample( guid=guid, text_a=clean(text_a), text_b=clean(text_b), label=["0"] )) return examples
def _create_examples(self, data, lang="zh"): examples = [] for index in range(len(data)): content = data[index] guid = int(content["ID"]) text_a = content["sentence1"] text_b = content["sentence2"] label = content["gold_label"] if isinstance(text_a,str) and isinstance(text_b,str): examples.append(data_feature_classifier.InputExample( guid=guid, text_a=clean(text_a), text_b=clean(text_b), label=[label] )) return examples
def _create_examples(self, lines, LABEL_SPLITTER="__label__"): examples = [] for (i, line) in enumerate(lines): guid = i element_list = line.split(LABEL_SPLITTER) text_a = tokenization.convert_to_unicode(element_list[0].strip()) text_a = clean(text_a) input_labels = element_list[1:] input_labels = [label.strip() for label in input_labels if label.strip() in list(self.label2id.keys())] examples.append(data_feature_classifier.InputExample( guid=guid, text_a=text_a, text_b=None, label=input_labels )) return examples
def _create_test_examples(self, df, lang="zh"): examples = [] for index in range(df.shape[0]): content = df.loc[index] guid = int(content["id"]) if lang == "zh": text_a = content["title1_zh"] text_b = content["title2_zh"] elif lang == "en": text_a = content["title1_en"] text_b = content["title2_en"] if isinstance(text_a, str) and isinstance(text_b, str): examples.append( data_feature_classifier.InputExample(guid=guid, text_a=clean(text_a), text_b=clean(text_b), label=["unrelated"])) return examples
def _create_examples(self, frequent_phrases): examples = [] for (i, line) in enumerate(frequent_phrases): guid = i text_a = clean(line[0]) input_labels = ["0"] text_a = tokenization.convert_to_unicode(text_a) input_labels = [ label.strip() for label in input_labels if label.strip() in list(self.label2id.keys()) ] examples.append( data_feature_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=input_labels)) return examples