def get_sentence_examples(self, questions): for index, data in enumerate(questions): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(data[0])) text_b = tokenization.convert_to_unicode(str(data[1])) label = str(0) yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def _to_example(sentences): import re """ sentences to InputExample :param sentences: list of strings :return: list of InputExample """ unique_id = 0 for ss in sentences: line = tokenization.convert_to_unicode(ss) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'dev.txt') dev_df = pd.read_csv(file_path, encoding='utf-8', sep='\t', header=None) dev_data = [] for index, dev in enumerate(dev_df.values): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(dev[1])) text_b = tokenization.convert_to_unicode(str(dev[2])) label = str(dev[3]) dev_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return dev_data
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'dev.txt') dev_data = [] with open(file_path, 'r', encoding='utf-8') as dev_f: dev_dt = dev_f.readlines() for i in range(len(dev_dt)): row_data = dev_dt[i].strip().split('\t') guid = 'dev-%d' % i text_a = tokenization.convert_to_unicode(row_data[0]) text_b = tokenization.convert_to_unicode(row_data[1]) label = str(row_data[2]) dev_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return dev_data
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.txt') # train_df = pd.read_csv(file_path, encoding='utf-8', sep='\t', header=None) train_data = [] with open(file_path, 'r', encoding='utf-8') as train_f: train_dt = train_f.readlines() for i in range(len(train_dt)): row_data = train_dt[i].strip().split('\t') guid = 'train-%d' % i text_a = tokenization.convert_to_unicode(row_data[0]) text_b = tokenization.convert_to_unicode(row_data[1]) label = str(row_data[2]) train_data.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return train_data
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'sentiment_test.txt') with open(file_path, 'r', encoding="utf-8") as f: reader = f.readlines() # random.shuffle(reader) # 测试集不打乱数据,便于比较 examples = [] for index, line in enumerate(reader): guid = 'test-%d' % index split_line = line.strip().split("\t") text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None label = split_line[0] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'sentiment_train.txt') with open(file_path, 'r', encoding="utf-8") as f: reader = f.readlines() random.seed(0) random.shuffle(reader) # 注意要shuffle examples, self.labels = [], [] for index, line in enumerate(reader): guid = 'train-%d' % index split_line = line.strip().split("\t") if len(split_line) == 2: text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None label = split_line[0] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) self.labels.append(label) return examples