def prepare_dataset(self, sentences, max_seq_length, lower=None, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ data = [] for s in sentences: if lower: string = [w[0].strip().lower() for w in s] else: string = [w[0].strip() for w in s] char_line = ' '.join(string) text = tokenization.convert_to_unicode(char_line) if train: tags = [w[-1] for w in s] else: tags = ['O' for _ in string] labels = ' '.join(tags) labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = self.convert_single_example(char_line=text, max_seq_length=max_seq_length, tokenizer=self.tokenizer, label_line=labels) data.append([string, segment_ids, ids, mask, label_ids]) return data
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'train.csv') reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False) # 如果数据不是乱序的,注意要shuffle # 这里的数据量比较大,取部分跑一下 reader = reader.head(50000) print(type(reader)) import random random.shuffle(reader.values.tolist()) print("train length:", len(reader)) examples = [] for _, row in reader.iterrows(): line = row[0] # print(line) split_line = line.strip().split("\t") if len(split_line) != 4: continue guid = split_line[0] text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[3] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): file_path = os.path.join(data_dir, 'data_1017.txt') f = open(file_path, 'r', encoding='utf8') data = f.readlines() import random import json random.shuffle(data) print("train length:", len(data)) examples = [] for i, row in enumerate(data): line = json.loads(row) guid = i text_a = tokenization.convert_to_unicode(line['question']) text_b = tokenization.convert_to_unicode(line['similar']) label = line['label'] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.csv') reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False) # 这里的数据量比较大,取部分跑一下,跟验证集的数据区分开 reader = reader.head(10000) examples = [] for _, row in reader.iterrows(): line = row[0] # print(line) split_line = line.strip().split("\t") if len(split_line) != 4: continue guid = split_line[0] text_a = tokenization.convert_to_unicode(split_line[1]) text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[3] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples