def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) text_a = tokenization.convert_to_unicode(line[8]) text_b = tokenization.convert_to_unicode(line[9]) label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] labels = [] labels_test = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) # tokenization is based on vocab file text_a = tokenization.convert_to_unicode(line[0]) label = tokenization.convert_to_unicode(line[1]) labels.append(label) if set_type == "test": label = "0" labels_test.append(label) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples, labels, labels_test
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: # 类似python里面的open操作 while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters(文档分隔符) if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) # 相当于把句子按照vocab表中的词进行划分表示 if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) # 把之前的句子[[['the','fountain','of','classic',...]],[['this','text','is',...]]]转换为 # 标签['[PAD]','[unused1]','[unused2],...'] vocab_words = list(tokenizer.vocab.keys()) # 这是label词表list['[PAD]','[unused1]','[unused2],...'] instances = [] for _ in range(dupe_factor): # dupe_factor是重复因子 for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def write_to_tf_record(writer, tokenizer, query, docs, labels, ids_file=None, query_id=None, doc_ids=None): #query = tokenization.convert_to_unicode(query) query_token_ids = tokenization.convert_to_bert_input( text=query, max_seq_length=FLAGS.max_query_length, tokenizer=tokenizer, add_cls=True) query_token_ids_tf = tf.train.Feature(int64_list=tf.train.Int64List( value=query_token_ids)) for i, (doc_text, label) in enumerate(zip(docs, labels)): doc_token_id = tokenization.convert_to_bert_input( text=tokenization.convert_to_unicode(doc_text), max_seq_length=FLAGS.max_seq_length - len(query_token_ids), tokenizer=tokenizer, add_cls=False) doc_ids_tf = tf.train.Feature(int64_list=tf.train.Int64List( value=doc_token_id)) labels_tf = tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) features = tf.train.Features( feature={ 'query_ids': query_token_ids_tf, 'doc_ids': doc_ids_tf, 'label': labels_tf, }) example = tf.train.Example(features=features) writer.write(example.SerializeToString()) if ids_file: ids_file.write('\t'.join([query_id, doc_ids[i]]) + '\n')
def get_train_examples(self, data_dir): examples = [] train_files = ["mytrain.term_recall.json"] for file_name in train_files: train_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(train_file): q_json_dict = json.loads(line) qid = q_json_dict["qid"] q_text = tokenization.convert_to_unicode(q_json_dict["query"]) term_recall_dict = q_json_dict["term_recall"][self.recall_field] guid = "train-%s" % qid examples.append( InputExample(guid=guid, text=q_text, term_recall_dict=term_recall_dict) ) train_file.close() random.shuffle(examples) return examples
def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for prediction.""" lines = self._read_tsv(os.path.join(data_dir, "weibo_senti_10k.csv")) examples = [] lines = lines[10040:10230] for (i, line) in enumerate(lines): if i == 0: continue guid = "test-%d" % (i) strs = line[0].split(",") text_a = tokenization.convert_to_unicode(strs[1]) label = "0" examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.tsv') with open(file_path, 'r', encoding="utf-8") as f: reader = f.readlines() examples = [] for index, line in enumerate(reader): guid = 'train-%d' % index split_line = line.strip().split("\t") text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None label = split_line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def read_array_examples(array_example): examples = [] unique_id = 0 for l in array_example: line = tokenization.convert_to_unicode(l) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def get_train_examples(self, data_dir): # 读取训练数据路径 print("hi, commentProcessor") file_path = os.path.join(data_dir, 'comments_train.csv') # 使用 Pandas 读取数据 df = pd.read_csv(file_path) # 将训练数据切分为 80% 训练集和 20% 验证集 # df_train, self.df_dev = train_test_split(df, test_size=0.2) df_train = df examples = [] # 按 BERT 推荐格式处理数据 for index, row in df_train.iterrows(): guid = 'train-%d' % row[0] # 索引 text_a = tokenization.convert_to_unicode(str(row[1])) # 文本 label = row[2] # 文本标签 print(guid, "text_a: ", text_a, "label: ", label) examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.csv') with open(file_path, 'r', encoding='utf-8') as f: reader = f.readlines() examples = [] labels_dev = [] for index, line in enumerate(reader): guid = 'dev-%d' % index split_line = line.strip().split(',') text_a = tokenization.convert_to_unicode(split_line[1]) # text_b = tokenization.convert_to_unicode(split_line[2]) label = split_line[0] labels_dev.append(label) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples, labels_dev
def read_examples(text): """return example format from string""" examples = [] unique_id = 0 text = text.replace('\n', ' ') #remove line breaks line = tokenization.convert_to_unicode(text) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def get_train_examples(self, data_dir): examples = [] train_files = [data_dir] for file_name in train_files: train_file = open(file_name) for i, line in enumerate(train_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] guid = "train-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) train_file.close() random.shuffle(examples) return examples
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def _create_examples(self, lines, set_type="test"): """Creates examples for the training and dev sets. :param lines: all input lines from input file :type lines: list :return: a list of InputExample element :rtype: list """ examples = [] for (i, line) in enumerate(lines): line = line.split("\t") guid = "%s-%s" % (set_type, i) label = eval(line[1]) textA = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, textA=textA, textB=None, label=label)) # examples 包含了所有数据的列表, 其中每个数据类型为 InputExample # 对于训练数据进行随机打乱 if set_type == "train": random.shuffle(examples) return examples
def convert_single_example(query): global max_seq_length text = tokenization.convert_to_unicode(query) raw_tokens = tokenizer.tokenize(text) tokens = raw_tokens[0:(max_seq_length - 2)] tokens.insert(0, "[CLS]") # 句子开始设置CLS 标志 tokens.append("[SEP]") # 句尾添加[SEP] 标志 input_ids = tokenizer.convert_tokens_to_ids(tokens) segment_ids = [0] * max_seq_length input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) return input_ids, input_mask, segment_ids, raw_tokens
def read_examples(str_io): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 while True: line = tokenization.convert_to_unicode(str_io.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append(InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "train": text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) label = tokenization.convert_to_unicode(line[3]) if set_type == "dev": text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[3]) if set_type == "test": label = "NOT ENOUGH INFO" text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_test_examples(self, dataset_file, set_type): mag_dataset = dataset_file examples = [] for _ in range(120000): next(mag_dataset) index = 0 MAX_LINE_COUNT = 120000 for line in mag_dataset: guid = "%s-%s" % (set_type, index) pos = line.find('\t') text_a = tokenization.convert_to_unicode(line[pos + 1:]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=None)) index += 1 if index >= MAX_LINE_COUNT: break mag_dataset.close() return examples
def _create_samples(self, set_type): examples = [] if set_type == "train": files = sea.DataFiles._train_file_names elif set_type == "dev": files = sea.DataFiles._validation_file_names else: files = sea.DataFiles._test_file_names for file in files: file = os.path.join("../", file) lines = pd.read_csv(file, delimiter=",") for i in range(len(lines)): offset = 0 max_length = 512 doc_stride = FLAGS.doc_stride sentence = lines.iloc[i, 1].strip("\n").strip("\"") sentence = tokenization.convert_to_unicode(sentence) for _ in range(6): text_a = sentence[offset:offset + max_length] text_a = " ".join(text_a) guid = "{}-{}".format(i, set_type) label = [] for j in range(2, 2 + self._labels_num, 1): code = lines.iloc[i, j] code = code + 2 label.append(code) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) offset += max_length offset = offset - doc_stride return examples
def get_test_examples(self, data_dir): test_files = [data_dir] examples = [] for file_name in test_files: test_file = open(file_name) for i, line in enumerate(test_file): jdict = json.loads(line) docid = jdict["id"] doc_text = jdict["content"] doc_text = tokenization.convert_to_unicode(doc_text) term_recall_dict = {} if not doc_text.strip(): doc_text = '.' guid = "test-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) test_file.close() return examples
def read_tokenized_examples(lst_strs): """ :param lst_strs: [[]] 每个子元素为一个序列,子元素的每一个元素为这个序列的一个index :return: """ unique_id = 0 # 对lst_list中的数据进行转化为ID lst_strs = [[tokenization.convert_to_unicode(w) for w in s] for s in lst_strs] for ss in lst_strs: text_a = ss text_b = None try: # 这里使用|||对输入的句子进行切分如果存在这个符号,表示输入的是两个句子,即text_a 和text_b, 否则index出错,只会存在test_a j = ss.index('|||') text_a = ss[:j] text_b = ss[(j + 1):] except ValueError: pass yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1
def get_train_examples(self, data_dir): examples = [] train_files = ["train.fold0.docterm_recall", "train.fold1.docterm_recall", "train.fold2.docterm_recall", "train.fold3.docterm_recall"] for file_name in train_files: train_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(train_file): json_dict = json.loads(line) docid = json_dict["doc"]["id"] doc_text = tokenization.convert_to_unicode(json_dict["doc"]["title"]) term_recall_dict = json_dict["term_recall"] if not term_recall_dict or not doc_text.strip(): continue guid = "train-%s" % docid examples.append( InputExample(guid=guid, text=doc_text, term_recall_dict=term_recall_dict) ) train_file.close() random.shuffle(examples) return examples
def get_dev_examples_(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "test.csv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%s" % (i) line_ = line[0].split(',') #if len(line_) != 3: # bad sample num=150000 # continue text_a = tokenization.convert_to_unicode(line_[1]) #label = tokenization.convert_to_unicode(line_[2]) label = '0' examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def convert_lines_to_examples(lines): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 for line in lines: line = tokenization.convert_to_unicode(line) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def _create_examples(self, lines, set_type): """Creates examples for the training sets.""" examples = [] for (i, line) in enumerate(lines): #if i == 0: # continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[0]) text_b = None if set_type == "test": label = "pos" else: label = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): # 读取数据集 file_path = os.path.join(data_dir, 'train.csv') df = pd.read_csv(file_path) # 划分训练集和测试集 file_path_test = os.path.join(data_dir, 'test.csv') df_test = pd.read_csv(file_path_test) df_train = df self.df_test = df_test # 再从训练集中划分出一部分验证集 df_train, self.df_dev = train_test_split(df_train, test_size=0.2) examples = [] for index, row in df_train.iterrows(): guid = 'train-%d' % index # 按示例添加唯一 guid text_a = tokenization.convert_to_unicode(str(row[0])) # title1_zh #text_b = tokenization.convert_to_unicode(str(row[1])) # title2_zh label = row[1] # label examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def _create_examples(self, lines): """See base class.""" examples = [] for (i, line) in enumerate(lines): guid = "%s" % (i) if 'id' not in line else line['id'] text_a = tokenization.convert_to_unicode(line['text']) label = ['O'] * len(text_a) if 'label' in line: for l, words in line['label'].items(): for word, indices in words.items(): for index in indices: if index[0] == index[1]: label[index[0]] = 'S-' + l else: label[index[0]] = 'B-' + l label[index[1]] = 'E-' + l for i in range(index[0] + 1, index[1]): label[i] = 'M-' + l examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def get_dev_examples(self, data_dir): file_path = os.path.join(data_dir, 'cnews.val.txt') with open(file_path, 'r', encoding="utf-8") as f: reader = f.readlines() random.shuffle(reader) # 注意要shuffle reader = reader[0:200] examples = [] labels = [] for index, line in enumerate(reader): guid = 'train-%d' % index split_line = line.strip().split("\t") text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None label = split_line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) labels.append(label) return examples, labels
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if i == 0: continue guid = "%s-%s" % (set_type, i) # title = line[1] text_a = tokenization.convert_to_unicode(line[2]) # text_a = tokenization.convert_to_unicode(line[0]) # sdp = line[3] if set_type == "test": label = "false" else: label = line[4] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" def process_text(s): for token in retain: s = s.replace(token, sep_replace[token][FLAGS.entity_sep]) return s examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) if type(line[0]) != str: text_a = ' ' else: text_a = tokenization.convert_to_unicode(process_text(line[0])) if set_type == "test": label = 0 else: label = int(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): dev_files = ["{}.json".format(self.dev_fold)] examples = [] data_dirs = data_dir.split(',') for file_name in dev_files: for data_dir in data_dirs: dev_file = open(os.path.join(data_dir, file_name)) for i, line in enumerate(dev_file): q_json_dict = json.loads(line) qid = q_json_dict["qid"] q_text = tokenization.convert_to_unicode(q_json_dict["query"]) for field in self.recall_fields: if field not in q_json_dict["term_recall"]: continue term_recall_dict = q_json_dict["term_recall"][field] guid = "dev-%s" % qid examples.append( InputExample(guid=guid, text=q_text, term_recall_dict=term_recall_dict) ) dev_file.close() return examples
def read_examples(self, sentence_list): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 for line in sentence_list: line = tokenization.convert_to_unicode(line) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( self.InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def __call__(self, line): """Perform transformation for sequence pairs or single sequences. The transformation is processed in the following steps: - tokenize the input sequences - insert [CLS], [SEP] as necessary - generate type ids to indicate whether a token belongs to the first sequence or the second sequence. - generate valid length For sequence pairs, the input is a tuple of 3 strings: text_a, text_b and label. Inputs: text_a: 'is this jacksonville ?' text_b: 'no it is not' label: '0' Tokenization: text_a: 'is this jack ##son ##ville ?' text_b: 'no it is not .' Processed: tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]' type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 valid_length: 14 label: 0 For single sequences, the input is a tuple of 2 strings: text_a and label. Inputs: text_a: 'the dog is hairy .' label: '1' Tokenization: text_a: 'the dog is hairy .' Processed: text_a: '[CLS] the dog is hairy . [SEP]' type_ids: 0 0 0 0 0 0 0 valid_length: 7 label: 1 Parameters ---------- line: tuple of str Input strings. For sequence pairs, the input is a tuple of 3 strings: (text_a, text_b, label). For single sequences, the input is a tuple of 2 strings: (text_a, label). Returns ------- np.array: input token ids in 'int32', shape (batch_size, seq_length) np.array: valid length in 'int32', shape (batch_size,) np.array: input token type ids in 'int32', shape (batch_size, seq_length) np.array: label id in 'int32', shape (batch_size, 1) """ # convert to unicode text_a = line[0] label = line[-1] text_a = convert_to_unicode(text_a) label = convert_to_unicode(label) if self._pair: assert len(line) == 3 text_b = line[1] text_b = convert_to_unicode(text_b) tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if self._pair: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self._max_seq_length - 2: tokens_a = tokens_a[0:(self._max_seq_length - 2)] # The embedding vectors for `type=0` and `type=1` were learned during # pre-training and are added to the wordpiece embedding vector # (and position vector). This is not *strictly* necessary since # the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append('[CLS]') segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append('[SEP]') segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append('[SEP]') segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) label_id = self._label_map[label] # The valid length of sentences. Only real tokens are attended to. valid_length = len(input_ids) if self._pad: # Zero-pad up to the sequence length. padding_length = self._max_seq_length - valid_length input_ids.extend([0] * padding_length) segment_ids.extend([0] * padding_length) return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\ np.array(segment_ids, dtype='int32'), np.array([label_id], dtype='int32')
def raw_preprocess(iterator): tokenizer = tokenization.FullTokenizer(vocab_file=_vocab_file, do_lower_case=_do_lower_case) while True: try: line_arr = iterator.next().strip().split("\001") #_id, source_str = line_arr _id = line_arr[0] source_str = line_arr[2] if not source_str: continue source = tokenization.convert_to_unicode(source_str) if not source: continue text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", source.strip()) if m is None: text_a = source.strip() else: text_a = m.group(1) text_b = m.group(2) tokens_a = tokenizer.tokenize(text_a) tokens_b = None if text_b: tokens_b = tokenizer.tokenize(text_b) if tokens_b: _truncate_seq_pair(tokens_a, tokens_b, _seq_length - 3) else: if len(tokens_a) > _seq_length - 2: tokens_a = tokens_a[0: (_seq_length-2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens, input_type_ids = _encode_tokens(tokens_a, tokens_b) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < _seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) tokens.append("##NULL##") assert len(input_ids) == _seq_length assert len(input_mask) == _seq_length assert len(input_type_ids) == _seq_length assert len(tokens) == _seq_length encode_dict = {} encode_dict["_id"] = _id encode_dict["tokens"] = tokens encode_dict["input_ids"] = input_ids encode_dict["input_mask"] = input_mask encode_dict["input_type_ids"] = input_type_ids yield encode_dict except StopIteration, e: print("stop") break except Exception, e: err = traceback.format_exc() print(err, file=sys.stderr)