def _create_examples(lines, set_type): r"""Creates examples for the training and dev sets.""" examples = [] if set_type in ('train', 'dev'): for (i, line) in enumerate(lines): if i == 0: continue guid = f"{set_type}-{i}" text_a = tokenization.convert_to_unicode(line[0]) # Single sentence classification, text_b doesn't exist text_b = None label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) if set_type == 'test': for (i, line) in enumerate(lines): if i == 0: continue guid = f"{set_type}-{i}" text_a = tokenization.convert_to_unicode(line[1]) # Single sentence classification, text_b doesn't exist text_b = None label = '0' # arbitrary set as 0 examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_example(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(line[1]) labels = tokenization.convert_to_unicode(line[0]) examples.append(InputExample(guid=guid, text=text, labels=labels)) return examples
def _create_examples(self, lines, set_type): """create examples for training and test""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) if set_type != 'test': text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) else: text_a = tokenization.convert_to_unicode(line[1]) label = "health" examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, tokenization.convert_to_unicode("0000")) text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample( guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(lines, set_type): r"""Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = f"{set_type}-{i}" text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(lines, set_type): r"""Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = f"{set_type}-{i}" if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): r"""See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = f"dev-{i}" language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): r"""See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", f"multinli.train.{self.language}.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = f"train-{i}" text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _read_numerical_file(self, fp, delimiter=";"): for i, line in enumerate(fp): cols = tokenization.convert_to_unicode(line).strip().split(delimiter) cols = list(map(lambda x: list(map(int, x.split(" "))), cols)) if len(cols) > self.num_numerical_fields: cols = cols[:self.num_numerical_fields] tgt_start_idx = cols[0].index(self.bos_id, 1) record = self.Record(*cols, tgt_start_idx=tgt_start_idx, data_id=i) yield record
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 for sent in input_file: sent = ' '.join(w for w in sent) line = tokenization.convert_to_unicode(sent) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) #print(text_a) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
default="", metavar='STRING', help='input file path') parser.add_argument('--vocab_file', type=str, default="", metavar='STRING', help='vocab file path') args = parser.parse_args() input_file = args.input_file max_seq_length = args.max_seq_length tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) with tf.gfile.Open(input_file, "r") as f: reader = csv.reader(f, delimiter="\t") for line in reader: text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) a_input_ids, a_input_mask, a_segment_ids = convert_single_example( text_a, None, max_seq_length=max_seq_length, tokenizer=tokenizer) b_input_ids, b_input_mask, b_segment_ids = convert_single_example( text_b, None, max_seq_length=max_seq_length, tokenizer=tokenizer)
def convert_single_example_to_unicode(guid, single_example): text_a = tokenization.convert_to_unicode(single_example[0]) text_b = tokenization.convert_to_unicode(single_example[1]) label = tokenization.convert_to_unicode(single_example[2]) return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def convert_examples_to_features(examples, label_list, seq_length, tokenizer, trunc_keep_right, data_stats=None, aug_ops=None): """convert examples to features.""" label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tf.logging.info("number of examples to process: {}".format(len(examples))) features = [] if aug_ops: tf.logging.info("building vocab") word_vocab = build_vocab(examples) examples = word_level_augment.word_level_augment( examples, aug_ops, word_vocab, data_stats) for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("processing {:d}".format(ex_index)) tokens_a = tokenizer.tokenize_to_wordpiece(example.word_list_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize_to_wordpiece(example.word_list_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" if trunc_keep_right: _truncate_seq_pair_keep_right(tokens_a, tokens_b, seq_length - 3) else: _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: if trunc_keep_right: tokens_a = tokens_a[-(seq_length - 2):] else: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length label_id = label_map[example.label] if ex_index < 1: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) # st = " ".join([str(x) for x in tokens]) st = "" for x in tokens: st += tokenization.convert_to_unicode(x) tf.logging.info("tokens: %s" % st) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids, label_id=label_id)) return features
def _convert_example_to_record(self, example, is_infer): # process src src_token_ids = [] src_pos_ids = [] if self.use_role: src_role_ids = [] role_id_list = [] # tokenize src s_token_ids_list = [] for s in example.src.split("[SEP]"): s = tokenization.convert_to_unicode(s).strip() if self.use_role: s, role_id = s.split("\1") role_id = int(role_id) role_id_list.append(role_id) if self.data_format == "tokenized": s_tokens = s.split(" ") else: s_tokens = self.tokenizer.tokenize(s) s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [ self.eos_id ] s_token_ids_list.append(s_token_ids) # trim src idx = len(s_token_ids_list) - 1 total_token_num = 1 while idx >= 0: total_token_num += len(s_token_ids_list[idx]) if total_token_num > self.max_src_len: if self.truncate_first_turn and idx == 0: truncated_ids = s_token_ids_list[idx][:self.max_src_len - total_token_num] if len(truncated_ids) > 1: s_token_ids_list[idx] = truncated_ids[:-1] + [ self.eos_id ] idx -= 1 break idx -= 1 for i, s_token_ids in enumerate(s_token_ids_list[idx + 1:], idx + 1): src_token_ids += s_token_ids src_pos_ids += list(range(1, len(s_token_ids) + 1)) if self.use_role: src_role_ids += [role_id_list[i]] * len(s_token_ids) src_token_ids = [self.bos_id] + src_token_ids src_type_ids = [0] * len(src_token_ids) src_pos_ids = [0] + src_pos_ids if self.use_role: src_role_ids = [0] + src_role_ids assert len(src_token_ids) == len(src_type_ids) == len(src_pos_ids), \ "not len(src_token_ids) == len(src_type_ids) == len(src_pos_ids)" token_ids = src_token_ids type_ids = src_type_ids pos_ids = src_pos_ids if self.use_role: role_ids = src_role_ids tgt_start_idx = len(token_ids) if not is_infer: # process tgt # tokenize tgt tgt = tokenization.convert_to_unicode(example.tgt).strip() if self.data_format == "tokenized": tgt_tokens = tgt.split(" ") else: tgt_tokens = self.tokenizer.tokenize(tgt) tgt_token_ids = self.tokenizer.convert_tokens_to_ids(tgt_tokens) tgt_token_ids.append(self.eos_id) # trim tgt if len(tgt_token_ids) > self.max_tgt_len - 1: tgt_token_ids = tgt_token_ids[:self.max_tgt_len - 1] tgt_token_ids = [self.bos_id] + tgt_token_ids tgt_type_ids = [1] * len(tgt_token_ids) tgt_pos_ids = list(range(1, len(tgt_token_ids) + 1)) if self.use_role: tgt_role_ids = [0] * len(tgt_token_ids) assert len(tgt_token_ids) == len(tgt_type_ids) == len(tgt_pos_ids), \ "not len(tgt_token_ids) == len(tgt_type_ids) == len(tgt_pos_ids)" token_ids += tgt_token_ids type_ids += tgt_type_ids pos_ids += tgt_pos_ids if self.use_role: role_ids += tgt_role_ids assert len(token_ids) == len(type_ids) == len(pos_ids), \ "not len(token_ids) == len(type_ids) == len(pos_ids)" if self.continuous_position: src_pos_ids = list(range(len(src_token_ids))) if not is_infer: tgt_pos_ids = list(range(len(tgt_token_ids))) pos_ids = list(range(len(token_ids))) field_values = { "token_ids": src_token_ids, "type_ids": src_type_ids, "pos_ids": src_pos_ids } if self.use_role: field_values["role_ids"] = role_ids field_values["tgt_start_idx"] = tgt_start_idx field_values["data_id"] = example.data_id record = self.Record(**field_values) return record
def _convert_example_to_record(self, example, max_seq_length, max_ent_cnt, tokenizer): input_tokens = [] tok_to_sent = [] tok_to_word = [] for sent_idx, sent in enumerate(example.sents): for word_idx, word in enumerate(sent): word = tokenization.convert_to_unicode(word) tokens_tmp = tokenizer.tokenize(word) input_tokens += tokens_tmp tok_to_sent += [sent_idx] * len(tokens_tmp) tok_to_word += [word_idx] * len(tokens_tmp) if len(input_tokens) <= max_seq_length - 2: input_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] tok_to_sent = [None] + tok_to_sent + [None] tok_to_word = [None] + tok_to_word + [None] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_mask = [1] * len(input_ids) text_type_ids = [0] * len(input_ids) position_ids = list(range(len(input_ids))) # padding padding = [None] * (max_seq_length - len(input_ids)) tok_to_sent += padding tok_to_word += padding padding = [0] * (max_seq_length - len(input_ids)) input_mask += padding text_type_ids += padding padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding position_ids += padding else: input_tokens = input_tokens[:max_seq_length - 2] tok_to_sent = tok_to_sent[:max_seq_length - 2] tok_to_word = tok_to_word[:max_seq_length - 2] input_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] tok_to_sent = [None] + tok_to_sent + [None] tok_to_word = [None] + tok_to_word + [None] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_mask = [1] * len(input_ids) text_type_ids = [0] * len(input_ids) position_ids = list(range(len(input_ids))) # ent_mask & ner / coreference feature ent_mask = np.zeros((max_ent_cnt, max_seq_length), dtype='int64') ent_ner = [0] * max_seq_length ent_pos = [0] * max_seq_length tok_to_ent = [-1] * max_seq_length ents = example.vertexSet for ent_idx, ent in enumerate(ents): for mention in ent: for tok_idx in range(len(input_ids)): if tok_to_sent[tok_idx] == mention['sent_id'] \ and mention['pos'][0] <= tok_to_word[tok_idx] < mention['pos'][1]: ent_mask[ent_idx][tok_idx] = 1 ent_ner[tok_idx] = self.ner_map[ent[0]['type']] ent_pos[tok_idx] = ent_idx + 1 tok_to_ent[tok_idx] = ent_idx # distance feature ent_first_appearance = [0] * max_ent_cnt ent_distance = np.zeros((max_ent_cnt, max_ent_cnt), dtype='int64') # padding id is 10 for i in range(len(ents)): if np.all(ent_mask[i] == 0): continue else: ent_first_appearance[i] = np.where(ent_mask[i] == 1)[0][0] for i in range(len(ents)): for j in range(len(ents)): if ent_first_appearance[i] != 0 and ent_first_appearance[j] != 0: if ent_first_appearance[i] >= ent_first_appearance[j]: ent_distance[i][j] = self.distance_buckets[ent_first_appearance[i] - ent_first_appearance[j]] else: ent_distance[i][j] = - self.distance_buckets[- ent_first_appearance[i] + ent_first_appearance[j]] ent_distance += 10 # norm from [-9, 9] to [1, 19] # structure prior for attentive biase # PRIOR DEFINITION | share ent context | diff ent context | No ent # share sem context | intra-coref | intra-relate | intra-NA # diff sem context | inter-coref | inter-relate | structure_mask = np.zeros((5, max_seq_length, max_seq_length), dtype='float') for i in range(max_seq_length): if input_mask[i] == 0: break else: if tok_to_ent[i] != -1: for j in range(max_seq_length): if tok_to_sent[j] is None: continue # intra if tok_to_sent[j] == tok_to_sent[i]: # intra-coref if tok_to_ent[j] == tok_to_ent[i]: structure_mask[0][i][j] = 1 # intra-relate elif tok_to_ent[j] != -1: structure_mask[1][i][j] = 1 # intra-NA else: structure_mask[2][i][j] = 1 else: # inter-coref if tok_to_ent[j] == tok_to_ent[i]: structure_mask[3][i][j] = 1 # inter-relate elif tok_to_ent[j] != -1: structure_mask[4][i][j] = 1 # label label_ids = np.zeros((max_ent_cnt, max_ent_cnt, len(self.label_map.keys())), dtype='int64') # test file does not have "labels" if example.labels is not None: labels = example.labels for label in labels: label_ids[label['h']][label['t']][self.label_map[label['r']]] = 1 for h in range(len(ents)): for t in range(len(ents)): if np.all(label_ids[h][t] == 0): label_ids[h][t][0] = 1 label_mask = np.zeros((max_ent_cnt, max_ent_cnt), dtype='int64') label_mask[:len(ents), :len(ents)] = 1 for ent in range(len(ents)): label_mask[ent][ent] = 0 for ent in range(len(ents)): if np.all(ent_mask[ent] == 0): label_mask[ent, :] = 0 label_mask[:, ent] = 0 ent_mask = self.norm_mask(ent_mask) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(text_type_ids) == max_seq_length assert len(position_ids) == max_seq_length assert ent_mask.shape == (max_ent_cnt, max_seq_length) assert label_ids.shape == (max_ent_cnt, max_ent_cnt, len(self.label_map.keys())) assert label_mask.shape == (max_ent_cnt, max_ent_cnt) assert len(ent_ner) == max_seq_length assert len(ent_pos) == max_seq_length assert ent_distance.shape == (max_ent_cnt, max_ent_cnt) assert structure_mask.shape == (5, max_seq_length, max_seq_length) input_ids = np.expand_dims(input_ids, axis=-1).astype('int64') input_mask = np.expand_dims(input_mask, axis=-1).astype('int64') text_type_ids = np.expand_dims(text_type_ids, axis=-1).astype('int64') position_ids = np.expand_dims(position_ids, axis=-1).astype('int64') ent_ner = np.expand_dims(ent_ner, axis=-1).astype('int64') ent_pos = np.expand_dims(ent_pos, axis=-1).astype('int64') ent_distance = np.expand_dims(ent_distance, axis=-1).astype('int64') Record = namedtuple( 'Record', ['token_ids', 'input_mask', 'text_type_ids', 'position_ids', 'ent_mask', 'label_ids', 'label_mask', 'ent_ner', 'ent_pos', 'ent_distance', 'structure_mask']) record = Record( token_ids=input_ids, input_mask=input_mask, text_type_ids=text_type_ids, position_ids=position_ids, ent_mask=ent_mask, label_ids=label_ids, label_mask=label_mask, ent_ner=ent_ner, ent_pos=ent_pos, ent_distance=ent_distance, structure_mask=structure_mask) return record
def process_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case) else: return tokenization.convert_to_unicode(text)