def get_bert_pair_single_features(FLAGS, tokenizer, query, candidate, max_seq_length): tokens_a = tokenizer.tokenize(full2half(query)) tokens_b = tokenizer.tokenize(full2half(candidate)) tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) def get_input(input_tokens_a, input_tokens_b): tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in input_tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in input_tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return [tokens, input_ids, input_mask, segment_ids] [tokens_a_, input_ids_a, input_mask_a, segment_ids_a] = get_input(tokens_a, tokens_b) [tokens_b_, input_ids_b, input_mask_b, segment_ids_b] = get_input(tokens_b, tokens_a) feature_dict = {"input_ids_a":input_ids_a, "input_mask_a":input_mask_a, "segment_ids_a":segment_ids_a, "input_ids_b":input_ids_b, "input_mask_b":input_mask_b, "segment_ids_b":segment_ids_b, "label_ids":[0]} return feature_dict
def get_training_isntance(document, max_seq_length): if not document: return [] # index_range = list(range(num_of_documents)) # index_range.remove(document_index) # random_document_lst = random.sample(index_range, len(index_range)) # Account for [CLS], [SEP], [SEP] max_num_tokens = max_seq_length - 3 instances = [] # We *usually* want to fill up the entire sequence since we are padding # to `max_seq_length` anyways, so short sequences are generally wasted # computation. However, we *sometimes* # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter # sequences to minimize the mismatch between pre-training and fine-tuning. # The `target_seq_length` is just a rough target however, whereas # `max_seq_length` is a hard limit. target_seq_length = max_num_tokens tokens_a_lst = [] instances = [] tokens_a = document['title'] tokens_b = document['comment'] label = document['label'] tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: if token == '[UNK]': continue tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: if token == '[UNK]': continue tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) if label == "0": is_random_next = 0 else: is_random_next = 1 instance = TrainingInstance(tokens=tokens, segment_ids=segment_ids, label_ids=is_random_next) instances = [instance] return instances
def convert_classifier_examples_with_rule_to_features(examples, label_dict, max_seq_length, tokenizer, rule_detector, output_file): feature_writer = ClassifierRuleFeatureWriter(output_file, is_training=False) for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_b = None if example.text_b: try: tokens_b = tokenizer.tokenize(example.text_b) except: print("==token b error==", example.text_b, ex_index) break if tokens_b: tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] rule_id_lst = rule_detector.infer(tokens_a) tokens = [] segment_ids = [] rule_ids = [] tokens.append("[CLS]") segment_ids.append(0) rule_ids.append(0) for index, token in enumerate(tokens_a): tokens.append(token) segment_ids.append(0) rule_ids.append(rule_id_lst[index]) tokens.append("[SEP]") segment_ids.append(0) rule_ids.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) rule_ids.append(0) try: assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(rule_ids) == max_seq_length except: print(len(input_ids), max_seq_length, ex_index, "length error") break if len(example.label) == 1: label_id = label_dict[example.label[0]] else: label_id = [0] * len(label_dict) for item in example.label: label_id[label_dict[item]] = 1 if ex_index < 5: print(tokens) tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.info("rule_ids: %s" % " ".join([str(x) for x in rule_ids])) tf.logging.info("label: {} (id = {})".format( example.label, label_id)) feature = extra_mask_feature_classifier.InputFeatures( guid=example.guid, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, rule_ids=rule_ids, label_ids=label_id) feature_writer.process_feature(feature) feature_writer.close()
def convert_pair_order_classifier_examples_to_features(examples, label_dict, max_seq_length, tokenizer, output_file): feature_writer = PairClassifierFeatureWriter(output_file, is_training=False) for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_b = tokenizer.tokenize(example.text_b) tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) def get_input(input_tokens_a, input_tokens_b): tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in input_tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in input_tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return [tokens, input_ids, input_mask, segment_ids] [tokens_a_, input_ids_a, input_mask_a, segment_ids_a] = get_input(tokens_a, tokens_b) [tokens_b_, input_ids_b, input_mask_b, segment_ids_b] = get_input(tokens_b, tokens_a) try: assert len(input_ids_a) == max_seq_length assert len(input_mask_a) == max_seq_length assert len(segment_ids_a) == max_seq_length assert len(input_ids_b) == max_seq_length assert len(input_mask_b) == max_seq_length assert len(segment_ids_b) == max_seq_length except: print(len(input_ids_a), input_ids_a, max_seq_length, ex_index, "length error") break if len(example.label) == 1: label_id = label_dict[example.label[0]] else: label_id = [0] * len(label_dict) for item in example.label: label_id[label_dict[item]] = 1 if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("guid: %s" % (example.guid)) tf.logging.info( "tokens_a: %s" % " ".join([tokenization.printable_text(x) for x in tokens_a_])) tf.logging.info("input_ids_a: %s" % " ".join([str(x) for x in input_ids_a])) tf.logging.info("input_mask_a: %s" % " ".join([str(x) for x in input_mask_a])) tf.logging.info("segment_ids_a: %s" % " ".join([str(x) for x in segment_ids_a])) tf.logging.info( "tokens_b: %s" % " ".join([tokenization.printable_text(x) for x in tokens_b_])) tf.logging.info("input_ids_b: %s" % " ".join([str(x) for x in input_ids_b])) tf.logging.info("input_mask_b: %s" % " ".join([str(x) for x in input_mask_b])) tf.logging.info("segment_ids_b: %s" % " ".join([str(x) for x in segment_ids_b])) tf.logging.info("label: {} (id = {})".format( example.label, label_id)) feature = pair_data_feature_classifier.InputFeatures( guid=example.guid, input_ids_a=input_ids_a, input_mask_a=input_mask_a, segment_ids_a=segment_ids_a, input_ids_b=input_ids_b, input_mask_b=input_mask_b, segment_ids_b=segment_ids_b, label_ids=label_id) feature_writer.process_feature(feature) feature_writer.close()
def convert_multichoice_examples_to_features(examples, label_dict, max_seq_length, tokenizer, output_file): feature_writer = MultiChoiceFeatureWriter(output_file, is_training=False) for (ex_index, example) in enumerate(examples): question_text = tokenizer.tokenize(example.question_text) context_text = tokenizer.tokenize(example.doc_tokens) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) question_context = question_text + context_text choice_token_ids = [] choice_segment_ids = [] choice_mask = [] choice_tokens = [] for answer in example.answer_choice: answer_text = tokenizer.tokenize(answer) tf_data_utils._truncate_seq_pair(question_context, answer_text, max_seq_length - 3) tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in question_context: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in answer_text: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length choice_token_ids.extend(input_ids) choice_segment_ids.extend(segment_ids) choice_mask.extend(input_mask) choice_tokens.extend(tokens) assert len(choice_token_ids) == max_seq_length * len( example.answer_choice) if ex_index < 5: tf.logging.info("*** Example ***") tf.logging.info("tokens: {}".format(choice_token_ids)) tf.logging.info("choice: {} answer {}".format( example.choice, example.answer_choice)) # tf.logging.info("*** Example ***") # tf.logging.info("qas_id: %s" % (example.qas_id)) # tf.logging.info("tokens: %s" % " ".join( # [tokenization.printable_text(x) for x in tokens])) # tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) # tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) # tf.logging.info( # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) # tf.logging.info("choice: {} answer {}".format(example.choice, example.answer_choice)) feature = data_feature_mrc.InputFeatures( unique_id=example.qas_id, input_ids=choice_token_ids, input_mask=choice_mask, segment_ids=choice_segment_ids, choice=example.choice) feature_writer.process_feature(feature) feature_writer.close()
def create_cls_problem_generator(task_type, examples, label_dict, multi_task_config, tokenizer, mode): max_seq_length = multi_task_config[task_type]["max_length"] lm_augumentation = multi_task_config[task_type]["lm_augumentation"] for (ex_index, example) in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_b = None if example.text_b: try: tokens_b = tokenizer.tokenize(example.text_b) except: print("==token b error==", example.text_b, ex_index) break if tokens_b: tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length-3) else: if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) if lm_augumentation and mode == 'train': rng = random.Random() (mask_lm_tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions( tokens, multi_task_config[task_type]["masked_lm_prob"], multi_task_config[task_type]["max_predictions_per_seq"], list(tokenizer.vocab.keys()), rng) _, mask_lm_tokens, _ = create_mask_and_padding( mask_lm_tokens, copy(segment_ids), max_seq_length) masked_lm_weights, masked_lm_labels, masked_lm_positions = create_mask_and_padding( masked_lm_labels, masked_lm_positions, multi_task_config[task_type]["max_predictions_per_seq"]) mask_lm_input_ids = tokenizer.convert_tokens_to_ids( mask_lm_tokens) masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) assert len(mask_lm_tokens) == max_seq_length input_mask, tokens, segment_ids = create_mask_and_padding( tokens, segment_ids, max_seq_length) input_ids = tokenizer.convert_tokens_to_ids(tokens) if len(example.label) == 1: label_id = label_dict[example.label[0]] else: label_id = [0] * len(label_dict) for item in example.label: label_id[label_dict[item]] = 1 assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length if ex_index < 5: tf.logging.debug("*** Example ***") tf.logging.debug("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in tokens])) tf.logging.debug("input_ids: %s" % " ".join([str(x) for x in input_ids])) tf.logging.debug("input_mask: %s" % " ".join([str(x) for x in input_mask])) tf.logging.debug("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) tf.logging.debug("%s_label_ids: %s" % (task_type, str(label_id))) tf.logging.debug("%s_label: %s" % (task_type, str(example.label))) if lm_augumentation and mode == 'train': tf.logging.debug("mask lm tokens: %s" % " ".join( [tokenization.printable_text(x) for x in mask_lm_tokens])) tf.logging.debug("mask lm input_ids: %s" % " ".join([str(x) for x in mask_lm_input_ids])) tf.logging.debug("mask lm label ids: %s" % " ".join([str(x) for x in masked_lm_ids])) tf.logging.debug("mask lm position: %s" % " ".join([str(x) for x in masked_lm_positions])) if not lm_augumentation: return_dict = { 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, '%s_label_ids' % task_type: label_id } else: if mode == 'train': return_dict = { 'input_ids': mask_lm_input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, '%s_label_ids' % task_type: label_id, "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, } else: return_dict = { 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, '%s_label_ids' % task_type: label_id, "masked_lm_positions": [0]*multi_task_config[task_type]["max_predictions_per_seq"], "masked_lm_ids": [0]*multi_task_config[task_type]["max_predictions_per_seq"], "masked_lm_weights": [0]*multi_task_config[task_type]["max_predictions_per_seq"], } yield return_dict
def create_instances_from_document(all_documents, document_index, vocab_words, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, num_of_documents): """Creates `TrainingInstance`s for a single document.""" document = get_document(all_documents, es_api, document_index) if not document: return [] # index_range = list(range(num_of_documents)) # index_range.remove(document_index) # random_document_lst = random.sample(index_range, len(index_range)) # Account for [CLS], [SEP], [SEP] max_num_tokens = max_seq_length - 3 instances = [] # We *usually* want to fill up the entire sequence since we are padding # to `max_seq_length` anyways, so short sequences are generally wasted # computation. However, we *sometimes* # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter # sequences to minimize the mismatch between pre-training and fine-tuning. # The `target_seq_length` is just a rough target however, whereas # `max_seq_length` is a hard limit. target_seq_length = max_num_tokens tokens_a_lst = [] instances = [] tokens_a = document['title'] tokens_b = document['comment'] label = document['label'] tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: if token == '[UNK]': continue tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: if token == '[UNK]': continue tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) if label == "0": is_random_next = False else: is_random_next = True (output_tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) instance = TrainingInstance(original_tokens=tokens, tokens=output_tokens, segment_ids=segment_ids, is_random_next=is_random_next, masked_lm_positions=masked_lm_positions, masked_lm_labels=masked_lm_labels) instances = [instance] return instances