def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None, sep_token=None, class_labels=None, label_alias=None, vocab=None, is_test=False): """convert glue examples into necessary features""" if not is_test: label_dtype = 'int32' if class_labels else 'float32' # get the label label = example[-1] example = example[:-1] #create label maps if classification task if class_labels: label_map = {} for (i, l) in enumerate(class_labels): label_map[l] = i if label_alias: for key in label_alias: label_map[key] = label_map[label_alias[key]] label = label_map[label] label = np.array([label], dtype=label_dtype) # tokenize raw text tokens_raw = [tokenizer(l) for l in example] # truncate to the truncate_length, tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length) # concate the sequences with special tokens tokens_trun[0] = [cls_token] + tokens_trun[0] tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun)) # convert the token to ids input_ids = vocab[tokens] valid_length = len(input_ids) if not is_test: return input_ids, segment_ids, valid_length, label else: return input_ids, segment_ids, valid_length
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None, sep_token=None, class_labels=None, label_alias=None, vocab=None, is_test=False): """Convert GLUE/SuperGLUE classification and regression examples into the necessary features""" if not is_test: label_dtype = 'int32' if class_labels else 'float32' example, label = example[:-1], example[-1] # create label maps if classification task if class_labels: label_map = {} for (i, l) in enumerate(class_labels): label_map[l] = i if label_alias: for key in label_alias: label_map[key] = label_map[label_alias[key]] # Fix for BoolQ, WSC, and MultiRC, json values get loaded as boolean and not as string # assignments. if type(label) == bool: label = "true" if label else "false" # Fix for COPA if type(label) == int: label = "0" if label == 0 else "1" label = label_map[label] label = np.array([label], dtype=label_dtype) tokens_raw = [tokenizer(l) for l in example] tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length) tokens_trun[0] = [cls_token] + tokens_trun[0] tokens, segment_ids, _ = concat_sequences( tokens_trun, [[sep_token]] * len(tokens_trun)) input_ids = vocab[tokens] valid_length = len(input_ids) if not is_test: return input_ids, segment_ids, valid_length, label else: return input_ids, segment_ids, valid_length
def convert_examples_to_features(example, tokenizer=None, cls_token=None, sep_token=None, vocab=None, max_seq_length=384, doc_stride=128, max_query_length=64, cls_index=0): """convert the examples to the BERT features""" query_tokenized = [cls_token] + tokenizer( example.question_text)[:max_query_length] #tokenize paragraph and get start/end position of the answer in tokenized paragraph tok_start_position, tok_end_position, all_doc_tokens, _, tok_to_orig_index = \ tokenize_and_align_positions(example.doc_tokens, example.start_position, example.end_position, tokenizer) # get doc spans using sliding window doc_spans, doc_spans_indices = get_doc_spans( all_doc_tokens, max_seq_length - len(query_tokenized) - 2, doc_stride) if not example.is_impossible: (tok_start_position, tok_end_position) = improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # get the new start/end position positions = [ align_position2doc_spans([tok_start_position, tok_end_position], doc_idx, offset=len(query_tokenized) + 1, default_value=0) for doc_idx in doc_spans_indices ] else: # if the question is impossible to answer, set the start/end position to cls index positions = [[cls_index, cls_index] for _ in doc_spans_indices] # record whether the tokens in a docspan have max context token_is_max_context = [{ len(query_tokenized) + p: check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0]) for p in range(len(doc_span)) } for (i, doc_span) in enumerate(doc_spans)] token_to_orig_map = [{ len(query_tokenized) + p + 1: tok_to_orig_index[p + doc_spans_indices[i][0]] for p in range(len(doc_span)) } for (i, doc_span) in enumerate(doc_spans)] #get sequence features: tokens, segment_ids, p_masks seq_features = [ concat_sequences([query_tokenized, doc_span], [[sep_token]] * 2) for doc_span in doc_spans ] features = [ SquadBERTFeautre(example_id=example.example_id, qas_id=example.qas_id, doc_tokens=example.doc_tokens, valid_length=len(tokens), tokens=tokens, token_to_orig_map=t2o, token_is_max_context=is_max, input_ids=vocab[tokens], p_mask=p_mask, segment_ids=segment_ids, start_position=start, end_position=end, is_impossible=example.is_impossible) for (tokens, segment_ids, p_mask), (start, end), is_max, t2o in zip( seq_features, positions, token_is_max_context, token_to_orig_map) ] return features