def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None, sep_token=None, class_labels=None, label_alias=None, vocab=None, is_test=False): """convert glue examples into necessary features""" if not is_test: label_dtype = 'int32' if class_labels else 'float32' # get the label label = example[-1] example = example[:-1] #create label maps if classification task if class_labels: label_map = {} for (i, l) in enumerate(class_labels): label_map[l] = i if label_alias: for key in label_alias: label_map[key] = label_map[label_alias[key]] label = label_map[label] label = np.array([label], dtype=label_dtype) # tokenize raw text tokens_raw = [tokenizer(l) for l in example] # truncate to the truncate_length, tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length) # concate the sequences with special tokens tokens_trun[0] = [cls_token] + tokens_trun[0] tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun)) # convert the token to ids input_ids = vocab[tokens] valid_length = len(input_ids) if not is_test: return input_ids, segment_ids, valid_length, label else: return input_ids, segment_ids, valid_length
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None, sep_token=None, class_labels=None, label_alias=None, vocab=None, is_test=False): """Convert GLUE/SuperGLUE classification and regression examples into the necessary features""" if not is_test: label_dtype = 'int32' if class_labels else 'float32' example, label = example[:-1], example[-1] # create label maps if classification task if class_labels: label_map = {} for (i, l) in enumerate(class_labels): label_map[l] = i if label_alias: for key in label_alias: label_map[key] = label_map[label_alias[key]] # Fix for BoolQ, WSC, and MultiRC, json values get loaded as boolean and not as string # assignments. if type(label) == bool: label = "true" if label else "false" # Fix for COPA if type(label) == int: label = "0" if label == 0 else "1" label = label_map[label] label = np.array([label], dtype=label_dtype) tokens_raw = [tokenizer(l) for l in example] tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length) tokens_trun[0] = [cls_token] + tokens_trun[0] tokens, segment_ids, _ = concat_sequences( tokens_trun, [[sep_token]] * len(tokens_trun)) input_ids = vocab[tokens] valid_length = len(input_ids) if not is_test: return input_ids, segment_ids, valid_length, label else: return input_ids, segment_ids, valid_length