def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1): assert os.path.isfile(file_path) logger.info("Creating features from dataset file at %s", file_path) self.examples = [] self.keywords = label_mapping["keyword"] self.label_eos_id = self.keywords.index(label_mapping["label_eos_token"]) self.label_bos_id = self.keywords.index(label_mapping["label_bos_token"]) total, valid = 0, 0 with open(file_path, encoding="utf-8") as f: for line in tqdm(f): total += 1 example = json.loads(line) text = example["question"] columns = example["columns"] + example["tables"] + example["extra"] + example["negative"][:15] columns = [column.lower() for column in columns] # column_to_text = example["column_to_text"] column_to_text = {} for column in columns: column_text = column.replace(".", " ").replace("_", " ") column_to_text[column] = column_text.lower() sql = example["processed_sql"] text_tokens = [tokenizer.cls_token] + tokenizer.tokenize(text) + [tokenizer.sep_token] column_spans = [] start_idx = len(text_tokens) for column in columns: column_tokens = tokenizer.tokenize(column_to_text[column]) text_tokens.extend(column_tokens) text_tokens.append(tokenizer.sep_token) end_idx = start_idx + len(column_tokens) column_spans.append((start_idx, end_idx)) start_idx = end_idx + 1 input_ids = tokenizer.convert_tokens_to_ids(text_tokens) if len(input_ids) > 600: continue label_ids = [] try: for token in sql.split(): token = token.lower() if token in columns: label_ids.append(columns.index(token) + len(self.keywords)) else: label_ids.append(self.keywords.index(token)) except: continue if len(label_ids) > 300: continue label_ids = [self.label_bos_id] + label_ids + [self.label_eos_id] self.examples.append({ "idx": example["sql_id"], "input_ids": input_ids, "column_spans": column_spans, "label_ids": label_ids}) valid += 1 print("Valid Example {}; Invalid Example {}".format(valid, total-valid))
def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, ): assert os.path.isfile(file_path) # MODIFICATION: 2 is the value for the xlm-roberta tokenizer block_size = block_size - 2#tokenizer.num_special_tokens_to_add(pair=False) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), ) # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {directory}") self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append( tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]) ) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start )
def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] # batch_encoding = tokenizer( # [(example.text_a, example.text_b) for example in examples], # max_length=max_length, # padding="max_length", # truncation=True, # ) features = [] for i in range(len(examples)): # inputs = {k: batch_encoding[k][i] for k in batch_encoding} inputs = tokenizer.encode_plus(text=examples[i].text_a.split(" "), text_pair=examples[i].text_b.split(" ") if examples[i].text_b else None, max_length=max_length, padding="max_length", truncation=True) feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def convert_examples_to_features( examples: Union[List[InputExample]], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, ): if max_length is None: max_length = tokenizer.max_len labels = [exmaple.label for exmaple in examples] batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:3]): logger.info("*** Example ***") logger.info("guid: %s" % (example.uid)) logger.info("features: %s" % features[i]) return features
def get_space_aware_token(cls, token: str, tokenizer: PreTrainedTokenizer, **kwargs) -> str: """ Convert the given string to a single word/sentence-piece token. We might want to reverse-engineer the tokenizer to analyze the behaviour because every single implementation differs dramatically. """ if token in ["[MASK]", "<mask>"]: return tokenizer.mask_token # This trick is developed by professionals in the laboratory conditions # !! DO NOT TRY THIS AT HOME !! z, q, w = tokenizer.tokenize("zq w", **kwargs) # We expect that `q` should always be joined to the `z` join_seq, *tail_q = q.rpartition("q") assert tail_q == ["q", ""] # Similarly, `w` token should always start with whitespace space_seq, *tail_w = w.partition("w") assert tail_w == ["w", ""] # Only if the token starts with space then we treat it as a separate token if not token.startswith(" "): return join_seq + token *head, token = token.partition(" ") assert head == ["", " "] return space_seq + token # Here we also check whether the first token contains a whitespace prefix_space = z == q
def snli_multi_task_convert_examples_to_features( examples_1: Union[List[InputExample]], examples_2: Union[List[InputExample]], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, ): if max_length is None: max_length = tokenizer.max_len labels_1 = [exmaple.label for exmaple in examples_1] labels_2 = [exmaple.label for exmaple in examples_2] batch_encoding_1 = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples_1], max_length=max_length, pad_to_max_length=True, ) batch_encoding_2 = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples_2], max_length=max_length, pad_to_max_length=True, ) features = [] for i in range(len(examples_1)): inputs_1 = {k: batch_encoding_1[k][i] for k in batch_encoding_1} inputs_2 = {k: batch_encoding_2[k][i] for k in batch_encoding_2} feature = InputFeatures( input_ids=inputs_1['input_ids'], attention_mask=inputs_1['attention_mask'], token_type_ids=inputs_1['token_type_ids'], label=labels_1[i], input_ids_2=inputs_2['input_ids'], attention_mask_2=inputs_2['attention_mask'], token_type_ids_2=inputs_2['token_type_ids'], label_2=labels_2[i], ) features.append(feature) for i, example in enumerate(examples_1[:3]): logger.info("*** Example ***") logger.info("guid: %s" % (example.uid)) logger.info("features: %s" % features[i]) return features
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, col_token: str): self.examples = [] total = 0 valid = 0 with open(file_path, encoding="utf-8") as f: for line in tqdm(f): total += 1 example = json.loads(line) text = example["question"] schema = example["table_info"]["header"] tokens = [tokenizer.cls_token] + tokenizer.tokenize( text, add_prefix_space=True) + [col_token] column_spans = [] start_idx = len(tokens) for column in schema: column_tokens = tokenizer.tokenize(column.lower(), add_prefix_space=True) tokens.extend(column_tokens) column_spans.append( (start_idx, start_idx + len(column_tokens))) tokens.append(col_token) start_idx += len(column_tokens) + 1 # Change last col token to sep token tokens[-1] = tokenizer.sep_token input_ids = tokenizer.convert_tokens_to_ids(tokens) entities = example["entities"] column_labels = [0] * len(schema) for entity in entities: if entity != "limit" and entity != "*": column_labels[schema.index(entity)] = 1 if len(input_ids) > 600: continue self.examples.append({ "input_ids": input_ids, "column_spans": column_spans, "column_labels": column_labels }) valid += 1 # Create input print("Total {} and Valid {}".format(total, valid))
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info("Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"]
def __init__(self, tokenizer: PreTrainedTokenizer, file_path, block_size, local_rank=-1): assert os.path.isfile(file_path) logger.info( "Creating features from dataset file at {}".format(file_path)) self.examples = [] total, valid = 0, 0 add_prefix_space = isinstance(tokenizer, BartTokenizer) or isinstance( tokenizer, RobertaTokenizer) with open(file_path, encoding="utf-8") as f: for line in tqdm(f): total += 1 example = json.loads(line) sql = " ".join(example["sql"].split()).lower() text = example["question"].strip().lower() text_tokens = [tokenizer.cls_token] + tokenizer.tokenize( text, add_prefix_space=add_prefix_space) + [tokenizer.sep_token] sql_tokens = [tokenizer.cls_token] + tokenizer.tokenize( sql, add_prefix_space=add_prefix_space) + [tokenizer.sep_token] text_token_ids = tokenizer.convert_tokens_to_ids(text_tokens) sql_token_ids = tokenizer.convert_tokens_to_ids(sql_tokens) if len(text_token_ids) > 800 or len(sql_token_ids) > 800: continue self.examples.append({ "text_token_ids": text_token_ids, "sql_token_ids": sql_token_ids }) logger.info("Total {} examples.".format(total))
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int): assert os.path.isfile( file_path), f"Input file path {file_path} not found" directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lbl_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, ), ) if os.path.exists(cached_features_file): with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: with open(file_path, encoding="utf-8") as f: lines = [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] max_len = block_size - tokenizer.num_special_tokens_to_add( pair=False) batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=max_len) self.examples = batch_encoding["input_ids"] self.examples = [{ "input_ids": torch.tensor(e, dtype=torch.long) } for e in self.examples] with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def cnlp_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, token_classify=False, ): event_start_ind = tokenizer.convert_tokens_to_ids('<e>') event_end_ind = tokenizer.convert_tokens_to_ids('</e>') if max_length is None: max_length = tokenizer.max_len if task is not None: processor = cnlp_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = cnlp_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == classification: try: return label_map[example.label] except: logger.error('Error with example %s' % (example.guid)) raise Exception() elif output_mode == "regression": return float(example.label) elif output_mode == tagging: return [label_map[label] for label in example.label] elif output_mode == relex: return [(int(start_token), int(end_token), label_map.get(category, 0)) for (start_token, end_token, category) in example.label] raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] if examples[0].text_b is None: sentences = [example.text_a.split(' ') for example in examples] else: sentences = [(example.text_a, example.text_b) for example in examples] batch_encoding = tokenizer( sentences, max_length=max_length, padding="max_length", truncation=True, is_split_into_words=True, ) # This code has to solve the problem of properly setting labels for word pieces that do not actually need to be tagged. encoded_labels = [] if output_mode == tagging: for sent_ind, sent in enumerate(sentences): sent_labels = [] ## FIXME -- this is stupid and won't work outside the roberta encoding label_ind = 0 for wp_ind, wp in enumerate(batch_encoding[sent_ind].tokens): if wp.startswith('Ġ') or wp in special_tokens: sent_labels.append(labels[sent_ind].pop(0)) else: sent_labels.append(-100) # if wp_ind in word_inds: # sent_labels.append(labels[sent_ind][label_ind]) # label_ind += 1 # else: # sent_labels.append(-100) encoded_labels.append(np.array(sent_labels)) labels = encoded_labels elif output_mode == relex: # start by building a matrix that's N' x N' (word-piece length) with "None" as the default # for word pairs, and -100 (mask) as the default if one of word pair is a suffix token out_of_bounds = 0 num_relations = 0 for sent_ind, sent in enumerate(sentences): num_relations += len(labels[sent_ind]) wpi_to_tokeni = {} tokeni_to_wpi = {} sent_labels = np.zeros((max_length, max_length)) - 100 wps = batch_encoding[sent_ind].tokens sent_len = len(wps) ## FIXME -- this is stupid and won't work outside the roberta encoding for wp_ind, wp in enumerate(wps): if wp.startswith('Ġ') or wp in special_tokens: key = wp_ind val = len(wpi_to_tokeni) wpi_to_tokeni[key] = val tokeni_to_wpi[val] = key # make every label beween pairs a 0 to start: for wpi in wpi_to_tokeni.keys(): for wpi2 in wpi_to_tokeni.keys(): # leave the diagonals at -100 because you can't have a relation with itself and we # don't want to consider it because it may screw up the learning to have 2 such similar # tokens not involved in a relation. if wpi != wpi2: sent_labels[wpi, wpi2] = 0.0 for label in labels[sent_ind]: if not label[0] in tokeni_to_wpi or not label[ 1] in tokeni_to_wpi: out_of_bounds += 1 continue wpi1 = tokeni_to_wpi[label[0]] wpi2 = tokeni_to_wpi[label[1]] sent_labels[wpi1][wpi2] = label[2] encoded_labels.append(sent_labels) labels = encoded_labels if out_of_bounds > 0: logging.warn( 'During relation processing, there were %d relations (out of %d total relations) where at least one argument was truncated so the relation could not be trained/predicted.' % (out_of_bounds, num_relations)) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} try: event_start = inputs['input_ids'].index(event_start_ind) except: event_start = -1 try: event_end = inputs['input_ids'].index(event_end_ind) except: event_end = len(inputs['input_ids']) - 1 inputs['event_tokens'] = [0] * len(inputs['input_ids']) if event_start >= 0: inputs['event_tokens'] = [0] * event_start + [1] * ( event_end - event_start + 1) + [0] * (len(inputs['input_ids']) - event_end - 1) else: inputs['event_tokens'] = [1] * len(inputs['input_ids']) feature = InputFeatures(**inputs, label=[labels[i]]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, short_seq_probability=0.1, nsp_probability=0.5, ): assert os.path.isfile( file_path), f"Input file path {file_path} not found" self.block_size = block_size - tokenizer.num_special_tokens_to_add( pair=True) self.short_seq_probability = short_seq_probability self.nsp_probability = nsp_probability directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_nsp_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), filename, ), ) self.tokenizer = tokenizer lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: logger.info( f"Creating features from dataset file at {directory}") self.documents = [[]] with open(file_path, encoding="utf-8") as f: while True: line = f.readline() if not line: break line = line.strip() if not line and len(self.documents[-1]) != 0: self.documents.append([]) tokens = tokenizer.tokenize(line) tokens = tokenizer.convert_tokens_to_ids(tokens) if tokens: self.documents[-1].append(tokens) logger.info( f"Creating examples from {len(self.documents)} documents.") self.examples = [] for doc_index, document in enumerate(self.documents): self.create_examples_from_document(document, doc_index) start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)
def cnlp_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, token_classify=False, inference=False, hierarchical=False, chunk_len: int = -1, num_chunks: int = -1, cls_id: int = -1, sep_id: int = -1, pad_id: int = -1, insert_empty_chunk_at_beginning: bool = False, truncate_examples: bool = False, ) -> Union[List[InputFeatures], List[HierarchicalInputFeatures]]: event_start_ind = tokenizer.convert_tokens_to_ids('<e>') event_end_ind = tokenizer.convert_tokens_to_ids('</e>') if max_length is None: max_length = tokenizer.max_len if task is not None: processor = cnlp_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = cnlp_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: # give it a random label, if we didn't specify a label with the data we won't be comparing it. # return list(label_map.values())[0] return None if output_mode == classification: try: return label_map[example.label] except: logger.error('Error with example %s' % (example.guid)) raise Exception() elif output_mode == "regression": return float(example.label) elif output_mode == tagging: return [label_map[label] for label in example.label] elif output_mode == relex: return [(int(start_token), int(end_token), label_map.get(category, 0)) for (start_token, end_token, category) in example.label] elif output_mode == mtl: return [label_map[x] for x in example.label] raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] if examples[0].text_b is None: sentences = [example.text_a.split(' ') for example in examples] else: sentences = [(example.text_a, example.text_b) for example in examples] batch_encoding = tokenizer( sentences, max_length=max_length, padding="max_length", truncation=True, is_split_into_words=True, ) roberta_based = tokenizer.cls_token == '<s>' if not roberta_based: assert tokenizer.cls_token == '[CLS]', 'This tokenizer does not seem to be based on BERT or Roberta -- this will cause errors with the dataset encoding.' # This code has to solve the problem of properly setting labels for word pieces that do not actually need to be tagged. if not inference: encoded_labels = [] if output_mode == tagging: for sent_ind, sent in enumerate(sentences): sent_labels = [] ## align word-piece tokens to the tokenization we got as input and only assign labels to input tokens word_ids = batch_encoding.word_ids(batch_index=sent_ind) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(labels[sent_ind][word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(-100) previous_word_idx = word_idx encoded_labels.append(np.array(label_ids)) labels = encoded_labels elif output_mode == relex: # start by building a matrix that's N' x N' (word-piece length) with "None" as the default # for word pairs, and -100 (mask) as the default if one of word pair is a suffix token out_of_bounds = 0 num_relations = 0 for sent_ind, sent in enumerate(sentences): word_ids = batch_encoding.word_ids(batch_index=sent_ind) num_relations += len(labels[sent_ind]) wpi_to_tokeni = {} tokeni_to_wpi = {} sent_labels = np.zeros((max_length, max_length)) - 100 ## align word-piece tokens to the tokenization we got as input and only assign labels to input tokens previous_word_idx = None for word_pos_idx, word_idx in enumerate(word_ids): if word_idx != previous_word_idx and word_idx is not None: key = word_pos_idx val = len(wpi_to_tokeni) wpi_to_tokeni[key] = val tokeni_to_wpi[val] = key previous_word_idx = word_idx # make every label beween pairs a 0 to start: for wpi in wpi_to_tokeni.keys(): for wpi2 in wpi_to_tokeni.keys(): # leave the diagonals at -100 because you can't have a relation with itself and we # don't want to consider it because it may screw up the learning to have 2 such similar # tokens not involved in a relation. if wpi != wpi2: sent_labels[wpi, wpi2] = 0.0 for label in labels[sent_ind]: if not label[0] in tokeni_to_wpi or not label[ 1] in tokeni_to_wpi: out_of_bounds += 1 continue wpi1 = tokeni_to_wpi[label[0]] wpi2 = tokeni_to_wpi[label[1]] sent_labels[wpi1][wpi2] = label[2] encoded_labels.append(sent_labels) labels = encoded_labels if out_of_bounds > 0: logging.warn( 'During relation processing, there were %d relations (out of %d total relations) where at least one argument was truncated so the relation could not be trained/predicted.' % (out_of_bounds, num_relations)) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} try: event_start = inputs['input_ids'].index(event_start_ind) except: event_start = -1 try: event_end = inputs['input_ids'].index(event_end_ind) except: event_end = len(inputs['input_ids']) - 1 inputs['event_tokens'] = [0] * len(inputs['input_ids']) if event_start >= 0: inputs['event_tokens'] = [0] * event_start + [1] * ( event_end - event_start + 1) + [0] * (len(inputs['input_ids']) - event_end - 1) else: inputs['event_tokens'] = [1] * len(inputs['input_ids']) if inference: label = None else: label = [labels[i]] feature = InputFeatures(**inputs, label=label) if hierarchical: feature = cnlp_convert_features_to_hierarchical( feature, chunk_len=chunk_len, num_chunks=num_chunks, cls_id=cls_id, sep_id=sep_id, pad_id=pad_id, insert_empty_chunk_at_beginning=insert_empty_chunk_at_beginning, ) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % truncate_features(features[i]) if truncate_examples else features[i]) return features
def prepare_prompts( cls, prompts: PromptTemplateConfig, tokenizer: PreTrainedTokenizer, default_init: Union[str, bool] = None, ) -> PromptTemplate: if not isinstance(default_init, str): default_init = None prompt_tokens: List[Optional[Token]] = [] prompt_to_id: Dict[str, int] = dict() prompt_to_init: Dict[str, int] = dict() for prompt in prompts: if prompt is None: prompt_tokens.append(None) continue # We initialize with the default initializer unless stated otherwise init_with = default_init # If an tuple/list is given, the second argument is the custom initializer if isinstance(prompt, (tuple, list)): prompt, init_with = prompt # If an integer is given, we need to covert it into a token if isinstance(prompt, int): # If the index is -1 we treat it as `[MASK]` token if prompt < 0: # assert tokenizer.mask_token_id is not None # prompt = tokenizer.mask_token_id prompt += tokenizer.vocab_size assert (prompt not in tokenizer.all_special_ids ), "Do not hardcode special IDs" # Then we convert it into a wordpiece prompt = tokenizer.convert_ids_to_tokens(prompt) assert isinstance(prompt, str) if prompt in ["[MASK]", "<mask>"]: prompt = tokenizer.mask_token prompt = cls.get_space_aware_token(prompt, tokenizer) # if prompt.startswith(' '): # prompt = self.metaspace + prompt[1:] # TODO let's make sure t prompt_id: int = tokenizer.convert_tokens_to_ids(prompt) assert (prompt_id != tokenizer.unk_token_id ), "Using UNK not implemented yet, may be tricky" prompt_tokens.append( Token( text=prompt, text_id=prompt_id, type_id=0, idx=None, idx_end=None, )) prompt_to_id[prompt] = prompt_id if init_with is not None: # if init_with in ['[MASK]', '<mask>']: # init_with = tokenizer.mask_token init_with = cls.get_space_aware_token(init_with, tokenizer) prompt_to_init[prompt] = tokenizer.convert_tokens_to_ids( init_with) return PromptTemplate(prompt_tokens, prompt_to_id, prompt_to_init)
def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: PreTrainedTokenizer, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=1, sep_token="[SEP]", sep_token_extra=False, pad_on_left=False, pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True, ) -> List[InputFeatures]: """Loads a data file into a list of `InputFeatures` `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ # TODO clean up all this to leverage built-in features of tokenizers label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] for word, label in zip(example.words, example.labels): word_tokens = tokenizer.tokenize(word) # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space. if len(word_tokens) > 0: tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = tokenizer.num_special_tokens_to_add() if len(tokens) > max_seq_length - special_tokens_count: tokens = tokens[: (max_seq_length - special_tokens_count)] label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens += [sep_token] label_ids += [pad_token_label_id] if sep_token_extra: # roberta uses an extra separator b/w pairs of sentences tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] label_ids += [pad_token_label_id] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s", example.guid) logger.info("tokens: %s", " ".join([str(x) for x in tokens])) logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) if "token_type_ids" not in tokenizer.model_input_names: segment_ids = None features.append( InputFeatures( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids ) )
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1): self.examples = [] self.keywords = label_mapping["keyword"] self.label_eos_id = self.keywords.index(label_mapping["label_eos_token"]) self.label_bos_id = self.keywords.index(label_mapping["label_bos_token"]) add_prefix_space = isinstance(tokenizer, BartTokenizer) or isinstance(tokenizer, RobertaTokenizer) total, valid = 0, 0 with open(file_path, encoding="utf-8") as f: for line in tqdm(f): total += 1 example = json.loads(line) text = example["normalized_question"] columns = example["columns"] tables = example["tables"] columns_text = example["column_text"] tables_text = example["table_text"] sql = example["sql"] # we need the adjusted token index info. token_idx_to_sub_token_start_idx = {} text_tokens = [tokenizer.cls_token] start_idx = 0 # This is for adjusting the sc_link and cv_link for idx, token in enumerate(text.split()): sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space) token_idx_to_sub_token_start_idx[idx] = start_idx text_tokens.extend(sub_tokens) start_idx += len(sub_tokens) text_tokens.append(tokenizer.sep_token) question_start, question_end = 1, len(text_tokens) - 1 # exclusive column_spans = [] start_idx = len(text_tokens) for column_tokens in columns_text: column_str = " ".join(column_tokens) column_tokens = tokenizer.tokenize(column_str, add_prefix_space=add_prefix_space) text_tokens.extend(column_tokens) text_tokens.append(tokenizer.sep_token) end_idx = start_idx + len(column_tokens) column_spans.append((start_idx, end_idx)) start_idx = end_idx + 1 column_start = [column_span[0] for column_span in column_spans] column_end = [column_span[1] for column_span in column_spans] table_spans = [] start_idx = len(text_tokens) for table_tokens in tables_text: table_str = " ".join(table_tokens) table_tokens = tokenizer.tokenize(table_str, add_prefix_space=add_prefix_space) text_tokens.extend(table_tokens) text_tokens.append(tokenizer.sep_token) end_idx = start_idx + len(table_tokens) table_spans.append((start_idx, end_idx)) start_idx = end_idx + 1 table_start = [table_span[0] for table_span in table_spans] table_end = [table_span[1] for table_span in table_spans] input_ids = tokenizer.convert_tokens_to_ids(text_tokens) if len(input_ids) > block_size: continue label_ids = [] try: for token in sql.split(): if token in columns: label_ids.append(columns.index(token) + len(self.keywords)) else: label_ids.append(self.keywords.index(token)) except: continue label_ids = [self.label_bos_id] + label_ids + [self.label_eos_id] primary_key = [int(x) for x in example["sc_struct"]["primary_key"]] foreign_key = {x.split(",")[0]: int(x.split(",")[1]) for x in example["sc_struct"]["foreign_key"]} column_to_table = {"0": None} sc_link = {"q_col_match": {}, "q_tab_match": {}} for k, v in example["sc_link"]["q_col_match"].items(): new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1] sc_link["q_col_match"][new_k] = v for k, v in example["sc_link"]["q_tab_match"].items(): new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1] sc_link["q_tab_match"][new_k] = v cv_link = {"num_date_match": {}, "cell_match": {}} for k, v in example["cv_link"]["num_date_match"].items(): new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1] cv_link["num_date_match"][new_k] = v for k, v in example["cv_link"]["cell_match"].items(): new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1] cv_link["cell_match"][new_k] = v for idx, column in enumerate(columns): if column == "*": continue t = column.split(".")[0] column_to_table[str(idx)] = tables.index(t) foreign_keys_tables = {} for k, v in foreign_key.items(): t_k = str(column_to_table[str(k)]) t_v = str(column_to_table[str(v)]) if t_k not in foreign_keys_tables: foreign_keys_tables[t_k] = [] if int(t_v) not in foreign_keys_tables[t_k]: foreign_keys_tables[t_k].append(int(t_v)) self.examples.append({ "input_ids": input_ids, "example_info": { "normalized_question": text, "columns": columns, "tables": tables, "tokens": text_tokens, "question_start": question_start, "question_end": question_end, "column_start": torch.LongTensor(column_start), "column_end": torch.LongTensor(column_end), "table_start": torch.LongTensor(table_start), "table_end": torch.LongTensor(table_end), "sc_link": sc_link, "cv_link": cv_link, "primary_keys": primary_key, "foreign_keys": foreign_key, "column_to_table": column_to_table, "foreign_keys_tables": foreign_keys_tables }, "column_spans": column_spans, "label_ids": label_ids}) valid += 1 print("Valid Example {}; Invalid Example {}".format(valid, total - valid))