Exemple #1
0
  def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
    assert os.path.isfile(file_path)
    logger.info("Creating features from dataset file at %s", file_path)

    self.examples = []
    self.keywords = label_mapping["keyword"]
    self.label_eos_id = self.keywords.index(label_mapping["label_eos_token"])
    self.label_bos_id = self.keywords.index(label_mapping["label_bos_token"])
    total, valid = 0, 0
    with open(file_path, encoding="utf-8") as f:
      for line in tqdm(f):
        total += 1
        example = json.loads(line)
        text = example["question"]
        columns = example["columns"] + example["tables"] + example["extra"] + example["negative"][:15]
        columns = [column.lower() for column in columns]

        # column_to_text = example["column_to_text"]
        column_to_text = {}
        for column in columns:
          column_text = column.replace(".", " ").replace("_", " ")
          column_to_text[column] = column_text.lower()
        sql = example["processed_sql"]
        text_tokens = [tokenizer.cls_token] + tokenizer.tokenize(text) + [tokenizer.sep_token]
        column_spans = []
        start_idx = len(text_tokens)
        for column in columns:
          column_tokens = tokenizer.tokenize(column_to_text[column])
          text_tokens.extend(column_tokens)
          text_tokens.append(tokenizer.sep_token)
          end_idx = start_idx + len(column_tokens)
          column_spans.append((start_idx, end_idx))
          start_idx = end_idx + 1
        input_ids = tokenizer.convert_tokens_to_ids(text_tokens)

        if len(input_ids) > 600:
          continue

        label_ids = []
        try:
          for token in sql.split():
            token = token.lower()
            if token in columns:
              label_ids.append(columns.index(token) + len(self.keywords))
            else:
              label_ids.append(self.keywords.index(token))
        except:
          continue
        if len(label_ids) > 300:
          continue
        label_ids = [self.label_bos_id] + label_ids + [self.label_eos_id]

        self.examples.append({
          "idx": example["sql_id"],
          "input_ids": input_ids,
          "column_spans": column_spans,
          "label_ids": label_ids})
        valid += 1
    print("Valid Example {}; Invalid Example {}".format(valid, total-valid))
Exemple #2
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path,
                 block_size,
                 local_rank=-1):
        assert os.path.isfile(file_path)
        logger.info(
            "Creating features from dataset file at {}".format(file_path))

        self.examples = []
        total, valid = 0, 0
        add_prefix_space = isinstance(tokenizer, BartTokenizer) or isinstance(
            tokenizer, RobertaTokenizer)
        with open(file_path, encoding="utf-8") as f:
            for line in tqdm(f):
                total += 1
                example = json.loads(line)

                sql = " ".join(example["sql"].split()).lower()
                text = example["question"].strip().lower()

                text_tokens = [tokenizer.cls_token] + tokenizer.tokenize(
                    text,
                    add_prefix_space=add_prefix_space) + [tokenizer.sep_token]
                sql_tokens = [tokenizer.cls_token] + tokenizer.tokenize(
                    sql,
                    add_prefix_space=add_prefix_space) + [tokenizer.sep_token]

                text_token_ids = tokenizer.convert_tokens_to_ids(text_tokens)
                sql_token_ids = tokenizer.convert_tokens_to_ids(sql_tokens)
                if len(text_token_ids) > 800 or len(sql_token_ids) > 800:
                    continue

                self.examples.append({
                    "text_token_ids": text_token_ids,
                    "sql_token_ids": sql_token_ids
                })
        logger.info("Total {} examples.".format(total))
Exemple #3
0
    def __init__(
        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
    ):
        assert os.path.isfile(file_path)

        # MODIFICATION: 2 is the value for the xlm-roberta tokenizer
        block_size = block_size - 2#tokenizer.num_special_tokens_to_add(pair=False)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
Exemple #4
0
 def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str,
              col_token: str):
     self.examples = []
     total = 0
     valid = 0
     with open(file_path, encoding="utf-8") as f:
         for line in tqdm(f):
             total += 1
             example = json.loads(line)
             text = example["question"]
             schema = example["table_info"]["header"]
             tokens = [tokenizer.cls_token] + tokenizer.tokenize(
                 text, add_prefix_space=True) + [col_token]
             column_spans = []
             start_idx = len(tokens)
             for column in schema:
                 column_tokens = tokenizer.tokenize(column.lower(),
                                                    add_prefix_space=True)
                 tokens.extend(column_tokens)
                 column_spans.append(
                     (start_idx, start_idx + len(column_tokens)))
                 tokens.append(col_token)
                 start_idx += len(column_tokens) + 1
             # Change last col token to sep token
             tokens[-1] = tokenizer.sep_token
             input_ids = tokenizer.convert_tokens_to_ids(tokens)
             entities = example["entities"]
             column_labels = [0] * len(schema)
             for entity in entities:
                 if entity != "limit" and entity != "*":
                     column_labels[schema.index(entity)] = 1
             if len(input_ids) > 600:
                 continue
             self.examples.append({
                 "input_ids": input_ids,
                 "column_spans": column_spans,
                 "column_labels": column_labels
             })
             valid += 1
             # Create input
     print("Total {} and Valid {}".format(total, valid))
def cnlp_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
    token_classify=False,
):
    event_start_ind = tokenizer.convert_tokens_to_ids('<e>')
    event_end_ind = tokenizer.convert_tokens_to_ids('</e>')

    if max_length is None:
        max_length = tokenizer.max_len

    if task is not None:
        processor = cnlp_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = cnlp_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            return None
        if output_mode == classification:
            try:
                return label_map[example.label]
            except:
                logger.error('Error with example %s' % (example.guid))
                raise Exception()

        elif output_mode == "regression":
            return float(example.label)
        elif output_mode == tagging:
            return [label_map[label] for label in example.label]
        elif output_mode == relex:
            return [(int(start_token), int(end_token),
                     label_map.get(category, 0))
                    for (start_token, end_token, category) in example.label]

        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    if examples[0].text_b is None:
        sentences = [example.text_a.split(' ') for example in examples]
    else:
        sentences = [(example.text_a, example.text_b) for example in examples]

    batch_encoding = tokenizer(
        sentences,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        is_split_into_words=True,
    )

    # This code has to solve the problem of properly setting labels for word pieces that do not actually need to be tagged.
    encoded_labels = []
    if output_mode == tagging:
        for sent_ind, sent in enumerate(sentences):
            sent_labels = []

            ## FIXME -- this is stupid and won't work outside the roberta encoding
            label_ind = 0
            for wp_ind, wp in enumerate(batch_encoding[sent_ind].tokens):
                if wp.startswith('Ġ') or wp in special_tokens:
                    sent_labels.append(labels[sent_ind].pop(0))
                else:
                    sent_labels.append(-100)
                # if wp_ind in word_inds:
                #     sent_labels.append(labels[sent_ind][label_ind])
                #     label_ind += 1
                # else:
                #     sent_labels.append(-100)

            encoded_labels.append(np.array(sent_labels))

        labels = encoded_labels
    elif output_mode == relex:
        # start by building a matrix that's N' x N' (word-piece length) with "None" as the default
        # for word pairs, and -100 (mask) as the default if one of word pair is a suffix token
        out_of_bounds = 0
        num_relations = 0
        for sent_ind, sent in enumerate(sentences):
            num_relations += len(labels[sent_ind])
            wpi_to_tokeni = {}
            tokeni_to_wpi = {}
            sent_labels = np.zeros((max_length, max_length)) - 100
            wps = batch_encoding[sent_ind].tokens
            sent_len = len(wps)
            ## FIXME -- this is stupid and won't work outside the roberta encoding
            for wp_ind, wp in enumerate(wps):
                if wp.startswith('Ġ') or wp in special_tokens:
                    key = wp_ind
                    val = len(wpi_to_tokeni)

                    wpi_to_tokeni[key] = val
                    tokeni_to_wpi[val] = key

            # make every label beween pairs a 0 to start:
            for wpi in wpi_to_tokeni.keys():
                for wpi2 in wpi_to_tokeni.keys():
                    # leave the diagonals at -100 because you can't have a relation with itself and we
                    # don't want to consider it because it may screw up the learning to have 2 such similar
                    # tokens not involved in a relation.
                    if wpi != wpi2:
                        sent_labels[wpi, wpi2] = 0.0

            for label in labels[sent_ind]:
                if not label[0] in tokeni_to_wpi or not label[
                        1] in tokeni_to_wpi:
                    out_of_bounds += 1
                    continue

                wpi1 = tokeni_to_wpi[label[0]]
                wpi2 = tokeni_to_wpi[label[1]]

                sent_labels[wpi1][wpi2] = label[2]

            encoded_labels.append(sent_labels)
        labels = encoded_labels
        if out_of_bounds > 0:
            logging.warn(
                'During relation processing, there were %d relations (out of %d total relations) where at least one argument was truncated so the relation could not be trained/predicted.'
                % (out_of_bounds, num_relations))

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        try:
            event_start = inputs['input_ids'].index(event_start_ind)
        except:
            event_start = -1

        try:
            event_end = inputs['input_ids'].index(event_end_ind)
        except:
            event_end = len(inputs['input_ids']) - 1

        inputs['event_tokens'] = [0] * len(inputs['input_ids'])
        if event_start >= 0:
            inputs['event_tokens'] = [0] * event_start + [1] * (
                event_end - event_start +
                1) + [0] * (len(inputs['input_ids']) - event_end - 1)
        else:
            inputs['event_tokens'] = [1] * len(inputs['input_ids'])

        feature = InputFeatures(**inputs, label=[labels[i]])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):

        assert os.path.isfile(
            file_path), f"Input file path {file_path} not found"

        self.block_size = block_size - tokenizer.num_special_tokens_to_add(
            pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_nsp_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        self.tokenizer = tokenizer

        lock_path = cached_features_file + ".lock"

        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                logger.info(
                    f"Creating features from dataset file at {directory}")
                self.documents = [[]]
                with open(file_path, encoding="utf-8") as f:
                    while True:
                        line = f.readline()
                        if not line:
                            break
                        line = line.strip()
                        if not line and len(self.documents[-1]) != 0:
                            self.documents.append([])
                        tokens = tokenizer.tokenize(line)
                        tokens = tokenizer.convert_tokens_to_ids(tokens)
                        if tokens:
                            self.documents[-1].append(tokens)

                logger.info(
                    f"Creating examples from {len(self.documents)} documents.")
                self.examples = []
                for doc_index, document in enumerate(self.documents):
                    self.create_examples_from_document(document, doc_index)
                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
Exemple #7
0
def cnlp_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
    token_classify=False,
    inference=False,
    hierarchical=False,
    chunk_len: int = -1,
    num_chunks: int = -1,
    cls_id: int = -1,
    sep_id: int = -1,
    pad_id: int = -1,
    insert_empty_chunk_at_beginning: bool = False,
    truncate_examples: bool = False,
) -> Union[List[InputFeatures], List[HierarchicalInputFeatures]]:
    event_start_ind = tokenizer.convert_tokens_to_ids('<e>')
    event_end_ind = tokenizer.convert_tokens_to_ids('</e>')

    if max_length is None:
        max_length = tokenizer.max_len

    if task is not None:
        processor = cnlp_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = cnlp_output_modes[task]
            logger.info("Using output mode %s for task %s" %
                        (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        if example.label is None:
            # give it a random label, if we didn't specify a label with the data we won't be comparing it.
            # return list(label_map.values())[0]
            return None
        if output_mode == classification:
            try:
                return label_map[example.label]
            except:
                logger.error('Error with example %s' % (example.guid))
                raise Exception()

        elif output_mode == "regression":
            return float(example.label)
        elif output_mode == tagging:
            return [label_map[label] for label in example.label]
        elif output_mode == relex:
            return [(int(start_token), int(end_token),
                     label_map.get(category, 0))
                    for (start_token, end_token, category) in example.label]
        elif output_mode == mtl:
            return [label_map[x] for x in example.label]

        raise KeyError(output_mode)

    labels = [label_from_example(example) for example in examples]

    if examples[0].text_b is None:
        sentences = [example.text_a.split(' ') for example in examples]
    else:
        sentences = [(example.text_a, example.text_b) for example in examples]

    batch_encoding = tokenizer(
        sentences,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        is_split_into_words=True,
    )

    roberta_based = tokenizer.cls_token == '<s>'
    if not roberta_based:
        assert tokenizer.cls_token == '[CLS]', 'This tokenizer does not seem to be based on BERT or Roberta -- this will cause errors with the dataset encoding.'

    # This code has to solve the problem of properly setting labels for word pieces that do not actually need to be tagged.
    if not inference:
        encoded_labels = []
        if output_mode == tagging:
            for sent_ind, sent in enumerate(sentences):
                sent_labels = []

                ## align word-piece tokens to the tokenization we got as input and only assign labels to input tokens
                word_ids = batch_encoding.word_ids(batch_index=sent_ind)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                    # ignored in the loss function.
                    if word_idx is None:
                        label_ids.append(-100)
                    # We set the label for the first token of each word.
                    elif word_idx != previous_word_idx:
                        label_ids.append(labels[sent_ind][word_idx])
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx

                encoded_labels.append(np.array(label_ids))

            labels = encoded_labels
        elif output_mode == relex:
            # start by building a matrix that's N' x N' (word-piece length) with "None" as the default
            # for word pairs, and -100 (mask) as the default if one of word pair is a suffix token
            out_of_bounds = 0
            num_relations = 0
            for sent_ind, sent in enumerate(sentences):
                word_ids = batch_encoding.word_ids(batch_index=sent_ind)
                num_relations += len(labels[sent_ind])
                wpi_to_tokeni = {}
                tokeni_to_wpi = {}
                sent_labels = np.zeros((max_length, max_length)) - 100

                ## align word-piece tokens to the tokenization we got as input and only assign labels to input tokens
                previous_word_idx = None
                for word_pos_idx, word_idx in enumerate(word_ids):
                    if word_idx != previous_word_idx and word_idx is not None:
                        key = word_pos_idx
                        val = len(wpi_to_tokeni)

                        wpi_to_tokeni[key] = val
                        tokeni_to_wpi[val] = key
                    previous_word_idx = word_idx
                # make every label beween pairs a 0 to start:
                for wpi in wpi_to_tokeni.keys():
                    for wpi2 in wpi_to_tokeni.keys():
                        # leave the diagonals at -100 because you can't have a relation with itself and we
                        # don't want to consider it because it may screw up the learning to have 2 such similar
                        # tokens not involved in a relation.
                        if wpi != wpi2:
                            sent_labels[wpi, wpi2] = 0.0

                for label in labels[sent_ind]:
                    if not label[0] in tokeni_to_wpi or not label[
                            1] in tokeni_to_wpi:
                        out_of_bounds += 1
                        continue

                    wpi1 = tokeni_to_wpi[label[0]]
                    wpi2 = tokeni_to_wpi[label[1]]

                    sent_labels[wpi1][wpi2] = label[2]

                encoded_labels.append(sent_labels)
            labels = encoded_labels
            if out_of_bounds > 0:
                logging.warn(
                    'During relation processing, there were %d relations (out of %d total relations) where at least one argument was truncated so the relation could not be trained/predicted.'
                    % (out_of_bounds, num_relations))

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        try:
            event_start = inputs['input_ids'].index(event_start_ind)
        except:
            event_start = -1

        try:
            event_end = inputs['input_ids'].index(event_end_ind)
        except:
            event_end = len(inputs['input_ids']) - 1

        inputs['event_tokens'] = [0] * len(inputs['input_ids'])
        if event_start >= 0:
            inputs['event_tokens'] = [0] * event_start + [1] * (
                event_end - event_start +
                1) + [0] * (len(inputs['input_ids']) - event_end - 1)
        else:
            inputs['event_tokens'] = [1] * len(inputs['input_ids'])

        if inference:
            label = None
        else:
            label = [labels[i]]
        feature = InputFeatures(**inputs, label=label)
        if hierarchical:
            feature = cnlp_convert_features_to_hierarchical(
                feature,
                chunk_len=chunk_len,
                num_chunks=num_chunks,
                cls_id=cls_id,
                sep_id=sep_id,
                pad_id=pad_id,
                insert_empty_chunk_at_beginning=insert_empty_chunk_at_beginning,
            )
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % truncate_features(features[i])
                    if truncate_examples else features[i])

    return features
Exemple #8
0
    def prepare_prompts(
        cls,
        prompts: PromptTemplateConfig,
        tokenizer: PreTrainedTokenizer,
        default_init: Union[str, bool] = None,
    ) -> PromptTemplate:
        if not isinstance(default_init, str):
            default_init = None

        prompt_tokens: List[Optional[Token]] = []
        prompt_to_id: Dict[str, int] = dict()
        prompt_to_init: Dict[str, int] = dict()
        for prompt in prompts:
            if prompt is None:
                prompt_tokens.append(None)
                continue
            # We initialize with the default initializer unless stated otherwise
            init_with = default_init

            # If an tuple/list is given, the second argument is the custom initializer
            if isinstance(prompt, (tuple, list)):
                prompt, init_with = prompt

            # If an integer is given, we need to covert it into a token
            if isinstance(prompt, int):
                # If the index is -1 we treat it as `[MASK]` token
                if prompt < 0:
                    # assert tokenizer.mask_token_id is not None
                    # prompt = tokenizer.mask_token_id
                    prompt += tokenizer.vocab_size

                assert (prompt not in tokenizer.all_special_ids
                        ), "Do not hardcode special IDs"
                # Then we convert it into a wordpiece
                prompt = tokenizer.convert_ids_to_tokens(prompt)

            assert isinstance(prompt, str)

            if prompt in ["[MASK]", "<mask>"]:
                prompt = tokenizer.mask_token

            prompt = cls.get_space_aware_token(prompt, tokenizer)
            # if prompt.startswith(' '):
            #     prompt = self.metaspace + prompt[1:]

            # TODO let's make sure t
            prompt_id: int = tokenizer.convert_tokens_to_ids(prompt)
            assert (prompt_id != tokenizer.unk_token_id
                    ), "Using UNK not implemented yet, may be tricky"

            prompt_tokens.append(
                Token(
                    text=prompt,
                    text_id=prompt_id,
                    type_id=0,
                    idx=None,
                    idx_end=None,
                ))
            prompt_to_id[prompt] = prompt_id
            if init_with is not None:
                # if init_with in ['[MASK]', '<mask>']:
                #     init_with = tokenizer.mask_token
                init_with = cls.get_space_aware_token(init_with, tokenizer)
                prompt_to_init[prompt] = tokenizer.convert_tokens_to_ids(
                    init_with)

        return PromptTemplate(prompt_tokens, prompt_to_id, prompt_to_init)
Exemple #9
0
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_seq_length: int,
    tokenizer: PreTrainedTokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """Loads a data file into a list of `InputFeatures`
    `cls_token_at_end` define the location of the CLS token:
        - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
        - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
    `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    # TODO clean up all this to leverage built-in features of tokenizers

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)

            # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = tokenizer.num_special_tokens_to_add()
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s", example.guid)
            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        if "token_type_ids" not in tokenizer.model_input_names:
            segment_ids = None

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
            )
        )
Exemple #10
0
  def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
    self.examples = []
    self.keywords = label_mapping["keyword"]
    self.label_eos_id = self.keywords.index(label_mapping["label_eos_token"])
    self.label_bos_id = self.keywords.index(label_mapping["label_bos_token"])
    add_prefix_space = isinstance(tokenizer, BartTokenizer) or isinstance(tokenizer, RobertaTokenizer)
    total, valid = 0, 0
    with open(file_path, encoding="utf-8") as f:
      for line in tqdm(f):
        total += 1
        example = json.loads(line)
        text = example["normalized_question"]
        columns = example["columns"]
        tables = example["tables"]
        columns_text = example["column_text"]
        tables_text = example["table_text"]
        sql = example["sql"]
        # we need the adjusted token index info.
        token_idx_to_sub_token_start_idx = {}
        text_tokens = [tokenizer.cls_token]
        start_idx = 0 # This is for adjusting the sc_link and cv_link
        for idx, token in enumerate(text.split()):
          sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space)
          token_idx_to_sub_token_start_idx[idx] = start_idx
          text_tokens.extend(sub_tokens)
          start_idx += len(sub_tokens)
        text_tokens.append(tokenizer.sep_token)
        question_start, question_end = 1, len(text_tokens) - 1 # exclusive

        column_spans = []
        start_idx = len(text_tokens)
        for column_tokens in columns_text:
          column_str = " ".join(column_tokens)
          column_tokens = tokenizer.tokenize(column_str, add_prefix_space=add_prefix_space)
          text_tokens.extend(column_tokens)
          text_tokens.append(tokenizer.sep_token)
          end_idx = start_idx + len(column_tokens)
          column_spans.append((start_idx, end_idx))
          start_idx = end_idx + 1

        column_start = [column_span[0] for column_span in column_spans]
        column_end = [column_span[1] for column_span in column_spans]

        table_spans = []
        start_idx = len(text_tokens)
        for table_tokens in tables_text:
          table_str = " ".join(table_tokens)
          table_tokens = tokenizer.tokenize(table_str, add_prefix_space=add_prefix_space)
          text_tokens.extend(table_tokens)
          text_tokens.append(tokenizer.sep_token)
          end_idx = start_idx + len(table_tokens)
          table_spans.append((start_idx, end_idx))
          start_idx = end_idx + 1

        table_start = [table_span[0] for table_span in table_spans]
        table_end = [table_span[1] for table_span in table_spans]

        input_ids = tokenizer.convert_tokens_to_ids(text_tokens)

        if len(input_ids) > block_size:
          continue

        label_ids = []
        try:
          for token in sql.split():
            if token in columns:
              label_ids.append(columns.index(token) + len(self.keywords))
            else:
              label_ids.append(self.keywords.index(token))
        except:
          continue

        label_ids = [self.label_bos_id] + label_ids + [self.label_eos_id]

        primary_key = [int(x) for x in example["sc_struct"]["primary_key"]]
        foreign_key = {x.split(",")[0]: int(x.split(",")[1]) for x in example["sc_struct"]["foreign_key"]}
        column_to_table = {"0": None}

        sc_link = {"q_col_match": {}, "q_tab_match": {}}
        for k, v in example["sc_link"]["q_col_match"].items():
          new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1]
          sc_link["q_col_match"][new_k] = v

        for k, v in example["sc_link"]["q_tab_match"].items():
          new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1]
          sc_link["q_tab_match"][new_k] = v

        cv_link = {"num_date_match": {}, "cell_match": {}}
        for k, v in example["cv_link"]["num_date_match"].items():
          new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1]
          cv_link["num_date_match"][new_k] = v
        for k, v in example["cv_link"]["cell_match"].items():
          new_k = str(token_idx_to_sub_token_start_idx[int(k.split(",")[0])]) + "," + k.split(",")[1]
          cv_link["cell_match"][new_k] = v


        for idx, column in enumerate(columns):
          if column == "*":
            continue
          t = column.split(".")[0]
          column_to_table[str(idx)] = tables.index(t)

        foreign_keys_tables = {}
        for k, v in foreign_key.items():
          t_k = str(column_to_table[str(k)])
          t_v = str(column_to_table[str(v)])
          if t_k not in foreign_keys_tables:
            foreign_keys_tables[t_k] = []
          if int(t_v) not in foreign_keys_tables[t_k]:
            foreign_keys_tables[t_k].append(int(t_v))

        self.examples.append({
          "input_ids": input_ids,
          "example_info": {
            "normalized_question": text,
            "columns": columns,
            "tables": tables,
            "tokens": text_tokens,
            "question_start": question_start,
            "question_end": question_end,
            "column_start": torch.LongTensor(column_start),
            "column_end": torch.LongTensor(column_end),
            "table_start": torch.LongTensor(table_start),
            "table_end": torch.LongTensor(table_end),
            "sc_link": sc_link,
            "cv_link": cv_link,
            "primary_keys": primary_key,
            "foreign_keys": foreign_key,
            "column_to_table": column_to_table,
            "foreign_keys_tables": foreign_keys_tables
          },
          "column_spans": column_spans,
          "label_ids": label_ids})
        valid += 1
    print("Valid Example {}; Invalid Example {}".format(valid, total - valid))