コード例 #1
0
ファイル: input_features.py プロジェクト: voxlogic/FARM
def samples_to_features_ner(
    sample,
    tasks,
    max_seq_len,
    tokenizer,
    non_initial_token="X",
    **kwargs
):
    """
    Generates a dictionary of features for a given input sample that is to be consumed by an NER model.

    :param sample: Sample object that contains human readable text and label fields from a single NER data sample
    :type sample: Sample
    :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name)
    :type tasks: dict
    :param max_seq_len: Sequences are truncated after this many tokens
    :type max_seq_len: int
    :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens
    :param non_initial_token: Token that is inserted into the label sequence in positions where there is a
                              non-word-initial token. This is done since the default NER performs prediction
                              only on word initial tokens
    :return: A list with one dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask"
             (also "label_ids" if not in inference mode). The values are lists containing those features.
    :rtype: list
    """

    tokens = sample.tokenized["tokens"]

    if tokenizer.is_fast:
        text = sample.clear_text["text"]
        # Here, we tokenize the sample for the second time to get all relevant ids
        # This should change once we git rid of FARM's tokenize_with_metadata()
        inputs = tokenizer(text,
                           return_token_type_ids=True,
                           truncation=True,
                           truncation_strategy="longest_first",
                           max_length=max_seq_len,
                           return_special_tokens_mask=True)

        if (len(inputs["input_ids"]) - inputs["special_tokens_mask"].count(1)) != len(sample.tokenized["tokens"]):
            logger.error(f"FastTokenizer encoded sample {sample.clear_text['text']} to "
                         f"{len(inputs['input_ids']) - inputs['special_tokens_mask'].count(1)} tokens, which differs "
                         f"from number of tokens produced in tokenize_with_metadata().\n"
                         f"Further processing is likely to be wrong!")
    else:
        inputs = tokenizer.encode_plus(text=tokens,
                                       text_pair=None,
                                       add_special_tokens=True,
                                       truncation=False,
                                       return_special_tokens_mask=True,
                                       return_token_type_ids=True,
                                       is_pretokenized=False
                                       )

    input_ids, segment_ids, special_tokens_mask = inputs["input_ids"], inputs["token_type_ids"], inputs["special_tokens_mask"]

    # We construct a mask to identify the first token of a word. We will later only use them for predicting entities.
    # Special tokens don't count as initial tokens => we add 0 at the positions of special tokens
    # For BERT we add a 0 in the start and end (for CLS and SEP)
    initial_mask = [int(x) for x in sample.tokenized["start_of_word"]]
    initial_mask = insert_at_special_tokens_pos(initial_mask, special_tokens_mask, insert_element=0)
    assert len(initial_mask) == len(input_ids)

    for task_name, task in tasks.items():
        try:
            label_list = task["label_list"]
            label_name = task["label_name"]
            label_tensor_name = task["label_tensor_name"]
            labels_word = sample.clear_text[label_name]
            labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
            # labels_token = add_cls_sep(labels_token, cls_token, sep_token)
            label_ids = [label_list.index(lt) for lt in labels_token]
        except ValueError:
            label_ids = None
            problematic_labels = set(labels_token).difference(set(label_list))
            logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                           f"\nWe found a problem with labels {str(problematic_labels)}")
        except KeyError:
            # For inference mode we don't expect labels
            label_ids = None
            logger.warning(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                           "\nIf your are running in *inference* mode: Don't worry!"
                           "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")

        # This mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        padding_mask = [1] * len(input_ids)

        # Padding up to the sequence length.
        # Normal case: adding multiple 0 to the right
        # Special cases:
        # a) xlnet pads on the left and uses  "4" for padding token_type_ids
        if tokenizer.__class__.__name__ == "XLNetTokenizer":
            pad_on_left = True
            segment_ids = pad(segment_ids, max_seq_len, 4, pad_on_left=pad_on_left)
        else:
            pad_on_left = False
            segment_ids = pad(segment_ids, max_seq_len, 0, pad_on_left=pad_on_left)

        input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
        padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
        initial_mask = pad(initial_mask, max_seq_len, 0, pad_on_left=pad_on_left)
        if label_ids:
            label_ids = pad(label_ids, max_seq_len, 0, pad_on_left=pad_on_left)

        feature_dict = {
            "input_ids": input_ids,
            "padding_mask": padding_mask,
            "segment_ids": segment_ids,
            "initial_mask": initial_mask,
        }

        if label_ids:
            feature_dict[label_tensor_name] = label_ids

    return [feature_dict]
コード例 #2
0
ファイル: input_features.py プロジェクト: leiframming/FARM
def samples_to_features_ner(sample,
                            label_list,
                            max_seq_len,
                            tokenizer,
                            cls_token="[CLS]",
                            pad_token="[PAD]",
                            sep_token="[SEP]",
                            non_initial_token="X",
                            **kwargs):
    """
    Generates a dictionary of features for a given input sample that is to be consumed by an NER model.

    :param sample: Sample object that contains human readable text and label fields from a single NER data sample
    :type sample: Sample
    :param label_list: A list of all unique labels
    :type label_list: list
    :param max_seq_len: Sequences are truncated after this many tokens
    :type max_seq_len: int
    :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens
    :param cls_token: Token used to represent the beginning of the sequence
    :type cls_token: str
    :param pad_token: Token used to represent sequence padding
    :type pad_token: str
    :param sep_token: Token used to represent the border between two sequences
    :type sep_token: str
    :param non_initial_token: Token that is inserted into the label sequence in positions where there is a
                              non-word-initial token. This is done since the default NER performs prediction
                              only on word initial tokens
    :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask"
             (also "label_ids" if not in inference mode). The values are lists containing those features.
    :rtype: dict
    """

    # Tokenize words and extend the labels so they are aligned with the tokens
    # words = sample.clear_text["text"].split(" ")
    # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len)

    tokens = sample.tokenized["tokens"]
    initial_mask = [int(x) for x in sample.tokenized["start_of_word"]]

    # initial_mask =
    # Add CLS and SEP tokens
    tokens = add_cls_sep(tokens, cls_token, sep_token)
    initial_mask = [0] + initial_mask + [
        0
    ]  # CLS and SEP don't count as initial tokens
    padding_mask = [1] * len(tokens)

    # Convert to input and labels to ids, generate masks
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    if "label" in sample.clear_text:
        labels_word = sample.clear_text["label"]
        labels_token = expand_labels(labels_word, initial_mask,
                                     non_initial_token)
        # labels_token = add_cls_sep(labels_token, cls_token, sep_token)
        label_ids = [label_list.index(lt) for lt in labels_token]
    # Inference mode
    else:
        label_ids = None
    segment_ids = [0] * max_seq_len

    # Pad
    input_ids = pad(input_ids, max_seq_len, 0)
    if label_ids:
        label_ids = pad(label_ids, max_seq_len, 0)
    initial_mask = pad(initial_mask, max_seq_len, 0)
    padding_mask = pad(padding_mask, max_seq_len, 0)

    feature_dict = {
        "input_ids": input_ids,
        "padding_mask": padding_mask,
        "segment_ids": segment_ids,
        "initial_mask": initial_mask,
    }

    if label_ids:
        feature_dict["label_ids"] = label_ids

    return [feature_dict]
コード例 #3
0
ファイル: input_features.py プロジェクト: echan00/FARM
def samples_to_features_ner(sample,
                            tasks,
                            max_seq_len,
                            tokenizer,
                            cls_token="[CLS]",
                            sep_token="[SEP]",
                            non_initial_token="X",
                            **kwargs):
    """
    Generates a dictionary of features for a given input sample that is to be consumed by an NER model.
    :param sample: Sample object that contains human readable text and label fields from a single NER data sample
    :type sample: Sample
    :param tasks: A dictionary where the keys are the names of the tasks and the values are the details of the task (e.g. label_list, metric, tensor name)
    :type tasks: dict
    :param max_seq_len: Sequences are truncated after this many tokens
    :type max_seq_len: int
    :param tokenizer: A tokenizer object that can turn string sentences into a list of tokens
    :param cls_token: Token used to represent the beginning of the sequence
    :type cls_token: str
    :param sep_token: Token used to represent the border between two sequences
    :type sep_token: str
    :param non_initial_token: Token that is inserted into the label sequence in positions where there is a
                              non-word-initial token. This is done since the default NER performs prediction
                              only on word initial tokens
    :return: A dictionary containing the keys "input_ids", "padding_mask", "segment_ids", "initial_mask"
             (also "label_ids" if not in inference mode). The values are lists containing those features.
    :rtype: dict
    """
    # Tokenize words and extend the labels so they are aligned with the tokens
    # words = sample.clear_text["text"].split(" ")
    # tokens, initial_mask = words_to_tokens(words, tokenizer, max_seq_len)
    tokens = sample.tokenized["tokens"]
    custom_data = sample.tokenized["custom_data"]

    initial_mask = [int(x) for x in sample.tokenized["start_of_word"]]
    # initial_mask =
    # Add CLS and SEP tokens
    tokens = add_cls_sep(tokens, cls_token, sep_token)
    custom_data = [5] + custom_data + [4]
    initial_mask = [0] + initial_mask + [
        0
    ]  # CLS and SEP don't count as initial tokens
    padding_mask = [1] * len(tokens)
    # Convert to input and labels to ids, generate masks
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    for task_name, task in tasks.items():
        try:
            label_list = task["label_list"]
            label_name = task["label_name"]
            label_tensor_name = task["label_tensor_name"]
            labels_word = sample.clear_text[label_name]
            labels_token = expand_labels(labels_word, initial_mask,
                                         non_initial_token)
            # labels_token = add_cls_sep(labels_token, cls_token, sep_token)
            #label_ids = [label_list.index(lt) for lt in labels_token]
            label_ids = [
                label_list.index(lt) for lt in sample.tokenized['ner_label']
            ]
        except ValueError:
            label_ids = None
            problematic_labels = set(labels_token).difference(set(label_list))
            logger.warning(
                f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                f"\nWe found a problem with labels {str(problematic_labels)}")
        except KeyError:
            # For inference mode we don't expect labels
            label_ids = None
            logger.warning(
                f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                "\nIf your are running in *inference* mode: Don't worry!"
                "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are corre"
            )
        label_ids = [5] + label_ids + [4]
        segment_ids = []
        next_sent = False
        for x in input_ids:
            if x == 102:
                segment_ids.append(0)
                next_sent = True
            elif next_sent == True:
                segment_ids.append(1)
            else:
                segment_ids.append(0)
        segment_ids[len(segment_ids) - 1] = 1
        # Pad
        input_ids = pad(input_ids, max_seq_len, 0)
        if label_ids:
            label_ids = pad(label_ids, max_seq_len, 0)
        initial_mask = pad(initial_mask, max_seq_len, 0)
        padding_mask = pad(padding_mask, max_seq_len, 0)
        custom_data = pad(custom_data, max_seq_len, 0)
        segment_ids = pad(segment_ids, max_seq_len, 0)
        feature_dict = {
            "input_ids": input_ids,
            "padding_mask": padding_mask,
            "segment_ids": segment_ids,
            "initial_mask": initial_mask,
            "custom_data": custom_data,
        }
        if label_ids:
            feature_dict[label_tensor_name] = label_ids
    return [feature_dict]
コード例 #4
0
ファイル: multitask_learning.py プロジェクト: imdiptanu/FARM
    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
                  # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                  #                 "\nIf your are running in *inference* mode: Don't worry!"
                  #                 "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)