Ejemplo n.º 1
0
 def _init_baskets_from_file(self, file):
     dicts = self._file_to_dicts(file)
     dataset_name = os.path.splitext(os.path.basename(file))[0]
     baskets = [
         SampleBasket(raw=tr, id=f"{dataset_name}-{i}") for i, tr in enumerate(dicts)
     ]
     return baskets
Ejemplo n.º 2
0
 def _init_baskets_from_file(self, file):
     dicts = self.file_to_dicts(file)
     dataset_name = file.stem
     baskets = [
         SampleBasket(raw=tr, id=f"{dataset_name}-{i}") for i, tr in enumerate(dicts)
     ]
     return baskets
Ejemplo n.º 3
0
    def dataset_from_dicts(self,
                           dicts,
                           index=0,
                           rest_api_schema=False,
                           return_baskets=False):
        """
        Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a
        list of tensor names. This can be used for inference mode.

        :param dicts: List of dictionaries where each contains the data of one input sample.
        :type dicts: list of dicts
        :return: a Pytorch dataset and a list of tensor names.
        """
        if rest_api_schema:
            id_prefix = "infer"
        else:
            id_prefix = "train"
            # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID
        self.baskets = [
            SampleBasket(raw=tr, id=f"{id_prefix}-{i + index}")
            for i, tr in enumerate(dicts)
        ]
        self._init_samples_in_baskets()
        self._featurize_samples()
        if index == 0:
            self._log_samples(3)
        if return_baskets:
            dataset, tensor_names = self._create_dataset(keep_baskets=True)
            return dataset, tensor_names, self.baskets
        else:
            dataset, tensor_names = self._create_dataset()
            return dataset, tensor_names
Ejemplo n.º 4
0
    def dataset_from_dicts(self,
                           dicts,
                           indices=None,
                           rest_api_schema=False,
                           return_baskets=False,
                           fewer_samples=True):
        """
        Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a
        list of tensor names. This can be used for inference mode.

        :param dicts: List of dictionaries where each contains the data of one input sample.
        :type dicts: list of dicts
        :return: a Pytorch dataset and a list of tensor names.
        """
        if rest_api_schema:
            id_prefix = "infer"
        else:
            id_prefix = "train"
        # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID
        if indices:
            self.baskets = [
                SampleBasket(raw=tr, id_internal=f"{id_prefix}-{index}")
                for (tr, index) in zip(dicts, indices)
            ]
        else:
            self.baskets = [
                SampleBasket(raw=tr, id_internal=f"{id_prefix}-{i}")
                for (i, tr) in enumerate(dicts)
            ]
        self._init_samples_in_baskets(fewer_samples=fewer_samples)
        self._featurize_samples()

        if indices:
            logger.info(f"Currently working on indices: {indices}")

            if 0 in indices:
                self._log_samples(2)
            if 50 in indices:
                self._print_samples(30)
        else:
            self._log_samples(2)
        if return_baskets:
            dataset, tensor_names = self._create_dataset(keep_baskets=True)
            return dataset, tensor_names, self.baskets
        else:
            dataset, tensor_names = self._create_dataset()
            return dataset, tensor_names
Ejemplo n.º 5
0
 def dataset_from_dicts(self, dicts):
     self.baskets = [
         SampleBasket(raw=tr, id="infer - {}".format(i))
         for i, tr in enumerate(dicts)
     ]
     self._init_samples_in_baskets()
     self._featurize_samples()
     dataset, tensor_names = self._create_dataset()
     return dataset, tensor_names
Ejemplo n.º 6
0
    def _dicts_to_baskets(self, dicts, indices):
        # Perform tokenization on documents and questions resulting in a nested list of doc-question pairs
        dicts_tokenized = [self.apply_tokenization(d) for d in dicts]

        baskets = []
        for index, document in zip(indices, dicts_tokenized):
            for q_idx, raw in enumerate(document):
                basket = SampleBasket(raw=raw, id=f"{index}-{q_idx}")
                baskets.append(basket)
        return baskets
Ejemplo n.º 7
0
 def dataset_from_dicts(self, dicts, index=None, from_inference=False):
     if(from_inference):
         dicts = [self._convert_inference(x) for x in dicts]
     self.baskets = [
         SampleBasket(raw=tr, id="infer - {}".format(i))
         for i, tr in enumerate(dicts)
     ]
     self._init_samples_in_baskets()
     self._featurize_samples()
     if index == 0:
         self._log_samples(3)
     dataset, tensor_names = self._create_dataset()
     return dataset, tensor_names
Ejemplo n.º 8
0
    def _dicts_to_baskets(self, dicts, index=None):
        # Perform tokenization on documents and questions resulting in a nested list of doc-question pairs
        dicts_tokenized = [self.apply_tokenization(d) for d in dicts]

        baskets = []
        for d_idx, document in enumerate(dicts_tokenized):
            for q_idx, raw in enumerate(document):
                squad_id_hex = dicts[d_idx]["qas"][q_idx]["id"]
                if squad_id_hex is None:
                    id_1 = d_idx + index
                    id_2 = q_idx
                else:
                    id_1, id_2 = encode_squad_id(squad_id_hex)
                basket = SampleBasket(raw=raw, id=f"{id_1}-{id_2}")
                baskets.append(basket)
        return baskets
Ejemplo n.º 9
0
 def dataset_from_dicts(self, dicts, index=None, rest_api_schema=False):
     if rest_api_schema:
         dicts = [self._convert_rest_api_dict(x) for x in dicts]
     if rest_api_schema:
         id_prefix = "infer"
     else:
         id_prefix = "train"
     self.baskets = [
         SampleBasket(raw=tr, id=f"{id_prefix}-{i}")
         for i, tr in enumerate(dicts)
     ]
     self._init_samples_in_baskets()
     self._featurize_samples()
     if index == 0:
         self._log_samples(3)
     dataset, tensor_names = self._create_dataset()
     return dataset, tensor_names
Ejemplo n.º 10
0
    def dataset_from_dicts(self, dicts):
        """
        Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a
        list of tensor names. This is used for inference mode.

        :param dicts: List of dictionaries where each contains the data of one input sample.
        :type dicts: list of dicts
        :return: a Pytorch dataset and a list of tensor names.
        """
        self.baskets = [
            SampleBasket(raw=tr, id="infer - {}".format(i))
            for i, tr in enumerate(dicts)
        ]
        self._init_samples_in_baskets()
        self._featurize_samples()
        dataset, tensor_names = self._create_dataset()
        return dataset, tensor_names
Ejemplo n.º 11
0
    def dataset_from_dicts(self,
                           dicts,
                           index=0,
                           rest_api_schema=False,
                           return_baskets=False):
        if rest_api_schema:
            dicts = [self._convert_rest_api_dict(x) for x in dicts]
        #We need to add the index (coming from multiprocessing chunks) to have a unique numerical basket ID
        self.baskets = [
            SampleBasket(raw=tr, id=(i + index) * 10000)
            for i, tr in enumerate(dicts)
        ]
        self._init_samples_in_baskets_squad()
        self._featurize_samples()
        if index == 0:
            self._log_samples(3)

        if return_baskets:
            dataset, tensor_names = self._create_dataset(keep_baskets=True)
            return dataset, tensor_names, self.baskets
        else:
            dataset, tensor_names = self._create_dataset(keep_baskets=False)
            return dataset, tensor_names
Ejemplo n.º 12
0
def tokenize_batch_question_answering(pre_baskets, tokenizer, indices):
    """
    Tokenizes text data for question answering tasks. Tokenization means splitting words into subwords, depending on the
    tokenizer's vocabulary.

    - We first tokenize all documents in batch mode. (When using FastTokenizers Rust multithreading can be enabled by TODO add how to enable rust mt)
    - Then we tokenize each question individually
    - We construct dicts with question and corresponding document text + tokens + offsets + ids

    :param pre_baskets: input dicts with QA info #todo change to input objects
    :param tokenizer: tokenizer to be used
    :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets are unique
    :return: baskets, list containing question and corresponding document information
    """
    assert len(indices) == len(pre_baskets)
    assert tokenizer.is_fast, "Processing QA data is only supported with fast tokenizers for now.\n" \
                              "Please load Tokenizers with 'use_fast=True' option."
    baskets = []
    # # Tokenize texts in batch mode
    texts = [d["context"] for d in pre_baskets]
    tokenized_docs_batch = tokenizer.batch_encode_plus(
        texts,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
        add_special_tokens=False,
        verbose=False)

    # Extract relevant data
    tokenids_batch = tokenized_docs_batch["input_ids"]
    offsets_batch = []
    for o in tokenized_docs_batch["offset_mapping"]:
        offsets_batch.append(np.array([x[0] for x in o]))
    start_of_words_batch = []
    for e in tokenized_docs_batch.encodings:
        start_of_words_batch.append(_get_start_of_word_QA(e.words))

    for i_doc, d in enumerate(pre_baskets):
        document_text = d["context"]
        # # Tokenize questions one by one
        for i_q, q in enumerate(d["qas"]):
            question_text = q["question"]
            tokenized_q = tokenizer.encode_plus(
                question_text,
                return_offsets_mapping=True,
                return_special_tokens_mask=True,
                add_special_tokens=False)

            # Extract relevant data
            question_tokenids = tokenized_q["input_ids"]
            question_offsets = [x[0] for x in tokenized_q["offset_mapping"]]
            question_sow = _get_start_of_word_QA(
                tokenized_q.encodings[0].words)

            external_id = q["id"]
            # The internal_id depends on unique ids created for each process before forking
            internal_id = f"{indices[i_doc]}-{i_q}"
            raw = {
                "document_text": document_text,
                "document_tokens": tokenids_batch[i_doc],
                "document_offsets": offsets_batch[i_doc],
                "document_start_of_word": start_of_words_batch[i_doc],
                "question_text": question_text,
                "question_tokens": question_tokenids,
                "question_offsets": question_offsets,
                "question_start_of_word": question_sow,
                "answers": q["answers"],
            }
            # TODO add only during debug mode (need to create debug mode)
            raw["document_tokens_strings"] = tokenized_docs_batch.encodings[
                i_doc].tokens
            raw["question_tokens_strings"] = tokenized_q.encodings[0].tokens

            baskets.append(
                SampleBasket(raw=raw,
                             id_internal=internal_id,
                             id_external=external_id,
                             samples=None))
    return baskets
Ejemplo n.º 13
0
    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
                  # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                  #                 "\nIf your are running in *inference* mode: Don't worry!"
                  #                 "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)