Ejemplo n.º 1
0
    def _dict_to_samples(self, dictionary, all_dicts=None):
        assert len(
            all_dicts
        ) > 1, "Need at least 2 documents to sample random sentences from"
        doc = dictionary["doc"]
        samples = []

        # create one sample for each sentence in the doc (except for the very last -> "nextSentence" is impossible)
        for idx in range(len(doc) - 1):
            tokenized = {}
            if self.next_sent_pred:
                text_a, text_b, is_next_label = get_sentence_pair(
                    doc, all_dicts, idx)
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": text_b,
                    "nextsentence_label": is_next_label,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                tokenized["text_b"] = tokenize_with_metadata(
                    text_b, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], tokenized["text_b"][
                        seq_name], _ = truncate_sequences(
                            seq_a=tokenized["text_a"][seq_name],
                            seq_b=tokenized["text_b"][seq_name],
                            tokenizer=self.tokenizer,
                            max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
            # if we don't do next sentence prediction, we should feed in a single sentence
            else:
                text_a = doc[idx]
                sample_in_clear_text = {
                    "text_a": text_a,
                    "text_b": None,
                    "nextsentence_label": None,
                }
                # tokenize
                tokenized["text_a"] = tokenize_with_metadata(
                    text_a, self.tokenizer)
                # truncate to max_seq_len
                for seq_name in ["tokens", "offsets", "start_of_word"]:
                    tokenized["text_a"][seq_name], _, _ = truncate_sequences(
                        seq_a=tokenized["text_a"][seq_name],
                        seq_b=None,
                        tokenizer=self.tokenizer,
                        max_seq_len=self.max_seq_len)
                    samples.append(
                        Sample(id=None,
                               clear_text=sample_in_clear_text,
                               tokenized=tokenized))
        return samples
Ejemplo n.º 2
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     if len(tokenized["tokens"]) == 0:
         text = dictionary["text"]
         logger.warning(
             f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}"
         )
         return []
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(
             seq_a=tokenized[seq_name],
             seq_b=None,
             tokenizer=self.tokenizer,
             max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     for task_name, task in self.tasks.items():
         if task_name in dictionary:
             label = float(dictionary[task_name])
             scaled_label = (label -
                             task["label_list"][0]) / task["label_list"][1]
             dictionary[task_name] = scaled_label
     if self.features:
         feats_embed = dictionary.pop("features")
         return [
             FeaturesEmbeddingSample(id=None,
                                     clear_text=dictionary,
                                     tokenized=tokenized,
                                     feat_embeds=feats_embed)
         ]
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Ejemplo n.º 3
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dict:
         dict["label"] = float(dict["label"])
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
def create_samples_sentence_pairs_using_placeholder(baskets, tokenizer,
                                                    max_seq_len):
    """A modified version of create_samples_sentence_pairs from farm/data_handlers/samples.py which calls a modified version of get_sentence_pair which just fetches a placeholder for the second sentence."""
    # TODO why not just use create_char_mlm_prediction_samples_sentence_pairs? Check if it makes a difference.
    for basket in tqdm(baskets):
        doc = basket.raw["doc"]
        basket.samples = []
        for idx in range(len(doc) - 1):
            id = "%s-%s" % (basket.id, idx)
            text_a, text_b, is_next_label = get_sentence_pair_with_placeholder(
                doc, idx)
            sample_in_clear_text = {
                "text_a": text_a,
                "text_b": text_b,
                "is_next_label": is_next_label,
            }
            tokenized = {}
            tokenized["text_a"] = tokenize_with_metadata(
                text_a, tokenizer, max_seq_len)
            tokenized["text_b"] = tokenize_with_metadata(
                text_b, tokenizer, max_seq_len)
            basket.samples.append(
                Sample(id=id,
                       clear_text=sample_in_clear_text,
                       tokenized=tokenized))
    return baskets
Ejemplo n.º 5
0
def test_sample_to_features_qa(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    sample_types = ["span", "no_answer"]

    for sample_type in sample_types:
        clear_text = json.load(
            open(f"samples/qa/{sample_type}/clear_text.json"))
        tokenized = json.load(open(f"samples/qa/{sample_type}/tokenized.json"))
        features_gold = json.load(
            open(f"samples/qa/{sample_type}/features.json"))
        max_seq_len = len(features_gold["input_ids"])

        tokenizer = Tokenizer.load(pretrained_model_name_or_path=MODEL,
                                   do_lower_case=False)
        curr_id = "-".join([str(x) for x in features_gold["id"]])

        s = Sample(id=curr_id, clear_text=clear_text, tokenized=tokenized)
        features = sample_to_features_qa(s, tokenizer, max_seq_len,
                                         SP_TOKENS_START, SP_TOKENS_MID,
                                         SP_TOKENS_END)[0]
        features = to_list(features)

        keys = features_gold.keys()
        for k in keys:
            value_gold = features_gold[k]
            value = to_list(features[k])
            assert value == value_gold, f"Mismatch between the {k} features in the {sample_type} test sample."
Ejemplo n.º 6
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> List[Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     if len(tokenized["tokens"]) == 0:
         text = dictionary["text"]
         logger.warning(
             f"The following text could not be tokenized, likely because it contains a character that the tokenizer does not recognize: {text}"
         )
         return []
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(
             seq_a=tokenized[seq_name],
             seq_b=None,
             tokenizer=self.tokenizer,
             max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     for task_name, task in self.tasks.items():
         if task_name in dictionary:
             scaled_dict_labels = []
             for label in dictionary[task_name]:
                 label = float(label)
                 scaled_label = (
                     label - task["label_list"][0]) / task["label_list"][1]
                 scaled_dict_labels.append(scaled_label)
             dictionary[task_name] = scaled_dict_labels
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Ejemplo n.º 7
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None, tokenizer=self.tokenizer,
                                             max_seq_len=self.max_seq_len)
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
Ejemplo n.º 8
0
 def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dict:
         label = float(dict["label"])
         scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
         dict["label"] = scaled_label
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Ejemplo n.º 9
0
 def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
     # truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
     for seq_name in tokenized.keys():
         tokenized[seq_name], _, _ = truncate_sequences(seq_a=tokenized[seq_name], seq_b=None,
                                                        tokenizer=self.tokenizer,
                                                        max_seq_len=self.max_seq_len)
     # Samples don't have labels during Inference mode
     if "label" in dictionary:
         label = float(dictionary["label"])
         scaled_label = (label - self.tasks["regression"]["label_list"][0]) / self.tasks["regression"]["label_list"][1]
         dictionary["label"] = scaled_label
     return [Sample(id=None, clear_text=dictionary, tokenized=tokenized)]
 def _dict_to_samples(cls, dict, all_dicts=None):
     """
     Converts a dict with a document to a sample (which will subsequently be featurized). It is used during prediction.
     
     This is a modified version of BertStyleLMProcessor._dict_to_samples from farm/data_handler/processor.py. It has been modified to create samples with just a single text, rather than two, as is the case for a normal BERT model.
     """
     doc = dict["doc"]
     samples = []
     for idx in range(len(doc) - 1):
         tokenized = {}
         tokenized["text_a"] = tokenize_with_metadata(
             doc[idx], cls.tokenizer, cls.max_seq_len)
         samples.append(
             Sample(id=None,
                    clear_text={"doc": doc[idx]},
                    tokenized=tokenized))
     return samples
Ejemplo n.º 11
0
    def parts_to_sample(self, admission_part, discharge_part, label) -> Sample:
        tokenized = {"text_a": admission_part, "text_b": discharge_part}
        sample_in_clear_text = {
            "text_a": admission_part["clear_text"],
            "text_b": discharge_part["clear_text"],
            "nextsentence_label": label,
        }

        # truncate to max_seq_len
        for seq_name in ["tokens", "offsets", "start_of_word"]:
            tokenized["text_a"][seq_name], tokenized["text_b"][
                seq_name], _ = truncate_sequences(
                    seq_a=tokenized["text_a"][seq_name],
                    seq_b=tokenized["text_b"][seq_name],
                    tokenizer=self.tokenizer,
                    max_seq_len=self.max_seq_len)

        return Sample(id=None,
                      clear_text=sample_in_clear_text,
                      tokenized=tokenized)
Ejemplo n.º 12
0
 def _dict_to_samples(cls, dict, all_dicts=None):
     doc = dict["doc"]
     samples = []
     for idx in range(len(doc) - 1):
         text_a, text_b, is_next_label = get_sentence_pair(doc, all_dicts, idx)
         sample_in_clear_text = {
             "text_a": text_a,
             "text_b": text_b,
             "is_next_label": is_next_label,
         }
         tokenized = {}
         tokenized["text_a"] = tokenize_with_metadata(
             text_a, cls.tokenizer, cls.max_seq_len
         )
         tokenized["text_b"] = tokenize_with_metadata(
             text_b, cls.tokenizer, cls.max_seq_len
         )
         samples.append(
             Sample(id=None, clear_text=sample_in_clear_text, tokenized=tokenized)
         )
     return samples
def create_char_mlm_prediction_samples_sentence_pairs(baskets, tokenizer,
                                                      max_seq_len):
    """A modified version of create_samples_sentence_pairs from farm/data_handlers/samples.py which simply assigns the first text as text_a and the second text as text_b. This only works becauses the docs contain a sentence to be predicted and a placeholder as the second text."""
    for basket in tqdm(baskets):
        doc = basket.raw["doc"]
        basket.samples = []
        id = "%s" % (basket.id)
        text_a = doc[0]
        text_b = doc[1]
        is_next_label = 1
        sample_in_clear_text = {
            "text_a": text_a,
            "text_b": text_b,
            "is_next_label": is_next_label,
        }
        tokenized = {}
        tokenized["text_a"] = tokenize_with_metadata(text_a, tokenizer,
                                                     max_seq_len)
        tokenized["text_b"] = tokenize_with_metadata(text_b, tokenizer,
                                                     max_seq_len)
        basket.samples.append(
            Sample(id=id, clear_text=sample_in_clear_text,
                   tokenized=tokenized))
    return baskets
Ejemplo n.º 14
0
    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
                  # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                  #                 "\nIf your are running in *inference* mode: Don't worry!"
                  #                 "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)
Ejemplo n.º 15
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Ejemplo n.º 16
0
 def _dict_to_samples(cls, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets
     tokenized = tokenize_with_metadata(dict["text"], cls.tokenizer, cls.max_seq_len)
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]
Ejemplo n.º 17
0
def create_samples_qa_Natural_Question(dictionary, max_query_len, max_seq_len,
                                       doc_stride, n_special_tokens):
    """
    This method will split question-document pairs from the SampleBasket into question-passage pairs which will
    each form one sample. The "t" and "c" in variables stand for token and character respectively.
    """

    # Initialize some basic variables
    # is_training = check_if_training(dictionary)
    question_tokens = dictionary["question_tokens"][:max_query_len]
    question_len_t = len(question_tokens)
    question_offsets = dictionary["question_offsets"]
    doc_tokens = dictionary["document_tokens"]
    doc_offsets = dictionary["document_offsets"]
    doc_text = dictionary["document_text"]
    doc_start_of_word = dictionary["document_start_of_word"]
    samples = []

    # Calculate the number of tokens that can be reserved for the passage. This is calculated by considering
    # the max_seq_len, the number of tokens in the question and the number of special tokens that will be added
    # when the question and passage are joined (e.g. [CLS] and [SEP])
    passage_len_t = max_seq_len - question_len_t - n_special_tokens

    # Perform chunking of document into passages. The sliding window moves in steps of doc_stride.
    # passage_spans is a list of dictionaries where each defines the start and end of each passage
    # on both token and character level
    passage_spans = chunk_into_passages(doc_offsets, doc_stride, passage_len_t,
                                        doc_text)
    for passage_span in passage_spans:
        # Unpack each variable in the dictionary. The "_t" and "_c" indicate
        # whether the index is on the token or character level
        passage_start_t = passage_span["passage_start_t"]
        passage_end_t = passage_span["passage_end_t"]
        passage_start_c = passage_span["passage_start_c"]
        passage_end_c = passage_span["passage_end_c"]
        passage_id = passage_span["passage_id"]

        # passage_offsets will be relative to the start of the passage (i.e. they will start at 0)
        # TODO: Is passage offsets actually needed? At this point, maybe we only care about token level
        passage_offsets = doc_offsets[passage_start_t:passage_end_t]
        passage_start_of_word = doc_start_of_word[
            passage_start_t:passage_end_t]
        passage_offsets = [x - passage_offsets[0] for x in passage_offsets]
        passage_tokens = doc_tokens[passage_start_t:passage_end_t]
        passage_text = dictionary["document_text"][
            passage_start_c:passage_end_c]

        # Deal with the potentially many answers (e.g. Squad or NQ dev set)
        answers_clear, answers_tokenized = process_answers(
            dictionary["answers"], doc_offsets, passage_start_c,
            passage_start_t)

        clear_text = {
            "passage_text": passage_text,
            "question_text": dictionary["question_text"],
            "passage_id": passage_id,
            "answers": answers_clear
        }
        tokenized = {
            "passage_start_t":
            passage_start_t,
            "passage_tokens":
            passage_tokens,
            "passage_offsets":
            passage_offsets,
            "passage_start_of_word":
            passage_start_of_word,
            "question_tokens":
            question_tokens,
            "question_offsets":
            question_offsets,
            "question_start_of_word":
            dictionary["question_start_of_word"][:max_query_len],
            "answers":
            answers_tokenized,
            "document_offsets":
            doc_offsets
        }  # So that to_doc_preds can access them
        samples.append(
            Sample(id=passage_id, clear_text=clear_text, tokenized=tokenized))
    return samples
Ejemplo n.º 18
0
 def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:
     # this tokenization also stores offsets, which helps to map our entity tags back to original positions
     words = re.findall(r"<t>(.*?)</t>", dict["text"], flags=0)
     word_one = words[0]
     term_one_idx = -1
     term_two_idx = -1        
     term_one_idxs = [m.start() for m in re.finditer(re.escape(word_one), dict["text"])]
     for idx, k in enumerate(term_one_idxs):
         try:
             if dict["text"][k-3:k] == '<t>':
                 term_one_idx = idx
         except:
             pass
     if len(words) > 1:
             word_two = words[1]
             word_two_tokenized = tokenize_with_metadata(word_two, self.tokenizer, self.max_seq_len)['tokens']
             term_two_idxs = [m.start() for m in re.finditer(re.escape(word_two), dict["text"])]
             for idx, k in enumerate(term_two_idxs):
                 try:
                     if dict["text"][k-3:k] == '<t>':
                         term_two_idx = idx
                 except:
                     pass
     dict["text"] = re.sub(r'<t>','', dict["text"])
     dict["text"] = re.sub(r'</t>','', dict["text"])
     tokenized = tokenize_with_metadata(dict["text"], self.tokenizer, self.max_seq_len)
     word_one_tokenized = tokenize_with_metadata(word_one, self.tokenizer, self.max_seq_len)['tokens']
     x1, y = [], []
     for token in tokenized['tokens']:
             if token == '[CLS]':
                     x1.append(5)
                     y.append('[CLS]')
             elif token == '[SEP]':
                     x1.append(4)
                     y.append('[SEP]')
             else:
                     x1.append(0)
                     y.append('N')
     idx = find_overlap(word_one_tokenized, tokenized['tokens'], term_one_idx)
     if idx > -1:
             for x in range(0,len(word_one_tokenized)):
                     x1[idx+x] = 1
                     y[idx+x] = 'Y'
     else:
         print("-1--")
         print(word_one_tokenized)
         print(tokenized['tokens'])
         x1, y = [], []
         for token in tokenized['tokens']:
             if token == '[CLS]':
                     x1.append(5)
                     y.append('[CLS]')
             elif token == '[SEP]':
                     x1.append(4)
                     y.append('[SEP]')
             else:
                     x1.append(0)
                     y.append('N')
     if len(words) > 1:
             idx = find_overlap(word_two_tokenized, tokenized['tokens'], term_two_idx)
             if idx > -1:
                     for x in range(0,len(word_two_tokenized)):
                             y[idx+x] = 'Y'
                             x1[idx+x] = 1
             else:
                     print("-2--")
                     print(word_two_tokenized)
                     print(tokenized['tokens'])
                     x1, y = [], []
                     for token in tokenized['tokens']:
                             if token == '[CLS]':
                                     x1.append(5)
                                     y.append('[CLS]')
                             elif token == '[SEP]':
                                     x1.append(4)
                                     y.append('[SEP]')
                             else:
                                     x1.append(0)
                                     y.append('N')
     tokenized['custom_data'] = x1
     tokenized['ner_label'] = y
     dict['custom_data'] = x1
     dict['ner_label'] = y
     return [Sample(id=None, clear_text=dict, tokenized=tokenized)]