Beispiel #1
0
        def _make_instance(sentence_tokens, question_tokens, answer_span, idx):
            d = dict()

            # For human-readability
            d["raw_sentence"] = MetadataField(" ".join(sentence_tokens[1:-1]))
            d["raw_question"] = MetadataField(" ".join(question_tokens[1:-1]))

            if model_preprocessing_interface.model_flags[
                    "uses_pair_embedding"]:
                inp = model_preprocessing_interface.boundary_token_fn(
                    sentence_tokens, question_tokens)
                d["inputs"] = sentence_to_text_field(inp, indexers)
            else:
                d["sentence"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        sentence_tokens), indexers)
                d["question"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        question_tokens), indexers)

            d["span_start"] = NumericField(answer_span[0],
                                           label_namespace="span_start_labels")
            d["span_end"] = NumericField(answer_span[1],
                                         label_namespace="span_end_labels")
            d["idx"] = LabelField(idx,
                                  label_namespace="idxs",
                                  skip_indexing=True)
            return Instance(d)
Beispiel #2
0
        def _make_instance(example):
            d = dict()

            # For human-readability
            d["raw_passage"] = MetadataField(" ".join(example["passage"]))
            d["raw_question"] = MetadataField(" ".join(example["question"]))

            if model_preprocessing_interface.model_flags["uses_pair_embedding"]:
                inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn(
                    example["passage"], example["question"], get_offset=True
                )
                d["inputs"] = sentence_to_text_field(inp, indexers)
            else:
                d["passage"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(example["passage"]), indexers
                )
                d["question"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(example["question"]), indexers
                )
                start_offset = 0
            d["span_start"] = NumericField(
                example["answer_span"][0] + start_offset, label_namespace="span_start_labels"
            )
            d["span_end"] = NumericField(
                example["answer_span"][1] + start_offset, label_namespace="span_end_labels"
            )
            d["start_offset"] = MetadataField(start_offset)
            d["passage_str"] = MetadataField(example["passage_str"])
            d["answer_str"] = MetadataField(example["answer_str"])
            d["space_processed_token_map"] = MetadataField(example["space_processed_token_map"])
            return Instance(d)
Beispiel #3
0
 def setUp(self):
     self.temp_dir = tempfile.mkdtemp()
     self.path = os.path.join(self.temp_dir, "temp_dataset.tsv")
     self.wic = tasks.WiCTask(self.temp_dir, 100, "wic", tokenizer_name="MosesTokenizer")
     indexers = {"bert_cased": SingleIdTokenIndexer("bert-xe-cased")}
     self.wic.val_data = [
         Instance(
             {
                 "sent1_str": MetadataField("Room and board yo."),
                 "sent2_str": MetadataField("He nailed boards"),
                 "idx": LabelField(1, skip_indexing=True),
                 "idx2": NumericField(2),
                 "idx1": NumericField(3),
                 "inputs": self.sentence_to_text_field(
                     ["[CLS]", "Room", "and", "board", "yo", "[SEP]", "He", "nailed", "boards"],
                     indexers,
                 ),
                 "labels": LabelField(0, skip_indexing=1),
             }
         )
     ]
     self.vocab = vocabulary.Vocabulary.from_instances(self.wic.val_data)
     self.vocab.add_token_to_namespace("True", "wic_tags")
     for data in self.wic.val_data:
         data.index_fields(self.vocab)
     self.args = mock.Mock()
     self.args.batch_size = 4
     self.args.cuda = -1
     self.args.run_dir = self.temp_dir
     self.args.exp_dir = ""
 def setUp(self):
     """
     Since we're testing write_preds, we need to mock model predictions and the parts
     of the model, arguments, and trainer needed to write to predictions.
     Unlike in update_metrics tests, the actual contents of the examples in val_data
     is not the most important as long as it adheres to the API necessary for examples
     of that task.
     """
     self.temp_dir = tempfile.mkdtemp()
     self.path = os.path.join(self.temp_dir, "temp_dataset.tsv")
     self.stsb = tasks.STSBTask(self.temp_dir,
                                100,
                                "sts-b",
                                tokenizer_name="MosesTokenizer")
     self.wic = tasks.WiCTask(self.temp_dir,
                              100,
                              "wic",
                              tokenizer_name="MosesTokenizer")
     stsb_val_preds = pd.DataFrame(data=[
         {
             "idx": 0,
             "labels": 1.00,
             "preds": 1.00,
             "sent1_str": "A man with a hard hat is dancing.",
             "sent2_str": "A man wearing a hard hat is dancing",
         },
         {
             "idx": 1,
             "labels": 0.950,
             "preds": 0.34,
             "sent1_str": "A young child is riding a horse.",
             "sent2_str": "A child is riding a horse.",
         },
     ])
     wic_val_preds = pd.DataFrame(data=[
         {
             "idx": 0,
             "sent1": "Room and board. ",
             "sent2": "He nailed boards across the windows.",
             "labels": 0,
             "preds": 0,
         },
         {
             "idx": 1,
             "sent1": "Hook a fish",
             "sent2": "He hooked a snake accidentally.",
             "labels": 1,
             "preds": 1,
         },
     ])
     indexers = {"bert_cased": SingleIdTokenIndexer("bert-xe-cased")}
     self.wic.val_data = [
         Instance({
             "sent1_str":
             MetadataField("Room and board."),
             "sent2_str":
             MetadataField("He nailed boards"),
             "idx":
             LabelField(0, skip_indexing=True),
             "idx2":
             NumericField(2),
             "idx1":
             NumericField(3),
             "inputs":
             self.sentence_to_text_field(
                 [
                     "[CLS]",
                     "Room",
                     "and",
                     "Board",
                     ".",
                     "[SEP]",
                     "He",
                     "nailed",
                     "boards",
                     "[SEP]",
                 ],
                 indexers,
             ),
             "labels":
             LabelField(0, skip_indexing=1),
         }),
         Instance({
             "sent1_str":
             MetadataField("C ##ir ##culate a rumor ."),
             "sent2_str":
             MetadataField("This letter is being circulated"),
             "idx":
             LabelField(1, skip_indexing=True),
             "idx2":
             NumericField(2),
             "idx1":
             NumericField(3),
             "inputs":
             self.sentence_to_text_field(
                 [
                     "[CLS]",
                     "C",
                     "##ir",
                     "##culate",
                     "a",
                     "rumor",
                     "[SEP]",
                     "This",
                     "##let",
                     "##ter",
                     "is",
                     "being",
                     "c",
                     "##ir",
                     "##culated",
                     "[SEP]",
                 ],
                 indexers,
             ),
             "labels":
             LabelField(0, skip_indexing=1),
         }),
         Instance({
             "sent1_str":
             MetadataField("Hook a fish'"),
             "sent2_str":
             MetadataField("He hooked a snake accidentally"),
             "idx":
             LabelField(2, skip_indexing=True),
             "idx2":
             NumericField(2),
             "idx1":
             NumericField(3),
             "inputs":
             self.sentence_to_text_field(
                 [
                     "[CLS]",
                     "Hook",
                     "a",
                     "fish",
                     "[SEP]",
                     "He",
                     "hooked",
                     "a",
                     "snake",
                     "accidentally",
                     "[SEP]",
                 ],
                 indexers,
             ),
             "labels":
             LabelField(1, skip_indexing=1),
         }),
         Instance({
             "sent1_str":
             MetadataField("For recreation he wrote poetry."),
             "sent2_str":
             MetadataField("Drug abuse is often regarded as recreation ."),
             "idx":
             LabelField(3, skip_indexing=True),
             "idx2":
             NumericField(2),
             "idx1":
             NumericField(3),
             "inputs":
             self.sentence_to_text_field(
                 [
                     "[CLS]",
                     "For",
                     "re",
                     "##creation",
                     "he",
                     "wrote",
                     "poetry",
                     "[SEP]",
                     "Drug",
                     "abuse",
                     "is",
                     "often",
                     "re",
                     "##garded",
                     "as",
                     "re",
                     "##creation",
                     "[SEP]",
                 ],
                 indexers,
             ),
             "labels":
             LabelField(1, skip_indexing=1),
         }),
     ]
     self.val_preds = {"sts-b": stsb_val_preds, "wic": wic_val_preds}
     self.vocab = vocabulary.Vocabulary.from_instances(self.wic.val_data)
     self.vocab.add_token_to_namespace("True", "wic_tags")
     for data in self.wic.val_data:
         data.index_fields(self.vocab)
     self.glue_tasks = [self.stsb, self.wic]
     self.args = mock.Mock()
     self.args.batch_size = 4
     self.args.cuda = -1
     self.args.run_dir = self.temp_dir
     self.args.exp_dir = ""