def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): tokenizer = cached_transformers.get_tokenizer("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-cased", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize("AllenNLP is great!"), allennlp_tokenizer.tokenize("Really it is!"), ) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def test_type_ids_when_folding(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=6) first_string = "How do trees get online?" second_string = "They log in!" tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize(first_string), allennlp_tokenizer.tokenize(second_string)) vocab = Vocabulary() indexed = indexer.tokens_to_indices(tokens, vocab) assert min(indexed["type_ids"]) == 0 assert max(indexed["type_ids"]) == 1
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer( "roberta-base", add_special_tokens=False) indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.add_special_tokens( allennlp_tokenizer.tokenize("AllenNLP is great!"), allennlp_tokenizer.tokenize("Really it is!"), ) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
class TransformerSuperGlueRteReader(DatasetReader): """ Dataset reader for the SuperGLUE Recognizing Textual Entailment task, to be used with a transformer model such as RoBERTa. The dataset is in the JSON Lines format. It will generate `Instances` with the following fields: * `tokens`, a `TextField` that contains the concatenation of premise and hypothesis, * `label`, a `LabelField` containing the label, if one exists. * `metadata`, a `MetadataField` that stores the instance's index in the file, the original premise, the original hypothesis, both of these in tokenized form, and the gold label, accessible as `metadata['index']`, `metadata['premise']`, `metadata['hypothesis']`, `metadata['tokens']`, and `metadata['label']`. # Parameters type : `str`, optional (default=`'roberta-base'`) This reader chooses tokenizer according to this setting. """ def __init__( self, transformer_model_name: str = "roberta-base", tokenizer_kwargs: Dict[str, Any] = None, **kwargs ) -> None: super().__init__( manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs ) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer( transformer_model_name, tokenizer_kwargs=tokenizer_kwargs, max_length=512 ) } @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path, extract_archive=True) logger.info("Reading file at %s", file_path) yielded_relation_count = 0 from allennlp.common.file_utils import json_lines_from_file for relation in self.shard_iterable(json_lines_from_file(file_path)): premise = relation["premise"] hypothesis = relation["hypothesis"] if "label" in relation: label = relation["label"] else: label = None index = relation["idx"] # todo: see if we even need this to be in a separate method instance = self.text_to_instance(index, label, premise, hypothesis) yield instance yielded_relation_count += 1 @overrides def text_to_instance( self, index: int, label: str, premise: str, hypothesis: str, ) -> Instance: tokenized_premise = self._tokenizer.tokenize(premise) tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) fields = {} premise_and_hypothesis = TextField( self._tokenizer.add_special_tokens(tokenized_premise, tokenized_hypothesis), ) fields["tokens"] = TextField(premise_and_hypothesis) # make the metadata metadata = { "premise": premise, "premise_tokens": tokenized_premise, "hypothesis": hypothesis, "hypothesis_tokens": tokenized_hypothesis, "index": index, } if label: fields["label"] = LabelField(label) metadata["label"] = label fields["metadata"] = MetadataField(metadata) return Instance(fields) @overrides def apply_token_indexers(self, instance: Instance) -> None: instance["tokens"].token_indexers = self._token_indexers
class TransformerMCReader(DatasetReader): """ Read input data for the TransformerMC model. This is the base class for all readers that produce data for TransformerMC. Instances have two fields: * `alternatives`, a `ListField` of `TextField` * `correct_alternative`, `IndexField` with the correct answer among `alternatives` * `qid`, a `MetadataField` containing question ids Parameters ---------- transformer_model_name : `str`, optional (default=`"roberta-large"`) This reader chooses tokenizer and token indexer according to this setting. length_limit : `int`, optional (default=`512`) We will make sure that the length of an alternative never exceeds this many word pieces. """ def __init__(self, transformer_model_name: str = "roberta-large", length_limit: int = 512, **kwargs) -> None: super().__init__(**kwargs) from allennlp.data.tokenizers import PretrainedTransformerTokenizer self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit def text_to_instance( self, # type: ignore qid: str, start: str, alternatives: List[str], label: Optional[int] = None, ) -> Instance: # tokenize start = self._tokenizer.tokenize(start) sequences = [] for alternative in alternatives: alternative = self._tokenizer.tokenize(alternative) length_for_start = (self.length_limit - len(alternative) - self._tokenizer.num_special_tokens_for_pair()) if length_for_start < 0: # If the alternative is too long by itself, we take the beginning and add no tokens from the start. alternative = alternative[:length_for_start] length_for_start = 0 sequences.append( self._tokenizer.add_special_tokens(start[:length_for_start], alternative)) # make fields from allennlp.data.fields import TextField sequences = [ TextField(sequence, self._token_indexers) for sequence in sequences ] from allennlp.data.fields import ListField sequences = ListField(sequences) from allennlp.data.fields import MetadataField fields = { "alternatives": sequences, "qid": MetadataField(qid), } if label is not None: if label < 0 or label >= len(sequences): raise ValueError("Alternative %d does not exist", label) from allennlp.data.fields import IndexField fields["correct_alternative"] = IndexField(label, sequences) return Instance(fields)
class TweetCandidateSpanDatasetReader(DatasetReader): def __init__( self, lazy: bool = False, cache_directory: Optional[str] = None, max_instances: Optional[int] = None, min_num_candidate: int = 3, max_num_candidate: int = 5, transformer_model_name_or_archive_path: str = "bert-base-uncased", ) -> None: super().__init__(lazy=lazy, cache_directory=cache_directory, max_instances=max_instances) if "tar.gz" in transformer_model_name_or_archive_path: config = extract_config_from_archive( transformer_model_name_or_archive_path) model_name = config.as_dict( )["dataset_reader"]["tokenizer"]["model_name"] else: model_name = transformer_model_name_or_archive_path self._tokenizer = PretrainedTransformerTokenizer( model_name=model_name, add_special_tokens=False) self._tokenindexer = PretrainedTransformerIndexer( model_name=model_name) self._min_num_candidate = min_num_candidate self._max_num_candidate = max_num_candidate def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) df = pd.read_json(file_path, lines=True) for record in df.to_dict("records"): if record["selected_text"]: text = record["text"] if not isinstance(text, str): continue elif text.strip() == "": continue elif len(record["candidate_spans"]) < self._min_num_candidate: continue else: yield self.text_to_instance( " " + text.strip(), record["sentiment"], record["candidate_spans"], record["textID"], record.get("selected_text"), record.get("selected_text_span"), ) def text_to_instance( self, text: str, sentiment: str, candidate_spans: list, text_id: Optional[str] = None, selected_text: Optional[str] = None, selected_text_span: Optional[tuple] = None, ) -> Instance: fields = {} text_tokens = self._tokenizer.tokenize(text) sentiment_tokens = self._tokenizer.tokenize(sentiment) text_with_sentiment_tokens = self._tokenizer.add_special_tokens( text_tokens, sentiment_tokens) fields["text_with_sentiment"] = TextField( text_with_sentiment_tokens, {"tokens": self._tokenindexer}) candidate_spans = [ tuple(i) for i in candidate_spans[:self._max_num_candidate] ] additional_metadata = {} if selected_text_span is not None: selected_text_span = tuple(selected_text_span) additional_metadata["selected_text_span"] = selected_text_span if selected_text_span not in candidate_spans: candidate_spans.append(selected_text_span) fields["label"] = LabelField(len(candidate_spans) - 1, skip_indexing=True) have_truth = False else: fields["label"] = LabelField( candidate_spans.index(selected_text_span), skip_indexing=True) have_truth = True additional_metadata["have_truth"] = have_truth additional_metadata["candidate_num"] = len(candidate_spans) fields["candidate_span_pairs"] = SpanPairsField( candidate_spans, fields["text_with_sentiment"]) metadata = { "text": text, "sentiment": sentiment, "selected_text": selected_text, "text_with_sentiment_tokens": text_with_sentiment_tokens } if text_id is not None: metadata["text_id"] = text_id if additional_metadata: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields) def span_to_str(self, text, span_start, span_end): text_tokens = self._tokenizer.tokenize(text) text_tokens = self._tokenizer.add_special_tokens(text_tokens) return span_tokens_to_text(text, text_tokens, span_start, span_end)
class WorldTreeSupportReader(DatasetReader): """ """ def __init__(self, transformer_model_name: str = "roberta-large", topk: int = 5, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } # Get the topk supporting facts self.topk = topk @overrides def _read(self, file_path: str) -> Iterable[Instance]: """ """ with open(cached_path(file_path), "r") as data_file: logger.info("Reading questions from file at: %s", file_path) questions = json.load(data_file) for question in questions: qid = question["id"] question_text = question["question"] supporting_facts = question["supports"] choices = question["choices"] answer = question["answer"] yield self.text_to_instance(qid, question_text, supporting_facts, choices, answer) @overrides def text_to_instance( self, # type: ignore qid: str, question: str, supporting_facts: List[str], choices: List[str], answer_idx: Optional[int] = None, ) -> Instance: # **A hack** # We need to make each question have exactly four choices to process them in batches. # Either drop the choice or use a non-answer one to fill. if len(choices) == 5: if answer_idx != 4: # Just drop the last choice choices = choices[:-1] elif answer_idx == 4: # Answer is the last, so drop the first choices = choices[1:] answer_idx -= 1 elif len(choices) == 3: if answer_idx != 2: # Use the last to fill choices.append(choices[-1]) else: # Use the first to fill choices.append(choices[0]) # Base checks assert len(choices) == 4 if answer_idx < 0 or answer_idx >= len(choices): # print(answer_idx) raise ValueError("Choice %d does not exist", answer_idx) # Combine supporting facts with questions # Here we're just concatenate the supporting facts to the end of the question supporting_facts = supporting_facts[:self.topk] question = " ".join([question] + supporting_facts) # Tokenize the question question_tokens = self._tokenizer.tokenize(question) # Tokenize the choices and concate them and the question into question-choice pairs qc_pairs = [] for choice in choices: choice_tokens = self._tokenizer.tokenize(choice) qc_pair = self._tokenizer.add_special_tokens( question_tokens, choice_tokens) qc_pairs.append(qc_pair) # Wrap them into AllenNLP fields qc_pairs = [TextField(pair, self._token_indexers) for pair in qc_pairs] qc_pairs = ListField(qc_pairs) answer_idx = IndexField(answer_idx, qc_pairs) metadata = MetadataField({ "id": qid, "question": question, "choices": choices }) return Instance({ "qc_pairs": qc_pairs, "answer_idx": answer_idx, "metadata": metadata, })
class TransformerSquadReader(DatasetReader): """ Dataset reader suitable for JSON-formatted SQuAD-like datasets to be used with a transformer-based QA model, such as [`TransformerQA`](../../models/transformer_qa#TransformerQA). It will generate `Instances` with the following fields: * `question_with_context`, a `TextField` that contains the concatenation of question and context, * `answer_span`, a `SpanField` into the `question` `TextField` denoting the answer. * `context_span`, a `SpanField` into the `question` `TextField` denoting the context, i.e., the part of the text that potential answers can come from. * `cls_index` (optional), an `IndexField` that holds the index of the `[CLS]` token within the `question_with_context` field. This is needed because the `[CLS]` token is used to indicate an impossible question. Since most tokenizers/models have the `[CLS]` token as the first token, this will only be included in the instance if the `[CLS]` token is NOT the first token. * `metadata`, a `MetadataField` that stores the instance's ID, the original question, the original passage text, both of these in tokenized form, and the gold answer strings, accessible as `metadata['id']`, `metadata['question']`, `metadata['context']`, `metadata['question_tokens']`, `metadata['context_tokens']`, and `metadata['answers']`. This is so that we can more easily use the official SQuAD evaluation script to get metrics. For SQuAD v2.0-style datasets that contain impossible questions, we set the gold answer span to the span of the `[CLS]` token when there are no answers. We also support limiting the maximum length for the question. When the context+question is too long, we run a sliding window over the context and emit multiple instances for a single question. If `skip_impossible_questions` is `True`, then we only emit instances that contain a gold answer. As a result, the per-instance metrics you get during training and evaluation might not correspond 100% to the SQuAD task. To get a final number for SQuAD v1.1, you have to run ``` python -m allennlp_models.rc.tools.transformer_qa_eval ``` # Parameters transformer_model_name : `str`, optional (default=`'bert-base-cased'`) This reader chooses tokenizer and token indexer according to this setting. length_limit : `int`, optional (default=`384`) We will make sure that the length of context+question never exceeds this many word pieces. stride : `int`, optional (default=`128`) When context+question are too long for the length limit, we emit multiple instances for one question, where the context is shifted. This parameter specifies the overlap between the shifted context window. It is called "stride" instead of "overlap" because that's what it's called in the original huggingface implementation. skip_impossible_questions : `bool`, optional (default=`False`) If this is true, we will skip examples that don't have an answer. This could happen if the question is marked impossible in the dataset, or if the question+context is truncated according to `length_limit` such that the context no longer contains a gold answer. For SQuAD v1.1-style datasets, you should set this to `True` during training, and `False` any other time. For SQuAD v2.0-style datasets you should leave this as `False`. max_query_length : `int`, optional (default=`64`) The maximum number of wordpieces dedicated to the question. If the question is longer than this, it will be truncated. """ def __init__( self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, stride: int = 128, skip_impossible_questions: bool = False, max_query_length: int = 64, tokenizer_kwargs: Dict[str, Any] = None, **kwargs ) -> None: if "skip_invalid_examples" in kwargs: import warnings warnings.warn( "'skip_invalid_examples' is deprecated, please use 'skip_impossible_questions' instead", DeprecationWarning, ) skip_impossible_questions = kwargs.pop("skip_invalid_examples") super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer( transformer_model_name, tokenizer_kwargs=tokenizer_kwargs ) } self.length_limit = length_limit self.stride = stride self.skip_impossible_questions = skip_impossible_questions self.max_query_length = max_query_length self._cls_token = self._tokenizer.tokenizer.cls_token # We'll include the `cls_index` IndexField in instances if the CLS token is # not always the first token. self._include_cls_index = ( self._find_cls_index( self._tokenizer.add_special_tokens( self._tokenizer.tokenize("a"), self._tokenizer.tokenize("a") ) ) != 0 ) @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset") yielded_question_count = 0 questions_with_more_than_one_instance = 0 for article in dataset: for paragraph_json in article["paragraphs"]: context = paragraph_json["context"] for question_answer in paragraph_json["qas"]: answers = [answer_json["text"] for answer_json in question_answer["answers"]] # Just like huggingface, we only use the first answer for training. if len(answers) > 0: first_answer_offset = int(question_answer["answers"][0]["answer_start"]) else: first_answer_offset = None instances = self.make_instances( question_answer.get("id", None), question_answer["question"], answers, context, first_answer_offset=first_answer_offset, always_add_answer_span=True, ) instances_yielded = 0 for instance in instances: yield instance instances_yielded += 1 if instances_yielded > 1: questions_with_more_than_one_instance += 1 yielded_question_count += 1 if questions_with_more_than_one_instance > 0: logger.info( "%d (%.2f%%) questions have more than one instance", questions_with_more_than_one_instance, 100 * questions_with_more_than_one_instance / yielded_question_count, ) def make_instances( self, qid: str, question: str, answers: List[str], context: str, first_answer_offset: Optional[int], always_add_answer_span: bool = False, ) -> Iterable[Instance]: """ Create training instances from a SQuAD example. """ # tokenize context by spaces first, and then with the wordpiece tokenizer # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we # detect whether a space comes before a word, and if so, add "a " in front of the word. def tokenize_slice(start: int, end: int) -> Iterable[Token]: text_to_tokenize = context[start:end] if start - 1 >= 0 and context[start - 1].isspace(): prefix = "a " # must end in a space, and be short so we can be sure it becomes only one token wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize) for wordpiece in wordpieces: if wordpiece.idx is not None: wordpiece.idx -= len(prefix) return wordpieces[1:] else: return self._tokenizer.tokenize(text_to_tokenize) tokenized_context = [] token_start = 0 for i, c in enumerate(context): if c.isspace(): for wordpiece in tokenize_slice(token_start, i): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) token_start = i + 1 for wordpiece in tokenize_slice(token_start, len(context)): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) if first_answer_offset is None: (token_answer_span_start, token_answer_span_end) = (-1, -1) else: (token_answer_span_start, token_answer_span_end), _ = char_span_to_token_span( [ (t.idx, t.idx + len(sanitize_wordpiece(t.text))) if t.idx is not None else None for t in tokenized_context ], (first_answer_offset, first_answer_offset + len(answers[0])), ) # Tokenize the question. tokenized_question = self._tokenizer.tokenize(question) tokenized_question = tokenized_question[: self.max_query_length] # Stride over the context, making instances. space_for_context = ( self.length_limit - len(tokenized_question) - len(self._tokenizer.sequence_pair_start_tokens) - len(self._tokenizer.sequence_pair_mid_tokens) - len(self._tokenizer.sequence_pair_end_tokens) ) stride_start = 0 while True: tokenized_context_window = tokenized_context[stride_start:] tokenized_context_window = tokenized_context_window[:space_for_context] window_token_answer_span = ( token_answer_span_start - stride_start, token_answer_span_end - stride_start, ) if any(i < 0 or i >= len(tokenized_context_window) for i in window_token_answer_span): # The answer is not contained in the window. window_token_answer_span = None if not self.skip_impossible_questions or window_token_answer_span is not None: additional_metadata = {"id": qid} instance = self.text_to_instance( question, tokenized_question, context, tokenized_context_window, answers=answers, token_answer_span=window_token_answer_span, additional_metadata=additional_metadata, always_add_answer_span=always_add_answer_span, ) yield instance stride_start += space_for_context if stride_start >= len(tokenized_context): break stride_start -= self.stride @overrides def text_to_instance( self, # type: ignore question: str, tokenized_question: List[Token], context: str, tokenized_context: List[Token], answers: List[str] = None, token_answer_span: Optional[Tuple[int, int]] = None, additional_metadata: Dict[str, Any] = None, always_add_answer_span: bool = False, ) -> Instance: fields = {} # make the question field question_field = TextField( self._tokenizer.add_special_tokens(tokenized_question, tokenized_context), self._token_indexers, ) fields["question_with_context"] = question_field cls_index = self._find_cls_index(question_field.tokens) if self._include_cls_index: fields["cls_index"] = IndexField(cls_index, question_field) start_of_context = ( len(self._tokenizer.sequence_pair_start_tokens) + len(tokenized_question) + len(self._tokenizer.sequence_pair_mid_tokens) ) # make the answer span if token_answer_span is not None: assert all(i >= 0 for i in token_answer_span) assert token_answer_span[0] <= token_answer_span[1] fields["answer_span"] = SpanField( token_answer_span[0] + start_of_context, token_answer_span[1] + start_of_context, question_field, ) elif always_add_answer_span: fields["answer_span"] = SpanField(cls_index, cls_index, question_field) # make the context span, i.e., the span of text from which possible answers should be drawn fields["context_span"] = SpanField( start_of_context, start_of_context + len(tokenized_context) - 1, question_field ) # make the metadata metadata = { "question": question, "question_tokens": tokenized_question, "context": context, "context_tokens": tokenized_context, "answers": answers or [], } if additional_metadata is not None: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields) def _find_cls_index(self, tokens: List[Token]) -> int: return next(i for i, t in enumerate(tokens) if t.text == self._cls_token)
class BertyTSVReader(DatasetReader): def __init__( self, sent1_col: str, sent2_col: str = None, label_col: str = 'label', bert_model: str = 'bert-base-uncased', max_sequence_length: int = 500, skip_label_indexing: bool = False, lower: bool = True, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._sent1_col = sent1_col self._sent2_col = sent2_col self._label_col = label_col self._tokenizer = PretrainedTransformerTokenizer( bert_model, add_special_tokens=False, max_length=max_sequence_length ) # type: PretrainedTransformerTokenizer self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._lower = lower self._token_indexers = { "tokens": PretrainedTransformerIndexer(model_name=bert_model) } @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: # without the quoting arg, errors will occur with line having quoting characters "/' df = pandas.read_csv(data_file, sep='\t', quoting=csv.QUOTE_NONE) has_label = self._label_col in df.columns for rid in range(0, df.shape[0]): sent1 = df.iloc[rid][self._sent1_col] if self._lower: sent1 = sent1.lower() if self._sent2_col: sent2 = df.iloc[rid][self._sent2_col] if self._lower: sent2 = sent2.lower() else: sent2 = None if has_label: label = df.iloc[rid][self._label_col] if self._skip_label_indexing: label = int(label) else: label = None instance = self.text_to_instance(sent1=sent1, sent2=sent2, label=label) if instance is not None: yield instance @overrides def text_to_instance( self, sent1: str, sent2: str = None, label: Optional[str] = None) -> Instance: # type: ignore fields: Dict[str, Field] = {} if sent2: # tokens = self._tokenizer.tokenize_sentence_pair(sent1, sent2) tokens1 = self._tokenizer.tokenize(sent1) tokens2 = self._tokenizer.tokenize(sent2) tokens = self._tokenizer.add_special_tokens(tokens1, tokens2) else: tokens = self._tokenizer.tokenize(sent1) tokens = self._tokenizer.add_special_tokens(tokens) fields['sent'] = TextField(tokens, self._token_indexers) if label is not None: fields['label'] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields) def instance_to_text(self, instance: Instance): return allenutil.bert_instance_as_json(instance)
class TransformerSquadReader(DatasetReader): """ Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four fields: * ``question_with_context``, a ``TextField`` that contains the concatenation of question and context, * ``answer_span``, a ``SpanField`` into the ``question`` ``TextField`` denoting the answer. * ``context_span`` a ``SpanField`` into the ``question`` ``TextField`` denoting the context, i.e., the part of the text that potential answers can come from. * A ``MetadataField`` that stores the instance's ID, the original question, the original passage text, both of these in tokenized form, and the gold answer strings, accessible as ``metadata['id']``, ``metadata['question']``, ``metadata['context']``, ``metadata['question_tokens']``, ``metadata['context_tokens']``, and ``metadata['answers']. This is so that we can more easily use the official SQuAD evaluation script to get metrics. We also support limiting the maximum length for the question. When the context+question is too long, we run a sliding window over the context and emit multiple instances for a single question. At training time, we only emit instances that contain a gold answer. At test time, we emit all instances. As a result, the per-instance metrics you get during training and evaluation don't correspond 100% to the SQuAD task. To get a final number, you have to run the script in scripts/transformer_qa_eval.py. # Parameters transformer_model_name : `str`, optional (default=`'bert-base-cased'`) This reader chooses tokenizer and token indexer according to this setting. length_limit : `int`, optional (default=`384`) We will make sure that the length of context+question never exceeds this many word pieces. stride : `int`, optional (default=`128`) When context+question are too long for the length limit, we emit multiple instances for one question, where the context is shifted. This parameter specifies the overlap between the shifted context window. It is called "stride" instead of "overlap" because that's what it's called in the original huggingface implementation. skip_invalid_examples: `bool`, optional (default=`False`) If this is true, we will skip examples that don't have a gold answer. You should set this to True during training, and False any other time. max_query_length : `int`, optional (default=`64`) The maximum number of wordpieces dedicated to the question. If the question is longer than this, it will be truncated. """ def __init__(self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, stride: int = 128, skip_invalid_examples: bool = False, max_query_length: int = 64, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit self.stride = stride self.skip_invalid_examples = skip_invalid_examples self.max_query_length = max_query_length @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open_compressed(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json["data"] logger.info("Reading the dataset") yielded_question_count = 0 questions_with_more_than_one_instance = 0 for article in dataset: for paragraph_json in article["paragraphs"]: context = paragraph_json["context"] for question_answer in paragraph_json["qas"]: answers = [ answer_json["text"] for answer_json in question_answer["answers"] ] # Just like huggingface, we only use the first answer for training. if len(answers) > 0: first_answer_offset = int( question_answer["answers"][0]["answer_start"]) else: first_answer_offset = None instances = self.make_instances( question_answer.get("id", None), question_answer["question"], answers, context, first_answer_offset, ) instances_yielded = 0 for instance in instances: yield instance instances_yielded += 1 if instances_yielded > 1: questions_with_more_than_one_instance += 1 yielded_question_count += 1 if questions_with_more_than_one_instance > 0: logger.info( "%d (%.2f%%) questions have more than one instance", questions_with_more_than_one_instance, 100 * questions_with_more_than_one_instance / yielded_question_count, ) def make_instances( self, qid: str, question: str, answers: List[str], context: str, first_answer_offset: Optional[int], ) -> Iterable[Instance]: # tokenize context by spaces first, and then with the wordpiece tokenizer # For RoBERTa, this produces a bug where every token is marked as beginning-of-sentence. To fix it, we # detect whether a space comes before a word, and if so, add "a " in front of the word. def tokenize_slice(start: int, end: int) -> Iterable[Token]: text_to_tokenize = context[start:end] if start - 1 >= 0 and context[start - 1].isspace(): prefix = "a " # must end in a space, and be short so we can be sure it becomes only one token wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize) for wordpiece in wordpieces: if wordpiece.idx is not None: wordpiece.idx -= len(prefix) return wordpieces[1:] else: return self._tokenizer.tokenize(text_to_tokenize) tokenized_context = [] token_start = 0 for i, c in enumerate(context): if c.isspace(): for wordpiece in tokenize_slice(token_start, i): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) token_start = i + 1 for wordpiece in tokenize_slice(token_start, len(context)): if wordpiece.idx is not None: wordpiece.idx += token_start tokenized_context.append(wordpiece) if first_answer_offset is None: (token_answer_span_start, token_answer_span_end) = (-1, -1) else: (token_answer_span_start, token_answer_span_end), _ = char_span_to_token_span( [(t.idx, t.idx + len(sanitize_wordpiece(t.text))) if t.idx is not None else None for t in tokenized_context], (first_answer_offset, first_answer_offset + len(answers[0])), ) # Tokenize the question tokenized_question = self._tokenizer.tokenize(question) tokenized_question = tokenized_question[:self.max_query_length] # Stride over the context, making instances # Sequences are [CLS] question [SEP] [SEP] context [SEP], hence the - 4 for four special tokens. # This is technically not correct for anything but RoBERTa, but it does not affect the scores. space_for_context = (self.length_limit - len(tokenized_question) - len(self._tokenizer.sequence_pair_start_tokens) - len(self._tokenizer.sequence_pair_mid_tokens) - len(self._tokenizer.sequence_pair_end_tokens)) stride_start = 0 while True: tokenized_context_window = tokenized_context[stride_start:] tokenized_context_window = tokenized_context_window[: space_for_context] window_token_answer_span = ( token_answer_span_start - stride_start, token_answer_span_end - stride_start, ) if any(i < 0 or i >= len(tokenized_context_window) for i in window_token_answer_span): # The answer is not contained in the window. window_token_answer_span = None if not self.skip_invalid_examples or window_token_answer_span is not None: additional_metadata = {"id": qid} instance = self.text_to_instance( question, tokenized_question, context, tokenized_context_window, answers, window_token_answer_span, additional_metadata, ) yield instance stride_start += space_for_context if stride_start >= len(tokenized_context): break stride_start -= self.stride @overrides def text_to_instance( self, # type: ignore question: str, tokenized_question: List[Token], context: str, tokenized_context: List[Token], answers: List[str], token_answer_span: Optional[Tuple[int, int]], additional_metadata: Dict[str, Any] = None, ) -> Instance: fields = {} # make the question field question_field = TextField( self._tokenizer.add_special_tokens(tokenized_question, tokenized_context), self._token_indexers, ) fields["question_with_context"] = question_field start_of_context = (len(self._tokenizer.sequence_pair_start_tokens) + len(tokenized_question) + len(self._tokenizer.sequence_pair_mid_tokens)) # make the answer span if token_answer_span is not None: assert all(i >= 0 for i in token_answer_span) assert token_answer_span[0] <= token_answer_span[1] fields["answer_span"] = SpanField( token_answer_span[0] + start_of_context, token_answer_span[1] + start_of_context, question_field, ) else: # We have to put in something even when we don't have an answer, so that this instance can be batched # together with other instances that have answers. fields["answer_span"] = SpanField(-1, -1, question_field) # make the context span, i.e., the span of text from which possible answers should be drawn fields["context_span"] = SpanField( start_of_context, start_of_context + len(tokenized_context) - 1, question_field) # make the metadata metadata = { "question": question, "question_tokens": tokenized_question, "context": context, "context_tokens": tokenized_context, "answers": answers, } if additional_metadata is not None: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
class RecordTaskReader(DatasetReader): """ Reader for Reading Comprehension with Commonsense Reasoning(ReCoRD) task from SuperGLUE. The task is detailed in the paper ReCoRD: Bridging the Gap between Human and Machine Commonsense Reading Comprehension (arxiv.org/pdf/1810.12885.pdf) by Zhang et al. Leaderboards and the official evaluation script for the ReCoRD task can be found sheng-z.github.io/ReCoRD-explorer/. The reader reads a JSON file in the format from sheng-z.github.io/ReCoRD-explorer/dataset-readme.txt # Parameters tokenizer: `Tokenizer`, optional The tokenizer class to use. Defaults to SpacyTokenizer token_indexers : `Dict[str, TokenIndexer]`, optional We similarly use this for both the question and the passage. See :class:`TokenIndexer`. Default is `{"tokens": SingleIdTokenIndexer()}`. passage_length_limit : `int`, optional (default=`None`) If specified, we will cut the passage if the length of passage exceeds this limit. question_length_limit : `int`, optional (default=`None`) If specified, we will cut the question if the length of question exceeds this limit. raise_errors: `bool`, optional (default=`False`) If the reader should raise errors or just continue. kwargs: `Dict` Keyword arguments to be passed to the DatasetReader parent class constructor. """ def __init__( self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, question_length_limit: int = 64, stride: int = 128, raise_errors: bool = False, tokenizer_kwargs: Dict[str, Any] = None, one_instance_per_query: bool = False, max_instances: int = None, **kwargs, ) -> None: """ Initialize the RecordTaskReader. """ super(RecordTaskReader, self).__init__(manual_distributed_sharding=True, max_instances=max_instances, **kwargs) self._kwargs = kwargs self._model_name = transformer_model_name self._tokenizer_kwargs = tokenizer_kwargs or {} # Save the values passed to __init__ to protected attributes self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name, tokenizer_kwargs=tokenizer_kwargs) } self._length_limit = length_limit self._query_len_limit = question_length_limit self._stride = stride self._raise_errors = raise_errors self._cls_token = "@placeholder" self._one_instance_per_query = one_instance_per_query def _to_params(self) -> Dict[str, Any]: """ Get the configuration dictionary for this class. # Returns `Dict[str, Any]` The config dict. """ return { "type": "superglue_record", "transformer_model_name": self._model_name, "length_limit": self._length_limit, "question_length_limit": self._query_len_limit, "stride": self._stride, "raise_errors": self._raise_errors, "tokenizer_kwargs": self._tokenizer_kwargs, "one_instance_per_query": self._one_instance_per_query, "max_instances": self.max_instances, **self._kwargs, } def _read(self, file_path: Union[Path, str]) -> Iterable[Instance]: # IF `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) # Read the 'data' key from the dataset logger.info(f"Reading '{file_path}'") with open(file_path) as fp: dataset = json.load(fp)["data"] logger.info(f"Found {len(dataset)} examples from '{file_path}'") # Keep track of certain stats while reading the file # examples_multiple_instance_count: The number of questions with more than # one instance. Can happen because there is multiple queries for a # single passage. # passages_yielded: The total number of instances found/yielded. examples_multiple_instance_count = 0 examples_no_instance_count = 0 passages_yielded = 0 # Iterate through every example from the ReCoRD data file. for example in dataset: # Get the list of instances for the current example instances_for_example = self.get_instances_from_example(example) # Keep track of number of instances for this specific example that # have been yielded. Since it instances_for_example is a generator, we # do not know its length. To address this, we create an counter int. instance_count = 0 # Iterate through the instances and yield them. for instance in instances_for_example: yield instance instance_count += 1 if instance_count == 0: logger.warning(f"Example '{example['id']}' had no instances.") examples_no_instance_count += 1 # Check if there was more than one instance for this example. If # there was we increase examples_multiple_instance_count by 1. # Otherwise we increase by 0. examples_multiple_instance_count += 1 if instance_count > 1 else 0 passages_yielded += instance_count # Check to see if we are over the max_instances to yield. if self.max_instances and passages_yielded > self.max_instances: logger.info("Passed max instances") break # Log pertinent information. if passages_yielded: logger.info( f"{examples_multiple_instance_count}/{passages_yielded} " f"({examples_multiple_instance_count / passages_yielded * 100:.2f}%) " f"examples had more than one instance") logger.info( f"{examples_no_instance_count}/{passages_yielded} " f"({examples_no_instance_count / passages_yielded * 100:.2f}%) " f"examples had no instances") else: logger.warning(f"Could not find any instances in '{file_path}'") def get_instances_from_example( self, example: Dict, always_add_answer_span: bool = False) -> Iterable[Instance]: """ Helper function to get instances from an example. Much of this comes from `transformer_squad.make_instances` # Parameters example: `Dict[str,Any]` The example dict. # Returns: `Iterable[Instance]` The instances for each example """ # Get the passage dict from the example, it has text and # entities example_id: str = example["id"] passage_dict: Dict = example["passage"] passage_text: str = passage_dict["text"] # Tokenize the passage tokenized_passage: List[Token] = self.tokenize_str(passage_text) # TODO: Determine what to do with entities. Superglue marks them # explicitly as input (https://arxiv.org/pdf/1905.00537.pdf) # Get the queries from the example dict queries: List = example["qas"] logger.debug(f"{len(queries)} queries for example {example_id}") # Tokenize and get the context windows for each queries for query in queries: # Create the additional metadata dict that will be passed w/ extra # data for each query. We store the question & query ids, all # answers, and other data following `transformer_qa`. additional_metadata = { "id": query["id"], "example_id": example_id, } instances_yielded = 0 # Tokenize, and truncate, the query based on the max set in # `__init__` tokenized_query = self.tokenize_str( query["query"])[:self._query_len_limit] # Calculate where the context needs to start and how many tokens we have # for it. This is due to the limit on the number of tokens that a # transformer can use because they have quadratic memory usage. But if # you are reading this code, you probably know that. space_for_context = ( self._length_limit - len(list(tokenized_query)) # Used getattr so I can test without having to load a # transformer model. - len(getattr(self._tokenizer, "sequence_pair_start_tokens", [])) - len(getattr(self._tokenizer, "sequence_pair_mid_tokens", [])) - len(getattr(self._tokenizer, "sequence_pair_end_tokens", []))) # Check if answers exist for this query. We assume that there are no # answers for this query, and set the start and end index for the # answer span to -1. answers = query.get("answers", []) if not answers: logger.warning(f"Skipping {query['id']}, no answers") continue # Create the arguments needed for `char_span_to_token_span` token_offsets = [(t.idx, t.idx + len(sanitize_wordpiece(t.text))) if t.idx is not None else None for t in tokenized_passage] # Get the token offsets for the answers for this current passage. answer_token_start, answer_token_end = (-1, -1) for answer in answers: # Try to find the offsets. offsets, _ = char_span_to_token_span( token_offsets, (answer["start"], answer["end"])) # If offsets for an answer were found, it means the answer is in # the passage, and thus we can stop looking. if offsets != (-1, -1): answer_token_start, answer_token_end = offsets break # Go through the context and find the window that has the answer in it. stride_start = 0 while True: tokenized_context_window = tokenized_passage[stride_start:] tokenized_context_window = tokenized_context_window[: space_for_context] # Get the token offsets w.r.t the current window. window_token_answer_span = ( answer_token_start - stride_start, answer_token_end - stride_start, ) if any(i < 0 or i >= len(tokenized_context_window) for i in window_token_answer_span): # The answer is not contained in the window. window_token_answer_span = None if ( # not self.skip_impossible_questions window_token_answer_span is not None): # The answer WAS found in the context window, and thus we # can make an instance for the answer. instance = self.text_to_instance( query["query"], tokenized_query, passage_text, tokenized_context_window, answers=[answer["text"] for answer in answers], token_answer_span=window_token_answer_span, additional_metadata=additional_metadata, always_add_answer_span=always_add_answer_span, ) yield instance instances_yielded += 1 if instances_yielded == 1 and self._one_instance_per_query: break stride_start += space_for_context # If we have reached the end of the passage, stop. if stride_start >= len(tokenized_passage): break # I am not sure what this does...but it is here? stride_start -= self._stride def tokenize_slice(self, text: str, start: int = None, end: int = None) -> Iterable[Token]: """ Get + tokenize a span from a source text. *Originally from the `transformer_squad.py`* # Parameters text: `str` The text to draw from. start: `int` The start index for the span. end: `int` The end index for the span. Assumed that this is inclusive. # Returns `Iterable[Token]` List of tokens for the retrieved span. """ start = start or 0 end = end or len(text) text_to_tokenize = text[start:end] # Check if this is the start of the text. If the start is >= 0, check # for a preceding space. If it exists, then we need to tokenize a # special way because of a bug with RoBERTa tokenizer. if start - 1 >= 0 and text[start - 1].isspace(): # Per the original tokenize_slice function, you need to add a # garbage token before the actual text you want to tokenize so that # the tokenizer does not add a beginning of sentence token. prefix = "a " # Tokenize the combined prefix and text wordpieces = self._tokenizer.tokenize(prefix + text_to_tokenize) # Go through each wordpiece in the tokenized wordpieces. for wordpiece in wordpieces: # Because we added the garbage prefix before tokenize, we need # to adjust the idx such that it accounts for this. Therefore we # subtract the length of the prefix from each token's idx. if wordpiece.idx is not None: wordpiece.idx -= len(prefix) # We do not want the garbage token, so we return all but the first # token. return wordpieces[1:] else: # Do not need any sort of prefix, so just return all of the tokens. return self._tokenizer.tokenize(text_to_tokenize) def tokenize_str(self, text: str) -> List[Token]: """ Helper method to tokenize a string. Adapted from the `transformer_squad.make_instances` # Parameters text: `str` The string to tokenize. # Returns `Iterable[Tokens]` The resulting tokens. """ # We need to keep track of the current token index so that we can update # the results from self.tokenize_slice such that they reflect their # actual position in the string rather than their position in the slice # passed to tokenize_slice. Also used to construct the slice. token_index = 0 # Create the output list (can be any iterable) that will store the # tokens we found. tokenized_str = [] # Helper function to update the `idx` and add every wordpiece in the # `tokenized_slice` to the `tokenized_str`. def add_wordpieces(tokenized_slice: Iterable[Token]) -> None: for wordpiece in tokenized_slice: if wordpiece.idx is not None: wordpiece.idx += token_index tokenized_str.append(wordpiece) # Iterate through every character and their respective index in the text # to create the slices to tokenize. for i, c in enumerate(text): # Check if the current character is a space. If it is, we tokenize # the slice of `text` from `token_index` to `i`. if c.isspace(): add_wordpieces(self.tokenize_slice(text, token_index, i)) token_index = i + 1 # Add the end slice that is not collected by the for loop. add_wordpieces(self.tokenize_slice(text, token_index, len(text))) return tokenized_str @staticmethod def get_spans_from_text(text: str, spans: List[Tuple[int, int]]) -> List[str]: """ Helper function to get a span from a string # Parameter text: `str` The source string spans: `List[Tuple[int,int]]` List of start and end indices for spans. Assumes that the end index is inclusive. Therefore, for start index `i` and end index `j`, retrieves the span at `text[i:j+1]`. # Returns `List[str]` The extracted string from text. """ return [text[start:end + 1] for start, end in spans] def text_to_instance( self, query: str, tokenized_query: List[Token], passage: str, tokenized_passage: List[Token], answers: List[str], token_answer_span: Optional[Tuple[int, int]] = None, additional_metadata: Optional[Dict[str, Any]] = None, always_add_answer_span: Optional[bool] = False, ) -> Instance: """ A lot of this comes directly from the `transformer_squad.text_to_instance` """ fields = {} # Create the query field from the tokenized question and context. Use # `self._tokenizer.add_special_tokens` function to add the necessary # special tokens to the query. query_field = TextField( self._tokenizer.add_special_tokens( # The `add_special_tokens` function automatically adds in the # separation token to mark the separation between the two lists of # tokens. Therefore, we can create the query field WITH context # through passing them both as arguments. tokenized_query, tokenized_passage, ), self._token_indexers, ) # Add the query field to the fields dict that will be outputted as an # instance. Do it here rather than assign above so that we can use # attributes from `query_field` rather than continuously indexing # `fields`. fields["question_with_context"] = query_field # Calculate the index that marks the start of the context. start_of_context = ( +len(tokenized_query) # Used getattr so I can test without having to load a # transformer model. + len(getattr(self._tokenizer, "sequence_pair_start_tokens", [])) + len(getattr(self._tokenizer, "sequence_pair_mid_tokens", []))) # make the answer span if token_answer_span is not None: assert all(i >= 0 for i in token_answer_span) assert token_answer_span[0] <= token_answer_span[1] fields["answer_span"] = SpanField( token_answer_span[0] + start_of_context, token_answer_span[1] + start_of_context, query_field, ) # make the context span, i.e., the span of text from which possible # answers should be drawn fields["context_span"] = SpanField( start_of_context, start_of_context + len(tokenized_passage) - 1, query_field) # make the metadata metadata = { "question": query, "question_tokens": tokenized_query, "context": passage, "context_tokens": tokenized_passage, "answers": answers or [], } if additional_metadata is not None: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields) def _find_cls_index(self, tokens: List[Token]) -> int: """ From transformer_squad Args: self: tokens: Returns: """ return next(i for i, t in enumerate(tokens) if t.text == self._cls_token)
class FakeReader(DatasetReader): """ Creates fake multiple-choice input. If your model doesn't get 99% on this data, it is broken. Instances have two fields: * `alternatives`, a ListField of TextField * `correct_alternative`, IndexField with the correct answer among `alternatives` Parameters ---------- transformer_model_name : `str`, optional (default=`roberta-large`) This reader chooses tokenizer and token indexer according to this setting. length_limit : `int`, optional (default=512) We will make sure that the length of the alternatives never exceeds this many word pieces. """ def __init__(self, transformer_model_name: str = "roberta-large", length_limit: int = 512, **kwargs) -> None: super().__init__(**kwargs) if self.max_instances is None: raise ValueError("FakeReader requires max_instances to be set.") from allennlp.data.tokenizers import PretrainedTransformerTokenizer self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit def _read(self, file_path: str): logger.info("Ignoring file at %s", file_path) for i in range(self.max_instances): label = i % 2 texts = [f"This is the false choice {i}."] * 2 texts[label] = f"This is the true choice {i}." yield self.text_to_instance(texts, label) def text_to_instance( self, # type: ignore alternatives: List[str], correct_alternative: int, ) -> Instance: # tokenize alternatives = [ self._tokenizer.tokenize(alternative) for alternative in alternatives ] # add special tokens alternatives = [ self._tokenizer.add_special_tokens(alternative) for alternative in alternatives ] # make fields from allennlp.data.fields import TextField alternatives = [ TextField(alternative, self._token_indexers) for alternative in alternatives ] if correct_alternative < 0 or correct_alternative >= len(alternatives): raise ValueError("Alternative %d does not exist.", correct_alternative) from allennlp.data.fields import ListField alternatives = ListField(alternatives) from allennlp.data.fields import IndexField return Instance({ "alternatives": alternatives, "correct_alternative": IndexField(correct_alternative, alternatives), })
class WorldTreeReader(DatasetReader): """ """ def __init__(self, transformer_model_name: str = "roberta-large", **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } @overrides def _read(self, file_path: str) -> Iterable[Instance]: """ """ with open(cached_path(file_path), "r") as data_file: logger.info("Reading questions from file at: %s", file_path) df = pd.read_csv(file_path, delimiter="\t") for _, row in df.iterrows(): qid = row["QuestionID"] raw_question = row["question"] question, choices = parse_raw_question(raw_question) answer = row["AnswerKey"] answer_idx = answser_to_index(answer) yield self.text_to_instance(qid, question, choices, answer_idx) @overrides def text_to_instance( self, # type: ignore qid: str, question: str, choices: List[str], answer_idx: Optional[int] = None, ) -> Instance: # **A hack** # We need to make each question have exactly four choices to process them in batches. # Either drop the choice or use a non-answer one to fill. if len(choices) == 5: if answer_idx != 4: # Just drop the last choice choices = choices[:-1] elif answer_idx == 4: # Answer is the last, so drop the first choices = choices[1:] answer_idx -= 1 elif len(choices) == 3: if answer_idx != 2: # Use the last to fill choices.append(choices[-1]) else: # Use the first to fill choices.append(choices[0]) # Base checks assert len(choices) == 4 if answer_idx < 0 or answer_idx >= len(choices): # print(answer_idx) raise ValueError("Choice %d does not exist", answer_idx) # Tokenize the question question_tokens = self._tokenizer.tokenize(question) # Tokenize the choices and concate them with the question into question-choice pairs qc_pairs = [] for choice in choices: choice_tokens = self._tokenizer.tokenize(choice) qc_pair = self._tokenizer.add_special_tokens( question_tokens, choice_tokens) qc_pairs.append(qc_pair) # Wrap them into AllenNLP fields qc_pairs = [TextField(pair, self._token_indexers) for pair in qc_pairs] qc_pairs = ListField(qc_pairs) answer_idx = IndexField(answer_idx, qc_pairs) metadata = MetadataField({ "id": qid, "question": question, "choices": choices }) return Instance({ "qc_pairs": qc_pairs, "answer_idx": answer_idx, "metadata": metadata, })
class BertDatasetReader(DatasetReader): """ Reads a file from ProPara state change dataset. This data is formatted as TSV, one instance per line. Format: "Query \t\t\t step \t\t\t state_change_types" state_change_types: string label applicable to this datapoint We convert these columns into fields named "tokens", "state_change_types". Parameters ---------- token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": BertTokenIndexer()}``) IMPORTANT NOTE: All components like tokeniser, token-indexer, token embedder and Seq2VecEncoder should be of Bert-type. """ def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self.transformer_model = "bert-base-uncased" self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512) self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512) # self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as state_change_file: logger.info("Reading state change instances from TSV dataset at: %s", file_path) for line in tqdm.tqdm(state_change_file): parts: List[str] = line.split('\t\t\t') query_text = parts[0].lower() query_tokens = self.tokenizer.tokenize(query_text) step_text = parts[1].lower() step_tokens = self.tokenizer.tokenize(step_text) combined_tokens = self.tokenizer.add_special_tokens(query_tokens,step_tokens) # parse labels state_change_types = parts[2].strip() # create instance yield self.text_to_instance(combined_tokens=combined_tokens, state_change_types=state_change_types) @overrides def text_to_instance(self, # type: ignore combined_tokens: List[str], state_change_types: Optional[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print(verb_vector) # encode inputs token_field = TextField(combined_tokens, {'tokens': self.token_indexer}) # token_field.index(vocab) fields['tokens'] = token_field # fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags') # fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags') # encode outputs if state_change_types: fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels') return Instance(fields) @classmethod def from_params(cls, params: Params,constructor_to_call=None, constructor_to_inspect=None) -> 'BertDatasetReader': # token_indexers = TokenIndexer() # print(params.pop("token_indexer", {})) # token_indexers = {'tokens': SingleIdTokenIndexer()} # params.assert_empty(cls.__name__) return BertDatasetReader()