def _make_instance(example): d = dict() # For human-readability d["raw_passage"] = MetadataField(" ".join(example["passage"])) d["raw_question"] = MetadataField(" ".join(example["question"])) if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn( example["passage"], example["question"], get_offset=True) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["passage"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["passage"]), indexers) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["question"]), indexers) start_offset = 0 d["span_start"] = NumericField(example["answer_span"][0] + start_offset, label_namespace="span_start_labels") d["span_end"] = NumericField(example["answer_span"][1] + start_offset, label_namespace="span_end_labels") d["start_offset"] = MetadataField(start_offset) d["passage_str"] = MetadataField(example["passage_str"]) d["answer_str"] = MetadataField(example["answer_str"]) d["space_processed_token_map"] = MetadataField( example["space_processed_token_map"]) return Instance(d)
def _make_instance(sentence_tokens, question_tokens, answer_span, idx): d = dict() # For human-readability d["raw_sentence"] = MetadataField(" ".join(sentence_tokens[1:-1])) d["raw_question"] = MetadataField(" ".join(question_tokens[1:-1])) if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp = model_preprocessing_interface.boundary_token_fn( sentence_tokens, question_tokens) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["sentence"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( sentence_tokens), indexers) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( question_tokens), indexers) d["span_start"] = NumericField(answer_span[0], label_namespace="span_start_labels") d["span_end"] = NumericField(answer_span[1], label_namespace="span_end_labels") d["idx"] = LabelField(idx, label_namespace="idxs", skip_indexing=True) return Instance(d)
def _make_instance(psg, qst, ans_str, label, psg_idx, qst_idx, ans_idx): """ pq_id: passage-question ID """ d = {} d["psg_str"] = MetadataField(" ".join(psg)) d["qst_str"] = MetadataField(" ".join(qst)) d["ans_str"] = MetadataField(ans_str) d["psg_idx"] = MetadataField(par_idx) d["qst_idx"] = MetadataField(qst_idx) d["ans_idx"] = MetadataField(ans_idx) d["idx"] = MetadataField(ans_idx) # required by evaluate() if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp = model_preprocessing_interface.boundary_token_fn(psg, qst) d["psg_qst_ans"] = sentence_to_text_field(inp, indexers) else: d["psg"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(psg), indexers) d["qst"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(qst), indexers) d["label"] = LabelField(label, label_namespace="labels", skip_indexing=True) return Instance(d)
def _make_instance(passage, question, answer, label, par_idx, qst_idx, ans_idx): """ pq_id: passage-question ID """ d = {} d["psg_str"] = MetadataField(" ".join(passage)) d["qst_str"] = MetadataField(" ".join(question)) d["ans_str"] = MetadataField(" ".join(answer)) d["psg_idx"] = MetadataField(par_idx) d["qst_idx"] = MetadataField(qst_idx) d["ans_idx"] = MetadataField(ans_idx) d["idx"] = MetadataField(ans_idx) # required by evaluate() if is_using_pytorch_transformers: inp = boundary_token_fn(para, question + answer) d["psg_qst_ans"] = sentence_to_text_field(inp, indexers) else: d["psg"] = sentence_to_text_field(boundary_token_fn(passage), indexers) d["qst"] = sentence_to_text_field(boundary_token_fn(question), indexers) d["ans"] = sentence_to_text_field(boundary_token_fn(answer), indexers) d["label"] = LabelField(label, label_namespace="labels", skip_indexing=True) return Instance(d)
def _make_instance(sent): """ Forward targs adds <s> as a target for input </s> and bwd targs adds </s> as a target for input <s> to avoid issues with needing to strip extra tokens in the input for each direction """ d = {} d["input"] = sentence_to_text_field(sent[:-1], indexers) d["targs"] = sentence_to_text_field(sent[1:], self.target_indexer) d["targs_b"] = sentence_to_text_field([sent[-1]] + sent[:-2], self.target_indexer) return Instance(d)
def make_instance(self, record, idx, indexers, model_preprocessing_interface) -> Type[Instance]: """Convert a single record to an AllenNLP Instance.""" tokens = record["text"].split() # already space-tokenized by Moses tokens = model_preprocessing_interface.boundary_token_fn( tokens ) # apply model-appropriate variants of [cls] and [sep]. text_field = sentence_to_text_field(tokens, indexers) d = {} d["idx"] = MetadataField(idx) d["input1"] = text_field d["span1s"] = ListField( [self._make_span_field(t["span1"], text_field, 1) for t in record["targets"]] ) if not self.single_sided: d["span2s"] = ListField( [self._make_span_field(t["span2"], text_field, 1) for t in record["targets"]] ) # Always use multilabel targets, so be sure each label is a list. labels = [utils.wrap_singleton_string(t["label"]) for t in record["targets"]] d["labels"] = ListField( [ MultiLabelField( label_set, label_namespace=self._label_namespace, skip_indexing=False ) for label_set in labels ] ) return Instance(d)
def _make_instance(sent_): """ Forward targs adds <s> as a target for input </s> and bwd targs adds </s> as a target for input <s> to avoid issues with needing to strip extra tokens in the input for each direction """ d = { "input": sentence_to_text_field(sent_, indexers), "targs": sentence_to_text_field(sent_[1:] + [sent_[0]], self.target_indexer), "targs_b": sentence_to_text_field([sent_[-1]] + sent_[:-1], self.target_indexer), } return Instance(d)
def run_repl(model, vocab, indexers, task, args): """ Run REPL """ print("Input CTRL-C or enter 'QUIT' to terminate.") while True: try: print() input_string = input(" INPUT: ") if input_string == "QUIT": break tokens = process_sentence( tokenizer_name=task.tokenizer_name, sent=input_string, max_seq_len=args.max_seq_len ) print("TOKENS:", " ".join("[{}]".format(tok) for tok in tokens)) field = sentence_to_text_field(tokens, indexers) field.index(vocab) batch = Batch([Instance({"input1": field})]).as_tensor_dict() batch = move_to_device(batch, args.cuda) with torch.no_grad(): out = model.forward(task, batch, predict=True) assert out["logits"].shape[1] == 2 s = " PRED: " s += "TRUE " if out["preds"][0].item() else "FALSE" s += " ({:.1f}%, logits: {:.3f} vs {:.3f})".format( torch.softmax(out["logits"][0], dim=0)[1].item() * 100, out["logits"][0][0].item(), out["logits"][0][1].item(), ) print(s) except KeyboardInterrupt: print("\nTerminating.") break
def _make_instance(question, choices, label, id_str): d = {} d["question_str"] = MetadataField(" ".join(question)) if not model_preprocessing_interface.model_flags["uses_pair_embedding"]: d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn(question), indexers ) for choice_idx, choice in enumerate(choices): inp = ( model_preprocessing_interface.boundary_token_fn(question, choice) if model_preprocessing_interface.model_flags["uses_pair_embedding"] else model_preprocessing_interface.boundary_token_fn(choice) ) d["choice%d" % choice_idx] = sentence_to_text_field(inp, indexers) d["choice%d_str" % choice_idx] = MetadataField(" ".join(choice)) d["label"] = LabelField(label, label_namespace="labels", skip_indexing=True) d["id_str"] = MetadataField(id_str) return Instance(d)
def _make_instance(sent_): sent_ = model_preprocessing_interface.boundary_token_fn(sent_) input_sent = sentence_to_text_field(sent_, indexers) d = { "input": input_sent, "targs": SequenceLabelField(sent_, input_sent, label_namespace=self._label_namespace), } return Instance(d)
def prepare_batch(tokens_batch, vocab, indexers, args): """ Do preprocessing for batch """ instance_ls = [] token_ls = [] for tokens in tokens_batch: field = sentence_to_text_field(tokens, indexers) field.index(vocab) instance_ls.append(Instance({"input1": field})) token_ls.append(tokens) batch = Batch(instance_ls).as_tensor_dict() batch = move_to_device(batch, args.cuda) return batch, token_ls
def _make_instance(psg, qst, ans_str, label, psg_idx, qst_idx, ans_idx): """ pq_id: passage-question ID """ d = {} d["psg_str"] = MetadataField(" ".join(psg)) d["qst_str"] = MetadataField(" ".join(qst)) d["ans_str"] = MetadataField(ans_str) d["psg_idx"] = MetadataField(par_idx) d["qst_idx"] = MetadataField(qst_idx) d["ans_idx"] = MetadataField(ans_idx) d["idx"] = MetadataField(ans_idx) # required by evaluate() if is_using_bert: inp = psg + qst[1:] d["psg_qst_ans"] = sentence_to_text_field(inp, indexers) else: d["psg"] = sentence_to_text_field(psg, indexers) d["qst"] = sentence_to_text_field(qst, indexers) d["label"] = LabelField(label, label_namespace="labels", skip_indexing=True) return Instance(d)