def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \ -> Mapping[TensorPort, np.ndarray]: batch_size = len(annotations) emb_supports = [a.support_embeddings for a in annotations] emb_questions = [a.question_embeddings for a in annotations] support_lengths = [a.support_length for a in annotations] question_lengths = [a.question_length for a in annotations] wiq = [a.word_in_question for a in annotations] offsets = [a.token_offsets for a in annotations] q_tokenized = [a.question_tokens for a in annotations] s_tokenized = [a.support_tokens for a in annotations] unique_words, unique_word_lengths, question2unique, support2unique = \ preprocessing.unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: preprocessing.stack_and_pad(emb_supports), XQAPorts.support_length: support_lengths, XQAPorts.emb_question: preprocessing.stack_and_pad(emb_questions), XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout, XQAPorts.is_eval: is_eval, XQAPorts.token_char_offsets: offsets, } if with_answers: spans = [a.answer_spans for a in annotations] span2question = [i for i in range(batch_size) for _ in spans[i]] output.update({ XQAPorts.answer_span: [span for span_list in spans for span in span_list], XQAPorts.correct_start_training: [] if is_eval else [span[0] for span_list in spans for span in span_list], XQAPorts.answer2question_training: span2question, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets]) return batch
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: # also add character information word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_words: question_words, Ports.Input.support_words: support_words, Ports.is_eval: is_eval } if self.shared_resources.config.get("vocab_from_embeddings", False): emb_support = np.zeros( [len(annotations), max(s_lengths), self.emb_matrix.shape[1]]) emb_question = np.zeros( [len(annotations), max(q_lengths), self.emb_matrix.shape[1]]) for i, a in enumerate(annotations): for j, k in enumerate(a.support_ids): emb_support[i, j] = self._get_emb(k) for j, k in enumerate(a.question_ids): emb_question[i, j] = self._get_emb(k) xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question else: xy_dict[Ports.Input.support] = [a.support_ids for a in annotations] xy_dict[Ports.Input.question] = [ a.question_ids for a in annotations ] if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] return numpify(xy_dict)
def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool): support_lengths = list() question_lengths = list() ass_lengths = [] ass2question = [] ass2unique = [] lemma2idx = dict() answer_labels = [] question_arg_span = [] support_arg_span = [] assertions2question_arg_span = [] assertions2support_arg_span = [] question_arg_span_idx = dict() support_arg_span_idx = dict() word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a["question_tokens"] for a in annotations] + [a["support_tokens"] for a in annotations], self.char_vocab) question, support = tokens[:len(annotations)], tokens[len(annotations):] word2lemma = [None] * len(rev_vocab) # we have to create batches here and cannot precompute them because of the batch-specific wiq feature for i, annot in enumerate(annotations): support_lengths.append(annot['support_lengths']) question_lengths.append(annot['question_lengths']) if "answers" in annot: answer_labels.append(annot["answers"]) # collect uniq lemmas: for k, l in enumerate(annot['question_lemmas']): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[question[i][k]] = lemma2idx[l] for k, l in enumerate(annot['support_lemmas']): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[support[i][k]] = lemma2idx[l] assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys( annot['question_lemmas'], annot['support_lemmas'], self._sources) sorted_assertionss = sorted(assertions.items(), key=lambda x: -x[1]) added_assertionss = set() for key, _ in sorted_assertionss: if len(added_assertionss) == self._limit: break a = self.__nlp(self._knowledge_store.get_assertion(key)) a_lemma = " ".join(t.lemma_ for t in a) if a_lemma in added_assertionss: continue else: added_assertionss.add(a_lemma) ass2question.append(i) ass_lengths.append(len(a)) q_arg_span = assertion_args[key][0] q_arg_span = (i, q_arg_span[0], q_arg_span[1]) s_arg_span = assertion_args[key][1] s_arg_span = (i, s_arg_span[0], s_arg_span[1]) if q_arg_span not in question_arg_span_idx: question_arg_span_idx[q_arg_span] = len(question_arg_span) question_arg_span.append(assertion_args[key][0]) if s_arg_span not in support_arg_span_idx: support_arg_span_idx[s_arg_span] = len(support_arg_span) support_arg_span.append(assertion_args[key][1]) assertions2question_arg_span.append(question_arg_span_idx[q_arg_span]) assertions2support_arg_span.append(support_arg_span_idx[s_arg_span]) u_ass = [] for t in a: w = t.orth_ if w not in vocab: vocab[w] = len(vocab) word_lengths.append(min(len(w), 20)) word_chars.append([self.char_vocab.get(c, 0) for c in w[:20]]) rev_vocab.append(w) if t.lemma_ not in lemma2idx: lemma2idx[t.lemma_] = len(lemma2idx) word2lemma.append(lemma2idx[t.lemma_]) u_ass.append(vocab[w]) ass2unique.append(u_ass) word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]]) for i, w in enumerate(rev_vocab): word_embeddings[i] = self._get_emb(self.shared_resources.vocab(w)) if not ass2unique: ass2unique.append([]) question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32) output = { AssertionMRPorts.word_chars: word_chars, AssertionMRPorts.word_char_length: word_lengths, AssertionMRPorts.question: question, AssertionMRPorts.support: support, AssertionMRPorts.support_length: support_lengths, AssertionMRPorts.question_length: question_lengths, AssertionMRPorts.is_eval: is_eval, AssertionMRPorts.word_embeddings: word_embeddings, AssertionMRPorts.assertion_lengths: ass_lengths, AssertionMRPorts.assertion2question: ass2question, AssertionMRPorts.assertions: ass2unique, AssertionMRPorts.word2lemma: word2lemma, AssertionMRPorts.question_arg_span: question_arg_span, AssertionMRPorts.support_arg_span: support_arg_span, AssertionMRPorts.assertion2question_arg_span: assertions2question_arg_span, AssertionMRPorts.assertion2support_arg_span: assertions2support_arg_span, '__vocab': vocab, '__rev_vocab': rev_vocab, '__lemma_vocab': lemma2idx, } if "answers" in annotations[0]: output[Ports.Target.target_index] = [a["answers"] for a in annotations] return numpify(output, keys=self.output_ports + self.training_ports)
def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \ -> Mapping[TensorPort, np.ndarray]: q_tokenized = [a.question_tokens for a in annotations] question_lengths = [a.question_length for a in annotations] max_training_support = self.config.get('max_training_support', 2) s_tokenized = [] support_lengths = [] wiq = [] offsets = [] support2question = [] support_ids = [] # aligns with support2question, used in output module to get correct index to original set of supports selected_support = [] all_spans = [] for i, a in enumerate(annotations): all_spans.append([]) if len(a.support_tokens) > max_training_support > 0 and not is_eval: # sample only 2 paragraphs and take first with double probability (the best) to speed # things up. Following https://arxiv.org/pdf/1710.10723.pdf is_done = False any_answer = any(a.answer_spans) # sample until there is at least one possible answer (if any) while not is_done: selected = self._rng.sample(range(0, len(a.support_tokens) + 1), max_training_support + 1) if 0 in selected and 1 in selected: selected = [s - 1 for s in selected if s > 0] else: selected = [max(0, s - 1) for s in selected[:max_training_support]] is_done = not any_answer or any(a.answer_spans[s] for s in selected) else: selected = set(range(len(a.support_tokens))) for s in selected: s_tokenized.append(a.support_tokens[s]) support_lengths.append(a.support_length[s]) wiq.append(a.word_in_question[s]) offsets.append(a.token_offsets[s]) selected_support.append(a.selected_supports[s]) support_ids.append(a.support_ids[s]) support2question.append(i) if with_answers: all_spans[-1].append(a.answer_spans[s]) word_chars, word_lengths, word_ids, vocab, rev_vocab = \ preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab) emb_support = np.zeros([len(support_lengths), max(support_lengths), self.vocab.emb_length]) emb_question = np.zeros([len(question_lengths), max(question_lengths), self.vocab.emb_length]) for i, a in enumerate(annotations): for j, q_id in enumerate(a.question_ids): emb_question[i, j] = self._get_emb(q_id) for k, s_ids in enumerate(support_ids): for j, s_id in enumerate(s_ids): emb_support[k, j] = self._get_emb(s_id) output = { XQAPorts.word_chars: word_chars, XQAPorts.word_char_length: word_lengths, XQAPorts.question_words: word_ids[:len(q_tokenized)], XQAPorts.support_words: word_ids[len(q_tokenized):], XQAPorts.emb_support: emb_support, XQAPorts.support_length: support_lengths, XQAPorts.emb_question: emb_question, XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.support2question: support2question, XQAPorts.is_eval: is_eval, XQAPorts.token_offsets: offsets, XQAPorts.selected_support: selected_support, '__vocab': vocab, '__rev_vocab': rev_vocab, } if with_answers: spans = [s for a in all_spans for spans_per_support in a for s in spans_per_support] span2support = [] support_idx = 0 for a in all_spans: for spans_per_support in a: span2support.extend([support_idx] * len(spans_per_support)) support_idx += 1 output.update({ XQAPorts.answer_span: [span for span in spans], XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans], XQAPorts.answer2support_training: span2support, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[XQAPorts.word_chars, XQAPorts.question_words, XQAPorts.support_words, XQAPorts.word_in_question, XQAPorts.token_offsets]) return batch
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: # also add character information word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] if self.shared_resources.config.get('use_dep_sa', False): xy_dict = { Ports.Input.support_length: s_lengths, Ports.Input.support_dep_i: [a.support_dep_i for a in annotations], Ports.Input.support_dep_j: [a.support_dep_j for a in annotations], Ports.Input.support_dep_type: [a.support_dep_type for a in annotations], Ports.Input.question_length: q_lengths, Ports.Input.question_dep_i: [a.question_dep_i for a in annotations], Ports.Input.question_dep_j: [a.question_dep_j for a in annotations], Ports.Input.question_dep_type: [a.question_dep_type for a in annotations], Ports.is_eval: is_eval } else: xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_batch_words: question_words, Ports.Input.support_batch_words: support_words, Ports.is_eval: is_eval, Ports.Input.support: [a.support_ids for a in annotations], Ports.Input.question: [a.question_ids for a in annotations] } if self.embeddings is not None: emb_support = np.zeros( [len(annotations), max(s_lengths), self.embeddings.shape[-1]]) emb_question = np.zeros( [len(annotations), max(q_lengths), self.embeddings.shape[-1]]) for i, a in enumerate(annotations): for j, t in enumerate(a.support_tokens): emb_support[i, j] = self.embeddings.get( t, self.__default_vec) for j, t in enumerate(a.question_tokens): emb_question[i, j] = self.embeddings.get( t, self.__default_vec) xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] xy_dict = numpify(xy_dict) return xy_dict
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] if self.shared_resources.config.get('use_dep_sa', False): xy_dict = { Ports.Input.support_length: s_lengths, Ports.Input.support_dep_i: [a.support_dep_i for a in annotations], Ports.Input.support_dep_j: [a.support_dep_j for a in annotations], Ports.Input.support_dep_type: [a.support_dep_type for a in annotations], Ports.Input.question_length: q_lengths, Ports.Input.question_dep_i: [a.question_dep_i for a in annotations], Ports.Input.question_dep_j: [a.question_dep_j for a in annotations], Ports.Input.question_dep_type: [a.question_dep_type for a in annotations], Ports.is_eval: is_eval } else: xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_batch_words: question_words, Ports.Input.support_batch_words: support_words, Ports.is_eval: is_eval, Ports.Input.support: [a.support_ids for a in annotations], Ports.Input.question: [a.question_ids for a in annotations] } if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] xy_dict = numpify(xy_dict) # Elmo embeddings tokens_support = [a.support_tokens for a in annotations] tokens_question = [a.question_tokens for a in annotations] # debug tokens_support_len = 0 tokens_question_len = 0 tokens_support_maxlen = 0 tokens_question_maxlen = 0 for a in annotations: tokens_support_len += len(a.support_tokens) tokens_question_len += len(a.question_tokens) tokens_support_maxlen = max(tokens_support_maxlen, len(a.support_tokens)) tokens_question_maxlen = max(tokens_question_maxlen, len(a.question_tokens)) # print('Q len:', tokens_question_len, 'maxlen:', tokens_question_maxlen, # ' S len:', tokens_support_len, 'maxlen:', tokens_support_maxlen, # file=sys.stderr) chars_support = batch_to_ids(tokens_support) chars_question = batch_to_ids(tokens_question) if torch.cuda.is_available(): chars_support = chars_support.cuda() chars_question = chars_question.cuda() with torch.no_grad(): emb_support = self.elmo( chars_support)['elmo_representations'][0].detach() emb_question = self.elmo( chars_question)['elmo_representations'][0].detach() xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question return xy_dict
def create_batch(self, annotations, is_eval: bool, with_answers: bool): q_tokenized = [a.question_tokens for a in annotations] question_lengths = [a.question_length for a in annotations] max_training_support = self.config.get('max_training_support', 2) s_tokenized = [] s_lemmas = [] support_lengths = [] wiq = [] offsets = [] support2question = [] # aligns with support2question, used in output module to get correct index to original set of supports selected_support = [] all_spans = [] for i, a in enumerate(annotations): s_lemmas.append([]) all_spans.append([]) if len(a.support_tokens ) > max_training_support > 0 and not is_eval: # sample only 2 paragraphs and take first with double probability (the best) to speed # things up. Following https://arxiv.org/pdf/1710.10723.pdf is_done = False any_answer = any(a.answer_spans) # sample until there is at least one possible answer (if any) while not is_done: selected = self._rng.sample( range(0, len(a.support_tokens) + 1), max_training_support + 1) if 0 in selected and 1 in selected: selected = [s - 1 for s in selected if s > 0] else: selected = [ max(0, s - 1) for s in selected[:max_training_support] ] is_done = not any_answer or any(a.answer_spans[s] for s in selected) selected = set(max(0, s - 1) for s in selected) else: selected = set(range(len(a.support_tokens))) for s in selected: s_tokenized.append(a.support_tokens[s]) s_lemmas[-1].append(a.support_lemmas[s]) support_lengths.append(a.support_length[s]) wiq.append(a.word_in_question[s]) offsets.append(a.token_offsets[s]) selected_support.append(a.selected_supports[s]) support2question.append(i) if with_answers: all_spans[-1].append(a.answer_spans[s]) word_chars, word_lengths, word_ids, vocab, rev_vocab = \ preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab) question = word_ids[:len(q_tokenized)] support = word_ids[len(q_tokenized):] ass_lengths = [] ass2question = [] ass2unique = [] lemma2idx = dict() question_arg_span = [] support_arg_span = [] assertion2question_arg_span = [] assertion2support_arg_span = [] question_arg_span_idx = dict() support_arg_span_idx = dict() word2lemma = [None] * len(rev_vocab) heuristic = self.config.get('heuristic', 'pair') s_offset = 0 for i, annot in enumerate(annotations): # collect uniq lemmas: for k, l in enumerate(annot.question_lemmas): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[question[i][k]] = lemma2idx[l] for k, ls in enumerate(s_lemmas[i]): for k2, l in enumerate(ls): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[support[s_offset + k][k2]] = lemma2idx[l] if self._limit == 0: s_offset += len(s_lemmas[i]) continue if heuristic == 'pair': assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys( annot.question_lemmas, [l for ls in s_lemmas[i] for l in ls], self._sources) elif heuristic == 'tfidf': assertions, assertion_args = self._knowledge_store.get_assertion_keys( [l for ls in s_lemmas[i] for l in ls], self._sources) assertions = list(assertions.keys()) assertion_strings = [ self._knowledge_store.get_assertion(key) for key in assertions ] scores = sort_by_tfidf(' '.join(annot.question_tokens), assertion_strings) assertions = {assertions[i]: s for i, s in scores} sorted_assertions = sorted(assertions.items(), key=lambda x: -x[1]) added_assertions = set() for key, _ in sorted_assertions: if len(added_assertions) == self._limit: break a = self._nlp( self._knowledge_store.get_assertion(key, cache=True)) a_lemma = " ".join(t.lemma_ for t in a) if a_lemma in added_assertions: continue else: added_assertions.add(a_lemma) ass2question.append(i) ass_lengths.append(len(a)) if heuristic == 'pair': q_arg_span = assertion_args[key][0] q_arg_span = (i, q_arg_span[0], q_arg_span[1]) s_arg_start, s_arg_end = assertion_args[key][1] doc_idx = 0 for ls in s_lemmas[i]: if s_arg_start < len(ls): break else: doc_idx += 1 s_arg_start -= len(ls) s_arg_end -= len(ls) s_arg_span = (s_offset + doc_idx, s_arg_start, s_arg_end) if q_arg_span not in question_arg_span_idx: question_arg_span_idx[q_arg_span] = len( question_arg_span) question_arg_span.append(assertion_args[key][0]) if s_arg_span not in support_arg_span_idx: support_arg_span_idx[s_arg_span] = len( support_arg_span) support_arg_span.append(assertion_args[key][1]) assertion2question_arg_span.append( question_arg_span_idx[q_arg_span]) assertion2support_arg_span.append( support_arg_span_idx[s_arg_span]) u_ass = [] for t in a: w = t.orth_ if w not in vocab: vocab[w] = len(vocab) word_lengths.append(min(len(w), 20)) word_chars.append( [self.char_vocab.get(c, 0) for c in w[:20]]) rev_vocab.append(w) if t.lemma_ not in lemma2idx: lemma2idx[t.lemma_] = len(lemma2idx) word2lemma.append(lemma2idx[t.lemma_]) u_ass.append(vocab[w]) ass2unique.append(u_ass) s_offset += len(s_lemmas[i]) word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]]) for i, w in enumerate(rev_vocab): word_embeddings[i] = self._get_emb(self.vocab(w)) if not ass2unique: ass2unique.append([]) question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32) output = { AssertionMRPorts.word_chars: word_chars, AssertionMRPorts.word_char_length: word_lengths, AssertionMRPorts.question: question, AssertionMRPorts.support: support, AssertionMRPorts.support_length: support_lengths, AssertionMRPorts.question_length: question_lengths, AssertionMRPorts.is_eval: is_eval, AssertionMRPorts.word_embeddings: word_embeddings, AssertionMRPorts.assertion_lengths: ass_lengths, AssertionMRPorts.assertion2question: ass2question, AssertionMRPorts.assertions: ass2unique, AssertionMRPorts.word2lemma: word2lemma, AssertionMRPorts.question_arg_span: question_arg_span, AssertionMRPorts.support_arg_span: support_arg_span, AssertionMRPorts.assertion2question_arg_span: assertion2question_arg_span, AssertionMRPorts.assertion2support_arg_span: assertion2support_arg_span, XQAPorts.word_in_question: wiq, XQAPorts.support2question: support2question, XQAPorts.token_offsets: offsets, XQAPorts.selected_support: selected_support, '__vocab': vocab, '__rev_vocab': rev_vocab, '__lemma_vocab': lemma2idx, } if with_answers: spans = [ s for a in all_spans for spans_per_support in a for s in spans_per_support ] span2support = [] support_idx = 0 for a in all_spans: for spans_per_support in a: span2support.extend([support_idx] * len(spans_per_support)) support_idx += 1 output.update({ XQAPorts.answer_span_target: [span for span in spans] if spans else np.zeros([0, 2], np.int32), XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans], XQAPorts.answer2support_training: span2support, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=self.output_ports + self.training_ports) return batch