def create_batch(self, triples: List[List[int]], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: _triples = list(triples) if with_answers: target = [1] * len(_triples) nb_entities = self.shared_resources.nb_entities nb_predicates = self.shared_resources.nb_predicates if with_answers: for i in range(len(_triples)): s, p, o = triples[i] for _ in range( self.shared_resources.config.get('num_negative', 1)): random_subject_index = self._kbp_rng.randint( 0, nb_entities) random_object_index = self._kbp_rng.randint( 0, nb_predicates) _triples.append([random_subject_index, p, o]) _triples.append([s, p, random_object_index]) target.append(0) target.append(0) xy_dict = {Ports.Input.question: _triples} if with_answers: xy_dict[Ports.Target.target_index] = target return numpify(xy_dict)
def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \ -> Mapping[TensorPort, np.ndarray]: batch_size = len(annotations) emb_supports = [a.support_embeddings for a in annotations] emb_questions = [a.question_embeddings for a in annotations] support_lengths = [a.support_length for a in annotations] question_lengths = [a.question_length for a in annotations] wiq = [a.word_in_question for a in annotations] offsets = [a.token_offsets for a in annotations] q_tokenized = [a.question_tokens for a in annotations] s_tokenized = [a.support_tokens for a in annotations] unique_words, unique_word_lengths, question2unique, support2unique = \ unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab) output = { XQAPorts.unique_word_chars: unique_words, XQAPorts.unique_word_char_length: unique_word_lengths, XQAPorts.question_words2unique: question2unique, XQAPorts.support_words2unique: support2unique, XQAPorts.emb_support: stack_and_pad(emb_supports), XQAPorts.support_length: support_lengths, XQAPorts.emb_question: stack_and_pad(emb_questions), XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout, XQAPorts.is_eval: is_eval, XQAPorts.token_char_offsets: offsets, } if with_answers: spans = [a.answer_spans for a in annotations] span2question = [i for i in range(batch_size) for _ in spans[i]] output.update({ XQAPorts.answer_span: [span for span_list in spans for span in span_list], XQAPorts.correct_start_training: [] if is_eval else [span[0] for span_list in spans for span in span_list], XQAPorts.answer2question_training: span2question, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[ XQAPorts.unique_word_chars, XQAPorts.question_words2unique, XQAPorts.support_words2unique, XQAPorts.word_in_question, XQAPorts.token_char_offsets ]) return batch
def create_batch(self, triples: List[List[int]], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: batch_size = len(triples) xy_dict = { Ports.Input.multiple_support: [0] * batch_size, Ports.Input.question: triples, Ports.Input.atomic_candidates: [0] * batch_size } return numpify(xy_dict)
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: # also add character information word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_words: question_words, Ports.Input.support_words: support_words, Ports.is_eval: is_eval } if self.shared_resources.config.get("vocab_from_embeddings", False): emb_support = np.zeros( [len(annotations), max(s_lengths), self.emb_matrix.shape[1]]) emb_question = np.zeros( [len(annotations), max(q_lengths), self.emb_matrix.shape[1]]) for i, a in enumerate(annotations): for j, k in enumerate(a.support_ids): emb_support[i, j] = self._get_emb(k) for j, k in enumerate(a.question_ids): emb_question[i, j] = self._get_emb(k) xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question else: xy_dict[Ports.Input.support] = [a.support_ids for a in annotations] xy_dict[Ports.Input.question] = [ a.question_ids for a in annotations ] if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] return numpify(xy_dict)
def create_batch(self, annotations: List[Mapping[str, Any]], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: output = { Ports.Input.question: [a["question"] for a in annotations], Ports.Input.atomic_candidates: [a["candidates"] for a in annotations] } if with_answers: output.update({ Ports.Target.target_index: [a["answers"][0] for a in annotations] }) return numpify(output)
def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: xy_dict = { Ports.Input.support: [a["supports"] for a in annotations], Ports.Input.question: [a["question"] for a in annotations], Ports.Input.question_length: [a["question_lengths"] for a in annotations], Ports.Input.support_length: [a['support_lengths'] for a in annotations], Ports.Input.sample_id: [a['ids'] for a in annotations] } if "answers" in annotations[0]: xy_dict[Ports.Target.target_index] = [ a["answers"] for a in annotations ] return numpify(xy_dict)
def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: q_lengths = [a["question_lengths"] for a in annotations] s_lengths = [a["support_lengths"] for a in annotations] if self.shared_resources.config.get("vocab_from_embeddings", False): emb_support = np.zeros( [len(annotations), max(s_lengths), self.emb_matrix.shape[1]]) emb_question = np.zeros( [len(annotations), max(q_lengths), self.emb_matrix.shape[1]]) for i, a in enumerate(annotations): for j, k in enumerate(a["supports"]): emb_support[i, j] = self._get_emb(k) for j, k in enumerate(a["question"]): emb_question[i, j] = self._get_emb(k) xy_dict = { Ports.Input.embedded_support: emb_support, Ports.Input.embedded_question: emb_question, Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a['ids'] for a in annotations] } else: xy_dict = { Ports.Input.support: [a["supports"] for a in annotations], Ports.Input.question: [a["question"] for a in annotations], Ports.Input.question_length: [a["question_lengths"] for a in annotations], Ports.Input.support_length: [a['support_lengths'] for a in annotations], Ports.Input.sample_id: [a['ids'] for a in annotations] } if "answers" in annotations[0]: xy_dict[Ports.Target.target_index] = [ a["answers"] for a in annotations ] return numpify(xy_dict, keys=[ Ports.Input.support, Ports.Input.question, Ports.Input.question_length, Ports.Input.support_length ])
def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool): support_lengths = list() question_lengths = list() ass_lengths = [] ass2question = [] ass2unique = [] lemma2idx = dict() answer_labels = [] question_arg_span = [] support_arg_span = [] assertions2question_arg_span = [] assertions2support_arg_span = [] question_arg_span_idx = dict() support_arg_span_idx = dict() word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a["question_tokens"] for a in annotations] + [a["support_tokens"] for a in annotations], self.char_vocab) question, support = tokens[:len(annotations)], tokens[len(annotations):] word2lemma = [None] * len(rev_vocab) # we have to create batches here and cannot precompute them because of the batch-specific wiq feature for i, annot in enumerate(annotations): support_lengths.append(annot['support_lengths']) question_lengths.append(annot['question_lengths']) if "answers" in annot: answer_labels.append(annot["answers"]) # collect uniq lemmas: for k, l in enumerate(annot['question_lemmas']): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[question[i][k]] = lemma2idx[l] for k, l in enumerate(annot['support_lemmas']): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[support[i][k]] = lemma2idx[l] assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys( annot['question_lemmas'], annot['support_lemmas'], self._sources) sorted_assertionss = sorted(assertions.items(), key=lambda x: -x[1]) added_assertionss = set() for key, _ in sorted_assertionss: if len(added_assertionss) == self._limit: break a = self.__nlp(self._knowledge_store.get_assertion(key)) a_lemma = " ".join(t.lemma_ for t in a) if a_lemma in added_assertionss: continue else: added_assertionss.add(a_lemma) ass2question.append(i) ass_lengths.append(len(a)) q_arg_span = assertion_args[key][0] q_arg_span = (i, q_arg_span[0], q_arg_span[1]) s_arg_span = assertion_args[key][1] s_arg_span = (i, s_arg_span[0], s_arg_span[1]) if q_arg_span not in question_arg_span_idx: question_arg_span_idx[q_arg_span] = len(question_arg_span) question_arg_span.append(assertion_args[key][0]) if s_arg_span not in support_arg_span_idx: support_arg_span_idx[s_arg_span] = len(support_arg_span) support_arg_span.append(assertion_args[key][1]) assertions2question_arg_span.append(question_arg_span_idx[q_arg_span]) assertions2support_arg_span.append(support_arg_span_idx[s_arg_span]) u_ass = [] for t in a: w = t.orth_ if w not in vocab: vocab[w] = len(vocab) word_lengths.append(min(len(w), 20)) word_chars.append([self.char_vocab.get(c, 0) for c in w[:20]]) rev_vocab.append(w) if t.lemma_ not in lemma2idx: lemma2idx[t.lemma_] = len(lemma2idx) word2lemma.append(lemma2idx[t.lemma_]) u_ass.append(vocab[w]) ass2unique.append(u_ass) word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]]) for i, w in enumerate(rev_vocab): word_embeddings[i] = self._get_emb(self.shared_resources.vocab(w)) if not ass2unique: ass2unique.append([]) question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32) output = { AssertionMRPorts.word_chars: word_chars, AssertionMRPorts.word_char_length: word_lengths, AssertionMRPorts.question: question, AssertionMRPorts.support: support, AssertionMRPorts.support_length: support_lengths, AssertionMRPorts.question_length: question_lengths, AssertionMRPorts.is_eval: is_eval, AssertionMRPorts.word_embeddings: word_embeddings, AssertionMRPorts.assertion_lengths: ass_lengths, AssertionMRPorts.assertion2question: ass2question, AssertionMRPorts.assertions: ass2unique, AssertionMRPorts.word2lemma: word2lemma, AssertionMRPorts.question_arg_span: question_arg_span, AssertionMRPorts.support_arg_span: support_arg_span, AssertionMRPorts.assertion2question_arg_span: assertions2question_arg_span, AssertionMRPorts.assertion2support_arg_span: assertions2support_arg_span, '__vocab': vocab, '__rev_vocab': rev_vocab, '__lemma_vocab': lemma2idx, } if "answers" in annotations[0]: output[Ports.Target.target_index] = [a["answers"] for a in annotations] return numpify(output, keys=self.output_ports + self.training_ports)
def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \ -> Mapping[TensorPort, np.ndarray]: q_tokenized = [a.question_tokens for a in annotations] question_lengths = [a.question_length for a in annotations] max_training_support = self.config.get('max_training_support', 2) s_tokenized = [] support_lengths = [] wiq = [] offsets = [] support2question = [] support_ids = [] # aligns with support2question, used in output module to get correct index to original set of supports selected_support = [] all_spans = [] for i, a in enumerate(annotations): all_spans.append([]) if len(a.support_tokens) > max_training_support > 0 and not is_eval: # sample only 2 paragraphs and take first with double probability (the best) to speed # things up. Following https://arxiv.org/pdf/1710.10723.pdf is_done = False any_answer = any(a.answer_spans) # sample until there is at least one possible answer (if any) while not is_done: selected = self._rng.sample(range(0, len(a.support_tokens) + 1), max_training_support + 1) if 0 in selected and 1 in selected: selected = [s - 1 for s in selected if s > 0] else: selected = [max(0, s - 1) for s in selected[:max_training_support]] is_done = not any_answer or any(a.answer_spans[s] for s in selected) else: selected = set(range(len(a.support_tokens))) for s in selected: s_tokenized.append(a.support_tokens[s]) support_lengths.append(a.support_length[s]) wiq.append(a.word_in_question[s]) offsets.append(a.token_offsets[s]) selected_support.append(a.selected_supports[s]) support_ids.append(a.support_ids[s]) support2question.append(i) if with_answers: all_spans[-1].append(a.answer_spans[s]) word_chars, word_lengths, word_ids, vocab, rev_vocab = \ preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab) emb_support = np.zeros([len(support_lengths), max(support_lengths), self.vocab.emb_length]) emb_question = np.zeros([len(question_lengths), max(question_lengths), self.vocab.emb_length]) for i, a in enumerate(annotations): for j, q_id in enumerate(a.question_ids): emb_question[i, j] = self._get_emb(q_id) for k, s_ids in enumerate(support_ids): for j, s_id in enumerate(s_ids): emb_support[k, j] = self._get_emb(s_id) output = { XQAPorts.word_chars: word_chars, XQAPorts.word_char_length: word_lengths, XQAPorts.question_words: word_ids[:len(q_tokenized)], XQAPorts.support_words: word_ids[len(q_tokenized):], XQAPorts.emb_support: emb_support, XQAPorts.support_length: support_lengths, XQAPorts.emb_question: emb_question, XQAPorts.question_length: question_lengths, XQAPorts.word_in_question: wiq, XQAPorts.support2question: support2question, XQAPorts.is_eval: is_eval, XQAPorts.token_offsets: offsets, XQAPorts.selected_support: selected_support, '__vocab': vocab, '__rev_vocab': rev_vocab, } if with_answers: spans = [s for a in all_spans for spans_per_support in a for s in spans_per_support] span2support = [] support_idx = 0 for a in all_spans: for spans_per_support in a: span2support.extend([support_idx] * len(spans_per_support)) support_idx += 1 output.update({ XQAPorts.answer_span: [span for span in spans], XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans], XQAPorts.answer2support_training: span2support, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=[XQAPorts.word_chars, XQAPorts.question_words, XQAPorts.support_words, XQAPorts.word_in_question, XQAPorts.token_offsets]) return batch
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: # also add character information word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] if self.shared_resources.config.get('use_dep_sa', False): xy_dict = { Ports.Input.support_length: s_lengths, Ports.Input.support_dep_i: [a.support_dep_i for a in annotations], Ports.Input.support_dep_j: [a.support_dep_j for a in annotations], Ports.Input.support_dep_type: [a.support_dep_type for a in annotations], Ports.Input.question_length: q_lengths, Ports.Input.question_dep_i: [a.question_dep_i for a in annotations], Ports.Input.question_dep_j: [a.question_dep_j for a in annotations], Ports.Input.question_dep_type: [a.question_dep_type for a in annotations], Ports.is_eval: is_eval } else: xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_batch_words: question_words, Ports.Input.support_batch_words: support_words, Ports.is_eval: is_eval, Ports.Input.support: [a.support_ids for a in annotations], Ports.Input.question: [a.question_ids for a in annotations] } if self.embeddings is not None: emb_support = np.zeros( [len(annotations), max(s_lengths), self.embeddings.shape[-1]]) emb_question = np.zeros( [len(annotations), max(q_lengths), self.embeddings.shape[-1]]) for i, a in enumerate(annotations): for j, t in enumerate(a.support_tokens): emb_support[i, j] = self.embeddings.get( t, self.__default_vec) for j, t in enumerate(a.question_tokens): emb_question[i, j] = self.embeddings.get( t, self.__default_vec) xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] xy_dict = numpify(xy_dict) return xy_dict
def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False): """ Creates generator that batches `data`. To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible. (which will sample batches from all instances) Args: `data`: dict with (multi-dimensional) numpy arrays or (nested) lists; first inner dimension (`num_instances`) should be the same over all data values. `batch_size`: the desired batch size `pad`: padding symbol in case data contains lists of lists of different sizes `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly once during training. Default: `False`, to be certain during training that each instance per batch gets same weight in the total loss (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`). Returns: a generator that generates a dict with same keys as `data`, and as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch, if `exact_epoch=True`) """ assert isinstance(data, dict) data0 = list(data.values())[0] if not isinstance(data0, np.ndarray): data_np = numpify(data, pad) # still need original data for length-based bucketing else: data_np = data def get_bucket_probs(_buckets2instances): N = float(np.sum([len(ids) for ids in _buckets2instances.values()])) return {bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items()} def shuffle_buckets(_buckets2instances): for bid in sorted(_buckets2instances.keys()): # sorted: to keep deterministic rs.shuffle(_buckets2instances[bid]) buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) n_buckets = len(buckets2instances) exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch # if average instances/bucket smaller than batch_size: set exact_epoch = True # to avoid empty batches during debugging on small data samples def bucket_generator(): buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure) shuffle_buckets(buckets2instances) all_seen = False while not all_seen: bids, probs = zip(*sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0])) # sorted keys: to keep deterministic if np.sum(probs) == 0.: all_seen = True else: bid = rs.choice(bids, replace=False, p=probs) # sample bucket according to remaining size batch_indices = buckets2instances[bid][:batch_size] buckets2instances[bid] = buckets2instances[bid][batch_size:] # if required by exact_epoch: also include last batch in bucket if too small if len(batch_indices) == batch_size or exact_epoch: yield {k: data_np[k][batch_indices] for k in data_np} return GeneratorWithRestart(bucket_generator)
def create_batch(self, annotations: List[MCAnnotation], is_eval: bool, with_answers: bool) -> Mapping[TensorPort, np.ndarray]: word_chars, word_lengths, tokens, vocab, rev_vocab = \ preprocessing.unique_words_with_chars( [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations], self.shared_resources.char_vocab) question_words, support_words = tokens[:len(annotations)], tokens[ len(annotations):] q_lengths = [a.question_length for a in annotations] s_lengths = [a.support_length for a in annotations] if self.shared_resources.config.get('use_dep_sa', False): xy_dict = { Ports.Input.support_length: s_lengths, Ports.Input.support_dep_i: [a.support_dep_i for a in annotations], Ports.Input.support_dep_j: [a.support_dep_j for a in annotations], Ports.Input.support_dep_type: [a.support_dep_type for a in annotations], Ports.Input.question_length: q_lengths, Ports.Input.question_dep_i: [a.question_dep_i for a in annotations], Ports.Input.question_dep_j: [a.question_dep_j for a in annotations], Ports.Input.question_dep_type: [a.question_dep_type for a in annotations], Ports.is_eval: is_eval } else: xy_dict = { Ports.Input.question_length: q_lengths, Ports.Input.support_length: s_lengths, Ports.Input.sample_id: [a.id for a in annotations], Ports.Input.word_chars: word_chars, Ports.Input.word_char_length: word_lengths, Ports.Input.question_batch_words: question_words, Ports.Input.support_batch_words: support_words, Ports.is_eval: is_eval, Ports.Input.support: [a.support_ids for a in annotations], Ports.Input.question: [a.question_ids for a in annotations] } if with_answers: xy_dict[Ports.Target.target_index] = [ a.answer for a in annotations ] xy_dict = numpify(xy_dict) # Elmo embeddings tokens_support = [a.support_tokens for a in annotations] tokens_question = [a.question_tokens for a in annotations] # debug tokens_support_len = 0 tokens_question_len = 0 tokens_support_maxlen = 0 tokens_question_maxlen = 0 for a in annotations: tokens_support_len += len(a.support_tokens) tokens_question_len += len(a.question_tokens) tokens_support_maxlen = max(tokens_support_maxlen, len(a.support_tokens)) tokens_question_maxlen = max(tokens_question_maxlen, len(a.question_tokens)) # print('Q len:', tokens_question_len, 'maxlen:', tokens_question_maxlen, # ' S len:', tokens_support_len, 'maxlen:', tokens_support_maxlen, # file=sys.stderr) chars_support = batch_to_ids(tokens_support) chars_question = batch_to_ids(tokens_question) if torch.cuda.is_available(): chars_support = chars_support.cuda() chars_question = chars_question.cuda() with torch.no_grad(): emb_support = self.elmo( chars_support)['elmo_representations'][0].detach() emb_question = self.elmo( chars_question)['elmo_representations'][0].detach() xy_dict[Ports.Input.emb_support] = emb_support xy_dict[Ports.Input.emb_question] = emb_question return xy_dict
def create_batch(self, annotations, is_eval, with_answers): frac = self.config.get('training_fraction_with_definition', 1.0) if not self.use_definitions or (frac < 1.0 and not is_eval and self._rng.random() > frac): return super(XQAAssertionDefinitionInputModule, self).create_batch(annotations, is_eval, with_answers) batch = super(XQAAssertionDefinitionInputModule, self).create_batch(annotations, True, with_answers) lemma_vocab = batch['__lemma_vocab'] vocab = batch['__vocab'] rev_vocab = batch['__rev_vocab'] word_chars = batch[AssertionMRPorts.word_chars].tolist() word_lengths = batch[AssertionMRPorts.word_char_length].tolist() word2lemma = batch[AssertionMRPorts.word2lemma].tolist() support = batch[AssertionMRPorts.support] rev_lemma_vocab = {v: k for k, v in lemma_vocab.items()} topk = self.config['topk'] self.reader.model_module.set_topk(topk) spans = self.reader.model_module( batch, [XQAPorts.answer_span])[XQAPorts.answer_span] definitions = [] definition_lengths = [] definition2question = [] seen_answer_lemmas = None for i, s in enumerate(spans): j = i // topk if i % topk == 0: seen_answer_lemmas = set() doc_idx_map = [ i for i, q_id in enumerate(batch[XQAPorts.support2question]) if q_id == j ] doc_idx, start, end = s[0], s[1], s[2] answer_token_ids = support[doc_idx_map[doc_idx], start:end + 1] answer_lemmas = [ rev_lemma_vocab[word2lemma[idd]] for idd in answer_token_ids ] answer_lemma = ' '.join(answer_lemmas) if answer_lemma in seen_answer_lemmas: continue seen_answer_lemmas.add(answer_lemma) ks = self._knowledge_store.assertion_keys_for_subject( answer_lemma, resource='wikipedia_firstsent') if not ks: # remove leading or trailing stop words or non alnum words while answer_lemmas and (answer_lemmas[0] in spacy.en.STOP_WORDS or not answer_lemmas[0].isalnum()): answer_lemmas = answer_lemmas[1:] while answer_lemmas and (answer_lemmas[-1] in spacy.en.STOP_WORDS or not answer_lemmas[-1].isalnum()): answer_lemmas = answer_lemmas[:-1] answer_lemma = ' '.join(answer_lemmas) if answer_lemma in seen_answer_lemmas: continue seen_answer_lemmas.add(answer_lemma) ks = self._knowledge_store.assertion_keys_for_subject( answer_lemma, resource='wikipedia_firstsent') defns = [ self._nlp(self._knowledge_store.get_assertion(key)) for key in ks ] if len(defns) > 3: indices_scores = sort_by_tfidf( ' '.join(annotations[j].question_lemmas + annotations[j].support_lemmas[doc_idx]), [' '.join(t.lemma_ for t in d) for d in defns]) # only select the top 3 definition with best match to the support and question defns = [defns[i] for i, _ in indices_scores[:3]] for defn in defns: definition_lengths.append(len(defn)) definition2question.append(j) defn_ids = [] for t in defn: w = t.orth_ if w not in vocab: vocab[w] = len(vocab) word_lengths.append(min(len(w), 20)) word_chars.append( [self.char_vocab.get(c, 0) for c in w[:20]]) rev_vocab.append(w) if t.lemma_ not in lemma_vocab: lemma_vocab[t.lemma_] = len(lemma_vocab) word2lemma.append(lemma_vocab[t.lemma_]) defn_ids.append(vocab[w]) definitions.append(defn_ids) batch[DefinitionPorts.definitions] = definitions batch[DefinitionPorts.definition_lengths] = definition_lengths batch[DefinitionPorts.definition2question] = definition2question batch[AssertionMRPorts.word_chars] = word_chars batch[AssertionMRPorts.word_char_length] = word_lengths batch[AssertionMRPorts.word2lemma] = word2lemma batch[AssertionMRPorts.is_eval] = is_eval word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]]) for i, w in enumerate(rev_vocab): word_embeddings[i] = self._get_emb(self.vocab(w)) batch[AssertionMRPorts.word_embeddings] = word_embeddings return numpify(batch, keys=[ DefinitionPorts.definitions, DefinitionPorts.definition_lengths, DefinitionPorts.definition2question, AssertionMRPorts.word_chars, AssertionMRPorts.word_char_length, AssertionMRPorts.word2lemma ])
def create_batch(self, annotations, is_eval: bool, with_answers: bool): q_tokenized = [a.question_tokens for a in annotations] question_lengths = [a.question_length for a in annotations] max_training_support = self.config.get('max_training_support', 2) s_tokenized = [] s_lemmas = [] support_lengths = [] wiq = [] offsets = [] support2question = [] # aligns with support2question, used in output module to get correct index to original set of supports selected_support = [] all_spans = [] for i, a in enumerate(annotations): s_lemmas.append([]) all_spans.append([]) if len(a.support_tokens ) > max_training_support > 0 and not is_eval: # sample only 2 paragraphs and take first with double probability (the best) to speed # things up. Following https://arxiv.org/pdf/1710.10723.pdf is_done = False any_answer = any(a.answer_spans) # sample until there is at least one possible answer (if any) while not is_done: selected = self._rng.sample( range(0, len(a.support_tokens) + 1), max_training_support + 1) if 0 in selected and 1 in selected: selected = [s - 1 for s in selected if s > 0] else: selected = [ max(0, s - 1) for s in selected[:max_training_support] ] is_done = not any_answer or any(a.answer_spans[s] for s in selected) selected = set(max(0, s - 1) for s in selected) else: selected = set(range(len(a.support_tokens))) for s in selected: s_tokenized.append(a.support_tokens[s]) s_lemmas[-1].append(a.support_lemmas[s]) support_lengths.append(a.support_length[s]) wiq.append(a.word_in_question[s]) offsets.append(a.token_offsets[s]) selected_support.append(a.selected_supports[s]) support2question.append(i) if with_answers: all_spans[-1].append(a.answer_spans[s]) word_chars, word_lengths, word_ids, vocab, rev_vocab = \ preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab) question = word_ids[:len(q_tokenized)] support = word_ids[len(q_tokenized):] ass_lengths = [] ass2question = [] ass2unique = [] lemma2idx = dict() question_arg_span = [] support_arg_span = [] assertion2question_arg_span = [] assertion2support_arg_span = [] question_arg_span_idx = dict() support_arg_span_idx = dict() word2lemma = [None] * len(rev_vocab) heuristic = self.config.get('heuristic', 'pair') s_offset = 0 for i, annot in enumerate(annotations): # collect uniq lemmas: for k, l in enumerate(annot.question_lemmas): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[question[i][k]] = lemma2idx[l] for k, ls in enumerate(s_lemmas[i]): for k2, l in enumerate(ls): if l not in lemma2idx: lemma2idx[l] = len(lemma2idx) word2lemma[support[s_offset + k][k2]] = lemma2idx[l] if self._limit == 0: s_offset += len(s_lemmas[i]) continue if heuristic == 'pair': assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys( annot.question_lemmas, [l for ls in s_lemmas[i] for l in ls], self._sources) elif heuristic == 'tfidf': assertions, assertion_args = self._knowledge_store.get_assertion_keys( [l for ls in s_lemmas[i] for l in ls], self._sources) assertions = list(assertions.keys()) assertion_strings = [ self._knowledge_store.get_assertion(key) for key in assertions ] scores = sort_by_tfidf(' '.join(annot.question_tokens), assertion_strings) assertions = {assertions[i]: s for i, s in scores} sorted_assertions = sorted(assertions.items(), key=lambda x: -x[1]) added_assertions = set() for key, _ in sorted_assertions: if len(added_assertions) == self._limit: break a = self._nlp( self._knowledge_store.get_assertion(key, cache=True)) a_lemma = " ".join(t.lemma_ for t in a) if a_lemma in added_assertions: continue else: added_assertions.add(a_lemma) ass2question.append(i) ass_lengths.append(len(a)) if heuristic == 'pair': q_arg_span = assertion_args[key][0] q_arg_span = (i, q_arg_span[0], q_arg_span[1]) s_arg_start, s_arg_end = assertion_args[key][1] doc_idx = 0 for ls in s_lemmas[i]: if s_arg_start < len(ls): break else: doc_idx += 1 s_arg_start -= len(ls) s_arg_end -= len(ls) s_arg_span = (s_offset + doc_idx, s_arg_start, s_arg_end) if q_arg_span not in question_arg_span_idx: question_arg_span_idx[q_arg_span] = len( question_arg_span) question_arg_span.append(assertion_args[key][0]) if s_arg_span not in support_arg_span_idx: support_arg_span_idx[s_arg_span] = len( support_arg_span) support_arg_span.append(assertion_args[key][1]) assertion2question_arg_span.append( question_arg_span_idx[q_arg_span]) assertion2support_arg_span.append( support_arg_span_idx[s_arg_span]) u_ass = [] for t in a: w = t.orth_ if w not in vocab: vocab[w] = len(vocab) word_lengths.append(min(len(w), 20)) word_chars.append( [self.char_vocab.get(c, 0) for c in w[:20]]) rev_vocab.append(w) if t.lemma_ not in lemma2idx: lemma2idx[t.lemma_] = len(lemma2idx) word2lemma.append(lemma2idx[t.lemma_]) u_ass.append(vocab[w]) ass2unique.append(u_ass) s_offset += len(s_lemmas[i]) word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]]) for i, w in enumerate(rev_vocab): word_embeddings[i] = self._get_emb(self.vocab(w)) if not ass2unique: ass2unique.append([]) question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32) output = { AssertionMRPorts.word_chars: word_chars, AssertionMRPorts.word_char_length: word_lengths, AssertionMRPorts.question: question, AssertionMRPorts.support: support, AssertionMRPorts.support_length: support_lengths, AssertionMRPorts.question_length: question_lengths, AssertionMRPorts.is_eval: is_eval, AssertionMRPorts.word_embeddings: word_embeddings, AssertionMRPorts.assertion_lengths: ass_lengths, AssertionMRPorts.assertion2question: ass2question, AssertionMRPorts.assertions: ass2unique, AssertionMRPorts.word2lemma: word2lemma, AssertionMRPorts.question_arg_span: question_arg_span, AssertionMRPorts.support_arg_span: support_arg_span, AssertionMRPorts.assertion2question_arg_span: assertion2question_arg_span, AssertionMRPorts.assertion2support_arg_span: assertion2support_arg_span, XQAPorts.word_in_question: wiq, XQAPorts.support2question: support2question, XQAPorts.token_offsets: offsets, XQAPorts.selected_support: selected_support, '__vocab': vocab, '__rev_vocab': rev_vocab, '__lemma_vocab': lemma2idx, } if with_answers: spans = [ s for a in all_spans for spans_per_support in a for s in spans_per_support ] span2support = [] support_idx = 0 for a in all_spans: for spans_per_support in a: span2support.extend([support_idx] * len(spans_per_support)) support_idx += 1 output.update({ XQAPorts.answer_span_target: [span for span in spans] if spans else np.zeros([0, 2], np.int32), XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans], XQAPorts.answer2support_training: span2support, }) # we can only numpify in here, because bucketing is not possible prior batch = numpify(output, keys=self.output_ports + self.training_ports) return batch