Esempio n. 1
0
    def create_batch(self, triples: List[List[int]], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        _triples = list(triples)

        if with_answers:
            target = [1] * len(_triples)

        nb_entities = self.shared_resources.nb_entities
        nb_predicates = self.shared_resources.nb_predicates

        if with_answers:
            for i in range(len(_triples)):
                s, p, o = triples[i]

                for _ in range(
                        self.shared_resources.config.get('num_negative', 1)):

                    random_subject_index = self._kbp_rng.randint(
                        0, nb_entities)
                    random_object_index = self._kbp_rng.randint(
                        0, nb_predicates)

                    _triples.append([random_subject_index, p, o])
                    _triples.append([s, p, random_object_index])

                    target.append(0)
                    target.append(0)

        xy_dict = {Ports.Input.question: _triples}

        if with_answers:
            xy_dict[Ports.Target.target_index] = target

        return numpify(xy_dict)
Esempio n. 2
0
    def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \
            -> Mapping[TensorPort, np.ndarray]:

        batch_size = len(annotations)

        emb_supports = [a.support_embeddings for a in annotations]
        emb_questions = [a.question_embeddings for a in annotations]

        support_lengths = [a.support_length for a in annotations]
        question_lengths = [a.question_length for a in annotations]
        wiq = [a.word_in_question for a in annotations]
        offsets = [a.token_offsets for a in annotations]

        q_tokenized = [a.question_tokens for a in annotations]
        s_tokenized = [a.support_tokens for a in annotations]

        unique_words, unique_word_lengths, question2unique, support2unique = \
            unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab)

        output = {
            XQAPorts.unique_word_chars: unique_words,
            XQAPorts.unique_word_char_length: unique_word_lengths,
            XQAPorts.question_words2unique: question2unique,
            XQAPorts.support_words2unique: support2unique,
            XQAPorts.emb_support: stack_and_pad(emb_supports),
            XQAPorts.support_length: support_lengths,
            XQAPorts.emb_question: stack_and_pad(emb_questions),
            XQAPorts.question_length: question_lengths,
            XQAPorts.word_in_question: wiq,
            XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout,
            XQAPorts.is_eval: is_eval,
            XQAPorts.token_char_offsets: offsets,
        }

        if with_answers:
            spans = [a.answer_spans for a in annotations]
            span2question = [i for i in range(batch_size) for _ in spans[i]]
            output.update({
                XQAPorts.answer_span:
                [span for span_list in spans for span in span_list],
                XQAPorts.correct_start_training: [] if is_eval else
                [span[0] for span_list in spans for span in span_list],
                XQAPorts.answer2question_training:
                span2question,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output,
                        keys=[
                            XQAPorts.unique_word_chars,
                            XQAPorts.question_words2unique,
                            XQAPorts.support_words2unique,
                            XQAPorts.word_in_question,
                            XQAPorts.token_char_offsets
                        ])
        return batch
Esempio n. 3
0
    def create_batch(self, triples: List[List[int]], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        batch_size = len(triples)

        xy_dict = {
            Ports.Input.multiple_support: [0] * batch_size,
            Ports.Input.question: triples,
            Ports.Input.atomic_candidates: [0] * batch_size
        }
        return numpify(xy_dict)
Esempio n. 4
0
    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        # also add character information
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        xy_dict = {
            Ports.Input.question_length: q_lengths,
            Ports.Input.support_length: s_lengths,
            Ports.Input.sample_id: [a.id for a in annotations],
            Ports.Input.word_chars: word_chars,
            Ports.Input.word_char_length: word_lengths,
            Ports.Input.question_words: question_words,
            Ports.Input.support_words: support_words,
            Ports.is_eval: is_eval
        }

        if self.shared_resources.config.get("vocab_from_embeddings", False):
            emb_support = np.zeros(
                [len(annotations),
                 max(s_lengths), self.emb_matrix.shape[1]])
            emb_question = np.zeros(
                [len(annotations),
                 max(q_lengths), self.emb_matrix.shape[1]])
            for i, a in enumerate(annotations):
                for j, k in enumerate(a.support_ids):
                    emb_support[i, j] = self._get_emb(k)
                for j, k in enumerate(a.question_ids):
                    emb_question[i, j] = self._get_emb(k)

            xy_dict[Ports.Input.emb_support] = emb_support
            xy_dict[Ports.Input.emb_question] = emb_question
        else:
            xy_dict[Ports.Input.support] = [a.support_ids for a in annotations]
            xy_dict[Ports.Input.question] = [
                a.question_ids for a in annotations
            ]

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        return numpify(xy_dict)
Esempio n. 5
0
    def create_batch(self, annotations: List[Mapping[str, Any]], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:

        output = {
            Ports.Input.question: [a["question"] for a in annotations],
            Ports.Input.atomic_candidates:
            [a["candidates"] for a in annotations]
        }

        if with_answers:
            output.update({
                Ports.Target.target_index:
                [a["answers"][0] for a in annotations]
            })
        return numpify(output)
Esempio n. 6
0
 def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool,
                  with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
     xy_dict = {
         Ports.Input.support: [a["supports"] for a in annotations],
         Ports.Input.question: [a["question"] for a in annotations],
         Ports.Input.question_length:
         [a["question_lengths"] for a in annotations],
         Ports.Input.support_length:
         [a['support_lengths'] for a in annotations],
         Ports.Input.sample_id: [a['ids'] for a in annotations]
     }
     if "answers" in annotations[0]:
         xy_dict[Ports.Target.target_index] = [
             a["answers"] for a in annotations
         ]
     return numpify(xy_dict)
Esempio n. 7
0
    def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        q_lengths = [a["question_lengths"] for a in annotations]
        s_lengths = [a["support_lengths"] for a in annotations]
        if self.shared_resources.config.get("vocab_from_embeddings", False):
            emb_support = np.zeros(
                [len(annotations),
                 max(s_lengths), self.emb_matrix.shape[1]])
            emb_question = np.zeros(
                [len(annotations),
                 max(q_lengths), self.emb_matrix.shape[1]])
            for i, a in enumerate(annotations):
                for j, k in enumerate(a["supports"]):
                    emb_support[i, j] = self._get_emb(k)
                for j, k in enumerate(a["question"]):
                    emb_question[i, j] = self._get_emb(k)

            xy_dict = {
                Ports.Input.embedded_support: emb_support,
                Ports.Input.embedded_question: emb_question,
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a['ids'] for a in annotations]
            }
        else:
            xy_dict = {
                Ports.Input.support: [a["supports"] for a in annotations],
                Ports.Input.question: [a["question"] for a in annotations],
                Ports.Input.question_length:
                [a["question_lengths"] for a in annotations],
                Ports.Input.support_length:
                [a['support_lengths'] for a in annotations],
                Ports.Input.sample_id: [a['ids'] for a in annotations]
            }

        if "answers" in annotations[0]:
            xy_dict[Ports.Target.target_index] = [
                a["answers"] for a in annotations
            ]
        return numpify(xy_dict,
                       keys=[
                           Ports.Input.support, Ports.Input.question,
                           Ports.Input.question_length,
                           Ports.Input.support_length
                       ])
Esempio n. 8
0
    def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool):
        support_lengths = list()
        question_lengths = list()

        ass_lengths = []
        ass2question = []
        ass2unique = []
        lemma2idx = dict()
        answer_labels = []
        question_arg_span = []
        support_arg_span = []
        assertions2question_arg_span = []
        assertions2support_arg_span = []

        question_arg_span_idx = dict()
        support_arg_span_idx = dict()

        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a["question_tokens"] for a in annotations] + [a["support_tokens"] for a in annotations],
                self.char_vocab)
        question, support = tokens[:len(annotations)], tokens[len(annotations):]

        word2lemma = [None] * len(rev_vocab)

        # we have to create batches here and cannot precompute them because of the batch-specific wiq feature
        for i, annot in enumerate(annotations):
            support_lengths.append(annot['support_lengths'])
            question_lengths.append(annot['question_lengths'])

            if "answers" in annot:
                answer_labels.append(annot["answers"])

            # collect uniq lemmas:
            for k, l in enumerate(annot['question_lemmas']):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[question[i][k]] = lemma2idx[l]
            for k, l in enumerate(annot['support_lemmas']):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[support[i][k]] = lemma2idx[l]

            assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys(
                annot['question_lemmas'], annot['support_lemmas'], self._sources)

            sorted_assertionss = sorted(assertions.items(), key=lambda x: -x[1])
            added_assertionss = set()
            for key, _ in sorted_assertionss:
                if len(added_assertionss) == self._limit:
                    break
                a = self.__nlp(self._knowledge_store.get_assertion(key))
                a_lemma = " ".join(t.lemma_ for t in a)
                if a_lemma in added_assertionss:
                    continue
                else:
                    added_assertionss.add(a_lemma)
                ass2question.append(i)
                ass_lengths.append(len(a))
                q_arg_span = assertion_args[key][0]
                q_arg_span = (i, q_arg_span[0], q_arg_span[1])
                s_arg_span = assertion_args[key][1]
                s_arg_span = (i, s_arg_span[0], s_arg_span[1])
                if q_arg_span not in question_arg_span_idx:
                    question_arg_span_idx[q_arg_span] = len(question_arg_span)
                    question_arg_span.append(assertion_args[key][0])
                if s_arg_span not in support_arg_span_idx:
                    support_arg_span_idx[s_arg_span] = len(support_arg_span)
                    support_arg_span.append(assertion_args[key][1])
                assertions2question_arg_span.append(question_arg_span_idx[q_arg_span])
                assertions2support_arg_span.append(support_arg_span_idx[s_arg_span])

                u_ass = []
                for t in a:
                    w = t.orth_
                    if w not in vocab:
                        vocab[w] = len(vocab)
                        word_lengths.append(min(len(w), 20))
                        word_chars.append([self.char_vocab.get(c, 0) for c in w[:20]])
                        rev_vocab.append(w)
                        if t.lemma_ not in lemma2idx:
                            lemma2idx[t.lemma_] = len(lemma2idx)
                        word2lemma.append(lemma2idx[t.lemma_])
                    u_ass.append(vocab[w])
                ass2unique.append(u_ass)

        word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]])
        for i, w in enumerate(rev_vocab):
            word_embeddings[i] = self._get_emb(self.shared_resources.vocab(w))

        if not ass2unique:
            ass2unique.append([])
            question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32)

        output = {
            AssertionMRPorts.word_chars: word_chars,
            AssertionMRPorts.word_char_length: word_lengths,
            AssertionMRPorts.question: question,
            AssertionMRPorts.support: support,
            AssertionMRPorts.support_length: support_lengths,
            AssertionMRPorts.question_length: question_lengths,
            AssertionMRPorts.is_eval: is_eval,
            AssertionMRPorts.word_embeddings: word_embeddings,
            AssertionMRPorts.assertion_lengths: ass_lengths,
            AssertionMRPorts.assertion2question: ass2question,
            AssertionMRPorts.assertions: ass2unique,
            AssertionMRPorts.word2lemma: word2lemma,
            AssertionMRPorts.question_arg_span: question_arg_span,
            AssertionMRPorts.support_arg_span: support_arg_span,
            AssertionMRPorts.assertion2question_arg_span: assertions2question_arg_span,
            AssertionMRPorts.assertion2support_arg_span: assertions2support_arg_span,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
            '__lemma_vocab': lemma2idx,
        }
        if "answers" in annotations[0]:
            output[Ports.Target.target_index] = [a["answers"] for a in annotations]

        return numpify(output, keys=self.output_ports + self.training_ports)
Esempio n. 9
0
    def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \
            -> Mapping[TensorPort, np.ndarray]:

        q_tokenized = [a.question_tokens for a in annotations]
        question_lengths = [a.question_length for a in annotations]

        max_training_support = self.config.get('max_training_support', 2)
        s_tokenized = []
        support_lengths = []
        wiq = []
        offsets = []
        support2question = []
        support_ids = []
        # aligns with support2question, used in output module to get correct index to original set of supports
        selected_support = []
        all_spans = []
        for i, a in enumerate(annotations):
            all_spans.append([])
            if len(a.support_tokens) > max_training_support > 0 and not is_eval:
                # sample only 2 paragraphs and take first with double probability (the best) to speed
                # things up. Following https://arxiv.org/pdf/1710.10723.pdf
                is_done = False
                any_answer = any(a.answer_spans)
                # sample until there is at least one possible answer (if any)
                while not is_done:
                    selected = self._rng.sample(range(0, len(a.support_tokens) + 1), max_training_support + 1)
                    if 0 in selected and 1 in selected:
                        selected = [s - 1 for s in selected if s > 0]
                    else:
                        selected = [max(0, s - 1) for s in selected[:max_training_support]]
                    is_done = not any_answer or any(a.answer_spans[s] for s in selected)
            else:
                selected = set(range(len(a.support_tokens)))
            for s in selected:
                s_tokenized.append(a.support_tokens[s])
                support_lengths.append(a.support_length[s])
                wiq.append(a.word_in_question[s])
                offsets.append(a.token_offsets[s])
                selected_support.append(a.selected_supports[s])
                support_ids.append(a.support_ids[s])
                support2question.append(i)
                if with_answers:
                    all_spans[-1].append(a.answer_spans[s])

        word_chars, word_lengths, word_ids, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab)

        emb_support = np.zeros([len(support_lengths), max(support_lengths), self.vocab.emb_length])
        emb_question = np.zeros([len(question_lengths), max(question_lengths), self.vocab.emb_length])

        for i, a in enumerate(annotations):
            for j, q_id in enumerate(a.question_ids):
                emb_question[i, j] = self._get_emb(q_id)
        for k, s_ids in enumerate(support_ids):
            for j, s_id in enumerate(s_ids):
                emb_support[k, j] = self._get_emb(s_id)

        output = {
            XQAPorts.word_chars: word_chars,
            XQAPorts.word_char_length: word_lengths,
            XQAPorts.question_words: word_ids[:len(q_tokenized)],
            XQAPorts.support_words: word_ids[len(q_tokenized):],
            XQAPorts.emb_support: emb_support,
            XQAPorts.support_length: support_lengths,
            XQAPorts.emb_question: emb_question,
            XQAPorts.question_length: question_lengths,
            XQAPorts.word_in_question: wiq,
            XQAPorts.support2question: support2question,
            XQAPorts.is_eval: is_eval,
            XQAPorts.token_offsets: offsets,
            XQAPorts.selected_support: selected_support,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
        }

        if with_answers:
            spans = [s for a in all_spans for spans_per_support in a for s in spans_per_support]
            span2support = []
            support_idx = 0
            for a in all_spans:
                for spans_per_support in a:
                    span2support.extend([support_idx] * len(spans_per_support))
                    support_idx += 1
            output.update({
                XQAPorts.answer_span: [span for span in spans],
                XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans],
                XQAPorts.answer2support_training: span2support,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output, keys=[XQAPorts.word_chars,
                                      XQAPorts.question_words, XQAPorts.support_words,
                                      XQAPorts.word_in_question, XQAPorts.token_offsets])
        return batch
Esempio n. 10
0
    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        # also add character information
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        if self.shared_resources.config.get('use_dep_sa', False):
            xy_dict = {
                Ports.Input.support_length:
                s_lengths,
                Ports.Input.support_dep_i:
                [a.support_dep_i for a in annotations],
                Ports.Input.support_dep_j:
                [a.support_dep_j for a in annotations],
                Ports.Input.support_dep_type:
                [a.support_dep_type for a in annotations],
                Ports.Input.question_length:
                q_lengths,
                Ports.Input.question_dep_i:
                [a.question_dep_i for a in annotations],
                Ports.Input.question_dep_j:
                [a.question_dep_j for a in annotations],
                Ports.Input.question_dep_type:
                [a.question_dep_type for a in annotations],
                Ports.is_eval:
                is_eval
            }
        else:
            xy_dict = {
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a.id for a in annotations],
                Ports.Input.word_chars: word_chars,
                Ports.Input.word_char_length: word_lengths,
                Ports.Input.question_batch_words: question_words,
                Ports.Input.support_batch_words: support_words,
                Ports.is_eval: is_eval,
                Ports.Input.support: [a.support_ids for a in annotations],
                Ports.Input.question: [a.question_ids for a in annotations]
            }

        if self.embeddings is not None:
            emb_support = np.zeros(
                [len(annotations),
                 max(s_lengths), self.embeddings.shape[-1]])
            emb_question = np.zeros(
                [len(annotations),
                 max(q_lengths), self.embeddings.shape[-1]])
            for i, a in enumerate(annotations):
                for j, t in enumerate(a.support_tokens):
                    emb_support[i, j] = self.embeddings.get(
                        t, self.__default_vec)
                for j, t in enumerate(a.question_tokens):
                    emb_question[i, j] = self.embeddings.get(
                        t, self.__default_vec)

            xy_dict[Ports.Input.emb_support] = emb_support
            xy_dict[Ports.Input.emb_question] = emb_question

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        xy_dict = numpify(xy_dict)

        return xy_dict
Esempio n. 11
0
def get_batches(data, batch_size=32, pad=0, bucket_order=None, bucket_structure=None, exact_epoch=False):
    """
    Creates generator that batches `data`.
    To avoid biases, it is advised to keep `bucket_order=None` and `bucket_structure=None` if computationally possible.
    (which will sample batches from all instances)

    Args:
        `data`: dict with (multi-dimensional) numpy arrays or (nested) lists;
            first inner dimension (`num_instances`) should be the same over all data values.
        `batch_size`: the desired batch size
        `pad`: padding symbol in case data contains lists of lists of different sizes
        `bucket_order`: argument `order` in get_buckets (list with keys); `None` if no bucketing
        `bucket_structure`: argument `structure` in get_buckets; `None` if no bucketing
        `exact_epoch`: if set to `True`, final batch per bucket may be smaller, but each instance will be seen exactly
            once during training. Default: `False`, to be certain during training
            that each instance per batch gets same weight in the total loss
            (but not all instances are observed per epoch if bucket sizes are no multiple of `batch_size`).

    Returns:
        a generator that generates a dict with same keys as `data`, and
        as values data batches consisting of `[batch_size x num_instances]` 2D numpy tensors
        (1st dimension is at most `batch_size` but may be smaller to cover all instances exactly once per epoch,
        if `exact_epoch=True`)
     """
    assert isinstance(data, dict)

    data0 = list(data.values())[0]
    if not isinstance(data0, np.ndarray):
        data_np = numpify(data, pad)  # still need original data for length-based bucketing
    else:
        data_np = data

    def get_bucket_probs(_buckets2instances):
        N = float(np.sum([len(ids) for ids in _buckets2instances.values()]))
        return {bid: len(ids) / N if N > 0. else 0. for bid, ids in _buckets2instances.items()}

    def shuffle_buckets(_buckets2instances):
        for bid in sorted(_buckets2instances.keys()):  # sorted: to keep deterministic
            rs.shuffle(_buckets2instances[bid])

    buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
    n_buckets = len(buckets2instances)

    exact_epoch = True if len(data0) < n_buckets * batch_size else exact_epoch

    # if average instances/bucket smaller than batch_size: set exact_epoch = True
    # to avoid empty batches during debugging on small data samples

    def bucket_generator():
        buckets2instances, _ = get_buckets(data, bucket_order, bucket_structure)
        shuffle_buckets(buckets2instances)
        all_seen = False
        while not all_seen:
            bids, probs = zip(*sorted(get_bucket_probs(buckets2instances).items(), key=lambda x: x[0]))
            # sorted keys: to keep deterministic
            if np.sum(probs) == 0.:
                all_seen = True
            else:
                bid = rs.choice(bids, replace=False, p=probs)  # sample bucket according to remaining size
                batch_indices = buckets2instances[bid][:batch_size]
                buckets2instances[bid] = buckets2instances[bid][batch_size:]
                # if required by exact_epoch: also include last batch in bucket if too small
                if len(batch_indices) == batch_size or exact_epoch:
                    yield {k: data_np[k][batch_indices] for k in data_np}

    return GeneratorWithRestart(bucket_generator)
Esempio n. 12
0
    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        if self.shared_resources.config.get('use_dep_sa', False):
            xy_dict = {
                Ports.Input.support_length:
                s_lengths,
                Ports.Input.support_dep_i:
                [a.support_dep_i for a in annotations],
                Ports.Input.support_dep_j:
                [a.support_dep_j for a in annotations],
                Ports.Input.support_dep_type:
                [a.support_dep_type for a in annotations],
                Ports.Input.question_length:
                q_lengths,
                Ports.Input.question_dep_i:
                [a.question_dep_i for a in annotations],
                Ports.Input.question_dep_j:
                [a.question_dep_j for a in annotations],
                Ports.Input.question_dep_type:
                [a.question_dep_type for a in annotations],
                Ports.is_eval:
                is_eval
            }
        else:
            xy_dict = {
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a.id for a in annotations],
                Ports.Input.word_chars: word_chars,
                Ports.Input.word_char_length: word_lengths,
                Ports.Input.question_batch_words: question_words,
                Ports.Input.support_batch_words: support_words,
                Ports.is_eval: is_eval,
                Ports.Input.support: [a.support_ids for a in annotations],
                Ports.Input.question: [a.question_ids for a in annotations]
            }

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        xy_dict = numpify(xy_dict)

        # Elmo embeddings
        tokens_support = [a.support_tokens for a in annotations]
        tokens_question = [a.question_tokens for a in annotations]

        # debug
        tokens_support_len = 0
        tokens_question_len = 0
        tokens_support_maxlen = 0
        tokens_question_maxlen = 0
        for a in annotations:
            tokens_support_len += len(a.support_tokens)
            tokens_question_len += len(a.question_tokens)
            tokens_support_maxlen = max(tokens_support_maxlen,
                                        len(a.support_tokens))
            tokens_question_maxlen = max(tokens_question_maxlen,
                                         len(a.question_tokens))
        # print('Q len:', tokens_question_len, 'maxlen:', tokens_question_maxlen,
        #         '  S len:', tokens_support_len, 'maxlen:', tokens_support_maxlen,
        #         file=sys.stderr)

        chars_support = batch_to_ids(tokens_support)
        chars_question = batch_to_ids(tokens_question)

        if torch.cuda.is_available():
            chars_support = chars_support.cuda()
            chars_question = chars_question.cuda()

        with torch.no_grad():
            emb_support = self.elmo(
                chars_support)['elmo_representations'][0].detach()
            emb_question = self.elmo(
                chars_question)['elmo_representations'][0].detach()

        xy_dict[Ports.Input.emb_support] = emb_support
        xy_dict[Ports.Input.emb_question] = emb_question

        return xy_dict
Esempio n. 13
0
    def create_batch(self, annotations, is_eval, with_answers):
        frac = self.config.get('training_fraction_with_definition', 1.0)
        if not self.use_definitions or (frac < 1.0 and not is_eval
                                        and self._rng.random() > frac):
            return super(XQAAssertionDefinitionInputModule,
                         self).create_batch(annotations, is_eval, with_answers)
        batch = super(XQAAssertionDefinitionInputModule,
                      self).create_batch(annotations, True, with_answers)

        lemma_vocab = batch['__lemma_vocab']
        vocab = batch['__vocab']
        rev_vocab = batch['__rev_vocab']
        word_chars = batch[AssertionMRPorts.word_chars].tolist()
        word_lengths = batch[AssertionMRPorts.word_char_length].tolist()
        word2lemma = batch[AssertionMRPorts.word2lemma].tolist()
        support = batch[AssertionMRPorts.support]

        rev_lemma_vocab = {v: k for k, v in lemma_vocab.items()}
        topk = self.config['topk']
        self.reader.model_module.set_topk(topk)
        spans = self.reader.model_module(
            batch, [XQAPorts.answer_span])[XQAPorts.answer_span]

        definitions = []
        definition_lengths = []
        definition2question = []

        seen_answer_lemmas = None
        for i, s in enumerate(spans):
            j = i // topk
            if i % topk == 0:
                seen_answer_lemmas = set()
            doc_idx_map = [
                i for i, q_id in enumerate(batch[XQAPorts.support2question])
                if q_id == j
            ]
            doc_idx, start, end = s[0], s[1], s[2]
            answer_token_ids = support[doc_idx_map[doc_idx], start:end + 1]
            answer_lemmas = [
                rev_lemma_vocab[word2lemma[idd]] for idd in answer_token_ids
            ]
            answer_lemma = ' '.join(answer_lemmas)
            if answer_lemma in seen_answer_lemmas:
                continue
            seen_answer_lemmas.add(answer_lemma)
            ks = self._knowledge_store.assertion_keys_for_subject(
                answer_lemma, resource='wikipedia_firstsent')
            if not ks:
                # remove leading or trailing stop words or non alnum words
                while answer_lemmas and (answer_lemmas[0]
                                         in spacy.en.STOP_WORDS
                                         or not answer_lemmas[0].isalnum()):
                    answer_lemmas = answer_lemmas[1:]
                while answer_lemmas and (answer_lemmas[-1]
                                         in spacy.en.STOP_WORDS
                                         or not answer_lemmas[-1].isalnum()):
                    answer_lemmas = answer_lemmas[:-1]
                answer_lemma = ' '.join(answer_lemmas)
                if answer_lemma in seen_answer_lemmas:
                    continue
                seen_answer_lemmas.add(answer_lemma)
                ks = self._knowledge_store.assertion_keys_for_subject(
                    answer_lemma, resource='wikipedia_firstsent')

            defns = [
                self._nlp(self._knowledge_store.get_assertion(key))
                for key in ks
            ]
            if len(defns) > 3:
                indices_scores = sort_by_tfidf(
                    ' '.join(annotations[j].question_lemmas +
                             annotations[j].support_lemmas[doc_idx]),
                    [' '.join(t.lemma_ for t in d) for d in defns])
                # only select the top 3 definition with best match to the support and question
                defns = [defns[i] for i, _ in indices_scores[:3]]

            for defn in defns:
                definition_lengths.append(len(defn))
                definition2question.append(j)
                defn_ids = []
                for t in defn:
                    w = t.orth_
                    if w not in vocab:
                        vocab[w] = len(vocab)
                        word_lengths.append(min(len(w), 20))
                        word_chars.append(
                            [self.char_vocab.get(c, 0) for c in w[:20]])
                        rev_vocab.append(w)
                        if t.lemma_ not in lemma_vocab:
                            lemma_vocab[t.lemma_] = len(lemma_vocab)
                        word2lemma.append(lemma_vocab[t.lemma_])
                    defn_ids.append(vocab[w])
                definitions.append(defn_ids)

        batch[DefinitionPorts.definitions] = definitions
        batch[DefinitionPorts.definition_lengths] = definition_lengths
        batch[DefinitionPorts.definition2question] = definition2question
        batch[AssertionMRPorts.word_chars] = word_chars
        batch[AssertionMRPorts.word_char_length] = word_lengths
        batch[AssertionMRPorts.word2lemma] = word2lemma
        batch[AssertionMRPorts.is_eval] = is_eval

        word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]])
        for i, w in enumerate(rev_vocab):
            word_embeddings[i] = self._get_emb(self.vocab(w))

        batch[AssertionMRPorts.word_embeddings] = word_embeddings

        return numpify(batch,
                       keys=[
                           DefinitionPorts.definitions,
                           DefinitionPorts.definition_lengths,
                           DefinitionPorts.definition2question,
                           AssertionMRPorts.word_chars,
                           AssertionMRPorts.word_char_length,
                           AssertionMRPorts.word2lemma
                       ])
Esempio n. 14
0
    def create_batch(self, annotations, is_eval: bool, with_answers: bool):
        q_tokenized = [a.question_tokens for a in annotations]
        question_lengths = [a.question_length for a in annotations]

        max_training_support = self.config.get('max_training_support', 2)
        s_tokenized = []
        s_lemmas = []
        support_lengths = []
        wiq = []
        offsets = []
        support2question = []
        # aligns with support2question, used in output module to get correct index to original set of supports
        selected_support = []
        all_spans = []
        for i, a in enumerate(annotations):
            s_lemmas.append([])
            all_spans.append([])
            if len(a.support_tokens
                   ) > max_training_support > 0 and not is_eval:
                # sample only 2 paragraphs and take first with double probability (the best) to speed
                # things up. Following https://arxiv.org/pdf/1710.10723.pdf
                is_done = False
                any_answer = any(a.answer_spans)
                # sample until there is at least one possible answer (if any)
                while not is_done:
                    selected = self._rng.sample(
                        range(0,
                              len(a.support_tokens) + 1),
                        max_training_support + 1)
                    if 0 in selected and 1 in selected:
                        selected = [s - 1 for s in selected if s > 0]
                    else:
                        selected = [
                            max(0, s - 1)
                            for s in selected[:max_training_support]
                        ]
                    is_done = not any_answer or any(a.answer_spans[s]
                                                    for s in selected)
                selected = set(max(0, s - 1) for s in selected)
            else:
                selected = set(range(len(a.support_tokens)))
            for s in selected:
                s_tokenized.append(a.support_tokens[s])
                s_lemmas[-1].append(a.support_lemmas[s])
                support_lengths.append(a.support_length[s])
                wiq.append(a.word_in_question[s])
                offsets.append(a.token_offsets[s])
                selected_support.append(a.selected_supports[s])
                support2question.append(i)
                if with_answers:
                    all_spans[-1].append(a.answer_spans[s])

        word_chars, word_lengths, word_ids, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab)

        question = word_ids[:len(q_tokenized)]
        support = word_ids[len(q_tokenized):]

        ass_lengths = []
        ass2question = []
        ass2unique = []
        lemma2idx = dict()
        question_arg_span = []
        support_arg_span = []
        assertion2question_arg_span = []
        assertion2support_arg_span = []
        question_arg_span_idx = dict()
        support_arg_span_idx = dict()

        word2lemma = [None] * len(rev_vocab)

        heuristic = self.config.get('heuristic', 'pair')
        s_offset = 0
        for i, annot in enumerate(annotations):
            # collect uniq lemmas:
            for k, l in enumerate(annot.question_lemmas):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[question[i][k]] = lemma2idx[l]
            for k, ls in enumerate(s_lemmas[i]):
                for k2, l in enumerate(ls):
                    if l not in lemma2idx:
                        lemma2idx[l] = len(lemma2idx)
                    word2lemma[support[s_offset + k][k2]] = lemma2idx[l]

            if self._limit == 0:
                s_offset += len(s_lemmas[i])
                continue

            if heuristic == 'pair':
                assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys(
                    annot.question_lemmas,
                    [l for ls in s_lemmas[i] for l in ls], self._sources)
            elif heuristic == 'tfidf':
                assertions, assertion_args = self._knowledge_store.get_assertion_keys(
                    [l for ls in s_lemmas[i] for l in ls], self._sources)
                assertions = list(assertions.keys())
                assertion_strings = [
                    self._knowledge_store.get_assertion(key)
                    for key in assertions
                ]
                scores = sort_by_tfidf(' '.join(annot.question_tokens),
                                       assertion_strings)
                assertions = {assertions[i]: s for i, s in scores}

            sorted_assertions = sorted(assertions.items(), key=lambda x: -x[1])
            added_assertions = set()
            for key, _ in sorted_assertions:
                if len(added_assertions) == self._limit:
                    break
                a = self._nlp(
                    self._knowledge_store.get_assertion(key, cache=True))
                a_lemma = " ".join(t.lemma_ for t in a)
                if a_lemma in added_assertions:
                    continue
                else:
                    added_assertions.add(a_lemma)
                ass2question.append(i)
                ass_lengths.append(len(a))
                if heuristic == 'pair':
                    q_arg_span = assertion_args[key][0]
                    q_arg_span = (i, q_arg_span[0], q_arg_span[1])
                    s_arg_start, s_arg_end = assertion_args[key][1]
                    doc_idx = 0
                    for ls in s_lemmas[i]:
                        if s_arg_start < len(ls):
                            break
                        else:
                            doc_idx += 1
                            s_arg_start -= len(ls)
                            s_arg_end -= len(ls)
                    s_arg_span = (s_offset + doc_idx, s_arg_start, s_arg_end)
                    if q_arg_span not in question_arg_span_idx:
                        question_arg_span_idx[q_arg_span] = len(
                            question_arg_span)
                        question_arg_span.append(assertion_args[key][0])
                    if s_arg_span not in support_arg_span_idx:
                        support_arg_span_idx[s_arg_span] = len(
                            support_arg_span)
                        support_arg_span.append(assertion_args[key][1])
                    assertion2question_arg_span.append(
                        question_arg_span_idx[q_arg_span])
                    assertion2support_arg_span.append(
                        support_arg_span_idx[s_arg_span])

                u_ass = []
                for t in a:
                    w = t.orth_
                    if w not in vocab:
                        vocab[w] = len(vocab)
                        word_lengths.append(min(len(w), 20))
                        word_chars.append(
                            [self.char_vocab.get(c, 0) for c in w[:20]])
                        rev_vocab.append(w)
                        if t.lemma_ not in lemma2idx:
                            lemma2idx[t.lemma_] = len(lemma2idx)
                        word2lemma.append(lemma2idx[t.lemma_])
                    u_ass.append(vocab[w])
                ass2unique.append(u_ass)

            s_offset += len(s_lemmas[i])

        word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]])
        for i, w in enumerate(rev_vocab):
            word_embeddings[i] = self._get_emb(self.vocab(w))

        if not ass2unique:
            ass2unique.append([])
            question_arg_span = support_arg_span = np.zeros([0, 2],
                                                            dtype=np.int32)

        output = {
            AssertionMRPorts.word_chars: word_chars,
            AssertionMRPorts.word_char_length: word_lengths,
            AssertionMRPorts.question: question,
            AssertionMRPorts.support: support,
            AssertionMRPorts.support_length: support_lengths,
            AssertionMRPorts.question_length: question_lengths,
            AssertionMRPorts.is_eval: is_eval,
            AssertionMRPorts.word_embeddings: word_embeddings,
            AssertionMRPorts.assertion_lengths: ass_lengths,
            AssertionMRPorts.assertion2question: ass2question,
            AssertionMRPorts.assertions: ass2unique,
            AssertionMRPorts.word2lemma: word2lemma,
            AssertionMRPorts.question_arg_span: question_arg_span,
            AssertionMRPorts.support_arg_span: support_arg_span,
            AssertionMRPorts.assertion2question_arg_span:
            assertion2question_arg_span,
            AssertionMRPorts.assertion2support_arg_span:
            assertion2support_arg_span,
            XQAPorts.word_in_question: wiq,
            XQAPorts.support2question: support2question,
            XQAPorts.token_offsets: offsets,
            XQAPorts.selected_support: selected_support,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
            '__lemma_vocab': lemma2idx,
        }

        if with_answers:
            spans = [
                s for a in all_spans for spans_per_support in a
                for s in spans_per_support
            ]
            span2support = []
            support_idx = 0
            for a in all_spans:
                for spans_per_support in a:
                    span2support.extend([support_idx] * len(spans_per_support))
                    support_idx += 1
            output.update({
                XQAPorts.answer_span_target:
                [span
                 for span in spans] if spans else np.zeros([0, 2], np.int32),
                XQAPorts.correct_start:
                [] if is_eval else [span[0] for span in spans],
                XQAPorts.answer2support_training:
                span2support,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output, keys=self.output_ports + self.training_ports)
        return batch