Python unique_words_with_charsの例

プログラミング言語: Python

名前空間/パッケージ名: jack.util.preprocessing

メソッド/関数: unique_words_with_chars

hotexamples.comのコード掲載数: 7

Python unique_words_with_chars - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのjack.util.preprocessing.unique_words_with_charsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: shared.py プロジェクト: mitchelljeff/hack1

    def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \
            -> Mapping[TensorPort, np.ndarray]:

        batch_size = len(annotations)

        emb_supports = [a.support_embeddings for a in annotations]
        emb_questions = [a.question_embeddings for a in annotations]

        support_lengths = [a.support_length for a in annotations]
        question_lengths = [a.question_length for a in annotations]
        wiq = [a.word_in_question for a in annotations]
        offsets = [a.token_offsets for a in annotations]

        q_tokenized = [a.question_tokens for a in annotations]
        s_tokenized = [a.support_tokens for a in annotations]

        unique_words, unique_word_lengths, question2unique, support2unique = \
            preprocessing.unique_words_with_chars(q_tokenized, s_tokenized, self.char_vocab)


        output = {
            XQAPorts.unique_word_chars: unique_words,
            XQAPorts.unique_word_char_length: unique_word_lengths,
            XQAPorts.question_words2unique: question2unique,
            XQAPorts.support_words2unique: support2unique,
            XQAPorts.emb_support: preprocessing.stack_and_pad(emb_supports),
            XQAPorts.support_length: support_lengths,
            XQAPorts.emb_question: preprocessing.stack_and_pad(emb_questions),
            XQAPorts.question_length: question_lengths,
            XQAPorts.word_in_question: wiq,
            XQAPorts.keep_prob: 1.0 if is_eval else 1 - self.dropout,
            XQAPorts.is_eval: is_eval,
            XQAPorts.token_char_offsets: offsets,
        }

        if with_answers:
            spans = [a.answer_spans for a in annotations]
            span2question = [i for i in range(batch_size) for _ in spans[i]]
            output.update({
                XQAPorts.answer_span: [span for span_list in spans for span in span_list],
                XQAPorts.correct_start_training: [] if is_eval else [span[0] for span_list in spans for span in
                                                                     span_list],
                XQAPorts.answer2question_training: span2question,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output, keys=[XQAPorts.unique_word_chars,
                                      XQAPorts.question_words2unique, XQAPorts.support_words2unique,
                                      XQAPorts.word_in_question, XQAPorts.token_char_offsets])
        return batch

コード例 #2

ファイルを表示

    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        # also add character information
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        xy_dict = {
            Ports.Input.question_length: q_lengths,
            Ports.Input.support_length: s_lengths,
            Ports.Input.sample_id: [a.id for a in annotations],
            Ports.Input.word_chars: word_chars,
            Ports.Input.word_char_length: word_lengths,
            Ports.Input.question_words: question_words,
            Ports.Input.support_words: support_words,
            Ports.is_eval: is_eval
        }

        if self.shared_resources.config.get("vocab_from_embeddings", False):
            emb_support = np.zeros(
                [len(annotations),
                 max(s_lengths), self.emb_matrix.shape[1]])
            emb_question = np.zeros(
                [len(annotations),
                 max(q_lengths), self.emb_matrix.shape[1]])
            for i, a in enumerate(annotations):
                for j, k in enumerate(a.support_ids):
                    emb_support[i, j] = self._get_emb(k)
                for j, k in enumerate(a.question_ids):
                    emb_question[i, j] = self._get_emb(k)

            xy_dict[Ports.Input.emb_support] = emb_support
            xy_dict[Ports.Input.emb_question] = emb_question
        else:
            xy_dict[Ports.Input.support] = [a.support_ids for a in annotations]
            xy_dict[Ports.Input.question] = [
                a.question_ids for a in annotations
            ]

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        return numpify(xy_dict)

コード例 #3

ファイルを表示

    def create_batch(self, annotations: List[Mapping[str, any]], is_eval: bool, with_answers: bool):
        support_lengths = list()
        question_lengths = list()

        ass_lengths = []
        ass2question = []
        ass2unique = []
        lemma2idx = dict()
        answer_labels = []
        question_arg_span = []
        support_arg_span = []
        assertions2question_arg_span = []
        assertions2support_arg_span = []

        question_arg_span_idx = dict()
        support_arg_span_idx = dict()

        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a["question_tokens"] for a in annotations] + [a["support_tokens"] for a in annotations],
                self.char_vocab)
        question, support = tokens[:len(annotations)], tokens[len(annotations):]

        word2lemma = [None] * len(rev_vocab)

        # we have to create batches here and cannot precompute them because of the batch-specific wiq feature
        for i, annot in enumerate(annotations):
            support_lengths.append(annot['support_lengths'])
            question_lengths.append(annot['question_lengths'])

            if "answers" in annot:
                answer_labels.append(annot["answers"])

            # collect uniq lemmas:
            for k, l in enumerate(annot['question_lemmas']):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[question[i][k]] = lemma2idx[l]
            for k, l in enumerate(annot['support_lemmas']):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[support[i][k]] = lemma2idx[l]

            assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys(
                annot['question_lemmas'], annot['support_lemmas'], self._sources)

            sorted_assertionss = sorted(assertions.items(), key=lambda x: -x[1])
            added_assertionss = set()
            for key, _ in sorted_assertionss:
                if len(added_assertionss) == self._limit:
                    break
                a = self.__nlp(self._knowledge_store.get_assertion(key))
                a_lemma = " ".join(t.lemma_ for t in a)
                if a_lemma in added_assertionss:
                    continue
                else:
                    added_assertionss.add(a_lemma)
                ass2question.append(i)
                ass_lengths.append(len(a))
                q_arg_span = assertion_args[key][0]
                q_arg_span = (i, q_arg_span[0], q_arg_span[1])
                s_arg_span = assertion_args[key][1]
                s_arg_span = (i, s_arg_span[0], s_arg_span[1])
                if q_arg_span not in question_arg_span_idx:
                    question_arg_span_idx[q_arg_span] = len(question_arg_span)
                    question_arg_span.append(assertion_args[key][0])
                if s_arg_span not in support_arg_span_idx:
                    support_arg_span_idx[s_arg_span] = len(support_arg_span)
                    support_arg_span.append(assertion_args[key][1])
                assertions2question_arg_span.append(question_arg_span_idx[q_arg_span])
                assertions2support_arg_span.append(support_arg_span_idx[s_arg_span])

                u_ass = []
                for t in a:
                    w = t.orth_
                    if w not in vocab:
                        vocab[w] = len(vocab)
                        word_lengths.append(min(len(w), 20))
                        word_chars.append([self.char_vocab.get(c, 0) for c in w[:20]])
                        rev_vocab.append(w)
                        if t.lemma_ not in lemma2idx:
                            lemma2idx[t.lemma_] = len(lemma2idx)
                        word2lemma.append(lemma2idx[t.lemma_])
                    u_ass.append(vocab[w])
                ass2unique.append(u_ass)

        word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]])
        for i, w in enumerate(rev_vocab):
            word_embeddings[i] = self._get_emb(self.shared_resources.vocab(w))

        if not ass2unique:
            ass2unique.append([])
            question_arg_span = support_arg_span = np.zeros([0, 2], dtype=np.int32)

        output = {
            AssertionMRPorts.word_chars: word_chars,
            AssertionMRPorts.word_char_length: word_lengths,
            AssertionMRPorts.question: question,
            AssertionMRPorts.support: support,
            AssertionMRPorts.support_length: support_lengths,
            AssertionMRPorts.question_length: question_lengths,
            AssertionMRPorts.is_eval: is_eval,
            AssertionMRPorts.word_embeddings: word_embeddings,
            AssertionMRPorts.assertion_lengths: ass_lengths,
            AssertionMRPorts.assertion2question: ass2question,
            AssertionMRPorts.assertions: ass2unique,
            AssertionMRPorts.word2lemma: word2lemma,
            AssertionMRPorts.question_arg_span: question_arg_span,
            AssertionMRPorts.support_arg_span: support_arg_span,
            AssertionMRPorts.assertion2question_arg_span: assertions2question_arg_span,
            AssertionMRPorts.assertion2support_arg_span: assertions2support_arg_span,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
            '__lemma_vocab': lemma2idx,
        }
        if "answers" in annotations[0]:
            output[Ports.Target.target_index] = [a["answers"] for a in annotations]

        return numpify(output, keys=self.output_ports + self.training_ports)

コード例 #4

ファイルを表示

ファイル: shared.py プロジェクト: jg8610/jack

    def create_batch(self, annotations: List[XQAAnnotation], is_eval: bool, with_answers: bool) \
            -> Mapping[TensorPort, np.ndarray]:

        q_tokenized = [a.question_tokens for a in annotations]
        question_lengths = [a.question_length for a in annotations]

        max_training_support = self.config.get('max_training_support', 2)
        s_tokenized = []
        support_lengths = []
        wiq = []
        offsets = []
        support2question = []
        support_ids = []
        # aligns with support2question, used in output module to get correct index to original set of supports
        selected_support = []
        all_spans = []
        for i, a in enumerate(annotations):
            all_spans.append([])
            if len(a.support_tokens) > max_training_support > 0 and not is_eval:
                # sample only 2 paragraphs and take first with double probability (the best) to speed
                # things up. Following https://arxiv.org/pdf/1710.10723.pdf
                is_done = False
                any_answer = any(a.answer_spans)
                # sample until there is at least one possible answer (if any)
                while not is_done:
                    selected = self._rng.sample(range(0, len(a.support_tokens) + 1), max_training_support + 1)
                    if 0 in selected and 1 in selected:
                        selected = [s - 1 for s in selected if s > 0]
                    else:
                        selected = [max(0, s - 1) for s in selected[:max_training_support]]
                    is_done = not any_answer or any(a.answer_spans[s] for s in selected)
            else:
                selected = set(range(len(a.support_tokens)))
            for s in selected:
                s_tokenized.append(a.support_tokens[s])
                support_lengths.append(a.support_length[s])
                wiq.append(a.word_in_question[s])
                offsets.append(a.token_offsets[s])
                selected_support.append(a.selected_supports[s])
                support_ids.append(a.support_ids[s])
                support2question.append(i)
                if with_answers:
                    all_spans[-1].append(a.answer_spans[s])

        word_chars, word_lengths, word_ids, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab)

        emb_support = np.zeros([len(support_lengths), max(support_lengths), self.vocab.emb_length])
        emb_question = np.zeros([len(question_lengths), max(question_lengths), self.vocab.emb_length])

        for i, a in enumerate(annotations):
            for j, q_id in enumerate(a.question_ids):
                emb_question[i, j] = self._get_emb(q_id)
        for k, s_ids in enumerate(support_ids):
            for j, s_id in enumerate(s_ids):
                emb_support[k, j] = self._get_emb(s_id)

        output = {
            XQAPorts.word_chars: word_chars,
            XQAPorts.word_char_length: word_lengths,
            XQAPorts.question_words: word_ids[:len(q_tokenized)],
            XQAPorts.support_words: word_ids[len(q_tokenized):],
            XQAPorts.emb_support: emb_support,
            XQAPorts.support_length: support_lengths,
            XQAPorts.emb_question: emb_question,
            XQAPorts.question_length: question_lengths,
            XQAPorts.word_in_question: wiq,
            XQAPorts.support2question: support2question,
            XQAPorts.is_eval: is_eval,
            XQAPorts.token_offsets: offsets,
            XQAPorts.selected_support: selected_support,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
        }

        if with_answers:
            spans = [s for a in all_spans for spans_per_support in a for s in spans_per_support]
            span2support = []
            support_idx = 0
            for a in all_spans:
                for spans_per_support in a:
                    span2support.extend([support_idx] * len(spans_per_support))
                    support_idx += 1
            output.update({
                XQAPorts.answer_span: [span for span in spans],
                XQAPorts.correct_start: [] if is_eval else [span[0] for span in spans],
                XQAPorts.answer2support_training: span2support,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output, keys=[XQAPorts.word_chars,
                                      XQAPorts.question_words, XQAPorts.support_words,
                                      XQAPorts.word_in_question, XQAPorts.token_offsets])
        return batch

コード例 #5

ファイルを表示

    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        # also add character information
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        if self.shared_resources.config.get('use_dep_sa', False):
            xy_dict = {
                Ports.Input.support_length:
                s_lengths,
                Ports.Input.support_dep_i:
                [a.support_dep_i for a in annotations],
                Ports.Input.support_dep_j:
                [a.support_dep_j for a in annotations],
                Ports.Input.support_dep_type:
                [a.support_dep_type for a in annotations],
                Ports.Input.question_length:
                q_lengths,
                Ports.Input.question_dep_i:
                [a.question_dep_i for a in annotations],
                Ports.Input.question_dep_j:
                [a.question_dep_j for a in annotations],
                Ports.Input.question_dep_type:
                [a.question_dep_type for a in annotations],
                Ports.is_eval:
                is_eval
            }
        else:
            xy_dict = {
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a.id for a in annotations],
                Ports.Input.word_chars: word_chars,
                Ports.Input.word_char_length: word_lengths,
                Ports.Input.question_batch_words: question_words,
                Ports.Input.support_batch_words: support_words,
                Ports.is_eval: is_eval,
                Ports.Input.support: [a.support_ids for a in annotations],
                Ports.Input.question: [a.question_ids for a in annotations]
            }

        if self.embeddings is not None:
            emb_support = np.zeros(
                [len(annotations),
                 max(s_lengths), self.embeddings.shape[-1]])
            emb_question = np.zeros(
                [len(annotations),
                 max(q_lengths), self.embeddings.shape[-1]])
            for i, a in enumerate(annotations):
                for j, t in enumerate(a.support_tokens):
                    emb_support[i, j] = self.embeddings.get(
                        t, self.__default_vec)
                for j, t in enumerate(a.question_tokens):
                    emb_question[i, j] = self.embeddings.get(
                        t, self.__default_vec)

            xy_dict[Ports.Input.emb_support] = emb_support
            xy_dict[Ports.Input.emb_question] = emb_question

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        xy_dict = numpify(xy_dict)

        return xy_dict

コード例 #6

ファイルを表示

    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        if self.shared_resources.config.get('use_dep_sa', False):
            xy_dict = {
                Ports.Input.support_length:
                s_lengths,
                Ports.Input.support_dep_i:
                [a.support_dep_i for a in annotations],
                Ports.Input.support_dep_j:
                [a.support_dep_j for a in annotations],
                Ports.Input.support_dep_type:
                [a.support_dep_type for a in annotations],
                Ports.Input.question_length:
                q_lengths,
                Ports.Input.question_dep_i:
                [a.question_dep_i for a in annotations],
                Ports.Input.question_dep_j:
                [a.question_dep_j for a in annotations],
                Ports.Input.question_dep_type:
                [a.question_dep_type for a in annotations],
                Ports.is_eval:
                is_eval
            }
        else:
            xy_dict = {
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a.id for a in annotations],
                Ports.Input.word_chars: word_chars,
                Ports.Input.word_char_length: word_lengths,
                Ports.Input.question_batch_words: question_words,
                Ports.Input.support_batch_words: support_words,
                Ports.is_eval: is_eval,
                Ports.Input.support: [a.support_ids for a in annotations],
                Ports.Input.question: [a.question_ids for a in annotations]
            }

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        xy_dict = numpify(xy_dict)

        # Elmo embeddings
        tokens_support = [a.support_tokens for a in annotations]
        tokens_question = [a.question_tokens for a in annotations]

        # debug
        tokens_support_len = 0
        tokens_question_len = 0
        tokens_support_maxlen = 0
        tokens_question_maxlen = 0
        for a in annotations:
            tokens_support_len += len(a.support_tokens)
            tokens_question_len += len(a.question_tokens)
            tokens_support_maxlen = max(tokens_support_maxlen,
                                        len(a.support_tokens))
            tokens_question_maxlen = max(tokens_question_maxlen,
                                         len(a.question_tokens))
        # print('Q len:', tokens_question_len, 'maxlen:', tokens_question_maxlen,
        #         '  S len:', tokens_support_len, 'maxlen:', tokens_support_maxlen,
        #         file=sys.stderr)

        chars_support = batch_to_ids(tokens_support)
        chars_question = batch_to_ids(tokens_question)

        if torch.cuda.is_available():
            chars_support = chars_support.cuda()
            chars_question = chars_question.cuda()

        with torch.no_grad():
            emb_support = self.elmo(
                chars_support)['elmo_representations'][0].detach()
            emb_question = self.elmo(
                chars_question)['elmo_representations'][0].detach()

        xy_dict[Ports.Input.emb_support] = emb_support
        xy_dict[Ports.Input.emb_question] = emb_question

        return xy_dict

コード例 #7

ファイルを表示

ファイル: shared.py プロジェクト: 5l1v3r1/jack-1

    def create_batch(self, annotations, is_eval: bool, with_answers: bool):
        q_tokenized = [a.question_tokens for a in annotations]
        question_lengths = [a.question_length for a in annotations]

        max_training_support = self.config.get('max_training_support', 2)
        s_tokenized = []
        s_lemmas = []
        support_lengths = []
        wiq = []
        offsets = []
        support2question = []
        # aligns with support2question, used in output module to get correct index to original set of supports
        selected_support = []
        all_spans = []
        for i, a in enumerate(annotations):
            s_lemmas.append([])
            all_spans.append([])
            if len(a.support_tokens
                   ) > max_training_support > 0 and not is_eval:
                # sample only 2 paragraphs and take first with double probability (the best) to speed
                # things up. Following https://arxiv.org/pdf/1710.10723.pdf
                is_done = False
                any_answer = any(a.answer_spans)
                # sample until there is at least one possible answer (if any)
                while not is_done:
                    selected = self._rng.sample(
                        range(0,
                              len(a.support_tokens) + 1),
                        max_training_support + 1)
                    if 0 in selected and 1 in selected:
                        selected = [s - 1 for s in selected if s > 0]
                    else:
                        selected = [
                            max(0, s - 1)
                            for s in selected[:max_training_support]
                        ]
                    is_done = not any_answer or any(a.answer_spans[s]
                                                    for s in selected)
                selected = set(max(0, s - 1) for s in selected)
            else:
                selected = set(range(len(a.support_tokens)))
            for s in selected:
                s_tokenized.append(a.support_tokens[s])
                s_lemmas[-1].append(a.support_lemmas[s])
                support_lengths.append(a.support_length[s])
                wiq.append(a.word_in_question[s])
                offsets.append(a.token_offsets[s])
                selected_support.append(a.selected_supports[s])
                support2question.append(i)
                if with_answers:
                    all_spans[-1].append(a.answer_spans[s])

        word_chars, word_lengths, word_ids, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(q_tokenized + s_tokenized, self.char_vocab)

        question = word_ids[:len(q_tokenized)]
        support = word_ids[len(q_tokenized):]

        ass_lengths = []
        ass2question = []
        ass2unique = []
        lemma2idx = dict()
        question_arg_span = []
        support_arg_span = []
        assertion2question_arg_span = []
        assertion2support_arg_span = []
        question_arg_span_idx = dict()
        support_arg_span_idx = dict()

        word2lemma = [None] * len(rev_vocab)

        heuristic = self.config.get('heuristic', 'pair')
        s_offset = 0
        for i, annot in enumerate(annotations):
            # collect uniq lemmas:
            for k, l in enumerate(annot.question_lemmas):
                if l not in lemma2idx:
                    lemma2idx[l] = len(lemma2idx)
                word2lemma[question[i][k]] = lemma2idx[l]
            for k, ls in enumerate(s_lemmas[i]):
                for k2, l in enumerate(ls):
                    if l not in lemma2idx:
                        lemma2idx[l] = len(lemma2idx)
                    word2lemma[support[s_offset + k][k2]] = lemma2idx[l]

            if self._limit == 0:
                s_offset += len(s_lemmas[i])
                continue

            if heuristic == 'pair':
                assertions, assertion_args = self._knowledge_store.get_connecting_assertion_keys(
                    annot.question_lemmas,
                    [l for ls in s_lemmas[i] for l in ls], self._sources)
            elif heuristic == 'tfidf':
                assertions, assertion_args = self._knowledge_store.get_assertion_keys(
                    [l for ls in s_lemmas[i] for l in ls], self._sources)
                assertions = list(assertions.keys())
                assertion_strings = [
                    self._knowledge_store.get_assertion(key)
                    for key in assertions
                ]
                scores = sort_by_tfidf(' '.join(annot.question_tokens),
                                       assertion_strings)
                assertions = {assertions[i]: s for i, s in scores}

            sorted_assertions = sorted(assertions.items(), key=lambda x: -x[1])
            added_assertions = set()
            for key, _ in sorted_assertions:
                if len(added_assertions) == self._limit:
                    break
                a = self._nlp(
                    self._knowledge_store.get_assertion(key, cache=True))
                a_lemma = " ".join(t.lemma_ for t in a)
                if a_lemma in added_assertions:
                    continue
                else:
                    added_assertions.add(a_lemma)
                ass2question.append(i)
                ass_lengths.append(len(a))
                if heuristic == 'pair':
                    q_arg_span = assertion_args[key][0]
                    q_arg_span = (i, q_arg_span[0], q_arg_span[1])
                    s_arg_start, s_arg_end = assertion_args[key][1]
                    doc_idx = 0
                    for ls in s_lemmas[i]:
                        if s_arg_start < len(ls):
                            break
                        else:
                            doc_idx += 1
                            s_arg_start -= len(ls)
                            s_arg_end -= len(ls)
                    s_arg_span = (s_offset + doc_idx, s_arg_start, s_arg_end)
                    if q_arg_span not in question_arg_span_idx:
                        question_arg_span_idx[q_arg_span] = len(
                            question_arg_span)
                        question_arg_span.append(assertion_args[key][0])
                    if s_arg_span not in support_arg_span_idx:
                        support_arg_span_idx[s_arg_span] = len(
                            support_arg_span)
                        support_arg_span.append(assertion_args[key][1])
                    assertion2question_arg_span.append(
                        question_arg_span_idx[q_arg_span])
                    assertion2support_arg_span.append(
                        support_arg_span_idx[s_arg_span])

                u_ass = []
                for t in a:
                    w = t.orth_
                    if w not in vocab:
                        vocab[w] = len(vocab)
                        word_lengths.append(min(len(w), 20))
                        word_chars.append(
                            [self.char_vocab.get(c, 0) for c in w[:20]])
                        rev_vocab.append(w)
                        if t.lemma_ not in lemma2idx:
                            lemma2idx[t.lemma_] = len(lemma2idx)
                        word2lemma.append(lemma2idx[t.lemma_])
                    u_ass.append(vocab[w])
                ass2unique.append(u_ass)

            s_offset += len(s_lemmas[i])

        word_embeddings = np.zeros([len(rev_vocab), self.emb_matrix.shape[1]])
        for i, w in enumerate(rev_vocab):
            word_embeddings[i] = self._get_emb(self.vocab(w))

        if not ass2unique:
            ass2unique.append([])
            question_arg_span = support_arg_span = np.zeros([0, 2],
                                                            dtype=np.int32)

        output = {
            AssertionMRPorts.word_chars: word_chars,
            AssertionMRPorts.word_char_length: word_lengths,
            AssertionMRPorts.question: question,
            AssertionMRPorts.support: support,
            AssertionMRPorts.support_length: support_lengths,
            AssertionMRPorts.question_length: question_lengths,
            AssertionMRPorts.is_eval: is_eval,
            AssertionMRPorts.word_embeddings: word_embeddings,
            AssertionMRPorts.assertion_lengths: ass_lengths,
            AssertionMRPorts.assertion2question: ass2question,
            AssertionMRPorts.assertions: ass2unique,
            AssertionMRPorts.word2lemma: word2lemma,
            AssertionMRPorts.question_arg_span: question_arg_span,
            AssertionMRPorts.support_arg_span: support_arg_span,
            AssertionMRPorts.assertion2question_arg_span:
            assertion2question_arg_span,
            AssertionMRPorts.assertion2support_arg_span:
            assertion2support_arg_span,
            XQAPorts.word_in_question: wiq,
            XQAPorts.support2question: support2question,
            XQAPorts.token_offsets: offsets,
            XQAPorts.selected_support: selected_support,
            '__vocab': vocab,
            '__rev_vocab': rev_vocab,
            '__lemma_vocab': lemma2idx,
        }

        if with_answers:
            spans = [
                s for a in all_spans for spans_per_support in a
                for s in spans_per_support
            ]
            span2support = []
            support_idx = 0
            for a in all_spans:
                for spans_per_support in a:
                    span2support.extend([support_idx] * len(spans_per_support))
                    support_idx += 1
            output.update({
                XQAPorts.answer_span_target:
                [span
                 for span in spans] if spans else np.zeros([0, 2], np.int32),
                XQAPorts.correct_start:
                [] if is_eval else [span[0] for span in spans],
                XQAPorts.answer2support_training:
                span2support,
            })

        # we can only numpify in here, because bucketing is not possible prior
        batch = numpify(output, keys=self.output_ports + self.training_ports)
        return batch