def preprocess(self, docs: List[Document], evidence, name='train'):
     out = []
     for doc in docs:
         for para_ix, para in enumerate(doc.paragraphs):
             for q in para.questions:
                 if q.answer:
                     ans = q.answer.answer_spans
                 else:
                     ans = np.zeros((0, 2), dtype=np.int32)
                 if self.text_process:
                     text, ans, inv = self.text_process.encode_paragraph(
                         q.words, [flatten_iterable(para.text)],
                         para.paragraph_num == 0, ans, para.spans)
                     new_para = SquadParagraphWithAnswers(
                         text, ans, doc.doc_id, para_ix, para.original_text,
                         inv)
                 else:
                     new_para = SquadParagraphWithAnswers(
                         flatten_iterable(para.text), ans, doc.doc_id,
                         para_ix, para.original_text, para.spans)
                 if name == "train":
                     out.append(
                         WeightedMultiParagraphQuestion(
                             q.question_id, q.words, q.answer.answer_text,
                             [new_para], q.weight))
                 else:
                     out.append(
                         MultiParagraphQuestion(q.question_id, q.words,
                                                q.answer.answer_text,
                                                [new_para]))
     return out
Exemple #2
0
    def ranked_questions(self, docs: List[Document]) -> List[MultiParagraphQuestion]:
        out = []
        for doc in docs:
            scores = self.rank(flatten_iterable([q.words for q in x.questions] for x in doc.paragraphs),
                               [x.text for x in doc.paragraphs])
            q_ix = 0
            for para_ix, para in enumerate(doc.paragraphs):
                for q in para.questions:
                    para_scores = scores[q_ix]
                    para_ranks = np.argsort(para_scores)
                    selection = [i for i in para_ranks[:self.n_to_select]]

                    if self.force_answer and para_ix not in selection:
                        selection[-1] = para_ix

                    para = []
                    for ix in selection:
                        #if ix == para_ix:
                        if ix == para_ix and q.answer:
                            ans = q.answer.answer_spans
                        else:
                            ans = np.zeros((0, 2), dtype=np.int32)
                        p = doc.paragraphs[ix]
                        if self.text_process:
                            text, ans, inv = self.text_process.encode_paragraph(q.words,  [flatten_iterable(p.text)],
                                                               p.paragraph_num == 0, ans, p.spans)
                            para.append(SquadParagraphWithAnswers(text, ans, doc.doc_id,
                                                                  ix, p.original_text, inv))
                        else:
                            para.append(SquadParagraphWithAnswers(flatten_iterable(p.text), ans, doc.doc_id,
                                                                  ix, p.original_text, p.spans))

                    out.append(MultiParagraphQuestion(q.question_id, q.words, q.answer.answer_text, para))
                    q_ix += 1
        return out
def show_web_paragraphs():
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = TopTfIdf(stop, 6)
    stop_words = stop.words

    corpus = TriviaQaWebDataset()
    train = corpus.get_train()
    points = flatten_iterable([(q, d) for d in q.all_docs] for q in train)
    np.random.shuffle(points)

    for q, d in points:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        doc = corpus.evidence.get_document(d.doc_id)
        doc = splitter.split_annotated(doc, d.answer_spans)
        ranked = ranker.dists(q.question, doc)
        if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0:
            continue
        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i, (para, dist) in enumerate(ranked[0:2]):
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist))
            if len(para.answer_spans) == 0:
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
Exemple #4
0
def extract_paragraph(text: List[List[List[str]]], start, end) -> List[List[str]]:
    out = []
    on_token = 0
    on_para = []
    for para in text:
        for sent in para:
            exected_len = max(on_token - start, 0)
            if (sum(len(s) for s in out) + len(on_para)) != exected_len:
                raise ValueError()
            if on_token + len(sent) <= start:
                on_token += len(sent)
                continue
            if (on_token + len(sent)) >= end:
                on_para += sent[:end - on_token]
                out.append(on_para)
                if len(flatten_iterable(out)) != end - start:
                    raise ValueError(len(flatten_iterable(out)), end - start)
                return out
            if on_token + len(sent) < start:
                pass
            on_para += sent
            on_token += len(sent)
        if len(on_para) > 0:
            out.append(on_para)
            on_para = []

    out.append(on_para)
    if len(flatten_iterable(out)) != end - start:
        raise ValueError(len(flatten_iterable(out)), end-start)
    return out
Exemple #5
0
 def split(self, doc: List[List[List[str]]]):
     out = []
     on_token = 0
     for para in doc:
         flattened_para = flatten_iterable(para)
         end = on_token + len(flattened_para)
         out.append(ExtractedParagraph([flatten_iterable(para)], on_token, end))
         on_token = end
     return out
Exemple #6
0
def test_splitter(splitter: DocumentSplitter,
                  n_sample,
                  n_answer_spans,
                  seed=None):
    rng = np.random.RandomState(seed)
    corpus = TriviaQaEvidenceCorpusTxt()
    docs = sorted(corpus.list_documents())
    rng.shuffle(docs)
    max_tokens = splitter.max_tokens
    read_n = splitter.reads_first_n
    for doc in docs[:n_sample]:
        text = corpus.get_document(doc, read_n)
        fake_answers = []
        offset = 0
        for para in text:
            flattened = flatten_iterable(para)
            fake_answer_starts = np.random.choice(len(flattened),
                                                  min(
                                                      len(flattened) // 2,
                                                      np.random.randint(5)),
                                                  replace=False)
            max_answer_lens = np.minimum(
                len(flattened) - fake_answer_starts, 30)
            fake_answer_ends = fake_answer_starts + np.floor(
                rng.uniform() * max_answer_lens).astype(np.int32)
            fake_answers.append(
                np.concatenate([
                    np.expand_dims(fake_answer_starts, 1),
                    np.expand_dims(fake_answer_ends, 1)
                ],
                               axis=1) + offset)
            offset += len(flattened)

        fake_answers = np.concatenate(fake_answers, axis=0)
        flattened = flatten_iterable(flatten_iterable(text))
        answer_strs = set(tuple(flattened[s:e + 1]) for s, e in fake_answers)

        paragraphs = splitter.split_annotated(text, fake_answers)

        for para in paragraphs:
            text = flatten_iterable(para.text)
            if max_tokens is not None and len(text) > max_tokens:
                raise ValueError(
                    "Paragraph len len %d, but max tokens was %d" %
                    (len(text), max_tokens))
            start, end = para.start, para.end
            if text != flattened[start:end]:
                raise ValueError(
                    "Paragraph is missing text, given bounds were %d-%d" %
                    (start, end))
            for s, e in para.answer_spans:
                if tuple(text[s:e + 1]) not in answer_strs:
                    print(s, e)
                    raise ValueError(
                        "Incorrect answer for paragraph %d-%d (%s)" %
                        (start, end, " ".join(text[s:e + 1])))
Exemple #7
0
    def run_evaluators(self,
                       sess: tf.Session,
                       dataset: Dataset,
                       name,
                       n_sample=None,
                       feed_dict=None) -> Evaluation:
        all_tensors_needed = list(
            set(flatten_iterable(x.values() for x in self.tensors_needed)))

        tensors = {x: [] for x in all_tensors_needed}

        if n_sample is None:
            batches, n_batches = dataset.get_epoch(), len(dataset)
        else:
            batches, n_batches = dataset.get_samples(n_sample)

        data_used = []

        for batch in tqdm(batches, total=n_batches, desc=name, ncols=80):
            feed_dict = self.model.encode(batch, is_train=False)
            output = sess.run(all_tensors_needed, feed_dict=feed_dict)
            data_used += batch
            for i in range(len(all_tensors_needed)):
                tensors[all_tensors_needed[i]].append(output[i])

        # flatten the input
        for k in all_tensors_needed:
            v = tensors[k]
            if len(k.shape) == 0:
                v = np.array(v)  # List of scalars
            elif any(x is None for x in k.shape.as_list()):
                # Variable sized tensors, so convert to flat python-list
                v = flatten_iterable(v)
            else:
                v = np.concatenate(v, axis=0)  # concat along the batch dim
            tensors[k] = v

        percent_filtered = dataset.percent_filtered()
        if percent_filtered is None:
            true_len = len(data_used)
        else:
            true_len = len(data_used) * 1 / (1 - percent_filtered)

        combined = None
        for ev, needed in zip(self.evaluators, self.tensors_needed):
            args = {k: tensors[v] for k, v in needed.items()}
            evaluation = ev.evaluate(data_used, true_len, **args)
            if evaluation is None:
                raise ValueError(ev)
            if combined is None:
                combined = evaluation
            else:
                combined.add(evaluation)

        return combined
 def split(self, doc: List[List[List[str]]]) -> List[ExtractedParagraph]:
     words = flatten_iterable(flatten_iterable(doc))
     on_word = 0
     out = []
     while True:
         end_word = on_word + np.random.randint(1, 7)
         if on_word + end_word > len(words):
             out.append(ExtractedParagraph([words[on_word:]], on_word, len(words)))
             return out
         out.append(ExtractedParagraph([words[on_word:end_word]], on_word, end_word))
         on_word = end_word
Exemple #9
0
def read_input_data(model):
  data = []
  vocab = set()
  tokenizer = NltkAndPunctTokenizer()
  splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
  selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
  with open(OPTS.input_file) as f:
    for i, line in enumerate(f):
      try:
        document_raw, question_raw = line.strip().split('\t')
      except ValueError as e:
        print(line.strip())
        print('Error at line %d' % i)
        raise e
      document = re.split("\s*\n\s*", document_raw)
      question = tokenizer.tokenize_paragraph_flat(question_raw)
      doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
      split_doc = splitter.split(doc_toks)
      context = selector.prune(question, split_doc)
      if model.preprocessor is not None:
        context = [model.preprocessor.encode_text(question, x) for x in context]
      else:
        context = [flatten_iterable(x.text) for x in context]
      vocab.update(question)
      for txt in context:
        vocab.update(txt)
      ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]
      data.append((document_raw, question_raw, context, ex))
  return data, vocab
Exemple #10
0
def main():
    data = SquadCorpus()

    string_f1 = 0
    mapped_string_f1 = 0

    docs = data.get_train()
    n_questions = 0

    for doc in tqdm(docs):
        for para in doc.paragraphs:
            words = flatten_iterable(para.text)
            for question in para.questions:
                n_questions += 1
                span_answer = question.answer[0]
                span_str = " ".join(
                    words[span_answer.
                          para_word_start:span_answer.para_word_end + 1])
                raw_answer = span_answer.text
                mapped_str = para.get_original_text(
                    span_answer.para_word_start, span_answer.para_word_end)

                string_f1 += f1_score(raw_answer, span_str)
                mapped_string_f1 += f1_score(raw_answer, mapped_str)

    print(string_f1 / n_questions)
    print(mapped_string_f1 / n_questions)
Exemple #11
0
def main():
    data = TriviaQaWebDataset()

    stop = NltkPlusStopWords()
    splitter = MergeParagraphs(400)
    selector = TopTfIdf(stop, 4)

    print("Loading data..")
    train = data.get_train()
    print("Start")
    for q in train:
        for doc in q.all_docs:
            if len(doc.answer_spans) > 3:
                text = splitter.split_annotated(
                    data.evidence.get_document(doc.doc_id), doc.answer_spans)
                text = selector.prune(q.question, text)
                for para in text:
                    if len(para.answer_spans) > 3:
                        print(q.question)
                        text = flatten_iterable(para.text)
                        for s, e in para.answer_spans:
                            text[s] = "{{{" + text[s]
                            text[e] = text[e] + "}}}"
                        print(" ".join(text))
                        input()
    def any_found(self, para):
        # Normalize the paragraph
        words = [w.lower().strip(self.strip) for w in flatten_iterable(para)]
        occurances = []
        for answer_ix, answer in enumerate(self.answer_tokens):
            # Locations where the first word occurs
            word_starts = [i for i, w in enumerate(words) if answer[0] == w]
            n_tokens = len(answer)

            # Advance forward until we find all the words, skipping over articles
            for start in word_starts:
                end = start + 1
                ans_token = 1
                while ans_token < n_tokens and end < len(words):
                    next = words[end]
                    if answer[ans_token] == next:
                        ans_token += 1
                        end += 1
                    elif next in self.skip:
                        end += 1
                    else:
                        break
                if n_tokens == ans_token:
                    occurances.append((start, end))
        return list(set(occurances))
Exemple #13
0
def build_tokenized_corpus(input_root,
                           tokenizer,
                           output_dir,
                           skip_dirs=False,
                           n_processes=1,
                           wiki_only=False):
    if not exists(output_dir):
        makedirs(output_dir)

    all_files = _gather_files(input_root, output_dir, skip_dirs, wiki_only)

    if n_processes == 1:
        voc = build_tokenized_files(tqdm(all_files, ncols=80), input_root,
                                    output_dir, tokenizer)
    else:
        voc = set()
        from multiprocessing import Pool
        with Pool(n_processes) as pool:
            chunks = split(all_files, n_processes)
            chunks = flatten_iterable(group(c, 500) for c in chunks)
            pbar = tqdm(total=len(chunks), ncols=80)
            for v in pool.imap_unordered(
                    _build_tokenized_files_t,
                [[c, input_root, output_dir, tokenizer] for c in chunks]):
                voc.update(v)
                pbar.update(1)
            pbar.close()

    voc_file = join(output_dir, "vocab.txt")
    with open(voc_file, "w") as f:
        for word in sorted(voc):
            f.write(word)
            f.write("\n")
Exemple #14
0
def print_questions(question, answers, context, answer_span):
    print(" ".join(question))
    print(answers)
    context = flatten_iterable(context)
    for s, e in answer_span:
        context[s] = "{{{" + context[s]
        context[e] = context[e] + "}}}"
    print(" ".join(context))
Exemple #15
0
def post_split_tokens(tokens: List[str]) -> List[str]:
    """
    Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens
    due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see
    if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA
     """
    return flatten_iterable([x for x in extra_split_chars_re.split(token) if x != ""]
                            for token in tokens)
    def preprocess(self, questions: List[TriviaQaQuestion],
                   evidence) -> object:
        splitter = self.splitter
        para_filter = self.ranker

        with_paragraphs = []
        for q in questions:
            paras = []
            for doc in q.all_docs:
                if self.require_an_answer and len(doc.answer_spans) == 0:
                    continue
                text = evidence.get_document(doc.doc_id,
                                             splitter.reads_first_n)
                split = splitter.split_annotated(text, doc.answer_spans)
                paras.extend([
                    DocParagraphWithAnswers(x.text, x.start, x.end,
                                            x.answer_spans, doc.doc_id)
                    for x in split
                ])

            if para_filter is not None:
                paras = para_filter.prune(q.question, paras)

            if len(paras) == 0:
                continue
            if self.require_an_answer:
                if all(len(x.answer_spans) == 0 for x in paras):
                    continue

            if self.text_preprocess is not None:
                prepped = [
                    self.text_preprocess.encode_extracted_paragraph(
                        q.question, p) for p in paras
                ]
                if self.require_an_answer:
                    if all(len(x.answer_spans) == 0 for x in prepped):
                        continue
                doc_paras = []
                for i, (preprocessed, para) in enumerate(zip(prepped, paras)):
                    doc_paras.append(
                        DocumentParagraph(para.doc_id, para.start, para.end, i,
                                          preprocessed.answer_spans,
                                          preprocessed.text))
                with_paragraphs.append(
                    MultiParagraphQuestion(q.question_id, q.question,
                                           q.answer.all_answers, doc_paras))
            else:
                doc_paras = [
                    DocumentParagraph(x.doc_id, x.start, x.end, i,
                                      x.answer_spans, flatten_iterable(x.text))
                    for i, x in enumerate(paras)
                ]
                with_paragraphs.append(
                    MultiParagraphQuestion(q.question_id, q.question,
                                           q.answer.all_answers, doc_paras))

        return FilteredData(with_paragraphs, len(questions))
Exemple #17
0
def print_paragraph(question: TriviaQaQuestion,
                    para: ExtractedParagraphWithAnswers):
    print(" ".join(question.question))
    print(question.answer.all_answers)
    context = flatten_iterable(para.text)
    for s, e in para.answer_spans:
        context[s] = "{{{" + context[s]
        context[e] = context[e] + "}}}"
    print(" ".join(context))
def find_answer(documents, raw_question):

    raw_question = raw_question.lower()
    documents = [d.lower() for d in documents]

    global best_spans, conf

    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    tokenizer = NltkAndPunctTokenizer()

    question = tokenizer.tokenize_paragraph_flat(raw_question)

    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    splitter = MergeParagraphs(400)

    documents = [splitter.split(doc) for doc in documents]

    if len(documents) == 1:
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    context = [flatten_iterable(x.text) for x in context]

    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    encoded = model.encode(data, is_train=False)

    with sess.as_default():
        spans, confid = sess.run([best_spans, conf], feed_dict=encoded)

    best_para = np.argmax(confid)
    ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] +
                                      1])
    confidence = confid[best_para]

    return ans, confidence
Exemple #19
0
    def test_segment_log_sum_exp(self):
        sess = self.sess
        with sess.as_default():
            for i in range(10):
                groups = []
                for group_id in range(10):
                    group = []
                    for _ in range(np.random.randint(1, 5)):
                        group.append(np.random.normal(0, 2, 10))
                    groups.append(group)

                flat_groups = np.stack(flatten_iterable(groups), axis=0)
                semgents = np.array(
                    flatten_iterable([ix] * len(g)
                                     for ix, g in enumerate(groups)))
                actual = sess.run(segment_logsumexp(flat_groups, semgents))
                expected = [
                    np.log(np.sum(np.exp(np.concatenate(g, axis=0))))
                    for g in groups
                ]
                self.assertTrue(np.allclose(actual, expected))
def compute_answer_spans_par(questions: List[TriviaQaQuestion], corpus,
                             tokenizer, detector, n_processes: int):
    if n_processes == 1:
        word_tokenize = tokenizer.tokenize_paragraph_flat
        compute_answer_spans(questions, corpus, word_tokenize, detector)
        return questions
    from multiprocessing import Pool
    with Pool(n_processes) as p:
        chunks = split(questions, n_processes)
        questions = flatten_iterable(
            p.starmap(_compute_answer_spans_chunk,
                      [[c, corpus, tokenizer, detector] for c in chunks]))
        return questions
 def test_split_inv(self):
     paras = [
         "One fish two fish. Red fish blue fish",
         "Just one sentence",
         "How will an overhead score? The satisfactory juice returns against an inviting protein. "
         "How can a rat expand? The subway fishes throughout a struggle. The guaranteed herd pictures an "
         "episode into the accustomed damned. The garbage reigns beside the component!",
     ]
     tok = NltkAndPunctTokenizer()
     tokenized = [tok.tokenize_with_inverse(x) for x in paras]
     inv_split = RandomSplitter().split_inverse(tokenized)
     for para in inv_split:
         self.assertTrue(flatten_iterable(para.text) == [para.original_text[s:e] for s,e in para.spans])
def parse_race_data(source, name, tokenizer, use_tqdm=True) -> List[Document]:
    with open(source, 'r') as f:
        source_data = json.load(f)
    
    if use_tqdm:
        iter_files = tqdm(source_data['data'], ncols=80)
    else:
        iter_files = source_data['data']

    mult_options_dict = {'A':0,'B':1,'C':2,'D':3}
    mult_anwsers_array = []

    for article_ix, article in enumerate(iter_files):
        article_ix = "%s-%d" % (name, article_ix)

        paragraphs = []

        for para_ix, para in enumerate(article['paragraphs']):
            questions = []
            #pdb.set_trace()
            context = para['context']

            tokenized = tokenizer.tokenize_with_inverse(context)
            # list of sentences + mapping from words -> original text index
            text, text_spans = tokenized.text, tokenized.spans
            flat_text = flatten_iterable(text)

            n_words = sum(len(sentence) for sentence in text)

            for question_ix, question in enumerate(para['qas']):
                # There are actually some multi-sentence questions, so we should have used
                # tokenizer.tokenize_paragraph_flat here which would have produced slighy better
                # results in a few cases. However all the results we report were
                # done using `tokenize_sentence` so I am just going to leave this way
                question_text = tokenizer.tokenize_sentence(question['question'])
                #vz
                #pdb.set_trace()
                choices_text = [tokenizer.tokenize_sentence(x) for x in question['choices']]
                mult_answer = question['answer']
                mult_answer_ix = mult_options_dict[mult_answer]
                
                #old calc of spans from 'build_squad_dataset'

                mult_anwsers_array.append(mult_answer_ix)
                
                questions.append(Question(question['id'], question_text, mult_answer_ix,choices_text))

            paragraphs.append(Paragraph(text, questions, article_ix, para_ix, context,mult_anwsers_array))
            #vz we need to add here choices as well (like questions or similar)

        yield Document(article_ix, article["title"], paragraphs)
Exemple #23
0
    def get_document(self, doc_id, n_tokens=None, flat=False):
        if self.file_id_map is None:
            file_id = doc_id
        else:
            file_id = self.file_id_map.get(doc_id)

        if file_id is None:
            return None

        file_id = join(self.directory, file_id + ".txt")
        if not exists(file_id):
            return None

        with open(file_id, "r") as f:
            if n_tokens is None:
                text = f.read()
                if flat:
                    return [
                        x for x in self._split_all.split(text) if len(x) > 0
                    ]
                else:
                    paragraphs = []
                    for para in self._split_para.split(text):
                        paragraphs.append(
                            [sent.split(" ") for sent in para.split("\n")])
                    return paragraphs
            else:
                paragraphs = []
                paragraph = []
                cur_tokens = 0
                for line in f:
                    if line == "\n":
                        if not flat and len(paragraph) > 0:
                            paragraphs.append(paragraph)
                            paragraph = []
                    else:
                        sent = line.split(" ")
                        sent[-1] = sent[-1].rstrip()
                        if len(sent) + cur_tokens > n_tokens:
                            if n_tokens != cur_tokens:
                                paragraph.append(sent[:n_tokens - cur_tokens])
                            break
                        else:
                            paragraph.append(sent)
                            cur_tokens += len(sent)
                if flat:
                    return flatten_iterable(paragraph)
                else:
                    if len(paragraph) > 0:
                        paragraphs.append(paragraph)
                    return paragraphs
Exemple #24
0
def contains_question_word():
    data = TriviaQaWebDataset()
    stop = NltkPlusStopWords(punctuation=True).words
    doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True))
    splits = MergeParagraphs(400)
    # splits = Truncate(400)
    questions = data.get_dev()
    pairs = flatten_iterable([(q, doc) for doc in q.all_docs]
                             for q in questions)
    pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id))
    np.random.RandomState(0).shuffle(questions)
    has_token = 0
    total = 0
    used = Counter()

    for q, doc in tqdm(pairs[:1000]):
        text = data.evidence.get_document(doc.doc_id, splits.reads_first_n)
        q_tokens = set(x.lower() for x in q.question)
        q_tokens -= stop
        for para in splits.split_annotated(text, doc.answer_spans):
            # if para.start == 0:
            #     continue
            if len(para.answer_spans) == 0:
                continue
            if any(x.lower() in q_tokens for x in flatten_iterable(para.text)):
                has_token += 1
                for x in flatten_iterable(para.text):
                    if x in q_tokens:
                        used[x] += 1
            # else:
            #     print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans)
            #     input()
            total += 1
    for k, v in used.most_common(200):
        print("%s: %d" % (k, v))
    print(has_token / total)
Exemple #25
0
    def split(self, doc: List[List[List[str]]]):
        all_paragraphs = []

        on_doc_token = 0  # the word in the document the current paragraph starts at
        on_paragraph = []  # text we have collect for the current paragraph
        cur_tokens = 0  # number of tokens in the current paragraph

        word_ix = 0
        for para in doc:
            para = flatten_iterable(para)
            n_words = len(para)
            if self.top_n is not None and (word_ix + self.top_n) > self.top_n:
                if word_ix == self.top_n:
                    break
                para = para[:self.top_n - word_ix]
                n_words = self.top_n - word_ix

            start_token = word_ix
            end_token = start_token + n_words
            word_ix = end_token

            if cur_tokens + n_words > self.max_tokens:
                if cur_tokens != 0:  # end the current paragraph
                    all_paragraphs.append(
                        ExtractedParagraph(on_paragraph, on_doc_token,
                                           start_token))
                    on_paragraph = []
                    cur_tokens = 0

                if n_words >= self.max_tokens:  # either truncate the given paragraph, or begin a new paragraph
                    text = para[:self.max_tokens]
                    all_paragraphs.append(
                        ExtractedParagraph([text], start_token,
                                           start_token + self.max_tokens))
                    on_doc_token = end_token
                else:
                    on_doc_token = start_token
                    on_paragraph.append(para)
                    cur_tokens = n_words
            else:
                on_paragraph.append(para)
                cur_tokens += n_words

        if len(on_paragraph) > 0:
            all_paragraphs.append(
                ExtractedParagraph(on_paragraph, on_doc_token, word_ix))

        return all_paragraphs
    def encode(self,
               batch: List[ContextAndQuestion],
               is_train: bool,
               cached_doc=None):
        if len(batch) > self.max_batch_size:
            raise ValueError(
                "The model can only use a batch <= %d, but got %d" %
                (self.max_batch_size, len(batch)))
        data = self.encoder.encode(batch, is_train)
        data[self.
             _question_char_ids_placeholder] = self._batcher.batch_sentences(
                 [q.question for q in batch])
        data[self._is_train_placeholder] = is_train
        if cached_doc is not None:
            data[self._cached_doc_placeholder] = cached_doc
            context_word_dim = cached_doc.shape[1]
        else:
            data[self._cached_doc_placeholder] = np.zeros(
                (1, 1, self.document_embedding_dim))
            context_word_dim = data[self.encoder.context_words].shape[1]

        if not self.per_sentence:
            data[self._context_char_ids_placeholder] = \
                self._batcher.batch_sentences([x.get_context() for x in batch])
        else:
            data[self._context_char_ids_placeholder] = \
                self._batcher.batch_sentences(flatten_iterable([x.sentences for x in batch]))

            # Compute indices where context_sentence_ixs[sentence#, k, sentence_word#] = (batch#, k, batch_word#)
            # for each word. We use this to map the tokens built for the sentences back to
            # the format where sentences are flattened for each batch
            context_sentence_ixs = np.zeros(
                (len(batch), 3, context_word_dim, 3), dtype=np.int32)
            total_sent_ix = 0
            for ix, point in enumerate(batch):
                word_ix = 0
                for sent_ix, sent in enumerate(point.sentences):
                    for w_ix in range(len(sent)):
                        for k in range(3):
                            context_sentence_ixs[ix, k, word_ix] = [
                                total_sent_ix, k, w_ix
                            ]
                        word_ix += 1
                    total_sent_ix += 1
            data[self._context_sentence_ixs] = context_sentence_ixs
        return data
Exemple #27
0
def preprocess_par(questions: List,
                   evidence,
                   preprocessor,
                   n_processes=2,
                   chunk_size=200,
                   name=None):
    if chunk_size <= 0:
        raise ValueError("Chunk size must be >= 0, but got %s" % chunk_size)
    if n_processes is not None and n_processes <= 0:
        raise ValueError("n_processes must be >= 1 or None, but got %s" %
                         n_processes)
    n_processes = min(len(questions), n_processes)

    if n_processes == 1:
        out = preprocessor.preprocess(tqdm(questions, desc=name, ncols=80),
                                      evidence)
        preprocessor.finalize_chunk(out)
        return out
    else:
        from multiprocessing import Pool
        chunks = split(questions, n_processes)
        chunks = flatten_iterable([group(c, chunk_size) for c in chunks])
        print("Processing %d chunks with %d processes" %
              (len(chunks), n_processes))
        pbar = tqdm(total=len(questions), desc=name, ncols=80)
        lock = Lock()

        def call_back(results):
            preprocessor.finalize_chunk(results[0])
            with lock:  # FIXME Even with the lock, the progress bar still is jumping around
                pbar.update(results[1])

        with Pool(n_processes) as pool:
            results = [
                pool.apply_async(_preprocess_and_count,
                                 [c, evidence, preprocessor],
                                 callback=call_back) for c in chunks
            ]
            results = [r.get()[0] for r in results]

        pbar.close()
        output = results[0]
        for r in results[1:]:
            output += r
        return output
Exemple #28
0
def get_evidence_voc(corpus, n_processes=1):
    doc_ids = corpus.list_documents()
    voc = Counter()

    if n_processes == 1:
        for doc in tqdm(doc_ids):
            voc = corpus.get_document(doc, flat=True)
    else:
        from multiprocessing import Pool
        chunks = split(doc_ids, n_processes)
        chunks = flatten_iterable(group(x, 10000) for x in chunks)
        pbar = tqdm(total=len(chunks), ncols=80)
        with Pool(n_processes) as pool:
            for v in pool.imap_unordered(_extract_voc_tuple, [[corpus, c] for c in chunks]):
                voc += v
                pbar.update(1)
        pbar.close()

    return voc
 def any_found(self, para):
     words = [x.lower() for x in flatten_iterable(para)]
     occurances = []
     for answer_ix, answer in enumerate(self.answer_tokens):
         word_starts = [i for i, w in enumerate(words) if answer[0] == w]
         n_tokens = len(answer)
         for start in word_starts:
             end = start + 1
             ans_token = 1
             while ans_token < n_tokens and end < len(words):
                 next = words[end]
                 if answer[ans_token] == next:
                     ans_token += 1
                     end += 1
                 else:
                     break
             if n_tokens == ans_token:
                 occurances.append((start, end))
     return list(set(occurances))
def check_preprocess():
    data = TriviaQaWebDataset()
    merge = MergeParagraphs(400)
    questions = data.get_dev()
    pre = WithIndicators(False)
    remove_cross = WithIndicators(True)
    rng = np.random.RandomState(0)
    rng.shuffle(questions)

    for q in tqdm(questions[:1000]):
        doc = rng.choice(q.all_docs, 1)[0]
        text = data.evidence.get_document(doc.doc_id, n_tokens=800)
        paras = merge.split_annotated(text, doc.answer_spans)
        para = paras[np.random.randint(0, len(paras))]
        built = pre.encode_extracted_paragraph(q.question, para)

        expected_text = flatten_iterable(para.text)
        if expected_text != [
                x for x in built.text if x not in pre.special_tokens()
        ]:
            raise ValueError()

        expected = [expected_text[s:e + 1] for s, e in para.answer_spans]
        expected = Counter([tuple(x) for x in expected])

        actual = [tuple(built.text[s:e + 1]) for s, e in built.answer_spans]
        actual_cleaned = Counter(
            tuple(z for z in x if z not in pre.special_tokens())
            for x in actual)
        if actual_cleaned != expected:
            raise ValueError()

        r_built = remove_cross.encode_extracted_paragraph(q.question, para)
        rc = Counter(
            tuple(r_built.text[s:e + 1]) for s, e in r_built.answer_spans)
        removed = Counter()
        for w in actual:
            if all(x not in pre.special_tokens() for x in w):
                removed[w] += 1

        if rc != removed:
            raise ValueError()