Ejemplo n.º 1
0
    def build(self, answers: Set[str], save=True):
        client = TagmeClient()
        wiki_lookup = Wikipedia()

        page_sentences = defaultdict(list)
        for ans in answers:
            if ans not in wiki_lookup:
                continue
            wiki_page = wiki_lookup[ans]
            if len(wiki_page.text) != 0:
                sentences = nltk.sent_tokenize(wiki_page.text)
                random.shuffle(sentences)
                clean_sentences, all_mentions = client.tag_mentions(sentences)
                for sent, mentions in zip(clean_sentences, all_mentions):
                    page_mentions = {m.page for m in mentions}
                    n_mentions = len(page_mentions)
                    for page in page_mentions.intersection(answers):
                        raise NotImplementedError(
                            'Need to fix this to use extract_wiki_sentences')
                        stripped_sent = strip_title_references(page, sent)
                        page_sentences[page].append(
                            (n_mentions, stripped_sent))

        if save:
            with safe_open(self.location, 'wb') as f:
                pickle.dump(page_sentences, f)

        return page_sentences
Ejemplo n.º 2
0
    def build_many_docs(self,
                        pages,
                        documents,
                        use_wiki=True,
                        use_qb=True,
                        rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            log.info(
                'Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                for page, doc in tqdm.tqdm(documents):
                    self.answer_doc(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                for page in tqdm.tqdm(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                self.answer_doc(page=page,
                                                wiki_content=' '.join(
                                                    chunked_content)).save()
Ejemplo n.º 3
0
    def build_large_docs(self,
                         documents: Dict[str, str],
                         use_wiki=True,
                         use_qb=True,
                         rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info(f'Deleting index: {self.name}')
            self.delete()

        if self.exists():
            log.info(f'Index {self.name} exists')
        else:
            log.info(f'Index {self.name} does not exist')
            self.init()
            wiki_lookup = Wikipedia()
            log.info(
                'Indexing questions and corresponding wikipedia pages as large docs...'
            )
            for page in tqdm.tqdm(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = self.answer_doc(page=page,
                                         wiki_content=wiki_content,
                                         qb_content=qb_content)
                answer.save(index=self.name)
Ejemplo n.º 4
0
    def build_many_docs(pages,
                        documents,
                        use_wiki=True,
                        use_qb=True,
                        rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info('Deleting index: {}'.format(INDEX_NAME))
            ElasticSearchIndex.delete()

        if ElasticSearchIndex.exists():
            log.info('Index {} exists'.format(INDEX_NAME))
        else:
            log.info('Index {} does not exist'.format(INDEX_NAME))
            Answer.init()
            log.info(
                'Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                bar = progressbar.ProgressBar()
                for page, doc in bar(documents):
                    Answer(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                wiki_lookup = Wikipedia()
                bar = progressbar.ProgressBar()
                for page in bar(pages):
                    if page in wiki_lookup:
                        content = word_tokenize(wiki_lookup[page].text)
                        for i in range(0, len(content), 200):
                            chunked_content = content[i:i + 200]
                            if len(chunked_content) > 0:
                                Answer(page=page,
                                       wiki_content=' '.join(
                                           chunked_content)).save()
Ejemplo n.º 5
0
    def build_large_docs(documents: Dict[str, str],
                         use_wiki=True,
                         use_qb=True,
                         rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info('Deleting index: {}'.format(INDEX_NAME))
            ElasticSearchIndex.delete()

        if ElasticSearchIndex.exists():
            log.info('Index {} exists'.format(INDEX_NAME))
        else:
            log.info('Index {} does not exist'.format(INDEX_NAME))
            Answer.init()
            wiki_lookup = Wikipedia()
            log.info(
                'Indexing questions and corresponding wikipedia pages as large docs...'
            )
            bar = progressbar.ProgressBar()
            for page in bar(documents):
                if use_wiki and page in wiki_lookup:
                    wiki_content = wiki_lookup[page].text
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                answer = Answer(page=page,
                                wiki_content=wiki_content,
                                qb_content=qb_content)
                answer.save()
Ejemplo n.º 6
0
    def training_data(self) -> TrainingData:
        wiki_lookup = Wikipedia()
        wiki_content = []
        wiki_answers = []
        for ans in self.answers:
            if ans not in wiki_lookup:
                continue
            wiki_page = wiki_lookup[ans]
            if len(wiki_page.text) != 0:
                sentences = extract_wiki_sentences(
                    ans,
                    wiki_page.text,
                    self.n_sentences,
                    replace_title_mentions=self.replace_title_mentions)
                for sent in sentences:
                    wiki_content.append([sent])
                    wiki_answers.append(ans)

        return wiki_content, wiki_answers, None