Ejemplo n.º 1
0
 def extract(args):
     title, text = args
     sentences = extract_wiki_sentences(
         title,
         text,
         n_wiki_sentences,
         replace_title_mentions=replace_title_mentions)
     return title, sentences
Ejemplo n.º 2
0
    def training_data(self) -> TrainingData:
        wiki_lookup = Wikipedia()
        wiki_content = []
        wiki_answers = []
        for ans in self.answers:
            if ans not in wiki_lookup:
                continue
            wiki_page = wiki_lookup[ans]
            if len(wiki_page.text) != 0:
                sentences = extract_wiki_sentences(
                    ans, wiki_page.text, self.n_sentences,
                    replace_title_mentions=self.replace_title_mentions
                )
                for sent in sentences:
                    wiki_content.append([sent])
                    wiki_answers.append(ans)

        return wiki_content, wiki_answers, None
Ejemplo n.º 3
0
    def training_data(self) -> TrainingData:
        wiki_lookup = Wikipedia()
        wiki_content = []
        wiki_answers = []
        for ans in self.answers:
            if ans not in wiki_lookup:
                continue
            wiki_page = wiki_lookup[ans]
            if len(wiki_page.text) != 0:
                sentences = extract_wiki_sentences(
                    ans,
                    wiki_page.text,
                    self.n_sentences,
                    replace_title_mentions=self.replace_title_mentions)
                for sent in sentences:
                    wiki_content.append([sent])
                    wiki_answers.append(ans)

        return wiki_content, wiki_answers, None
Ejemplo n.º 4
0
    def __init__(self,
                 path,
                 qnum_field,
                 sent_field,
                 page_field,
                 text_field,
                 unigram_field,
                 bigram_field,
                 trigram_field,
                 example_mode='sentence',
                 use_wiki=False,
                 n_wiki_sentences=3,
                 replace_title_mentions='',
                 **kwargs):
        from unidecode import unidecode

        if use_wiki and 'train' in path:
            base_path = os.path.dirname(path)
            filename = os.path.basename(s3_wiki)
            output_file = os.path.join(base_path, filename)
            if not os.path.exists(output_file):
                download_from_url(s3_wiki, output_file)
            with open(output_file) as f:
                self.wiki_lookup = json.load(f)
        else:
            self.wiki_lookup = {}
        self.path = path
        self.example_mode = example_mode

        text_dependent_fields = []
        if text_field is not None:
            text_dependent_fields.append(('text', text_field))
        if unigram_field is not None:
            text_dependent_fields.append(('unigram', unigram_field))
        if bigram_field is not None:
            text_dependent_fields.append(('bigram', bigram_field))
        if trigram_field is not None:
            text_dependent_fields.append(('trigram', trigram_field))

        example_fields = {
            'qnum': [('qnum', qnum_field)],
            'sent': [('sent', sent_field)],
            'page': [('page', page_field)],
            'text': text_dependent_fields
        }

        examples = []
        answer_set = set()
        with open(path) as f:
            for ex in json.load(f)['questions']:
                if example_mode == 'sentence':
                    sentences = ex['sentences']
                    for i, s in enumerate(sentences):
                        examples.append(
                            Example.fromdict(
                                {
                                    'qnum': ex['qnum'],
                                    'sent': i,
                                    'text': unidecode(s),
                                    'page': ex['page']
                                }, example_fields))
                        answer_set.add(ex['page'])
                elif example_mode == 'question':
                    raise NotImplementedError(
                        'Question tokenization is not implemented yet, submit a PR!'
                    )
                elif example_mode == 'runs':
                    raise NotImplementedError(
                        'Run tokenization is not implemented yet, submit a PR!'
                    )
                else:
                    raise ValueError(
                        f"Valid modes are 'sentence', 'question', and 'runs', but '{example_mode}' was given"
                    )

        if use_wiki and n_wiki_sentences > 0 and 'train' in path:
            for page in answer_set:
                if page in self.wiki_lookup:
                    sentences = extract_wiki_sentences(
                        page,
                        self.wiki_lookup[page]['text'],
                        n_wiki_sentences,
                        replace_title_mentions=replace_title_mentions)
                    for i, s in enumerate(sentences):
                        examples.append(
                            Example.fromdict(
                                {
                                    'qnum': -1,
                                    'sent': i,
                                    'text': s,
                                    'page': page
                                }, example_fields))

        dataset_fields = {
            'qnum': qnum_field,
            'sent': sent_field,
            'page': page_field,
        }
        if text_field is not None:
            dataset_fields['text'] = text_field
        if unigram_field is not None:
            dataset_fields['unigram'] = unigram_field
        if bigram_field is not None:
            dataset_fields['bigram'] = bigram_field
        if trigram_field is not None:
            dataset_fields['trigram'] = trigram_field

        super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)