Ejemplo n.º 1
0
    def how_many_times_character_appears(self, passage):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.5)

        letters_frequency = extract_letters_frequency(text)

        if self.limit_classes is not None:
            letters_frequency = dict(
                filter(lambda entry: entry[1] < self.limit_classes,
                       letters_frequency.items()))

        random_character = np.random.choice(list(letters_frequency.keys()))
        answer = letters_frequency[random_character]
        start_idx = passage.index(text)
        spans = [(start_idx + i, start_idx + i + 1) for i, x in enumerate(text)
                 if x == random_character]

        question = randomize_instances_occurance_question(
            string_asked_about=f'\'{random_character}\'',
            string_category='character',
            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='CHAR')

        return sample_details
Ejemplo n.º 2
0
    def _how_many_common_template(self, passage, ner_categories,
                                  singular_string_category,
                                  plural_string_category):
        order_of_common = np.random.randint(0, 3)
        order_name = [
            'the most common', 'the second most common',
            'the third most common'
        ][order_of_common]

        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=1.00)

        ner_and_spans = extract_ner(text, ner_categories)
        spans = [(entry[1], entry[2]) for entry in ner_and_spans]
        tokens_text = [entry[0] for entry in ner_and_spans]
        sorted_tokens_by_frequency = Counter(tokens_text).most_common()
        most_common_token = sorted_tokens_by_frequency[order_of_common]
        answer = most_common_token[1]

        question = randomize_common_question(
            string_category=f'{order_name} {singular_string_category}',
            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='SPAN')

        return sample_details
Ejemplo n.º 3
0
    def _select_template(self, passage, pos_to_extract, string_category):

        order_of_common = np.random.randint(0, 3)
        order_name = [
            'the most common', 'the second most common',
            'the third most common'
        ][order_of_common]

        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=1.00)

        spans, tokens_text = extract_pos(text, pos_to_extract=pos_to_extract)
        sorted_tokens_by_frequency = Counter(tokens_text).most_common()
        most_common_token = sorted_tokens_by_frequency[order_of_common]

        answer = most_common_token[1]

        question = randomize_common_question(
            string_category=f'{order_name} {string_category}',
            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='SPAN')

        return sample_details
Ejemplo n.º 4
0
    def how_many_nouns(self, passage):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.25)

        spans, _ = extract_pos(text, pos_to_extract=['NN'])
        answer = len(spans)

        question = randomize_total_question(string_category=f'noun',
                                            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='SPAN')

        return sample_details
Ejemplo n.º 5
0
    def how_many_sentences_in_total(self, passage):
        context_scope, sen_index, passage = randomize_scope(passage,
                                                            passage_prob=1.0)

        sentences = extract_sentences(passage)
        spans = [(passage.index(s), passage.index(s) + len(s))
                 for s in sentences]
        answer = len(sentences)

        question = randomize_total_question(string_category='sentence',
                                            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='SENTENCE')

        return sample_details
Ejemplo n.º 6
0
    def how_many_words_in_total(self, passage):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.0)

        words = extract_words(text)
        start_idx = passage.index(text)
        spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w)))
                 for w in words]
        answer = len(words)

        question = randomize_total_question(string_category='word',
                                            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='WORD')

        return sample_details
Ejemplo n.º 7
0
    def how_many_numbers(self, passage):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.5)

        start_idx = passage.index(text)
        num_words, num_indices = extract_passage_numbers(text)
        spans = [(start_idx + idx, start_idx + idx + len(str(w)))
                 for w, idx in zip(num_words, num_indices)]
        answer = len(num_words)

        question = randomize_total_question(string_category='number',
                                            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='WORD')

        return sample_details
Ejemplo n.º 8
0
    def _how_many_template(self, passage, ner_categories,
                           singular_string_category, plural_string_category):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.85)

        ner_and_spans = extract_ner(text, ner_categories)
        spans = [(entry[1], entry[2]) for entry in ner_and_spans]
        answer = len(spans)

        question = randomize_ner_question(
            singular_string_category=singular_string_category,
            plural_string_category=plural_string_category,
            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='SPAN')

        return sample_details
Ejemplo n.º 9
0
    def how_many_words_shorter_than(self, passage):
        context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.2)

        words = extract_words(text)
        target_len = np.random.random_integers(5)
        target_words = list(filter(lambda tok: len(str(tok[0])) < target_len, words))
        start_idx = passage.index(text)
        spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w))) for w in target_words]
        answer = len(target_words)

        question = randomize_total_question(string_category=f'words shorter than {target_len} character',
                                            context_scope=context_scope)

        sample_details = dict(
            question=question,
            answer=answer,
            spans=spans,
            span_type='WORD'
        )

        return sample_details
Ejemplo n.º 10
0
    def how_many_title_case_words_in_total(self, passage):
        context_scope, sen_index, text = randomize_scope(passage,
                                                         passage_prob=0.5)

        words = extract_words(text)
        title_words = list(
            filter(lambda tok: len(tok.text) > 0 and tok.text[0].isupper(),
                   words))
        start_idx = passage.index(text)
        spans = [(start_idx + w.idx, start_idx + w.idx + len(str(w)))
                 for w in title_words]
        answer = len(title_words)

        question = randomize_total_question(string_category='title case word',
                                            context_scope=context_scope)

        sample_details = dict(question=question,
                              answer=answer,
                              spans=spans,
                              span_type='WORD')

        return sample_details
Ejemplo n.º 11
0
    def how_many_times_vowels_appears(self, passage):
        context_scope, sen_index, text = randomize_scope(passage, passage_prob=0.0)

        letters_frequency = extract_letters_frequency(text)

        if self.limit_classes is not None:
            letters_frequency = dict(filter(lambda entry: entry[1] < self.limit_classes, letters_frequency.items()))

        vowel_entries = dict(filter(lambda e: e[0].lower() in ('a', 'e', 'i', 'u', 'o'), letters_frequency.items()))
        answer = sum(vowel_entries.values())
        start_idx = passage.index(text)
        spans = [(start_idx + i, start_idx + i + 1) for i, x in enumerate(text) if x in vowel_entries.keys()]

        question = randomize_total_question(string_category='vowel character', context_scope=context_scope)

        sample_details = dict(
            question=question,
            answer=answer,
            spans=spans,
            span_type='CHAR'
        )

        return sample_details