Esempio n. 1
0
def _is_past_participle_verb(verb):
    main_form = verb.lemma_
    if main_form not in SLoader.get_verb_list():
        return False
    if main_form not in SLoader.get_past_participle_list():
        return False

    past_tense = SLoader.get_past_participle_list()[main_form]
    if past_tense == verb.lower_:
        return True
    return False
def _process_internal(question):
    assert(isinstance(question, spacy.tokens.span.Span))

    if len(question) < 2:
        return str(question) + "@placeholder"

    question = "Which " + str(question) + " ?"
    nlp = SLoader.get_full_spacy_nlp()
    question = list(nlp(question).sents)[0]
    qtype = get_question_type(question)

    if qtype == QType.WHICH_OF:
        return which_of.process(question)

    if qtype == QType.IN_WHICH_OF:
        # No examples in the entire SQuAD dataset.
        return in_which_of.process(question)

    if qtype == QType.WHICH_NOUN:
        return which_noun.process(question)

    if qtype == QType.WHICH_BE:
        return which_be.process(question)

    if qtype == QType.IN_WHICH_NOUN:
        # No examples in the entire SQuAD dataset.
        return in_which_noun.process(question)

    if qtype == QType.WHICH_VERB:
        return which_verb.process(question)

    question = question[1:]  # Remove "Which"
    question = ["@placeholder"] + [str(x) for x in question]
    question[-1] = '.'  # Replace "?" with "."
    return ' '.join(question)
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 3)

    skipped, question = _split_question(question)
    assert (isinstance(skipped, list))
    assert (isinstance(question, spacy.tokens.span.Span))

    if len(skipped) >= 1:
        skipped[0] = "in"  # Not "In".
    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    question = [str(x) for x in question]
    if not question[0].isupper():
        question[0] = question[0].capitalize()

    skipped = ' '.join(skipped)
    question = ' '.join(question)

    # Swap main question and "In ...".
    question = question + ", " + skipped + " ?"
    nlp = SLoader.get_full_spacy_nlp()
    doc = nlp(question)
    question = doc[0:len(doc)]  # Convert to spaCy Span (not Doc).

    return _process_internal(question)
Esempio n. 4
0
def _process_with_verb(question):
    to_remove = set()
    to_past = set()
    insert_before = {}

    verb = question[0]
    if verb.lower_ == "does" or verb.lower_ == "do":
        to_remove.add(verb)
    elif verb.lower_ == "did":
        # Put main verb to past tense.
        to_remove.add(verb)
        to_past.add(verb.head)
    elif verb.lemma_ == "be":
        insert_before[verb.head] = verb
        to_remove.add(verb)

    out = []
    for token in question:
        if token in insert_before:
            out.append(insert_before[token].text)
        if token in to_remove:
            continue
        if token in to_past:
            past_tenses = SLoader.get_past_tense_list()
            out.append(past_tenses.get(token.text, token.text))
        else:
            out.append(token.text)

    return deepcopy(out)
def _process_internal(question):
    assert (isinstance(question, spacy.tokens.span.Span))

    if len(question) < 2:
        return str(question) + "@placeholder"

    question = "What " + str(question) + " ?"
    nlp = SLoader.get_full_spacy_nlp()
    question = list(nlp(question).sents)[0]
    qtype = get_question_type(question)

    if qtype == QType.WHAT_BE:
        return what_be.process(question)

    if qtype == QType.WHAT_DO:
        return what_do.process(question)

    if qtype == QType.IN_WHAT:
        return in_what.process(question)

    if qtype == QType.WHAT_NOUN:
        return what_noun.process(question)

    if qtype == QType.WHAT_VERB:
        return what_verb.process(question)

    question = question[1:]  # Remove "What"
    question = ["@placeholder"] + [str(x) for x in question]
    question[-1] = '.'  # Replace "?" with "."
    return ' '.join(question)
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 3)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 2:
        return "@placeholder"

    insert_before = {}
    skip = set()
    to_past = set()
    for token in question:
        if token.pos_ == "VERB" and token.dep_.lower() == "aux":
            if token.lemma_ == "do":
                if token.lower_ == "did":
                    to_past.add(token.head)
                skip.add(token)
            else:
                if token.lemma_ == "have" and token.head:
                    if not _has_aux_pass(token.head):
                        insert_before[token.head] = token
                        skip.add(token)
                    else:
                        for child in token.head.children:
                            if child.dep_.lower() == "auxpass" or (child.lower_
                                                                   == "been"):
                                insert_before[child] = token
                                skip.add(token)
                                break
                elif token.lemma_ == "be":
                    pass  # Seems ok to let it as it is.

    question = question[2:]  # Skip "How much"
    out = []
    for token in question:
        if token in insert_before:
            out.append(insert_before[token].text)
        if token in skip:
            continue
        if token in to_past:
            past_tense = SLoader.get_past_tense_list()
            out.append(past_tense.get(token.lower_, token.text))
        else:
            out.append(token.text)

    question = ["@placeholder", "(", "long", ")"] + out + ["."]
    if len(question) >= 1 and not question[0].isupper():
        question[0] = question[0].capitalize()

    return ' '.join(question)
def _process_with_verb(question):
    # Try to remove aux verbs.
    # E.g. In which state did Jordan played the most of his games?
    if len(question) >= 1 and question[0].pos_ == "VERB":
        to_remove = set()
        to_past = set()
        insert_before = {}
        insert_at_end = []  # in this order.

        verb = question[0]
        if verb.lower_ == "does" or verb.lower_ == "do":
            # In which location do students of the School of Architecture
            #           of Notre Dame spend their 3rd year?
            # Do not bother with 1st/3rd person.
            # Leave it as it is.
            to_remove.add(verb)
        elif verb.lower_ == "did":
            # Put main verb to past tense.
            to_remove.add(verb)
            if verb != verb.head:
                to_past.add(verb.head)
        elif verb.lemma_ == "be":
            if verb.head and verb.dep_ in ["aux", "auxpass"] and (
                    verb.head != verb and verb.head.pos_ == "VERB"):
                # In which season was online voting introduced?
                insert_before[verb.head] = verb
                to_remove.add(verb)
            else:
                # In which direction is Puerto Rico from the island of
                #       Saint-Barthélemy?
                to_remove.add(verb)
                insert_at_end.append(verb)

        out = []
        for token in question:
            if token in insert_before:
                out.append(insert_before[token].text)
            if token in to_remove:
                continue
            if token in to_past:
                past_tenses = SLoader.get_past_tense_list()
                out.append(past_tenses.get(token.text, token.text))
            else:
                out.append(token.text)
        for token in insert_at_end:
            out.append(token.text)

        return deepcopy(out)

    return [str(x) for x in question]
Esempio n. 8
0
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 3)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 2:
        return "@placeholder"

    verb = question[1]  # Do/does/did ...
    question = question[2:]  # Remove "When do/does/did" tokens.
    to_replace = {}
    if verb.head and verb.head != verb and verb.head.pos_ == "VERB":
        if verb.lower_ == "did":
            verb_text = verb.head.lower_
            past_tense = SLoader.get_past_tense_list()
            verb_text = past_tense.get(verb_text, verb_text)
            to_replace[verb.head] = verb_text
        else:
            # Present tense. Do nothing.
            pass

    out = []
    for token in question:
        if token in to_replace:
            out.append(to_replace[token])
        else:
            out.append(token.text)
    question = out
    question.append("in")
    question.append("@placeholder")
    question.append(".")

    if len(question) >= 1 and not question[0].isupper():
        question[0] = question[0].capitalize()
    question = ' '.join(question)

    return question
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 3)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 2:
        return str(question) + " @placeholder ."

    PREP = {
        'describe': 'as',
        'identify': 'as',
        'view': 'as',
        'define': 'as',
        'rate': 'as',
        'compare': 'as',
        'credit': 'as'  # Was credited as ...
    }

    question = question[1:]  # Remove "How".
    verb = question[0]
    if verb.lower_ in ["do", "does"]:
        question = [str(x) for x in question[1:]]
        question.append(PREP.get(verb.head.lemma_, "by"))
        question.append("@placeholder")
        question.append(".")

        return _to_capitalize(question)

    # assert(verb.lower_ == "did")
    if verb.head and verb.head != verb:
        main_verb = verb.head

        out = []
        for token in question[1:]:
            if token == main_verb:
                past_tense = SLoader.get_past_tense_list()
                out.append(past_tense.get(token.lower_, token.text))
            else:
                out.append(token.text)

        out.append(PREP.get(main_verb.lemma_, "by"))
        out.append("@placeholder")
        out.append(".")

        return _to_capitalize(out)

    # Mostly spaCy "wrong" dependency trees.
    # Search for the first verb.
    # No examples found.
    main_verb = None
    for token in question[1:]:
        if token.pos_ == "VERB":
            main_verb = token
            break

    if main_verb is not None:
        out = []
        for token in question[1:]:
            if token == main_verb:
                past_tense = SLoader.get_past_tense_list()
                out.append(past_tense.get(token.lower_, token.text))
            else:
                out.append(token.text)

        out.append(PREP.get(main_verb.lemma_, "by"))
        out.append("@placeholder")
        out.append(".")

        return _to_capitalize(out)

    # No verb is found. Do nothing (but remove "did").
    # How did Descartes' distinguish types of existence?
    # How did Top 40 radio what ifmusic change during this era?
    # How did the actual sales of the G4's compare to the sales expectations?
    question = [str(x) for x in question[1:]]
    question.append(PREP.get(verb.head.lemma_, "by"))
    question.append("@placeholder")
    question.append(".")

    return _to_capitalize(question)
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 3)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 2:
        return str(question) + " @placeholder ."

    # Skip HOW X VERB? (X)
    # How far from each other were the motors in Gramme's demonstrations?
    # => far from each other
    question = question[1:]  # Remove "How"
    advj = question[0]
    measure = []
    while len(question) >= 1:
        token = question[0]
        if token == advj or advj.is_ancestor(token):
            measure.append(token.text)
            question = question[1:]
        else:
            break
    assert (len(measure) >= 1)
    measure = ' '.join(measure)

    if len(question) <= 1:
        # How lond did the creation of Red Book CD - DA standard take?
        # long => lond => spaCy error.
        out = "@placeholder " + measure
        if len(question) > 0:
            out = out + " " + str(question)
        return out + " ."

    assert (isinstance(measure, str))
    verb = question[0]
    if verb.pos_ != "VERB":
        # Caused by a spaCy wrong dependency tree.
        # How [far] away was the plant located from the epicenter?
        # Extend @measure until the first verb.
        while len(question) >= 1:
            token = question[0]
            if token.pos_ != "VERB":
                measure += (" " + token.text)
                question = question[1:]
            else:
                break

        if len(question) <= 1:
            # How far back to San Diego's roots in the arts and theater
            #       sector go?
            out = "@placeholder " + measure
            if len(question) > 0:
                out = out + " " + str(question)
            return out + " ."

    assert (isinstance(measure, str))
    assert (len(question) > 1)
    assert (question[0].pos_ == "VERB")

    verb = question[0]
    if verb.lemma_ == "do":
        if verb.head == verb:
            # How often do temperatures on the coastal plain of NC drop below
            #       freezing at night?
            # Insert @placeholder at the end.
            question = [str(x) for x in question[1:]]
            if len(question) >= 1 and not question[0].isupper():
                question[0] = question[0].capitalize()
            question.append("@placeholder")
            question.append("(")
            question.append(measure)
            question.append(")")
            question.append(".")
            return ' '.join(question)
        else:
            # Insert @placeholder at the end.
            # How far did the Arctic tern chick travel?
            # Correct main verb tense.
            main_verb = verb.head
            out = []
            for token in question[1:]:
                if token == main_verb and verb.lower_ == "did":
                    # To past.
                    past_tense = SLoader.get_past_tense_list()
                    out.append(past_tense.get(token.lemma_, token.text))
                else:
                    out.append(token.text)
            question = out

            if len(question) >= 1 and not question[0].isupper():
                question[0] = question[0].capitalize()
            question.append("@placeholder")
            question.append("(")
            question.append(measure)
            question.append(")")
            question.append(".")
            return ' '.join(question)

    if verb.head != verb:
        # How high had cotton revenues risen by the time of the American
        #           Civil War?
        main_verb = verb.head
        index = -1
        for i in range(0, len(question)):
            if question[i] == main_verb:
                index = i
                break
        assert (index > 0 and index < len(question))
        insert_before = main_verb
        if index >= 1 and _is_aux_verb(question[index - 1], main_verb):
            insert_before = question[index - 1]

        out = []
        for token in question[1:]:
            if token == insert_before:
                out.append(verb.text)
            out.append(token.text)
            if token == main_verb:
                out.append("@placeholder")
                out.append("(")
                out.append(measure)
                out.append(")")
        out.append(".")
        question = out

        if len(question) >= 1 and not question[0].isupper():
            question[0] = question[0].capitalize()
        return ' '.join(question)

    # Look for the subject (as a child of the verb).
    # How old are most of the native language speakers in northern Catalonia?
    subj = None
    for child in verb.children:
        if child.dep_.lower() in ["nsubj", "nsubjpass"]:
            subj = child
            break

    if subj is None:
        # How large in square kilometers is Greater Hyderabad?
        # How simple is the process of transformation?
        # Insert verb and @placeholder at the end.
        question = [str(x) for x in question[1:]]
        question.append(verb.text)
        question.append("@placeholder")
        question.append("(")
        question.append(measure)
        question.append(")")
        question.append(".")

        if len(question) >= 1 and not question[0].isupper():
            question[0] = question[0].capitalize()
        return ' '.join(question)

    insert_after = subj
    for child in subj.subtree:
        if child.idx > insert_after.idx:
            insert_after = child

    out = []
    for token in question[1:]:
        out.append(token.text)
        if token == insert_after:
            out.append(verb.text)
            out.append("@placeholder")
            out.append("(")
            out.append(measure)
            out.append(")")
    out.append(".")
    question = out

    if len(question) >= 1 and not question[0].isupper():
        question[0] = question[0].capitalize()
    return ' '.join(question)
Esempio n. 11
0
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 4)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 3:
        return "@placeholder"

    on = question[0]
    head = on.head
    question = question[2:]

    skipped = None
    if head != on:
        # On what magazine was she the cover model?
        skipped = []
        while len(question) >= 1:
            if question[0].pos_ == "VERB":
                break
            if head.is_ancestor(question[0]) or head == question[0]:
                skipped.append(question[0].text)
                question = question[1:]
            else:
                break
    else:
        # Stop at the first verb.
        # On what devices can video games be used?
        # On what was the Philip Glass opera based?
        # On what occasions are š and ž replaced with sh and zh?
        # On what film was videoconferencing widely used?
        # On what was the mitrailleuse mounted?
        # On what do plants depend in their environment?
        index = 0
        for token in question:
            if token.pos_ == "VERB":
                break
            index += 1

        if index < len(question):
            skipped = [str(x) for x in question[0:index]]
            question = question[index:]
        else:
            # No verb found.
            # Does not happen in the entire SQuAD.
            skipped = []

    assert (isinstance(skipped, list))

    if len(question) == 0:
        # Does not happen in the entire SQuAD.
        question = [str(x) for x in question]
        question = ["On", "@placeholder"] + question
        question.append(".")
        question = ' '.join(question)
        return question

    to_remove = set()
    insert_before = {}
    to_past = set()
    insert_at_end = []  # In this order.

    verb = question[0]
    if verb.pos_ == "VERB":
        # 99.99% of the cases.
        if verb.lower_ in ["do", "does"]:
            # Just remove.
            to_remove.add(verb)
        elif verb.lower_ == "did":
            to_remove.add(verb)
            if verb.head != verb and verb.head.pos_ == "VERB":
                # Normal case.
                to_past.add(verb.head)
            else:
                # 99% a spaCy tagging error.
                # Search for the first verb.
                main_verb = None
                for token in question[1:]:
                    if token.pos_ == "VERB":
                        # Very unlikely since it seems like
                        # a spaCy tagging problem.
                        main_verb = token
                        break
                    if token.lemma_ in SLoader.get_verb_list() and (
                            token in verb.children or token == verb.head):
                        # Some verbs are tagged as nouns.
                        # On what did a rescue helicopter crash with no "
                        #       survivors?
                        main_verb = token
                        break

                if main_verb is not None:
                    to_past.add(main_verb)
                else:
                    # Do nothing.
                    pass
        elif verb.lemma_ == "be":
            # On what date were the Belavezha Accords signed?
            # On what year was the USSR dissolved?
            to_remove.add(verb)
            if verb.head != verb and verb.head.pos_ == "VERB":
                insert_before[verb.head] = verb
            else:
                # Search for the first verb.
                main_verb = None
                for token in question[1:]:
                    if _is_past_participle_verb(token):
                        main_verb = token
                        break
                    if token.pos_ == "VERB":
                        # Another verb entry. <VERB1> <VERB2>
                        break

                if main_verb is not None:
                    # On what day and month was Spectre released to the
                    #   Chinese market released?
                    # On what date is Twilight Princess HD scheduled
                    #   for Australian release?
                    insert_before[main_verb] = verb
                else:
                    # No main verb linked to "be" was found.
                    # On what magazine was she the cover model?
                    # On what day was the funeral of Donda West?
                    # On what season was Kristy Lee Cook a contestant on
                    #       American Idol?
                    insert_at_end.append(verb)
        else:
            # On what part of newer iPods can you find the buttons?
            # On what devices can video games be used?
            # On what day would most of the games televised on the ESPN
            #       networks be played?
            # On what day would AFL games be shown on NFL Network?
            # Search for the first verb.
            to_remove.add(verb)
            main_verb = None
            for token in question[1:]:
                if token.pos_ == "VERB":
                    main_verb = token
                    break

            if main_verb is not None:
                insert_before[main_verb] = verb
            else:
                insert_at_end.append(verb)

    out = []
    for token in question:
        if token in insert_before:
            out.append(insert_before[token].text)
        if token in to_remove:
            continue
        if token in to_past:
            past_tense = SLoader.get_past_tense_list()
            out.append(past_tense.get(token.lower_, token.text))
        else:
            out.append(token.text)
    for token in insert_at_end:
        out.append(token.text)

    question = out + ["on", "@placeholder"]
    if len(skipped) >= 1:
        question.append("(")
        question.append(' '.join(skipped))
        question.append(")")
    question.append(".")

    if len(question) >= 1 and not question[0].isupper():
        question[0] = question[0].capitalize()

    return ' '.join(question)
def split_in_sentences(text):
    assert (isinstance(text, str))
    nlp = SLoader.get_full_spacy_nlp()
    return list(nlp(text).sents)