Esempio n. 1
0
def _is_past_participle_verb(verb):
    main_form = verb.lemma_
    if main_form not in SLoader.get_verb_list():
        return False
    if main_form not in SLoader.get_past_participle_list():
        return False

    past_tense = SLoader.get_past_participle_list()[main_form]
    if past_tense == verb.lower_:
        return True
    return False
Esempio n. 2
0
def process(question):
    assert (isinstance(question, spacy.tokens.span.Span))
    assert (len(question) >= 4)

    while len(question) >= 1 and question[-1].is_punct:
        question = question[:-1]

    if len(question) <= 3:
        return "@placeholder"

    on = question[0]
    head = on.head
    question = question[2:]

    skipped = None
    if head != on:
        # On what magazine was she the cover model?
        skipped = []
        while len(question) >= 1:
            if question[0].pos_ == "VERB":
                break
            if head.is_ancestor(question[0]) or head == question[0]:
                skipped.append(question[0].text)
                question = question[1:]
            else:
                break
    else:
        # Stop at the first verb.
        # On what devices can video games be used?
        # On what was the Philip Glass opera based?
        # On what occasions are š and ž replaced with sh and zh?
        # On what film was videoconferencing widely used?
        # On what was the mitrailleuse mounted?
        # On what do plants depend in their environment?
        index = 0
        for token in question:
            if token.pos_ == "VERB":
                break
            index += 1

        if index < len(question):
            skipped = [str(x) for x in question[0:index]]
            question = question[index:]
        else:
            # No verb found.
            # Does not happen in the entire SQuAD.
            skipped = []

    assert (isinstance(skipped, list))

    if len(question) == 0:
        # Does not happen in the entire SQuAD.
        question = [str(x) for x in question]
        question = ["On", "@placeholder"] + question
        question.append(".")
        question = ' '.join(question)
        return question

    to_remove = set()
    insert_before = {}
    to_past = set()
    insert_at_end = []  # In this order.

    verb = question[0]
    if verb.pos_ == "VERB":
        # 99.99% of the cases.
        if verb.lower_ in ["do", "does"]:
            # Just remove.
            to_remove.add(verb)
        elif verb.lower_ == "did":
            to_remove.add(verb)
            if verb.head != verb and verb.head.pos_ == "VERB":
                # Normal case.
                to_past.add(verb.head)
            else:
                # 99% a spaCy tagging error.
                # Search for the first verb.
                main_verb = None
                for token in question[1:]:
                    if token.pos_ == "VERB":
                        # Very unlikely since it seems like
                        # a spaCy tagging problem.
                        main_verb = token
                        break
                    if token.lemma_ in SLoader.get_verb_list() and (
                            token in verb.children or token == verb.head):
                        # Some verbs are tagged as nouns.
                        # On what did a rescue helicopter crash with no "
                        #       survivors?
                        main_verb = token
                        break

                if main_verb is not None:
                    to_past.add(main_verb)
                else:
                    # Do nothing.
                    pass
        elif verb.lemma_ == "be":
            # On what date were the Belavezha Accords signed?
            # On what year was the USSR dissolved?
            to_remove.add(verb)
            if verb.head != verb and verb.head.pos_ == "VERB":
                insert_before[verb.head] = verb
            else:
                # Search for the first verb.
                main_verb = None
                for token in question[1:]:
                    if _is_past_participle_verb(token):
                        main_verb = token
                        break
                    if token.pos_ == "VERB":
                        # Another verb entry. <VERB1> <VERB2>
                        break

                if main_verb is not None:
                    # On what day and month was Spectre released to the
                    #   Chinese market released?
                    # On what date is Twilight Princess HD scheduled
                    #   for Australian release?
                    insert_before[main_verb] = verb
                else:
                    # No main verb linked to "be" was found.
                    # On what magazine was she the cover model?
                    # On what day was the funeral of Donda West?
                    # On what season was Kristy Lee Cook a contestant on
                    #       American Idol?
                    insert_at_end.append(verb)
        else:
            # On what part of newer iPods can you find the buttons?
            # On what devices can video games be used?
            # On what day would most of the games televised on the ESPN
            #       networks be played?
            # On what day would AFL games be shown on NFL Network?
            # Search for the first verb.
            to_remove.add(verb)
            main_verb = None
            for token in question[1:]:
                if token.pos_ == "VERB":
                    main_verb = token
                    break

            if main_verb is not None:
                insert_before[main_verb] = verb
            else:
                insert_at_end.append(verb)

    out = []
    for token in question:
        if token in insert_before:
            out.append(insert_before[token].text)
        if token in to_remove:
            continue
        if token in to_past:
            past_tense = SLoader.get_past_tense_list()
            out.append(past_tense.get(token.lower_, token.text))
        else:
            out.append(token.text)
    for token in insert_at_end:
        out.append(token.text)

    question = out + ["on", "@placeholder"]
    if len(skipped) >= 1:
        question.append("(")
        question.append(' '.join(skipped))
        question.append(")")
    question.append(".")

    if len(question) >= 1 and not question[0].isupper():
        question[0] = question[0].capitalize()

    return ' '.join(question)