コード例 #1
0
ファイル: When_QG.py プロジェクト: jinpoon/11611_project
def When_module(sent, sent_features):
    question = []
    structures = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()

    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    when_parseTraversal(sent, parse, question, structures)
    # print(question)
    # print(structures)
    prev_min = float('Inf')

    if len(structures) > 0:
        whenPhrase = ""
        for t in structures:
            if t[1] < prev_min:
                whenPhrase = t[0]
                prev_min = t[1]
        # print(sent)
        # print(whenPhrase)
        thisQ = sent.replace(whenPhrase, "")
        dep_tree = sNLP.dependency_parse(thisQ)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(thisQ, dep_tree_list)

    for q in question:
        dep_tree = sNLP.dependency_parse(q)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(q, dep_tree_list)

    # print()

    pass
コード例 #2
0
def Where_Which_module(sent, sent_features):
    question = []
    simple_ques = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()
    #
    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    where_which_inFirstPP(sent, parse, simple_ques)
    if len(simple_ques) > 0:
        for bool, thisSent, nerSet, thisPP in simple_ques:
            dep_tree = sNLP.dependency_parse(thisSent)
            dep_tree = dep_tree.__next__()
            dep_tree_list = list(dep_tree.triples())
            # for t in dep_tree_list:
            #     print(t)
            if bool:
                case = thisPP.split(" ")[0]
                type = ""
                if "COUNTRY" in nerSet:
                    type = "country"
                elif "LOCATION" in nerSet:
                    type = "location"
                elif "CITY" in nerSet:
                    type = "city"
                else:
                    type = "place"
                return([construct_where_which(thisSent, dep_tree_list,case,type)])
            else:
                where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question)
                return(question)
コード例 #3
0
ファイル: QA.py プロジェクト: jinpoon/11611_project
class QA():
    def __init__(self):
        self.sNLP = StanfordNLP()
        self.dropType = {}
        self.typeNer = {}
        self.typePro = {}
        self.initQstType()
        self.candidateAnswer = []
        self.candidateSentence = []
        self.qgPipeline = QGPipeline()
        self.threshold = 90

    def initQstType(self):
        self.typeSet = ['WHADJP', 'WHADVP', 'WHPP', 'WHAVP', 'WHNP']
        self.dropType['WHADJP'] = ['NP', 'CD']
        self.dropType['WHAVP'] = ['PP', 'SBAR']
        self.dropType['WHADVP'] = ['PP', 'SBAR']
        self.dropType['WHPP'] = ['PP']
        self.dropType['WHNP'] = ['NP']
        self.dropType['UK'] = ['NP', 'NN']
        self.auxWord = ['did', 'do', 'does', 'is', 'are', 'were', 'was']
        self.typePro['where'] = ['in', 'at', 'on', 'behind', 'next']
        self.typeNer['when'] = ['DATE']
        self.typeNer['where'] = [
            'CITY', 'STATE_OR_PROVINCE', 'ORGANIZATION', 'LOCATION', 'COUNTRY'
        ]

    def decideType(self, myParent):
        if self.qstFlag:
            return
        for node in myParent:
            #node.pretty_print()
            if self.qstFlag:
                return

            if isinstance(node, str): continue

            if node.label() in self.typeSet:
                self.thisType = node.label()
                myParent.remove(node)
                self.qstFlag = True
            self.decideType(node)
            if node.label() == 'ROOT':
                self.qstSim = node.leaves()
                self.qstSim = ' '.join(self.qstSim[:-1])

    def parseDep(self, x):
        a = x[0][0].lower()
        a = WordNetLemmatizer().lemmatize(a)
        b = x[2][0].lower()
        b = WordNetLemmatizer().lemmatize(b)
        return (a, b)

    def bin_answer(self, question, sent):
        #print(question, sent)

        qstTree = self.sNLP.dependency_parse(question)
        qstTree = qstTree.__next__()
        qstTree = list(qstTree.triples())
        sentTree = self.sNLP.dependency_parse(sent)
        sentTree = sentTree.__next__()
        sentTree = list(sentTree.triples())
        #print(qstTree, sentTree)
        qstSub = []
        sentSub = []
        flag = False
        neg = False
        for x in qstTree:
            # print(x)
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                qstSub.append(self.parseDep(x))
            if x[1] == 'neg':
                neg = True
        for x in sentTree:
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                sentSub.append(self.parseDep(x))
                if self.parseDep(x) in qstSub:
                    flag = True
        #print(qstSub)
        #print(sentSub)

        if flag:
            if neg:
                return ('No', 100)
            else:
                return ('Yes', 100)

        bin_tags = set(
            ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has'])
        question = question.lower()
        sent = sent.lower()
        q_tokens = word_tokenize(question)
        s_tokens = word_tokenize(sent)
        negations = set(['not', 'never', "aren't"])
        ans = ''
        # case 1: negations
        for neg in negations:
            if (neg in q_tokens) and (neg not in s_tokens):
                if ans == "No":
                    ans = "Yes"
                else:
                    ans = "No"
            if (neg in q_tokens) and (neg in s_tokens):
                if ans == "Yes":
                    ans = "No"
                else:
                    ans = "Yes"
        # case 2: similarity
        sim = fuzz.partial_ratio(question, sent)
        if sim > 90:
            ans = "Yes"
        else:
            ans = "No"
        return (ans, sim)

    def qstType(self, qst):
        self.thisType = 'UK'
        self.qstFlag = False
        self.qstSim = None

        tree = self.sNLP.parser_sents([
            qst,
        ])
        for i in tree:
            self.decideType(i)

    def fitness(self, txt, qst):
        self.qstType(qst)
        if self.thisType == 'UK':
            _, sim = self.bin_answer(qst, txt)
            return sim > self.threshold
        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in [txt]:
            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)
            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_ans = '_'
        best_candi = None
        best_sen = None

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                best_dis = score
                best_sen = nowSentence
                best_ans = this_ans

        return self.threshold < best_dis

    def dropFragment(self, myParent, qstType):
        flag = 0
        for node in myParent:
            if isinstance(node, str): continue
            if self.dropTime > self.dropTotal:
                return
            if node.label() in self.dropType[qstType]:
                self.dropTime += 1
                if self.dropTime > self.dropTotal:
                    myParent.remove(node)
                    self.candidateAnswer.append(node.leaves())
                    self.findFlag = 1
                    return
            self.dropFragment(node, qstType)
            if node.label() == 'ROOT' and self.findFlag:
                # print(node.leaves())
                self.candidateSentence.append(node.leaves())

    def findFragment(self, myParent, qstType):
        for node in myParent:
            if isinstance(node, str): continue
            # node.pretty_print()
            if node.label() in self.dropType[qstType]:
                self.candidateAnswer.append((node.leaves(), node.label()))

            self.findFragment(node, qstType)

    def answerSpecial(self, txtList, tokens, qstType):
        # print(tokens[0])
        self.candidateAnswer = []
        self.finalAnswer = []
        self.candidateSentence = []
        for txt in txtList:
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.findFragment(i, qstType)
        for i in self.candidateAnswer:
            sentence = ' '.join(i[0])
            pos_tag = self.sNLP.ner(sentence)
            print(pos_tag)
            if pos_tag[1][1] in self.typeNer[qstType]:
                # print(pos_tag)
                self.finalAnswer.append(sentence)
        print(self.finalAnswer[0])

    def preProcessText(self, text):
        data = re.sub("\(.*\)", "", text)
        data = re.sub(' +', ' ', data).strip()
        return data

    def answer(self, txtList, qst):
        self.head = word_tokenize(qst)[0].lower()

        self.qstType(qst)
        if self.thisType == 'UK':

            best_score = 0
            best_ans = 'Yes'
            best_sent = '_'
            for txt in txtList:
                ans, sim = self.bin_answer(qst, txt)
                if sim > best_score:
                    best_ans = ans
                    best_score = sim
                    best_sent = txt
            #print('=======')
            #print(best_sent)
            #print(qst)
            print(best_ans + '.')
            #print(best_score)
            #print('=======')
            return

        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in txtList:
            thisSent = self.preProcessText(thisSent)
            if (len(word_tokenize(thisSent)) < 4
                    or len(word_tokenize(thisSent)) > 25):
                continue

            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)

            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_candi = None
        best_sen = None
        best_ans = '_'

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            # print(nowSentence)
            # print(self.qstSim)
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            # print(score)
            # print('----------')

            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                if self.head == 'who':
                    ners = getExhaustiveNERs(this_ans)
                    #print(this_ans, ners[0])
                    if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[
                            0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'when':
                    ners = getExhaustiveNERs(this_ans)
                    if 'DATE' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'where':
                    ners = getExhaustiveNERs(this_ans)
                    if 'LOCATION' not in ners[0] and 'CITY' not in ners[
                            0] and 'ORGANIZATION' not in ners[
                                0] and 'STATE_OR_PROVINCE' not in ners[
                                    0] and 'COUNTRY' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                best_dis = score

                best_sen = nowSentence
                best_ans = this_ans

        #print('++++++++++++++++++')
        #print(qst)
        #print(best_dis)
        #print(best_sen)
        if best_ans == '_':
            print('I cannot answer that question: ' + qst)
        else:
            print(best_ans.capitalize() + '.')
        #print('++++++++++++++++++')

    def edit_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.edit_distance(s2, s1)
        # len(s1) >= len(s2)
        if len(s2) == 0:
            return len(s1)
        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            c1 = c1.lower()
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                c2 = c2.lower()
                insertions = previous_row[
                    j +
                    1] + 1  # j+1 instead of j since previous_row and current_row are one character longer
                deletions = current_row[j] + 1  # than s2
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]

        for i in range(len(self.candidateSentence)):
            nowSentence = self.candidateSentence[i]

            score = self.edit_distance(nowSentence, tokens)
            best_candi = ' '.join(nowSentence)
            this_ans = ' '.join(self.candidateAnswer[i])
            if (score < best_dis
                    or (score == best_dis and len(this_ans) < len(best_ans))):
                best_dis = score
                best_ans = this_ans
        return best_dis
コード例 #4
0
ファイル: QA_utils.py プロジェクト: jinpoon/11611_project
class QuestionGeneration:
    def __init__(self):
        self.sNLP = StanfordNLP()
        self.beVerbs = {"am", "is", "are", "was", "were"}
        # self.aux_verbs = {'is', 'were', 'can', 'could', }

    def auxilaryWord(self, sub, POS_tag):
        # TODO lowercase
        # TODO will may...
        # TODO plural...
        # Jerry and I
        if sub.lower() in ('i', 'they', 'you'):
            return 'do'
        if sub.lower() in ('he', 'she'):
            return 'does'

    def beWork(self, sentence):
        # pos = nltk.pos_tag(sentence)
        j = None
        for i in range(len(sentence) - 1):
            if sentence[i] in self.beVerbs:
                j = i
                break
        if j is not None:
            temp = sentence[j]
            sentence.pop(j)
            sentence.insert(0, temp)
            #print(sentence)
            return sentence

        return

    # def getNounandVerbOfSentence(self, sentence):

    def QG(self, text):

        dep_parse_Tree = self.sNLP.dependency_parse(text)
        dep_parse_Tree = dep_parse_Tree.__next__()
        Ques_list = []

        # Yes or No question

        be_question = self.beWork(text)
        if be_question is not None:
            be_question += '?'
            Ques_list.append(be_question)

        # WHO question for Subject

        # create NER tags
        ner_tags = dict(self.sNLP.ner(text))
        pos_tag = self.sNLP.pos(text)
        #print(ner_tags)

        # get triples list of the dependency tree
        triples_list = list(dep_parse_Tree.triples())
        #print(triples_list)
        ##### LOOP THRU DEPENDENCY TREE AND CREATE QUESTIONS
        auxWord = 'xxx'
        for this in triples_list:
            # print(this)
            temp_text = '?'

            # for the subject question
            if this[1] in ['nsubj', 'csubj', 'nsubjpass']:
                subject = None
                sub_pos = None
                # in order of preference
                if this[2][1] in ['NNP', 'NNPS', 'PRP']:
                    subject = this[2][0]
                    sub_pos = this[2][1]
                elif this[0][1] in ['NNP', 'NNPS']:
                    subject = this[0][0]
                    sub_pos = this[0][1]
                elif this[2][1] in ['NN', 'NNS']:
                    subject = this[2][0]
                    sub_pos = this[2][1]

                #print("sub", subject)
                if subject is not None:  # need to add sub_pos
                    auxWord = self.auxilaryWord(subject, sub_pos)

                    if ner_tags[subject] in ['PERSON', 'TITLE', 'MISC'
                                             ]:  # check if its a PERSON NER
                        temp_text = self.contructQ(triples_list, subject, text,
                                                   None)
                        temp_text = temp_text.replace(subject, "Who").replace(
                            " .", "?")  # create question

                        # some string manipulation to get the ?
                        if "?" not in temp_text:
                            temp_text = temp_text + "?"
                            # print(text.replace(subject, "Who").replace(" .", "?"))

                    if ner_tags[
                            subject] == 'ORGANIZATION':  # if the subject is ORG
                        temp_text = text.replace(subject,
                                                 "Which organization").replace(
                                                     " .", "?")

                    if ner_tags[subject] == 'CITY':  # if the subject is CITY
                        temp_text = text.replace(subject,
                                                 "Which city").replace(
                                                     " .", "?")

                    if ner_tags[
                            subject] == 'COUNTRY':  # if the subject is CITY
                        temp_text = text.replace(subject,
                                                 "Which country").replace(
                                                     " .", "?")

                    if this[2][1] in ['PRP']:  # if the subject is preposition
                        temp_text = text.replace(subject,
                                                 "Who").replace(" .", "?")

                    if ner_tags[subject] in [
                            'O', 'LOCATION'
                    ] and temp_text == '?':  # if the subject is Other
                        temp_text = self.contructQ(triples_list, subject, text,
                                                   None)
                        if sub_pos == 'PRP' and subject.lower() in [
                                'they', 'he', 'she'
                        ]:
                            temp_text = temp_text.replace(subject,
                                                          "Who").replace(
                                                              " .", "?")
                        else:
                            temp_text = temp_text.replace(subject,
                                                          "What").replace(
                                                              " .", "?")

            # for number, How many questions
            elif this[1] in ['nummod']:
                numPhrase = this[2][0] + ' ' + this[0][0]
                targetWord = this[2][0]
                if ner_tags[targetWord] in ('NUMBERS'):
                    temp_text = text.replace(numPhrase, "").replace(" .", "?")
                    temp_text = "How many " + this[0][0] + " " + (
                        auxWord
                        if auxWord is not None else "") + " " + temp_text

            # for possessive questions
            elif this[1] in ['nmod:poss']:
                if this[2][1] in ['NNP']:
                    # if this[2][0][-1] == 's':
                    #     poss_word = this[2][0]
                    # else:
                    poss_word = this[2][0]  #+ " 's"
                    temp_text = self.contructQ(triples_list, this[2][0], text,
                                               None)
                    temp_text = temp_text.replace(poss_word, "Whose").replace(
                        " .", "?").replace("'s", "").replace(" '", "")
                    if not temp_text.startswith("Whose"):
                        temp_text = temp_text.replace("Whose",
                                                      "whose").replace(
                                                          " '", "")

            # for prop questions
            elif this[1] in ('case'):
                subject = this[0][0]
                propPhrase = this[2][0] + ' ' + this[0][0]
                # print(propPhrase)
                if ner_tags[subject] in ['CITY']:  # where
                    temp_text = text.replace(propPhrase, "").replace(
                        " .", "?")  # create question
                    temp_text = "Where " + (auxWord if auxWord is not None else
                                            "") + " " + temp_text
                    # some string manipulation to get the ?
                if ner_tags[subject] in ['DATE']:  # when
                    temp_text = text.replace(propPhrase, "").replace(" .", "?")
                    # print(auxWord, temp_text)
                    temp_text = "When " + (auxWord if auxWord is not None else
                                           "") + " " + temp_text

            elif this[1] in ('iobj', 'dobj'):
                # code to be written for questions on direct and indirect Objects
                pass
            #### endif

            if "?" not in temp_text:
                temp_text = temp_text + "?"
            if temp_text != '?':
                # print(temp_text)
                Ques_list.append(temp_text)

        return (Ques_list)

    #### in case of the subject has modifiers or the Subject is a part of a long NP remove all the related modifiers of the subject with the help of dependency tree
    #### same to be replicated for Object as well
    def contructQ(self, list_triples, subject, text, object):

        if subject is not None:
            text = text[text.find(
                subject
            ):]  ## removing unnecessary determinants (a, the, An) by slicing off until the subject word
            # print(text)
            dict_of_words_removed = {
            }  # subject related word removal to construct a question
            for thisTriple in list_triples:  ## loop thru dependency tree
                if thisTriple[0][0] == subject or thisTriple[0][
                        0] in dict_of_words_removed:
                    if thisTriple[1] not in ['nsubj', 'csubj']:
                        if (thisTriple[2][0]).lower() not in [
                                'the', 'a', 'an'
                        ]:  # skipping determinants as they can be present in other places of the sentence as well
                            text = re.sub(
                                ' +',
                                ' ', text.replace(thisTriple[2][0], '')).strip(
                                )  # removing subject related words
                            dict_of_words_removed[thisTriple[2][
                                0]] = 0  # adding the removed word so that other words that are connected to this can also be removed

        return (text)