コード例 #1
0
ファイル: QA_utils.py プロジェクト: jinpoon/11611_project
 def __init__(self, dataFile):
     self.sNLP = StanfordNLP()
     self.punc = {'.', '?', '!', '\n'}
     # self.textData = open(dataFile,"r", encoding='utf-8').read()
     self.textData = dataFile
     self.textData = self.preProcessText(self.textData)
     self.sentence_list = []
     # self.getSentenceList(self.textData)
     self.tokenizePara(self.textData)
コード例 #2
0
ファイル: QA.py プロジェクト: jinpoon/11611_project
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.dropType = {}
     self.typeNer = {}
     self.typePro = {}
     self.initQstType()
     self.candidateAnswer = []
     self.candidateSentence = []
     self.qgPipeline = QGPipeline()
     self.threshold = 90
コード例 #3
0
ファイル: When_QG.py プロジェクト: jinpoon/11611_project
def When_module(sent, sent_features):
    question = []
    structures = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()

    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    when_parseTraversal(sent, parse, question, structures)
    # print(question)
    # print(structures)
    prev_min = float('Inf')

    if len(structures) > 0:
        whenPhrase = ""
        for t in structures:
            if t[1] < prev_min:
                whenPhrase = t[0]
                prev_min = t[1]
        # print(sent)
        # print(whenPhrase)
        thisQ = sent.replace(whenPhrase, "")
        dep_tree = sNLP.dependency_parse(thisQ)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(thisQ, dep_tree_list)

    for q in question:
        dep_tree = sNLP.dependency_parse(q)
        dep_tree = dep_tree.__next__()
        dep_tree_list = list(dep_tree.triples())
        # for t in dep_tree_list:
        #     print(t)
        return construct_when(q, dep_tree_list)

    # print()

    pass
コード例 #4
0
def Where_Which_module(sent, sent_features):
    question = []
    simple_ques = []
    sNLP = StanfordNLP()

    # print(sent_features)

    # dep_parse = sNLP.dependency_parse(sent)
    # dep_parse = dep_parse.__next__()
    #
    # dep_parse_list = list(dep_parse.triples())

    parse = sNLP.parse(sent)
    # parse.pretty_print()
    #
    # for t in dep_parse_list:
    #     print(t)

    # print(sNLP.ner(sent))
    # print(sNLP.pos(sent))

    where_which_inFirstPP(sent, parse, simple_ques)
    if len(simple_ques) > 0:
        for bool, thisSent, nerSet, thisPP in simple_ques:
            dep_tree = sNLP.dependency_parse(thisSent)
            dep_tree = dep_tree.__next__()
            dep_tree_list = list(dep_tree.triples())
            # for t in dep_tree_list:
            #     print(t)
            if bool:
                case = thisPP.split(" ")[0]
                type = ""
                if "COUNTRY" in nerSet:
                    type = "country"
                elif "LOCATION" in nerSet:
                    type = "location"
                elif "CITY" in nerSet:
                    type = "city"
                else:
                    type = "place"
                return([construct_where_which(thisSent, dep_tree_list,case,type)])
            else:
                where_which_parseTraversal(thisSent, dep_tree_list, sNLP.ner(thisSent), question)
                return(question)
コード例 #5
0
def categorizeQs(sents, sent_to_Q_dict):
    # print(sents)
    sent_features = {}
    sNLP = StanfordNLP()
    normal_ners = sNLP.ner(sents)
    normal_ner_set = {t[1] for t in normal_ners}

    aux_Flag = max([1 if w in sents else 0 for w in aux_words])
    # print(aux_Flag)
    if aux_Flag == 1:
        thisQues = Binary_QG.bin_question(sents)
        for p_b in thisQues:
            if p_b is not None:
                sent_to_Q_dict["Binary"].append((sents, p_b))

    why_flag = max([1 if w in sents else 0 for w in why_keys])
    # print(why_flag)
    if why_flag == 1:
        thisQues = Why_QG.why_q(sents)
        if thisQues is not None:
            sent_to_Q_dict["Why"].append((sents, thisQues))

    thisQues = What_Who_QG.What_Who_module(sents)
    for p_ in thisQues:
        if p_ is not None:
            sent_to_Q_dict["What_Who"].append((sents, p_))

    if 'LOCATION' in normal_ner_set or 'COUNTRY' in normal_ner_set or 'CITY' in normal_ner_set:
        thisQ = Where_Which_QG.Where_Which_module(sents, sent_features)
        for p in thisQ:
            if p is not None:
                sent_to_Q_dict["Where_Which"].append((sents, p))

    if 'DATE' in normal_ner_set or 'TIME' in normal_ner_set:
        thisQ = When_QG.When_module(sents, sent_features)
        if thisQ is not None:
            sent_to_Q_dict["When"].append((sents, thisQ))
コード例 #6
0
ファイル: Why_QG.py プロジェクト: jinpoon/11611_project
def why_q(sents):
    # preprocessing

    sNLP = StanfordNLP()

    parse = sNLP.parse(sents)

    sents = What_Who_QG.remove_modifiers(parse)

    # print("remove modifiers", sents)

    tokenized_sentences = []
    question = ""

    tokenized_sentences.append(word_tokenize(sents))
    q_set = []
    for sent in tokenized_sentences:
        pos_tags = nltk.pos_tag(sent)
        # print(pos_tags)
        if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'):
            pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1])
        q_list = copy.deepcopy(pos_tags)
        q_string = ''
        #print(pos_tags)

        for i in range(len(pos_tags)):
            if pos_tags[i][1] == 'VBD':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD')
                q_list.insert(0, ('Why did', 0))
                break
            elif pos_tags[i][1] == 'VBZ':
                if pos_tags[i][0] in aux_words:
                    q_list.insert(0, q_list.pop(i))
                    q_list.insert(0, ("Why", 0))
                else:
                    q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ")
                    if q_list[i][0] == "do": q_list.pop(i)
                    q_list.insert(0, ("Why does", 0))
                break
            elif pos_tags[i][1] == 'VBP':
                q_list.insert(0, q_list.pop(i))
                q_list.insert(0, ("Why", 0))
                break
        replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:]
        q_list[0] = (replace_string, 0)
        #print(q_list)

        question = ' '.join([i[0] for i in q_list])
        ind = -1
        for k in why_keys:
            if question.find(k) != -1:
                ind = question.find(k)
                break
        if ind != -1:
            question = question[:ind - 1]
        question = question + "?"
        # print(question)

    if question != "":
        return (question)
    else:
        return None
コード例 #7
0
ファイル: QA_utils.py プロジェクト: jinpoon/11611_project
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.sent_simpl = Simplification()
     self.QG = QuestionGeneration()
コード例 #8
0
ファイル: QA_utils.py プロジェクト: jinpoon/11611_project
 def __init__(self):
     self.sNLP = StanfordNLP()
コード例 #9
0
ファイル: QA_utils.py プロジェクト: jinpoon/11611_project
 def __init__(self):
     self.sNLP = StanfordNLP()
     self.beVerbs = {"am", "is", "are", "was", "were"}
コード例 #10
0
def getNerSet(phrase):
    sNLP = StanfordNLP()
    return {t[1] for t in sNLP.ner(phrase)}
コード例 #11
0
def bin_question(sents):
    # preprocessing
    # text_file = sys.argv[1]
    # sentences = []
    # with io.open(text_file, 'r', encoding='utf-8') as f:
    #     for line in f:
    #         line = line.strip()
    #         sentences.extend(sent_tokenize(line))
    # # tagging
    # tokenized_sentences = [word_tokenize(i) for i in sentences if
    #                        (len(word_tokenize(i)) > 5) and (len(word_tokenize(i)) < 25)]

    sNLP = StanfordNLP()

    parse = sNLP.parse(sents)

    sents = What_Who_QG.remove_modifiers(parse)

    # print("remove modifiers", sents)

    tokenized_sentences = []
    tokenized_sentences.append(word_tokenize(sents))

    # print("TOKE", tokenized_sentences)
    aux_words = ['are', 'was', 'were', 'is', 'have', 'has']
    aux_words = set(aux_words)
    question_set = []
    # c = 0
    for sent in tokenized_sentences:
        pos_tags = nltk.pos_tag(sent)
        # print(pos_tags)

        if (pos_tags[0][1] != 'NNP') and (pos_tags[0][1] != 'NNPS'):
            pos_tags[0] = (pos_tags[0][0].lower(), pos_tags[0][1])
        q_list = copy.deepcopy(pos_tags)
        q_string = ''
        for i in range(len(pos_tags)):
            if pos_tags[i][0] in aux_words:
                q_list.insert(0, q_list.pop(i))
                break
            elif pos_tags[i][1] == 'VBD':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), 'VBD')
                q_list.insert(0, ('Did', 0))
                break
            elif pos_tags[i][1] == 'VBZ':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBZ")
                q_list.insert(0, ("Does", 0))
                # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v')
                break
            elif pos_tags[i][1] == 'VBP':
                q_list[i] = (wnl.lemmatize(pos_tags[i][0], pos='v'), "VBP")
                q_list.insert(0, ("Do", 0))
                # q_list[i] = wnl.lemmatize(pos_tags[i][0], pos = 'v')
                break
        if q_list[0][0].lower() in [
                'are', 'was', 'were', 'is', 'have', 'has', 'did', 'do', 'does'
        ]:
            replace_string = q_list[0][0][:1].upper() + q_list[0][0][1:]
            q_list[0] = (replace_string, 0)
            question = ' '.join([i[0] for i in q_list])
            question = question[:-2]
            question = question + "?"

            question_set.append(question)

    # print(question_set)

    return question_set
コード例 #12
0
ファイル: What_Who_QG.py プロジェクト: jinpoon/11611_project
#     for i in dep_tree:
#         if i[1] in ['nsubj', 'csubj', 'nsubjpass']:
#             return(i[0][0], i[0][1])
#     return (None,None)

# def findAuxVerb(dep_tree, verb):
#     aux = ""
#     mod = ""
#     for i in dep_tree:
#         if i[0][0] == verb and i[1] in ["auxpass", "aux"]:
#             aux += i[2][0]+" "
#         if i[0][0] == verb and i[1] in ["adv", "advmod"]:
#             mod += i[2][0] + " "
#     return (aux, mod, verb)

sNLP = StanfordNLP()


def getDecapitalized(sentence):
    tokens = sNLP.word_tokenize(sentence)
    first = tokens[0]
    # print(first)
    thisNER = sNLP.ner(sentence)
    # print(thisNER)
    if thisNER[0][1] not in [
            'PERSON', 'LOCATION', 'ORGANIZATION', 'CITY', 'NATIONALITY',
            'COUNTRY', 'TIME'
    ]:
        first = first.lower()
    return first + " " + " ".join(tokens[1:])