Example #1
0
def parse_questions():
  print "Parsing Questions..."
  parsed_questions = {}
  with open(DIR+'/questions.txt', 'r') as f:
    data = f.read()
    questions = re.split('[\s]*</top>[\s]*', data)
    if len(questions[-1].strip()) == 0: questions.pop()
    qc = QuestionClassifier.QuestionClassifier()
    for question in questions:
      question_number = int(re.search(r"<num>[\s]*Number:[\s]*([0-9]+)", question).group(1))
      question = re.search(r"<desc>[\s]*Description:[\s]*([a-zA-Z0-9\-\?\'\. ]+)", question).group(1)
      question_words = nltk.word_tokenize(question)
      question_pos = nltk.pos_tag(question_words)
      question_nes = nltk.ne_chunk(question_pos)
      question_tree = Chunker.chunker.parse(question_pos)
      question_classification = qc.classify(question)
      qwords, nouns, nes = [], [], []
      for part in question_nes:
        try:
          nes.append((part.node, part.leaves()[0][0]))
        except:
          if part[1] == 'WP' or part[1] == 'WRB':
            qwords.append(part[0])
          elif part[1] == 'NN' or part[1] == 'NNP':
            nouns.append(part[0])
      # print qwords, nouns, nes
      # print question_pos
      parsed_questions[question_number] = { "question": question, "pos": question_pos, "ne": question_nes, "parse_tree": question_tree, "question_classification": question_classification, "question_words": qwords, "nouns": nouns, "ne_words": nes }
  with open(DIR+'/parsed_questions.txt', 'wb') as f:
    pickle.dump(parsed_questions, f)
def similarityScore(sentence_1,sentence_2, qtype):
    words_1 = [i.lower() for i in nltk.word_tokenize(sentence_1) if i not in stop_words ]
    words_2 = [i.lower() for i in nltk.word_tokenize(sentence_2) if i not in stop_words ]

    nums = re.compile(r"[+-]?\d+(?:[\,\.]\d+)?(?:[eE][+-]?\d+)?")
    score=0.0
    flag=False
    specialFlag=False

################## category specific detailing #####################################
    if "NUM:money".lower() in qtype.lower() or "NUM:cost".lower() in qtype.lower() or "cost" in sentence_1 or "money" in sentence_1:
        money = re.compile('|'.join([
              r'^\$?\ ?(\d*\.\d{1,2})$',  # e.g., $.50, .50, $1.50, $.5, .5
              r'^\$?(\d*\.\d{1,2})$',  # e.g., $.50, .50, $1.50, $.5, .5
              r'^\$?(\d+)$',           # e.g., $500, $5, 500, 5
              r'^\$(\d+\.?)$',         # e.g., $5.
            ]))
        once=False
        for w2 in words_2:
            w2 = w2.split("-",1)[0]
            w2 = re.sub('[ ]', '', w2)
            if once==False:
                once = True
                val = money.search(sentence_2)
                val1 = nums.search(sentence_2)
                if val != None:
                    # print "NUM:money", val.group(0),
                    score += 0.25
                    flag=True
                    specialFlag=True
                    break
                elif val1 != None:
                    score += 0.1
                    flag=True
                    specialFlag=True
                    break
            if w2.lower() in currency:
                score+=0.25
                flag=True
                specialFlag=True
                break
            if w2.lower() in numbers:
                score+=0.15
                flag=True
                specialFlag=True
                break
    elif "NUM".lower() in qtype.lower() and "NUM:other".lower() not in qtype.lower() and "NUM:date".lower() not in qtype.lower():
        for w2 in words_2:
            w2 = w2.split("-",1)[0]
            w2 = re.sub('[,!.]', '', w2)
            val = nums.search(w2)
            if val != None:
                # if "NUM:count".lower() in qtype.lower() and val.group(0).isdigit() and int(val.group(0)) > 1900 and int(val.group(0)) > 1999: # if value is a year in range 1900,1999 then its a year..
                #     continue
                # print "NUM:", val.group(0),
                score += 0.2
                flag = True
                specialFlag=True
                break
            # elif "NUM:period".lower() in qtype.lower():
            #     if w2 in date_words or w2 in numbers:
            #         score+=0.2
            #         flag = True
            #         break

    elif "NUM:date".lower() in qtype.lower() :
        for w2 in words_2:
            if w2 in date_words:
                score=+0.25
                flag = True
                specialFlag = True
                break
            elif w2.isdigit():
                try:
                    if int(w2) > 1600 and int(w2) < 2100:
                        score += 0.25
                        specialFlag = True
                except:
                    pass

    eliminate_flag=False
    eliminate_location = dict()
    if  "LOC".lower() in qtype.lower()  :
        eliminate_location = extract_entities(sentence_1)
        eliminate_location = {k: v for k, v in eliminate_location.iteritems() if v == "GPE" or v =="LOCATION"}

    if "LOC".lower() in qtype.lower():
        entities= extract_entities(sentence_2)
        eliminate_flag = True
        for w2 in words_2:
            if w2.lower() in entities.keys():
                if (entities[w2.lower()]=="LOCATION" or entities[w2.lower()]=="GPE") and w2.lower() not in eliminate_location:
                    flag=True
                    if "LOC:other".lower() in qtype.lower():
                        score+=0.1
                    else:
                        score+=0.3
                    break

####################
# if NAME's apostophy relation .. NAME should not be counted for similarity measure
# to do
####################
    eliminate_entities = dict()
    if "'s" in sentence_1.lower()  or (sentence_1.lower().strip(" ").startswith("who") and ("HUM:ind".lower() in qtype.lower() or "HUM:desc".lower() in qtype.lower())):
        eliminate_entities = extract_entities(sentence_1)
        eliminate_entities = {k: v for k, v in eliminate_entities.iteritems() if v == "PERSON"}

    # print eliminate_entities

    if sentence_1.lower().strip(" ").startswith("who") and ("HUM:ind".lower() in qtype.lower() or "HUM:desc".lower() in qtype.lower()):
        entities= extract_entities(sentence_2)
        eliminate_flag = True
        for w2 in words_2:
            if w2.lower() in entities.keys():
                if entities[w2.lower()]=="PERSON" and w2.lower() not in eliminate_entities.keys():
                    # print w2.lower()
                    flag=True
                    if len(eliminate_entities)> 0:
                        specialFlag=True
                    score+=0.3
                    break

    if "DESC:reason".lower() in qtype.lower():
        reasons = re.compile("to (see|do|visit)")
        if reasons.search(sentence_2) != None:
                score +=0.25
                specialFlag=True
                flag=True
        for w2 in words_2:
            if w2 in reason_words:
                score +=0.2
                specialFlag=True
                flag=True
                break

    w1_synsets=[]
    w1_hypersets=[]
    w2_synsets=[]
    w2_hypersets=[]

    for w2 in words_2 :
        # if w2 not in eliminate_entities.keys():
        w2_synsets.extend(wn.synsets(w2))
    for w1 in words_1 :
        # if w1 not in eliminate_entities.keys():
        w1_synsets.extend(wn.synsets(w1))

    for ss in w1_synsets:
        w1_hypersets.extend(ss.hypernyms())
    for ss in w2_synsets:
        w2_hypersets.extend(ss.hypernyms())

    w1_synsets = set([ w1.name().split(".")[0] for w1 in w1_synsets])
    w2_synsets = set([ w2.name().split(".")[0] for w2 in w2_synsets])
    w1_hypersets = set([ w1.name().split(".")[0] for w1 in w1_hypersets if w1.name().split(".")[0] not in w1_synsets])
    w2_hypersets = set([ w2.name().split(".")[0] for w2 in w2_hypersets if w2.name().split(".")[0] not in w2_synsets])
    # w2_hypersets = list(w2_hypersets).extend(w2_synsets)

    # print w1_synsets
    # print w1_hypersets
    # print w2_synsets
    # print w2_hypersets

    # for w1 in words_1:
    #     for w2 in words_2:
    #         if w1 ==  w2 and w1 not in ["!",",",".","-","(",")","\\","/"]:
    #             # print w1, w2,
    #             # print sentence_2
    #             flag= True
    #             if nums.search(w1) !=None:
    #                 # print w1
    #                 score += 3.0/((len(words_2)+1) * (len(words_1)+1))
    #             else:
    #                 score += 0.08/((len(words_2)+1) * (len(words_1)+1))
    #             if specialFlag == True:
    #                     break
    match = nonEmptyIntersectionNumber(w1_synsets,w2_synsets)
    if match > 0:
        score += (3.0*match)/((len(w1_synsets)+1) * (len(w2_synsets)+1))
        flag=True
    # for w1ss in w1_synsets:
    #     for w2ss in w2_synsets:
    #         if w1ss ==  w2ss:
    #             # print w1ss, w2ss,
    #             # print sentence_2
    #             flag= True
    #             score += 2.0/((len(w1_synsets)+1) * (len(w2_synsets)+1))
    #             if specialFlag == True:
    #                     break
    #     # if specialFlag == True:
    #     #                 break

    match = nonEmptyIntersection(w1_hypersets,w2_hypersets)
    if match>0:
        score += (1.5 * match) /((len(w1_hypersets)+1) * (len(w2_hypersets)+1))
        flag=True
    #
    # for w1ss in w1_hypersets:
    #     for w2ss in w2_hypersets:
    #         if w1ss ==  w2ss:
    #             # print w1ss, w2ss,
    #             # print sentence_2
    #             flag= True
    #             score += 0.5/((len(w1_hypersets)+1) * (len(w2_hypersets)+1))
    #             if specialFlag == True:
    #                     break
    #     if specialFlag == True:
    #                     break
    #
    #
############ search for question related synsets in sentence ##################################
    match =nonEmptyIntersectionNumber(extract_keywords(sentence_1),extract_keywords(sentence_2))
    if match >0:
        score += 0.08 * match
        flag=True

    qtype_synsets= QuestionClassifier.liroth_to_wordnet(qtype)
    if qtype_synsets != None:
        qtype_synsets_names=set([ q.name().split(".")[0] for q in qtype_synsets ])
        # print qtype_synsets_names
        if nonEmptyIntersection(w2_hypersets.union(w2_synsets),qtype_synsets_names):
            score += 4.0/((len(qtype_synsets_names)+1) * (len(w2_hypersets)+1))
            flag=True
        # for w2ss in w2_hypersets:
        #     for q in qtype_synsets_names:
        #         if w2ss == q:
        #             # print w2ss,q,
        #             flag=True
        #             score += 3.0/((len(qtype_synsets_names)+1) * (len(w2_hypersets)+1))




    if flag == False:
        score-=0.2
    # if "what" in sentence_1:
    #     score =score/2.0
    # if "what" in sentence_1.lower() and  ("known as" in sentence_2.lower() or "called as" in sentence_2.lower() or "named as" in sentence_2.lower()):
    #     score =score *2.0 + 0.2
    if specialFlag==True:
        score*=1.1
    # if score > 0:
        # print "  --- " , score, sentence_2,
    return score, specialFlag
Example #3
0
    parser = argparse.ArgumentParser(description="QUESTION ANSWER SYSTEM", epilog="", prog="")
    parser.add_argument("-i", "--input", required=True, help="Input file containing list of story files")
    parser.add_argument("-t", "--temp", default="story.txt", help="temp file to operate on story")
    parser.add_argument(
        "-o", "--output", type=str, default="myresponse.txt", help=" Output file where answers will be stored"
    )
    parser.add_argument("-c", "--coref", type=int, default=0, help=" For Coref resolution use -c 1")
    args = vars(parser.parse_args())
    input_file = args["input"]
    out_filename = args["output"]
    coref_flag = args["coref"]

    out = open(out_filename, "w")
    out.close()

    q_classifier = QuestionClassifier.get_classifier()
    arkref_temp_path = args["temp"]
    with open(input_file, "r") as inputListFile:
        input_dir = inputListFile.readline().strip("\n")
        storyFileList = []
        for line in inputListFile:
            storyFileList.append(line.strip("\n"))

    for file in storyFileList:
        file = input_dir + "/" + file + ".story"
        storyid, text = get_metadata(file)
        with open(arkref_temp_path, "w") as tempfile:
            tempfile.write(text)

        if coref_flag != 0:
            with open(arkref_temp_path, "w") as tempfile: