def parse_questions(): print "Parsing Questions..." parsed_questions = {} with open(DIR+'/questions.txt', 'r') as f: data = f.read() questions = re.split('[\s]*</top>[\s]*', data) if len(questions[-1].strip()) == 0: questions.pop() qc = QuestionClassifier.QuestionClassifier() for question in questions: question_number = int(re.search(r"<num>[\s]*Number:[\s]*([0-9]+)", question).group(1)) question = re.search(r"<desc>[\s]*Description:[\s]*([a-zA-Z0-9\-\?\'\. ]+)", question).group(1) question_words = nltk.word_tokenize(question) question_pos = nltk.pos_tag(question_words) question_nes = nltk.ne_chunk(question_pos) question_tree = Chunker.chunker.parse(question_pos) question_classification = qc.classify(question) qwords, nouns, nes = [], [], [] for part in question_nes: try: nes.append((part.node, part.leaves()[0][0])) except: if part[1] == 'WP' or part[1] == 'WRB': qwords.append(part[0]) elif part[1] == 'NN' or part[1] == 'NNP': nouns.append(part[0]) # print qwords, nouns, nes # print question_pos parsed_questions[question_number] = { "question": question, "pos": question_pos, "ne": question_nes, "parse_tree": question_tree, "question_classification": question_classification, "question_words": qwords, "nouns": nouns, "ne_words": nes } with open(DIR+'/parsed_questions.txt', 'wb') as f: pickle.dump(parsed_questions, f)
def similarityScore(sentence_1,sentence_2, qtype): words_1 = [i.lower() for i in nltk.word_tokenize(sentence_1) if i not in stop_words ] words_2 = [i.lower() for i in nltk.word_tokenize(sentence_2) if i not in stop_words ] nums = re.compile(r"[+-]?\d+(?:[\,\.]\d+)?(?:[eE][+-]?\d+)?") score=0.0 flag=False specialFlag=False ################## category specific detailing ##################################### if "NUM:money".lower() in qtype.lower() or "NUM:cost".lower() in qtype.lower() or "cost" in sentence_1 or "money" in sentence_1: money = re.compile('|'.join([ r'^\$?\ ?(\d*\.\d{1,2})$', # e.g., $.50, .50, $1.50, $.5, .5 r'^\$?(\d*\.\d{1,2})$', # e.g., $.50, .50, $1.50, $.5, .5 r'^\$?(\d+)$', # e.g., $500, $5, 500, 5 r'^\$(\d+\.?)$', # e.g., $5. ])) once=False for w2 in words_2: w2 = w2.split("-",1)[0] w2 = re.sub('[ ]', '', w2) if once==False: once = True val = money.search(sentence_2) val1 = nums.search(sentence_2) if val != None: # print "NUM:money", val.group(0), score += 0.25 flag=True specialFlag=True break elif val1 != None: score += 0.1 flag=True specialFlag=True break if w2.lower() in currency: score+=0.25 flag=True specialFlag=True break if w2.lower() in numbers: score+=0.15 flag=True specialFlag=True break elif "NUM".lower() in qtype.lower() and "NUM:other".lower() not in qtype.lower() and "NUM:date".lower() not in qtype.lower(): for w2 in words_2: w2 = w2.split("-",1)[0] w2 = re.sub('[,!.]', '', w2) val = nums.search(w2) if val != None: # if "NUM:count".lower() in qtype.lower() and val.group(0).isdigit() and int(val.group(0)) > 1900 and int(val.group(0)) > 1999: # if value is a year in range 1900,1999 then its a year.. # continue # print "NUM:", val.group(0), score += 0.2 flag = True specialFlag=True break # elif "NUM:period".lower() in qtype.lower(): # if w2 in date_words or w2 in numbers: # score+=0.2 # flag = True # break elif "NUM:date".lower() in qtype.lower() : for w2 in words_2: if w2 in date_words: score=+0.25 flag = True specialFlag = True break elif w2.isdigit(): try: if int(w2) > 1600 and int(w2) < 2100: score += 0.25 specialFlag = True except: pass eliminate_flag=False eliminate_location = dict() if "LOC".lower() in qtype.lower() : eliminate_location = extract_entities(sentence_1) eliminate_location = {k: v for k, v in eliminate_location.iteritems() if v == "GPE" or v =="LOCATION"} if "LOC".lower() in qtype.lower(): entities= extract_entities(sentence_2) eliminate_flag = True for w2 in words_2: if w2.lower() in entities.keys(): if (entities[w2.lower()]=="LOCATION" or entities[w2.lower()]=="GPE") and w2.lower() not in eliminate_location: flag=True if "LOC:other".lower() in qtype.lower(): score+=0.1 else: score+=0.3 break #################### # if NAME's apostophy relation .. NAME should not be counted for similarity measure # to do #################### eliminate_entities = dict() if "'s" in sentence_1.lower() or (sentence_1.lower().strip(" ").startswith("who") and ("HUM:ind".lower() in qtype.lower() or "HUM:desc".lower() in qtype.lower())): eliminate_entities = extract_entities(sentence_1) eliminate_entities = {k: v for k, v in eliminate_entities.iteritems() if v == "PERSON"} # print eliminate_entities if sentence_1.lower().strip(" ").startswith("who") and ("HUM:ind".lower() in qtype.lower() or "HUM:desc".lower() in qtype.lower()): entities= extract_entities(sentence_2) eliminate_flag = True for w2 in words_2: if w2.lower() in entities.keys(): if entities[w2.lower()]=="PERSON" and w2.lower() not in eliminate_entities.keys(): # print w2.lower() flag=True if len(eliminate_entities)> 0: specialFlag=True score+=0.3 break if "DESC:reason".lower() in qtype.lower(): reasons = re.compile("to (see|do|visit)") if reasons.search(sentence_2) != None: score +=0.25 specialFlag=True flag=True for w2 in words_2: if w2 in reason_words: score +=0.2 specialFlag=True flag=True break w1_synsets=[] w1_hypersets=[] w2_synsets=[] w2_hypersets=[] for w2 in words_2 : # if w2 not in eliminate_entities.keys(): w2_synsets.extend(wn.synsets(w2)) for w1 in words_1 : # if w1 not in eliminate_entities.keys(): w1_synsets.extend(wn.synsets(w1)) for ss in w1_synsets: w1_hypersets.extend(ss.hypernyms()) for ss in w2_synsets: w2_hypersets.extend(ss.hypernyms()) w1_synsets = set([ w1.name().split(".")[0] for w1 in w1_synsets]) w2_synsets = set([ w2.name().split(".")[0] for w2 in w2_synsets]) w1_hypersets = set([ w1.name().split(".")[0] for w1 in w1_hypersets if w1.name().split(".")[0] not in w1_synsets]) w2_hypersets = set([ w2.name().split(".")[0] for w2 in w2_hypersets if w2.name().split(".")[0] not in w2_synsets]) # w2_hypersets = list(w2_hypersets).extend(w2_synsets) # print w1_synsets # print w1_hypersets # print w2_synsets # print w2_hypersets # for w1 in words_1: # for w2 in words_2: # if w1 == w2 and w1 not in ["!",",",".","-","(",")","\\","/"]: # # print w1, w2, # # print sentence_2 # flag= True # if nums.search(w1) !=None: # # print w1 # score += 3.0/((len(words_2)+1) * (len(words_1)+1)) # else: # score += 0.08/((len(words_2)+1) * (len(words_1)+1)) # if specialFlag == True: # break match = nonEmptyIntersectionNumber(w1_synsets,w2_synsets) if match > 0: score += (3.0*match)/((len(w1_synsets)+1) * (len(w2_synsets)+1)) flag=True # for w1ss in w1_synsets: # for w2ss in w2_synsets: # if w1ss == w2ss: # # print w1ss, w2ss, # # print sentence_2 # flag= True # score += 2.0/((len(w1_synsets)+1) * (len(w2_synsets)+1)) # if specialFlag == True: # break # # if specialFlag == True: # # break match = nonEmptyIntersection(w1_hypersets,w2_hypersets) if match>0: score += (1.5 * match) /((len(w1_hypersets)+1) * (len(w2_hypersets)+1)) flag=True # # for w1ss in w1_hypersets: # for w2ss in w2_hypersets: # if w1ss == w2ss: # # print w1ss, w2ss, # # print sentence_2 # flag= True # score += 0.5/((len(w1_hypersets)+1) * (len(w2_hypersets)+1)) # if specialFlag == True: # break # if specialFlag == True: # break # # ############ search for question related synsets in sentence ################################## match =nonEmptyIntersectionNumber(extract_keywords(sentence_1),extract_keywords(sentence_2)) if match >0: score += 0.08 * match flag=True qtype_synsets= QuestionClassifier.liroth_to_wordnet(qtype) if qtype_synsets != None: qtype_synsets_names=set([ q.name().split(".")[0] for q in qtype_synsets ]) # print qtype_synsets_names if nonEmptyIntersection(w2_hypersets.union(w2_synsets),qtype_synsets_names): score += 4.0/((len(qtype_synsets_names)+1) * (len(w2_hypersets)+1)) flag=True # for w2ss in w2_hypersets: # for q in qtype_synsets_names: # if w2ss == q: # # print w2ss,q, # flag=True # score += 3.0/((len(qtype_synsets_names)+1) * (len(w2_hypersets)+1)) if flag == False: score-=0.2 # if "what" in sentence_1: # score =score/2.0 # if "what" in sentence_1.lower() and ("known as" in sentence_2.lower() or "called as" in sentence_2.lower() or "named as" in sentence_2.lower()): # score =score *2.0 + 0.2 if specialFlag==True: score*=1.1 # if score > 0: # print " --- " , score, sentence_2, return score, specialFlag
parser = argparse.ArgumentParser(description="QUESTION ANSWER SYSTEM", epilog="", prog="") parser.add_argument("-i", "--input", required=True, help="Input file containing list of story files") parser.add_argument("-t", "--temp", default="story.txt", help="temp file to operate on story") parser.add_argument( "-o", "--output", type=str, default="myresponse.txt", help=" Output file where answers will be stored" ) parser.add_argument("-c", "--coref", type=int, default=0, help=" For Coref resolution use -c 1") args = vars(parser.parse_args()) input_file = args["input"] out_filename = args["output"] coref_flag = args["coref"] out = open(out_filename, "w") out.close() q_classifier = QuestionClassifier.get_classifier() arkref_temp_path = args["temp"] with open(input_file, "r") as inputListFile: input_dir = inputListFile.readline().strip("\n") storyFileList = [] for line in inputListFile: storyFileList.append(line.strip("\n")) for file in storyFileList: file = input_dir + "/" + file + ".story" storyid, text = get_metadata(file) with open(arkref_temp_path, "w") as tempfile: tempfile.write(text) if coref_flag != 0: with open(arkref_temp_path, "w") as tempfile: