def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list): # Declaring globals to be used in this function sent_score_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] temp_q=cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Temp_q: ',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) for i in range(0, len(complete_sentence_list)): #print 'Sentence is :', complete_sentence_list[i] score=0 # 1. Score using word match rule. Match words in question with the words in stop free sentence #print 'Sentence is :',sentence_list[i] score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) if q_person_list==[]: #Giving more weights to sentences having more names in it if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list) score=score + 6 # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) lmtzr = WordNetLemmatizer() temp= complete_sentence_list[i].split() for k in range(0,len(temp)): if lmtzr.lemmatize(temp[k].lower())=='name': score=score + 4 # 4. Awards points to all sentences that contain a name or reference to a human if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list) score=score+4 # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question # then it is a confident clue and we reward it more sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i]) '''for m in range(0, len(sent_pos_list)): if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split(): score=score + 18 #print 'Score now is :', score''' for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 6. If the question contains a profession name, the answer has to be a person and sentence would have #the person name and the profession if q_prof_list!=[]: for k in complete_sentence_list[i].split(): if k.lower() in q_prof_list: #print 'Profession Yes !' score=score+18 else: #Question contains name so the chances of answer being a profession name are decent if sent_prof_list[i] !=[]: score=score+6 sent_score_list.append(score) #print 'Sent score list is :',sent_score_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first candidate_list=[] npfinal_list=[] temp_list=[] answer_list=[] max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(complete_sentence_list)): if sent_score_list[i]==max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list #If there is only one sentence, then choose the sentence and then do the processing to display the answer if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): #Cleaning up the candidate sentence temp_str=candidate_list[k][0] index =candidate_list[k][1] #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') break ####################### SENTENCE PROCESSING TO FIND THE ANSWER ############################### #Just pick out the noun-phrase or PERSON names from the sentence #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str) s_plist=sent_person_list[index] s_proflist=sent_prof_list[index] #print 'Prof list is:',s_proflist #If the question has a name of person, then the answer sentence should/would most probably #the name of a person but it should not be the name of the person appearing in the question. #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases #print 'Question person list is:',q_person_list #print 'Sentence person list is:',s_plist result_list=[] q_loc_who_list=[] if q_person_list==[] and s_plist==[]: #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str) if pos_np_list != []: for x in pos_np_list: if x not in temp_q and x[0].isupper(): #Noun phrases or names generally start with an upper case character print 'First character caps',x result_list.append(x) return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) elif q_person_list !=[] and s_plist !=[]: #To counter situations when both question and sentence has names Ex. Who defeated who ? for k in s_plist: if k not in temp_q: answer_list.append(k) elif q_person_list==[] and s_plist !=[]: for i in range(0, len(s_plist)): if s_plist[i] not in q_person_list and s_plist[i] not in temp_q: #To counter situations where question has a name and NER doesn't identify it answer_list.append(s_plist[i]) elif q_person_list != [] and s_proflist !=[]: #To counter situations for 'Who is X' type questions which could have a profession name in the answer for k in s_proflist: answer_list.append(k) elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ? #print 'Question has no name but has a location' for k in temp_str.split(): if k not in temp_q: q_loc_who_list.append(k) if q_loc_who_list !=[]: return ' '.join(q_loc_who_list) '''elif q_person_list==[] and s_proflist !=[]: for k in s_proflist: answer_list.append(k)''' if answer_list != [] :#and flag==1: #Indicating candidate sentence has a name other than that in question result= ' '.join(answer_list) else: #Pick out the noun phrase or nouns and then display them as answer np_list = POS_Tagging.pos_noun_tagging(temp_str) for x in np_list : if x not in temp_q: npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question #print 'NP Final list after removal is',npfinal_list if npfinal_list !=[]: result=' '.join(npfinal_list) else: result=temp_str # Printing out the whole sentence #print 'Result is:',result return result
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] abbreviation_list=[('Mt.','Mount')] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) #print 'Question is :',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist for i in range(0,len(complete_sentence_list)): score=0 #print complete_sentence_list[i] # 1. Word Match scoring function for each of the sentences score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #print 'Score after wordmatch is :',score #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue for k in temp_q.split(): if k.lower() in what_month: if sent_time_list[i] != []: score=score + 4 #print 'Score after Rule 2 is :',score # 3. What "kind" questions. Sentences containing "call" or "from" elif k.lower() =='kind': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['call','from']: score=score+6 #print 'Score after Rule 3 is :',score # 4. If question contains "name" and the sentence contains {name,call,known} elif k.lower() =='name': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['name','call','known']: score=score+20 #print 'Score after Rule 4 is :',score '''if q_person_list !=[]: if sent_person_list[i] !=[]: score=score+6''' #print 'Score after Rule 4 is :',score #5. If question contains name + PP and contains(S,ProperNoun) and Head PP '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if person_list != []: #TODO Check if it also contains (proper_noun,head(PP)) score=score +20''' # 6. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 7. Definition type questions or what is X or what are X questions ? temp_list=temp_q.split() if len(temp_list) <= 6: if '(' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('(') + 1 end_index=complete_sentence_list[i].index(')') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '--' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('--') + 1 end_index=complete_sentence_list[i].index('--') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '{' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('{') + 1 end_index=complete_sentence_list[i].index('}') score=score+20 return complete_sentence_list[i][start_index:end_index] # If the question contains "sport" related terms, answer should also have sport related terms '''if temp[j].lower() in ['sports','games','olympics']: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']: score=score+6''' # If the sentence contains a "country" name and the sentence contains a LOCATION, then it is confident score '''if temp[j].lower() in ['country','countries','olympics']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if loc_list != []: score=score + 6*len(loc_list)''' # Confidence score increases with increasing number of countries appearing in the sentence. sent_score_list.append(score) #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list answer_list=[] if len(final_sent_list) == 1: temp= final_sent_list[0].split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:])''' #print temp_q.split() for k in range(0,len(temp)): if k !=0 or k!=len(temp)-1: if temp[k].lower()=='per' and temp[k+1].lower()=='cent': return ' '.join(temp[k-1:k+2]) if temp[k] not in temp_q.split(): #print temp[k] answer_list.append(temp[k]) return ' '.join(answer_list) else: for i in range(0,len(final_sent_list)): result=final_sent_list[i] break temp= result.split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:]) else: temp=result''' for k in range(0, len(temp)): if temp[k] not in temp_q.split(): answer_list.append(temp[k]) return ' '.join(answer_list)
''' return [ taggeditem[0] for taggeditem in all_tagged_tokens if taggeditem[1][int(numchars) - 1] in alloweditems ] for i in range(len(all_filepaths)): lemmatized_text = [] text = et1.get_article_text(all_filepaths[i]) ## get article text #print text tokenize_sentences = et1.tokenize_sent_word(text) ## tokenize sentences # chack if tagging is needed if pos_tagging_needed in ['True', 'yes', 'y', 'Y', 'Yes']: text_tok = pos_t.tag_tokennized_text(tokenize_sentences) for s in text_tok: for word, POStag in s: lemmatized = wordnet_lemmatizer.lemmatize(word), POStag lemmatized_text.append(lemmatized) else: for s in tokenize_sentences: for word in s: lemmatized = wordnet_lemmatizer.lemmatize(word) lemmatized_text.append(lemmatized) ### filtering if pos_tagging_needed in ['True', 'yes', 'y', 'Y', 'Yes']: filtered_pos_tokens = filter_pos_tags(lemmatized_text) cleaned_text = fs.filter_spam_items(filtered_pos_tokens)
def answering_why(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list): # Declaring globals to be used in this function sent_score_list = [] final_sent_list = [] best_sent_index = [] best = [ ] # List of the best scoring sentences based on word match with the question q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] temp_q = cleansedQuestion temp_q = temp_q.replace('"', '') temp_q = temp_q.replace("'", '"') temp_q = temp_q.replace('?', '') #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN', 'VBP'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist # Find score for each sentence using word march score first for i in range(0, len(complete_sentence_list)): wm_score = 0 complete_sentence_list[i] = complete_sentence_list[i].replace( '.', '').replace(',', '').replace('!', '') wm_score = wm_score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) sent_score_list.append(wm_score) max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: best.append((complete_sentence_list[i], i)) best_sent_index.append(i) #print 'Best list is:',best # Finding indices of the best sentences '''for j in range(0,len(best)): best_sent_index.append(best[j][1])''' # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i] = 0 for i in range(0, len(complete_sentence_list)): score = 0 # 1. If the given sentence is in the best list, then reward them. It is a clue if i in best_sent_index: score = score + 3 #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i == k - 1: score = score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i == k + 1: score = score + 4 #4. If the sentence contains word "want", then it is a good clue temp = complete_sentence_list[i].split() for word in temp: if word.lower() == 'want': score = score + 4 #5. If the sentence contains word "so" or "because" then it is a good clue elif word.lower() in ['so', 'because']: score = score + 4 #5. Matching the main verb in question and sentence. If so it is a confident clue sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) lmtzr = WordNetLemmatizer() for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 sent_score_list[i] = score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value = max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences # which have both maximum value and present in candidate list. For why questions we don't do more filtering # since most of the answers span the entire sentence for i in range(0, len(sent_score_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list if len(final_sent_list) == 1: temp = final_sent_list[0].split() for k in range(0, len(temp)): if temp[k].lower( ) == 'so': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("so") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower( ) == 'because': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("because") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower( ) == 'to': #If sentence contains "to", the answer is generally the words that come after so #index=final_sent_list[0].index("to") #return final_sent_list[0][k:] return ' '.join(temp[k:]) return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0, len(final_sent_list)): result = final_sent_list[k] return result
def answering_why(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list): # Declaring globals to be used in this function sent_score_list=[] final_sent_list=[] best_sent_index=[] best=[] # List of the best scoring sentences based on word match with the question q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') #print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN','VBP'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist # Find score for each sentence using word march score first for i in range(0,len(complete_sentence_list)): wm_score=0 #complete_sentence_list[i]=complete_sentence_list[i].replace('.','').replace(',','').replace('!','') wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) sent_score_list.append(wm_score) max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: best.append((complete_sentence_list[i],i)) best_sent_index.append(i) #print 'Best list is:',best # Finding indices of the best sentences # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i]=0 for i in range(0, len(complete_sentence_list)): score=0 # 1. If the given sentence is in the best list, then reward them. It is a clue if i in best_sent_index: score=score + 3 #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i==k-1: score=score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i==k+1: score=score + 4 #4. If the sentence contains word "want", then it is a good clue temp=complete_sentence_list[i].split() for word in temp: if word.lower()=='want': score=score+4 #5. If the sentence contains word "so" or "because" then it is a good clue elif word.lower() in ['so','because']: score=score+4 #5. Matching the main verb in question and sentence. If so it is a confident clue sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) lmtzr=WordNetLemmatizer() for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 sent_score_list[i]=score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences # which have both maximum value and present in candidate list. For why questions we don't do more filtering # since most of the answers span the entire sentence for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list if len(final_sent_list) == 1: temp=final_sent_list[0].split() for k in range(0, len(temp)): if temp[k].lower() =='so': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("so") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower() =='because': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("because") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower() =='to': #If sentence contains "to", the answer is generally the words that come after so #index=final_sent_list[0].index("to") #return final_sent_list[0][k:] return ' '.join(temp[k:]) return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0,len(final_sent_list)): result=final_sent_list[k] return result
def answering_how(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_percent_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] q_verblist=[] best=[] # List of the best scoring sentences based on word match with the question much_list=['thousand','thousands','hundred','hundreds','dollars','cents','million','billion','trillion','none','nothing','everything','few','something', 'dollars','grams','kilos','kilogram','kilograms','milligrams','mg','metre','centimetre','inches','feet','foot','ft','cent','percent','salary','pay','income','loss','profit','one','two','three','four','five','six','seven','eight','nine','ten', 'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety', 'hour','hours','minutes','seconds','second','minute','half','quarter','more','less','than'] many_list=['one','two','three','four','five','six','seven','eight','nine','ten', 'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','hundred', 'thousand','million','billion','trillion'] how_often=['daily','weekly','bi-weekly','fortnightly','monthly','bi-monthly','quarterly','half-yearly','yearly','decade','millennium' 'day','everyday','night','afternoon','noon','hourly','hours','minutes','seconds','second','minute'] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] abbreviation_list=[('Mt.','Mount')] ########################### QUESTION PROCESSING ################## temp_q=cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist #print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0,len(complete_sentence_list)): score=0 # 1. Find score for each sentence using word march score first #print 'The sentence is :',complete_sentence_list[i] #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower()=="many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score=score + 6 #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower()=="much": for m in complete_sentence_list[i].split(): if m.lower() in ['money','earn','salary','profit','loss'] or m in much_list: score=score+6 #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower()=='often' or k.lower() =='long': for m in complete_sentence_list[i].split(): if m in how_often: #m.lower() in sent_time_list[i] or score=score+10 break '''if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6''' sent_score_list.append(score) print 'Score list is:',sent_score_list max_score_value=max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) print 'Final sent list is:',final_sent_list temp_result=[] temp_solution=[] if len(final_sent_list) == 1: #If the question contains often, the sentence will usually contain a time expression.If so pick #that expression as the solution if final_sent_list[0].index('.')==len(final_sent_list[0]) -1: req_string=final_sent_list[0][:-1] temp2=req_string.split() else: temp2=final_sent_list[0].split() else: if final_sent_list[0].index('.')==len(final_sent_list[0]) -1: req_string=final_sent_list[0][:-1] temp2=req_string.split() else: temp2=final_sent_list[0].split() #Picking the sentence which comes first when there are multiple candidates #If sentence contains per cent most probably it would be an answer to the how question (much or many) for k in range(0,len(temp2)): if k !=0 or k!=len(temp2)-1: if temp2[k].lower()=='per' and temp2[k+1].lower()=='cent': return ' '.join(temp2[k-1:k+2]) if 'many' in temp_q.split(): #print 'many' for m in range(0,len(temp2)): if nums.match(temp2[m]) or temp2[m] in many_list: print 'Yes' temp_solution.append(temp2[m]) print 'Temp solution is:',temp_solution return ' '.join(temp_solution) elif 'much' in temp_q.split(): #print 'many' for m in range(0,len(temp2)): if nums.match(temp2[m]) or temp2[m] in much_list: temp_solution.append(temp2[m]) return ' '.join(temp_solution) for k in temp2: if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) '''else:
def answering_how(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] q_verblist = [] best = [ ] # List of the best scoring sentences based on word match with the question much_list = [ 'thousand', 'thousands', 'hundred', 'hundreds', 'dollars', 'cents', 'million', 'billion', 'trillion', 'none', 'nothing', 'everything', 'few', 'something', 'dollars', 'grams', 'kilos', 'kilogram', 'kilograms', 'milligrams', 'mg', 'metre', 'centimetre', 'inches', 'feet', 'foot', 'ft', 'cent', 'percent', 'salary', 'pay', 'income', 'loss', 'profit', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hour', 'hours', 'minutes', 'seconds', 'second', 'minute', 'half', 'quarter', 'more', 'less', 'than' ] many_list = [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion', 'trillion' ] how_often = [ 'daily', 'weekly', 'bi-weekly', 'fortnightly', 'monthly', 'bi-monthly', 'quarterly', 'half-yearly', 'yearly', 'decade', 'millennium' 'day', 'everyday', 'night', 'afternoon', 'noon', 'hourly', 'hours', 'minutes', 'seconds', 'second', 'minute' ] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] abbreviation_list = [('Mt.', 'Mount')] ########################### QUESTION PROCESSING ################## temp_q = cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace('?', '') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist #print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0, len(complete_sentence_list)): score = 0 # 1. Find score for each sentence using word march score first #print 'The sentence is :',complete_sentence_list[i] #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) #2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower() == "many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score = score + 6 #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower() == "much": for m in complete_sentence_list[i].split(): if m.lower() in [ 'money', 'earn', 'salary', 'profit', 'loss' ] or m in much_list: score = score + 6 #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower() == 'often' or k.lower() == 'long': for m in complete_sentence_list[i].split(): if m in how_often: #m.lower() in sent_time_list[i] or score = score + 10 break '''if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6''' sent_score_list.append(score) #print 'Score list is:',sent_score_list max_score_value = max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final sent list is:',final_sent_list temp_result = [] temp_solution = [] if len(final_sent_list) == 1: #If the question contains often, the sentence will usually contain a time expression.If so pick #that expression as the solution if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1: req_string = final_sent_list[0][:-1] temp2 = req_string.split() else: temp2 = final_sent_list[0].split() else: if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1: req_string = final_sent_list[0][:-1] temp2 = req_string.split() else: temp2 = final_sent_list[0].split( ) #Picking the sentence which comes first when there are multiple candidates #If sentence contains per cent most probably it would be an answer to the how question (much or many) for k in range(0, len(temp2)): if k != 0 or k != len(temp2) - 1: if temp2[k].lower() == 'per' and temp2[k + 1].lower() == 'cent': return ' '.join(temp2[k - 1:k + 2]) if 'many' in temp_q.split(): #print 'many' for m in range(0, len(temp2)): #print 'temp2[m]:',temp2[m] if nums.match(temp2[m]) or temp2[m] in many_list: #print 'Yes' temp_solution.append(temp2[m]) #print 'Temp solution is:',temp_solution if temp_solution != []: return ' '.join(temp_solution) else: return ' '.join(temp2) elif 'much' in temp_q.split(): #print 'many' for m in range(0, len(temp2)): if nums.match(temp2[m]) or temp2[m] in much_list: temp_solution.append(temp2[m]) if temp_solution != []: return ' '.join(temp_solution) else: return ' '.join(temp2) for k in temp2: if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result)
def answering_where(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline, sent_loc_list): # Declaring globals to be used in this function candidate_list = [] sent_score_list = [] q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] location_prepositions = [ 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by', 'down', 'from', 'in', 'inside', 'into', 'near', 'off', 'onto', 'opposite', 'outside', 'over', 'surrounding', 'round', 'through', 'towards', 'under', 'up' ] abbreviation_list = [('Mt.', 'Mount')] temp_q = cleansedQuestion temp_q = temp_q.replace('"', '') temp_q = temp_q.replace("'", '"') temp_q = temp_q.replace('?', '') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist #print 'Master location list is:',sent_loc_list # 1. Find score for each sentence using word march score first for i in range(0, len(sentence_list)): score = 0 #print 'Sentence is :',sentence_list[i] score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i]) #print 'After wordmatch score is:',score #2. Check if the sentence contains location preposition, then it is a good clue for k in complete_sentence_list[i].split(): if k in location_prepositions: score = score + 4 # 3. Check if the sentence contains Location entity if sent_loc_list[i] != []: # If sentence contains location score = score + 6 # 4. Reward sentences which has "from" in the question and in the answer too from_qflag = 0 cand_list = [] for k in temp_q.split(): if k.lower() == 'from': #print 'From qflag is true' from_qflag = 1 if from_qflag == 1 and 'from' in complete_sentence_list[i].split(): #print 'True:' '''if sent_loc_list[i] !=[]: for m in sent_loc_list[i]: if m not in temp_q.split(): cand_list.append(m) if cand_list!=[]: return ' '.join(cand_list) else: for k in complete_sentence_list[i].split(): if k not in temp_q: cand_list.append(k) return ' '.join(cand_list)''' score = score + 6 # 4. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 sent_score_list.append(score) #print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score = 0 first_sentence_flag = 0 temp_list = cleansedQuestion.split() flag = 0 for word in temp_list: if word.lower() == 'where': flag = 1 for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower() == 'happen': dateline_score = dateline_score + 4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list) - 1 and temp_list[i].lower( ) == 'take' and temp_list[i + 1].lower() == 'place': dateline_score = dateline_score + 4 # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions if temp_list[i].lower() == 'this': if flag == 0: dateline_score = dateline_score + 20 else: first_sentence_flag = 1 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower() == 'story' and flag == 0: dateline_score = dateline_score + 20 #print 'Date line score for the question is :',dateline_score first_list = [] if first_sentence_flag == 1: #Choose the first sentence as the answer pos_np_list = POS_Tagging.pos_NNP_tagging(complete_sentence_list[0]) if pos_np_list != []: for k in pos_np_list: if k not in temp_q.split(): first_list.append(k) return ' '.join(first_list) else: return complete_sentence_list[0] # Selecting the sentence/sentences that has the maximum score. max_score_value = max(sent_score_list) #Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i], i)) #print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list) == 1: temp_str = candidate_list[0][0] index = candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): temp_str = candidate_list[k][0] index = candidate_list[k][1] break #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " temp_str = temp_str.replace('"', '') temp_str = temp_str.replace(',', '').replace('?', '').replace('!', '') ################### SENTENCE PROCESSING ####################### result_list = [] answer_list = [] s_loclist = sent_loc_list[index] #print 'Location list:', s_loclist if s_loclist == []: #The selected sentence does not seem to have a location expression, then print whole sentence minus the words in the question '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str) if nnp_list != []: for k in nnp_list: if k not in temp_q: result_list.append(k) if result_list !=[]: return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q.split(): result_list.append(k) if result_list != []: return ' '.join(result_list) if s_loclist != []: for i in range(0, len(s_loclist)): if s_loclist[i] not in temp_q.split( ): #To counter situations where question has a location and NER doesn't identify it answer_list.append(s_loclist[i]) #print 'Answer list is :',answer_list temp_result = [] np_result_list = [] if answer_list != []: result = ' '.join(answer_list) return result else: '''np_list = POS_Tagging.pos_noun_tagging(temp_str) if np_list != []: for k in np_list: if k not in temp_q: np_result_list.append(k) return ' '.join(np_result_list)''' for k in temp_str.split(): if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) # Dateline score is greater than the sent list score else: result = dateline return result
def answering_where(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline,sent_loc_list): # Declaring globals to be used in this function candidate_list=[] sent_score_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','for','from', 'has','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] location_prepositions=['above','across','after','against','along','among','around', 'before','behind','below','beneath','beside','between','by','down','from', 'in','inside','into','near','off','onto','opposite','outside','over','surrounding', 'round','through','towards','under','up'] abbreviation_list=[('Mt.','Mount')] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist #print 'Master location list is:',sent_loc_list # 1. Find score for each sentence using word march score first for i in range(0,len(sentence_list)): score=0 #print 'Sentence is :',sentence_list[i] score= score + WM.stemWordMatch(stop_words_free_question,sentence_list[i]) #print 'After wordmatch score is:',score #2. Check if the sentence contains location preposition, then it is a good clue for k in complete_sentence_list[i].split(): if k in location_prepositions: score=score+4 # 3. Check if the sentence contains Location entity if sent_loc_list[i] != []: # If sentence contains location score=score + 6 # 4. Reward sentences which has "from" in the question and in the answer too from_qflag=0 cand_list=[] for k in temp_q.split(): if k.lower()=='from': #print 'From qflag is true' from_qflag=1 if from_qflag==1 and 'from' in complete_sentence_list[i].split(): #print 'True:' '''if sent_loc_list[i] !=[]: for m in sent_loc_list[i]: if m not in temp_q.split(): cand_list.append(m) if cand_list!=[]: return ' '.join(cand_list) else: for k in complete_sentence_list[i].split(): if k not in temp_q: cand_list.append(k) return ' '.join(cand_list)''' score=score + 6 # 4. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 sent_score_list.append(score) #print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score=0 first_sentence_flag=0 temp_list=cleansedQuestion.split() flag=0 for word in temp_list: if word.lower() == 'where': flag=1 for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower()=='happen': dateline_score= dateline_score+4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list)-1 and temp_list[i].lower()=='take' and temp_list[i+1].lower()=='place': dateline_score=dateline_score+4 # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions if temp_list[i].lower()=='this': if flag==0: dateline_score= dateline_score+20 else: first_sentence_flag=1 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower()=='story' and flag==0: dateline_score= dateline_score+20 #print 'Date line score for the question is :',dateline_score first_list=[] if first_sentence_flag==1: #Choose the first sentence as the answer pos_np_list=POS_Tagging.pos_NNP_tagging(complete_sentence_list[0]) if pos_np_list !=[]: for k in pos_np_list: if k not in temp_q.split(): first_list.append(k) return ' '.join(first_list) else: return complete_sentence_list[0] # Selecting the sentence/sentences that has the maximum score. max_score_value =max(sent_score_list) #Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): temp_str=candidate_list[k][0] index=candidate_list[k][1] break #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') ################### SENTENCE PROCESSING ####################### result_list=[] answer_list=[] s_loclist=sent_loc_list[index] #print 'Location list:', s_loclist if s_loclist==[]: #The selected sentence does not seem to have a location expression, then print whole sentence minus the words in the question '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str) if nnp_list != []: for k in nnp_list: if k not in temp_q: result_list.append(k) if result_list !=[]: return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q.split(): result_list.append(k) if result_list !=[]: return ' '.join(result_list) if s_loclist!=[]: for i in range(0, len(s_loclist)): if s_loclist[i] not in temp_q.split() : #To counter situations where question has a location and NER doesn't identify it answer_list.append(s_loclist[i]) #print 'Answer list is :',answer_list temp_result=[] np_result_list=[] if answer_list != []: result=' '.join(answer_list) return result else: '''np_list = POS_Tagging.pos_noun_tagging(temp_str) if np_list != []: for k in np_list: if k not in temp_q: np_result_list.append(k) return ' '.join(np_result_list)''' for k in temp_str.split(): if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) # Dateline score is greater than the sent list score else: result=dateline return result
def answering_who(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_person_list, sent_prof_list): # Declaring globals to be used in this function sent_score_list = [] q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] temp_q = cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace('?', '') lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Temp_q: ',temp_q q_person_list, q_org_list, q_loc_list, q_month_list, q_time_list, q_money_list, q_percent_list, q_prof_list = NER.named_entity_recognition( temp_q) for i in range(0, len(complete_sentence_list)): #print 'Sentence is :', complete_sentence_list[i] score = 0 # 1. Score using word match rule. Match words in question with the words in stop free sentence #print 'Sentence is :',sentence_list[i] score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) if q_person_list == []: #Giving more weights to sentences having more names in it if sent_person_list[i] != [] or sent_prof_list[i] != []: #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list) score = score + 6 # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) lmtzr = WordNetLemmatizer() temp = complete_sentence_list[i].split() for k in range(0, len(temp)): if lmtzr.lemmatize(temp[k].lower()) == 'name': score = score + 4 # 4. Awards points to all sentences that contain a name or reference to a human if sent_person_list[i] != [] or sent_prof_list[i] != []: #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list) score = score + 4 # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question # then it is a confident clue and we reward it more sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) '''for m in range(0, len(sent_pos_list)): if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split(): score=score + 18 #print 'Score now is :', score''' for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 # 6. If the question contains a profession name, the answer has to be a person and sentence would have #the person name and the profession if q_prof_list != []: for k in complete_sentence_list[i].split(): if k.lower() in q_prof_list: #print 'Profession Yes !' score = score + 18 else: #Question contains name so the chances of answer being a profession name are decent if sent_prof_list[i] != []: score = score + 6 sent_score_list.append(score) #print 'Sent score list is :',sent_score_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first candidate_list = [] npfinal_list = [] temp_list = [] answer_list = [] max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(complete_sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i], i)) #print 'Candidate list is :',candidate_list #If there is only one sentence, then choose the sentence and then do the processing to display the answer if len(candidate_list) == 1: temp_str = candidate_list[0][0] index = candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): #Cleaning up the candidate sentence temp_str = candidate_list[k][0] index = candidate_list[k][1] #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') break ####################### SENTENCE PROCESSING TO FIND THE ANSWER ############################### #Just pick out the noun-phrase or PERSON names from the sentence #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str) s_plist = sent_person_list[index] s_proflist = sent_prof_list[index] #print 'Prof list is:',s_proflist #If the question has a name of person, then the answer sentence should/would most probably #the name of a person but it should not be the name of the person appearing in the question. #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases #print 'Question person list is:',q_person_list #print 'Sentence person list is:',s_plist result_list = [] q_loc_who_list = [] if q_person_list == [] and s_plist == []: #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str) if pos_np_list != []: for x in pos_np_list: if x not in temp_q and x[0].isupper(): #Noun phrases or names generally start with an upper case character print 'First character caps',x result_list.append(x) return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) elif q_person_list != [] and s_plist != []: #To counter situations when both question and sentence has names Ex. Who defeated who ? for k in s_plist: if k not in temp_q: answer_list.append(k) elif q_person_list == [] and s_plist != []: for i in range(0, len(s_plist)): if s_plist[i] not in q_person_list and s_plist[ i] not in temp_q: #To counter situations where question has a name and NER doesn't identify it answer_list.append(s_plist[i]) elif q_person_list != [] and s_proflist != []: #To counter situations for 'Who is X' type questions which could have a profession name in the answer for k in s_proflist: answer_list.append(k) elif q_person_list == [] and q_loc_list != []: # Who is <X> where ? #print 'Question has no name but has a location' for k in temp_str.split(): if k not in temp_q: q_loc_who_list.append(k) if q_loc_who_list != []: return ' '.join(q_loc_who_list) '''elif q_person_list==[] and s_proflist !=[]: for k in s_proflist: answer_list.append(k)''' if answer_list != []: #and flag==1: #Indicating candidate sentence has a name other than that in question result = ' '.join(answer_list) else: #Pick out the noun phrase or nouns and then display them as answer np_list = POS_Tagging.pos_noun_tagging(temp_str) for x in np_list: if x not in temp_q: npfinal_list.append( x ) #Removing all occurences of existing noun phrases from the question #print 'NP Final list after removal is',npfinal_list if npfinal_list != []: result = ' '.join(npfinal_list) else: result = temp_str # Printing out the whole sentence #print 'Result is:',result return result
def answering_how( cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list ): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] q_verblist = [] best = [] # List of the best scoring sentences based on word match with the question much_list = [ "thousand", "hundred", "dollars", "cents", "million", "billion", "trillion", "none", "nothing", "everything", "few", "something", "cent", "percent", "salary", "pay", "income", "loss", "profit", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", ] many_list = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million", "billion", "trillion", ] how_often = [ "daily", "weekly", "bi-weekly", "fortnightly", "monthly", "bi-monthly", "quarterly", "half-yearly", "yearly", "decade", "millennium" "day", "everyday", "night", "afternoon", "noon", ] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs = [] stanford_stop_words_list = [ "a", "an", "and", "are", "as", "at", "be", "buy", "do", "for", "from", "has", "have", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", ] abbreviation_list = [("Mt.", "Mount")] ########################### QUESTION PROCESSING ################## temp_q = cleansedQuestion # temp_q=temp_q.replace('"','') # temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace("?", "") for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) # print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if ( pos_list[i][1] in ["VB", "VBD", "VBZ", "VBN"] and lmtzr.lemmatize(pos_list[i][0], "v") not in stanford_stop_words_list ): q_verblist.append(lmtzr.lemmatize(pos_list[i][0], "v")) # print 'Question verb list is :',q_verblist # print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0, len(complete_sentence_list)): score = 0 # 1. Find score for each sentence using word march score first # print 'The sentence is :',complete_sentence_list[i] # score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i]) # 2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower() == "many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score = score + 6 # 3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower() == "much": for m in complete_sentence_list[i].split(): if m.lower() in ["money", "earn", "salary", "profit", "loss"] or m in much_list: score = score + 6 # 4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower() == "often": for m in complete_sentence_list[i].split(): if m.lower() in sent_time_list or m.lower() in how_often: score = score + 10 """if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6""" sent_score_list.append(score) # print 'Score list is:',sent_score_list max_score_value = max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) # print 'Final sent list is:',final_sent_list temp_result = [] temp_solution = [] if len(final_sent_list) == 1: # If the question contains often, the sentence will usually contain a time expression.If so pick # that expression as the solution """temp=cleansedQuestion.split() if 'often' in temp: #print 'often' temp2=final_sent_list[0].split() for m in range(0,len(temp2)): if temp2[m] in how_often: temp_solution.append(temp2[m]) #print 'Answer: ',' '.join(temp_solution)+'\n' #print '\n' return ' '.join(temp_solution)""" if "many" in temp_q.split(): # print 'many' temp2 = final_sent_list[0].split() for m in range(0, len(temp2)): if nums.match(temp2[m]) or temp2[m] in many_list: temp_solution.append(temp2[m]) return " ".join(temp_solution) return final_sent_list[0] """for k in final_sent_list[0].split(): if k not in cleansedQuestion.split(): temp_result.append(k) return ' '.join(temp_result)""" else: # Choose the sentence that comes at the last, in case of a tie for k in range(0, len(final_sent_list)): result = final_sent_list[k] break for k in result.split(): if k not in cleansedQuestion.split(): temp_result.append(k) return " ".join(temp_result)