def answering_which(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] for i in range(0,len(complete_sentence_list)): score=0 # 1. Find score for each sentence using word march score first #print 'The sentence is :',complete_sentence_list[i] #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(stop_words_free_question,sentence_list[i]) sent_score_list.append(score) #print 'Score list is:',sent_score_list max_score_value=max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list final_sent_list=[] temp_result=[] for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final sent list is:',final_sent_list if len(final_sent_list) == 1: temp = final_sent_list[0].split() for k in range(0, len(temp)): if temp[k].lower()=='that': return ' '.join(temp[k:]) return final_sent_list[0] else: for k in range(0,len(final_sent_list)): result=final_sent_list[k] break temp = result.split() for k in range(0, len(temp)): if temp[k].lower()=='that': return ' '.join(temp[k:]) return result
class FindBooks(object): file_patten = "pattens.txt" wu_pattens = [] kmp_pattens = [] Wu = WM.WuManber(BookNameConstraint) Kmp = kmp.KmpSearch(BookNameConstraint) def __init__(self): all_pattens = open(self.file_patten).readlines() for i, item in enumerate(all_pattens): item = item.strip(" \n") if len(item) < 5: self.kmp_pattens.append(item) else: self.wu_pattens.append(item) self.Wu.InitPatten(self.wu_pattens) self.Kmp.InitPattens(self.kmp_pattens) def SearchBooks(self, string): rWu = self.Wu.Search(string) rKmp = self.Kmp.Search(string) return [rWu, rKmp]
def answering_when(cleansedQuestion, stop_words_free_question, sentence_list, dateline): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = {} final_sent_list = [] when_year_verbs = ['play', 'fought'] #'win','lose','victorius'] when_time_values = [ 'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov', 'december', 'dec', '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999' ] #print 'Question is :',cleansedQuestion # 1. Check if the sentence contains "TIME" expression for i in range(0, len(sentence_list)): score = 0 person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging( sentence_list[i]) if time_list != []: # Sentence contains a time expression candidate_sent_list.append(sentence_list[i]) # Now compute the wordmatch score score = score + 4 + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) #sent_score_list.append((score,i)) # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk for word in cleansedQuestion: if word.lower() in ['the', 'last']: for sent in sentence_list[i]: if sent in ['first', 'last', 'since', 'ago']: score = score + 20 # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year} if word.lower() in ['start', 'begin']: for sent in sentence_list[i]: if sent in ['start', 'begin', 'since', 'year']: score = score + 20 sent_score_list[i] = score #print 'Candidate sentences list is :',candidate_sent_list #print 'Sent score list is :', sent_score_list # For when and where questions the answer to the question could also be from the timeline of the story dateline_score = 0 for i in range(0, len(cleansedQuestion)): # 1. If question contains "happen", it is a good clue that timeline could be answer if cleansedQuestion[i].lower() == 'happen': dateline_score = dateline_score + 4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(cleansedQuestion) - 1 and cleansedQuestion[i].lower( ) == 'take' and cleansedQuestion[i + 1].lower() == 'place': dateline_score = dateline_score + 4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower() == 'this': dateline_score = dateline_score + 12 # 4. If question contains "story", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower() == 'story': dateline_score = dateline_score + 12 #print 'Date line score for the question is :',dateline_score # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose # dateline_score else choose the maximum score from sent_score_list max_score_index = max(sent_score_list, key=lambda i: sent_score_list[i]) score_values = sent_score_list.values() max_score_value = max(score_values) #print 'Max value is :', max_score_value # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list # First step is to parse the stop-words free question and look for words in the question which might help us find #the answer #print 'Stopwords free question :', stop_words_free_question '''for i in stop_words_free_question: if i in when_year: final_sent_list.append(''' # Giving preference to sentences which contain a year value # for i in sent_score_list.keys(): '''temp=sentence_list[i].split() for j in range(0, len(temp)): if j in when_year: print 'Year is true' #final_sent_list.append(sentence_list[i]) final_sent_list.append(j)''' # If none of the sentences contain a year, then choose the one with maximum value if sent_score_list[i] == max_score_value: final_sent_list.append(sentence_list[i]) #print 'Final sentence list is:',final_sent_list # Now from the sentences extracting out the years or the date /time values alone and representing them final_temp_list = [] if len(final_sent_list) == 1: temp = nltk.word_tokenize(final_sent_list[0]) for j in range(0, len(temp)): if temp[j].lower() in when_time_values: #print 'year true' final_temp_list.append(temp[j]) if final_temp_list != []: result = ' '.join(final_temp_list) print 'Answer: ', result + '\n' #print '\n' return result else: print 'Answer: ', final_sent_list[0] + '\n' #print '\n' return final_sent_list[0] else: for i in range(0, len(final_sent_list)): temp = nltk.word_tokenize(final_sent_list[i]) for j in range(0, len(temp)): if temp[j].lower() in when_time_values: #print 'year true' final_temp_list.append(temp[j]) if final_temp_list != []: result = ' '.join(final_temp_list) print 'Answer: ', result + '\n' #print '\n' return result else: print 'Answer: ', ' '.join(final_sent_list) + '\n' #print '\n' return ' '.join(final_sent_list) else: result = dateline print 'Answer: ', result + '\n' #print '\n' return result
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] master_loc_list=[] location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around', 'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against'] what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour', 'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days', 'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday', 'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight', 'hourly','nightly','daily','monthly','weekly','quarterly','yearly'] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') # 1. Find score for each sentence using word march score first for i in range(0,len(complete_sentence_list)): score=0 score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue temp=cleansedQuestion.split() temp=nltk.word_tokenize(stop_words_free_question) flag=0 for j in range(0, len(temp)): if temp[j].lower() in what_month: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if temp2[k] in date_expression_list: count=count+4 # 3. What "kind" questions. Sentences containing "call" or "from" if temp[j].lower() =='kind': temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['call','from']: count=count+6 # 4. If question contains "name" and the sentence contains {name,call,known} if temp[j].lower() =='name': temp2=complete_sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['name','call','known']: score=score+20 #5. If question contains name + PP and contains(S,ProperNoun) and Head PP if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if person_list != []: #TODO Check if it also contains (proper_noun,head(PP)) score=score +20 # If the question contains "sport" related terms, answer should also have sport related terms '''if temp[j].lower() in ['sports','games','olympics']: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']: score=score+6''' # If the sentence contains a "country" name and the sentence contains a LOCATION, then it is confident score if temp[j].lower() in ['country','countries','olympics']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if loc_list != []: score=score + 6*len(loc_list) # Confidence score increases with increasing number of countries appearing in the sentence. sent_score_list.append(score) #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list temp_solution=[] answer_loc=[] if len(final_sent_list) == 1: print 'Answer: ',final_sent_list[0] +'\n' #print '\n' return final_sent_list[0] else: for i in range(0,len(final_sent_list)): temp=final_sent_list[i] break #result=' '.join(final_sent_list) result=temp print 'Answer: ', result +'\n' #print '\n' return result
def answering_where(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] master_loc_list=[] location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around', 'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against'] when_year_verbs=['play','fought'] #'win','lose','victorius'] when_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] #print 'Question is :',cleansedQuestion # 1. Find score for each sentence using word march score first for i in range(0,len(sentence_list)): score=0 score= score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #2. Check if the sentence contains location preposition, then it is a good clue temp=complete_sentence_list[i].split() flag=0 for j in range(0, len(temp)): if temp[j] in location_prepositions: flag=1 if flag == 1: score= score + 4 # 3. Check if the sentence contains Location entity person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if loc_list != []: # If sentence contains location score=score + 6 candidate_sent_list.append(sentence_list[i]) master_loc_list.append((' '.join(loc_list),i)) sent_score_list.append(score) #print 'Master loc list is :',master_loc_list #print 'Candidate sentences based on Location entity are:',candidate_sent_list # For when and where questions the answer to the question could also be from the timeline of the story dateline_score=0 for i in range(0,len(cleansedQuestion)): # 1. If question contains "happen", it is a good clue that timeline could be answer if cleansedQuestion[i].lower()=='happen': dateline_score= dateline_score+4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(cleansedQuestion)-1 and cleansedQuestion[i].lower()=='take' and cleansedQuestion[i+1].lower()=='place': dateline_score=dateline_score+4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower()=='this': dateline_score= dateline_score+12 # 4. If question contains "story", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower()=='story': dateline_score= dateline_score+12 #print 'Date line score for the question is :',dateline_score # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose # dateline_score else choose the maximum score from sent_score_list max_score_value =max(sent_score_list) #print 'Max value is :', max_score_value # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append((complete_sentence_list[i],i)) #print 'Final sent list is:',final_sent_list #TODO - check which works better #TODO - based on the verbs in the question select the appropriate sentence from sentence list '''for i in range(0, len(sent_score_list)): if sent_score_list[i] in candidate_sent_list: if sent_score_list[i]==max_score_value: final_sent_list.append(sentence_list[i]) else: final_temp_list.append(sentence_list[i])''' # Now from the sentences extracting out the years or the date /time values alone and representing them temp_solution=[] answer_loc=[] if len(final_sent_list) == 1: sent=final_sent_list[0][0] index=final_sent_list[0][1] #print index for i in range(0,len(master_loc_list)): answer_loc.append(master_loc_list[i][0]) print 'Answer: ',' '.join(set(answer_loc))+'\n' #print '\n' return ' '.join(set(answer_loc)) '''print master_loc_list[i] temp=master_loc_list[i][1] if temp==index: result=master_loc_list[i][0] print 'Result is :',master_loc_list[i][0] return result''' else: for i in range(0,len(final_sent_list)): temp=final_sent_list[i][0] temp_solution.append(temp) break result=' '.join(temp_solution) print 'Answer: ', result+'\n' #print '\n' return result else: result=dateline print 'Answer: ', result+'\n' #print '\n' return result
def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list): # Declaring globals to be used in this function sent_score_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] temp_q=cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Temp_q: ',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) for i in range(0, len(complete_sentence_list)): #print 'Sentence is :', complete_sentence_list[i] score=0 # 1. Score using word match rule. Match words in question with the words in stop free sentence #print 'Sentence is :',sentence_list[i] score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) if q_person_list==[]: #Giving more weights to sentences having more names in it if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list) score=score + 6 # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) lmtzr = WordNetLemmatizer() temp= complete_sentence_list[i].split() for k in range(0,len(temp)): if lmtzr.lemmatize(temp[k].lower())=='name': score=score + 4 # 4. Awards points to all sentences that contain a name or reference to a human if sent_person_list[i] !=[] or sent_prof_list[i] !=[]: #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list) score=score+4 # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question # then it is a confident clue and we reward it more sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i]) '''for m in range(0, len(sent_pos_list)): if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split(): score=score + 18 #print 'Score now is :', score''' for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 6. If the question contains a profession name, the answer has to be a person and sentence would have #the person name and the profession if q_prof_list!=[]: for k in complete_sentence_list[i].split(): if k.lower() in q_prof_list: #print 'Profession Yes !' score=score+18 else: #Question contains name so the chances of answer being a profession name are decent if sent_prof_list[i] !=[]: score=score+6 sent_score_list.append(score) #print 'Sent score list is :',sent_score_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first candidate_list=[] npfinal_list=[] temp_list=[] answer_list=[] max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(complete_sentence_list)): if sent_score_list[i]==max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list #If there is only one sentence, then choose the sentence and then do the processing to display the answer if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): #Cleaning up the candidate sentence temp_str=candidate_list[k][0] index =candidate_list[k][1] #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') break ####################### SENTENCE PROCESSING TO FIND THE ANSWER ############################### #Just pick out the noun-phrase or PERSON names from the sentence #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str) s_plist=sent_person_list[index] s_proflist=sent_prof_list[index] #print 'Prof list is:',s_proflist #If the question has a name of person, then the answer sentence should/would most probably #the name of a person but it should not be the name of the person appearing in the question. #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases #print 'Question person list is:',q_person_list #print 'Sentence person list is:',s_plist result_list=[] q_loc_who_list=[] if q_person_list==[] and s_plist==[]: #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str) if pos_np_list != []: for x in pos_np_list: if x not in temp_q and x[0].isupper(): #Noun phrases or names generally start with an upper case character print 'First character caps',x result_list.append(x) return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) elif q_person_list !=[] and s_plist !=[]: #To counter situations when both question and sentence has names Ex. Who defeated who ? for k in s_plist: if k not in temp_q: answer_list.append(k) elif q_person_list==[] and s_plist !=[]: for i in range(0, len(s_plist)): if s_plist[i] not in q_person_list and s_plist[i] not in temp_q: #To counter situations where question has a name and NER doesn't identify it answer_list.append(s_plist[i]) elif q_person_list != [] and s_proflist !=[]: #To counter situations for 'Who is X' type questions which could have a profession name in the answer for k in s_proflist: answer_list.append(k) elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ? #print 'Question has no name but has a location' for k in temp_str.split(): if k not in temp_q: q_loc_who_list.append(k) if q_loc_who_list !=[]: return ' '.join(q_loc_who_list) '''elif q_person_list==[] and s_proflist !=[]: for k in s_proflist: answer_list.append(k)''' if answer_list != [] :#and flag==1: #Indicating candidate sentence has a name other than that in question result= ' '.join(answer_list) else: #Pick out the noun phrase or nouns and then display them as answer np_list = POS_Tagging.pos_noun_tagging(temp_str) for x in np_list : if x not in temp_q: npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question #print 'NP Final list after removal is',npfinal_list if npfinal_list !=[]: result=' '.join(npfinal_list) else: result=temp_str # Printing out the whole sentence #print 'Result is:',result return result
def answering_when( cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline, month_list, time_list ): # Declaring globals to be used in this function candidate_list = [] sent_score_list = [] stanford_stop_words_list = [ "a", "an", "and", "are", "as", "at", "be", "buy", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", ] time_nos = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million", "billion", "trillion", ] temp_q = cleansedQuestion temp_q = temp_q.replace('"', "") temp_q = temp_q.replace("'", '"') temp_q = temp_q.replace("?", "") # print 'Question is :',temp_q # print 'Month list is :',month_list # print 'Time list is :',time_list # 1. Check if the sentence contains "TIME" expression # print 'Time list is :',time_list for i in range(0, len(sentence_list)): score = 0 # print 'Sentence is :',complete_sentence_list[i] if time_list[i] != [] or month_list[i] != []: # Sentence contains a time expression # Now compute the wordmatch score score = score + 4 + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk temp = cleansedQuestion.split() for m in range(0, len(temp) - 1): if temp[m].lower() == "the" and temp[m + 1].lower() == "last": for sent in sentence_list[i].split(): if sent in ["first", "last", "since", "ago"]: score = score + 20 # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year} for word in cleansedQuestion.split(): if word.lower() in ["start", "begin"]: for sent in sentence_list[i].split(): if sent in ["start", "begin", "since", "year"]: score = score + 20 sent_score_list.append(score) # 4. Verb match ?? # print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score = 0 temp_list = cleansedQuestion.split() for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower() == "happen": dateline_score = dateline_score + 4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list) - 1 and temp_list[i].lower() == "take" and temp_list[i + 1].lower() == "place": dateline_score = dateline_score + 4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if temp_list[i].lower() == "this": dateline_score = dateline_score + 20 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower() == "story": dateline_score = dateline_score + 20 # print 'Date line score for the question is :',dateline_score # Selecting the sentence/sentences that has the maximum score. max_score_value = max(sent_score_list) # Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i], i)) # print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list) == 1: temp_str = candidate_list[0][0] index = candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): if month_list[candidate_list[k][1]] != []: # Rewarding sentences with month # Cleaning up the candidate sentence temp_str = candidate_list[k][0] index = candidate_list[k][1] break else: temp_str = candidate_list[0][0] index = candidate_list[0][1] # Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " # temp_str=temp_str.replace('"','') # temp_str=temp_str.replace("'",'"') # temp_str=temp_str.replace(',','').replace('?','').replace('!','') ################### SENTENCE PROCESSING ####################### result_list = [] answer_list = [] s_monthlist = month_list[index] s_timelist = time_list[index] # print 'Month list:',s_monthlist # print 'Time list:', s_timelist if ( s_monthlist == [] and s_timelist == [] ): # The selected sentence does not seem to have a time or month expression, then print whole sentence minus the words in the question for k in temp_str.split(): if k not in temp_q: result_list.append(k) return " ".join(result_list) if s_monthlist != []: for i in range(0, len(s_monthlist)): if ( s_monthlist[i] not in temp_q ): # To counter situations where question has a month and NER doesn't identify it answer_list.append(s_monthlist[i]) # If time list is not empty if s_timelist != []: temp_list = temp_str.split() for j in range(0, len(temp_list)): if temp_list[j] in s_timelist and j != 0 and temp_list[j] not in temp_q: # and j!=len(temp_list)-1: if temp_list[j - 1] in stanford_stop_words_list: answer_list.append( temp_list[j - 1].lower() ) # Appending the word before the time list which is generally a number or indicative of the time if j - 2 >= 0: answer_list.append(temp_list[j - 2].lower()) else: answer_list.append( temp_list[j - 1].lower() ) # Appending the word after the time list word which will be the result in few cases # Non-days time values for i in range(0, len(s_timelist)): if s_timelist[i] not in temp_q: # and s_timelist[i] not in ['days']: answer_list.append(s_timelist[i]) # Time list values will usually have numbers or other prepositions before it which will give us the complete answer time_prep = ["over", "period", "within", "inside", "under", "ago", "through", "past"] for k in temp_str.split(): if k.lower() in time_prep: answer_list.append(k.lower()) if k.isdigit(): answer_list.append(k) if k.lower() in time_nos: answer_list.append(k.lower()) # print 'Answer list is :',set(answer_list) temp_result = [] if answer_list != []: result = " ".join(list(set(answer_list))) return result else: for k in temp_str.split(): if k not in temp_q: temp_result.append(k) return " ".join(temp_result) else: result = dateline return result
def answering_how(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_percent_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] q_verblist=[] best=[] # List of the best scoring sentences based on word match with the question much_list=['thousand','thousands','hundred','hundreds','dollars','cents','million','billion','trillion','none','nothing','everything','few','something', 'dollars','grams','kilos','kilogram','kilograms','milligrams','mg','metre','centimetre','inches','feet','foot','ft','cent','percent','salary','pay','income','loss','profit','one','two','three','four','five','six','seven','eight','nine','ten', 'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety', 'hour','hours','minutes','seconds','second','minute','half','quarter','more','less','than'] many_list=['one','two','three','four','five','six','seven','eight','nine','ten', 'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','hundred', 'thousand','million','billion','trillion'] how_often=['daily','weekly','bi-weekly','fortnightly','monthly','bi-monthly','quarterly','half-yearly','yearly','decade','millennium' 'day','everyday','night','afternoon','noon','hourly','hours','minutes','seconds','second','minute'] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] abbreviation_list=[('Mt.','Mount')] ########################### QUESTION PROCESSING ################## temp_q=cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist #print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0,len(complete_sentence_list)): score=0 # 1. Find score for each sentence using word march score first #print 'The sentence is :',complete_sentence_list[i] #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower()=="many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score=score + 6 #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower()=="much": for m in complete_sentence_list[i].split(): if m.lower() in ['money','earn','salary','profit','loss'] or m in much_list: score=score+6 #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower()=='often' or k.lower() =='long': for m in complete_sentence_list[i].split(): if m in how_often: #m.lower() in sent_time_list[i] or score=score+10 break '''if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6''' sent_score_list.append(score) print 'Score list is:',sent_score_list max_score_value=max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) print 'Final sent list is:',final_sent_list temp_result=[] temp_solution=[] if len(final_sent_list) == 1: #If the question contains often, the sentence will usually contain a time expression.If so pick #that expression as the solution if final_sent_list[0].index('.')==len(final_sent_list[0]) -1: req_string=final_sent_list[0][:-1] temp2=req_string.split() else: temp2=final_sent_list[0].split() else: if final_sent_list[0].index('.')==len(final_sent_list[0]) -1: req_string=final_sent_list[0][:-1] temp2=req_string.split() else: temp2=final_sent_list[0].split() #Picking the sentence which comes first when there are multiple candidates #If sentence contains per cent most probably it would be an answer to the how question (much or many) for k in range(0,len(temp2)): if k !=0 or k!=len(temp2)-1: if temp2[k].lower()=='per' and temp2[k+1].lower()=='cent': return ' '.join(temp2[k-1:k+2]) if 'many' in temp_q.split(): #print 'many' for m in range(0,len(temp2)): if nums.match(temp2[m]) or temp2[m] in many_list: print 'Yes' temp_solution.append(temp2[m]) print 'Temp solution is:',temp_solution return ' '.join(temp_solution) elif 'much' in temp_q.split(): #print 'many' for m in range(0,len(temp2)): if nums.match(temp2[m]) or temp2[m] in much_list: temp_solution.append(temp2[m]) return ' '.join(temp_solution) for k in temp2: if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) '''else:
def answering_who(cleansedQuestion,stop_words_free_question,sentence_list): # Declaring globals to be used in this function wordmatch_score_list=[] sent_containing_person_score_list=[] sent_containing_name_score_list=[] sent_containing_person_and_name_score_list=[] sent_containing_person_or_name_score_list=[] master_person_list=[] sent_score_list=[] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') for i in range(0, len(sentence_list)): #print 'Sentence is :', sentence_list[i] score=0 # 1. Score using word match rule wordmatch_score_list.append(WM.stemWordMatch(cleansedQuestion,sentence_list[i])) score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) q_person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(cleansedQuestion) if q_person_list==[]: sent_plist,sent_olist,sent_llist,sent_tlist,sent_proflist=NET.named_entity_tagging(sentence_list[i]) master_person_list.append((sent_plist,i)) if sent_plist !=[]: score=score + 6*len(sent_plist) # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) temp= sentence_list[i].split() for k in range(0,len(temp)): if snowball_stemmer.stem(temp[k].lower())=='name': score=score +4 else: #Question has a name, and if the sentence contains the same name, then it is a good clue. # 4. Awards points to all sentences that contain a name or reference to a human sent_plist,sent_olist,sent_llist,sent_tlist,sent_proflist=NET.named_entity_tagging(sentence_list[i]) master_person_list.append(sent_plist) if sent_plist==q_person_list: score=score+4*len(sent_plist) elif sent_plist != [] or "name" in sentence_list[i]: score=score+4 '''if sent_plist==[] and "name" in sentence_list[i]: sent_containing_name_score_list.append(4) else: sent_containing_name_score_list.append(0)''' sent_score_list.append(score) #print 'Sent score list is :',sent_score_list #print 'Master person list is:',master_person_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first # Preference is given to sentences which have a person name in them. If there is only one such sentence that is the answer candidate_list=[] final_result_set=[] temp_list=[] max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(sentence_list)): if sent_score_list[i]==max_score_value: candidate_list.append((sentence_list[i],i)) #print 'Candidate list is :',candidate_list if len(candidate_list)==1: q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question) #If the question has a profession but not name of person, then the answer sentence should/would most probably #the name of a person #print 'Question Person List',q_plist if q_plist == [] or q_proflist != []: #temp_result=master_person_list[candidate_list[0][1]][0] s_plist,s_olist,s_llist,s_tlist,s_proflist=NET.named_entity_tagging(candidate_list[0][0]) result= ' '.join(s_plist) print 'Answer: ',result+'\n' #print '\n' return result elif q_plist != [] or q_proflist != []: #print candidate_list[0][1] s_plist,s_olist,s_llist,s_tlist,s_proflist=NET.named_entity_tagging(candidate_list[0][0]) result= ' '.join(s_plist) print 'Answer: ',result+'\n' #print '\n' return result elif q_plist != [] or q_proflist == []: # Implies question has a name. So pick a sentence which has the same name in sentence which is present in question # result=candidate_list[0][0] print 'Answer: ',result+'\n' #print '\n' return result else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): val=candidate_list[k][0] #print 'val is :',val index=candidate_list[k][1] #print 'index is :', index temp_list.append(index) break #result=' '.join(temp_list) x= master_person_list[temp_list[0]] #print 'x is :', x result2 = temp_list[0] #for i in range(0,len(x)): if x != []: temp=' '.join(x[0]) if temp not in stop_words_free_question: final_result_set.append(temp) else: final_result_set.append(val) if final_result_set != []: print 'Answer: ',' '.join(final_result_set)+'\n' #print '\n' #print 'Result 2 is :',result2 return ' '.join(final_result_set) else: print 'Answer: ',temp+'\n' #print '\n' return temp #' '.join(x) # Checking to see if the question contains profession name. If so the answer should be a sentence containing a name and higher weights # is given for the score from Rule 2. Else Rule 1 and Rule 2 are given equal weightage. '''q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question)
def train(FIS_name, data, target_col, mf, Ncentroids, overlap, alpha=0.5, iterations=50, sa=False, sa_plot=False): ''' Trains a FIS, writes all the properties of this FIS to a FIS file using the write function. Inputs: data: nummpy array of size > number of centroids x 2 target_col: integer index of the target column Ncentroids: either an integer (for each feature te same) or an array size = number of features mf: 'triangle', 'trapezoid' or 'Gaussian' overlap: number between 0 and 1, when gaussian mf overlap is the variance when triangle/trapezoid overlap is half of the base iterations: number of iterations for the simulated annealing Outputs: RB: list of lists of integer rules target_centroids: list with scaled target centroids feature_centroids: the other feature centroids ''' # scale the data data, min_x, max_x = scale(data) # get centroids centroids = cluster(data, target_col, Ncentroids, plot=False) # learn WM rules RB = WM.learn(data, centroids, overlap, mf, target_col) # return everything needed for testing target_centroids = centroids[target_col] # delete target centroid for testing feature_centroids = np.delete(centroids, target_col, 0) # delete target values for testing targets = data[:, target_col] data = np.delete(data, target_col, 1) method = 'WM' # for simulated annealing, get the new rule base if sa: method = 'WM+SA' RB = SA.search(data, targets, RB, alpha, feature_centroids, overlap, mf, target_centroids, min_x[target_col], max_x[target_col], plot=sa_plot, iterations=iterations) # Write FIS file in the format: # FIS_name.FIS with open(FIS_name + '.FIS', "w") as fis_file: write(fis_file, method, mf, overlap, target_centroids, feature_centroids, RB)
def answering_where(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] master_loc_list = [] location_prepositions = [ 'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under', 'next to', 'below', 'between', 'around', 'outside', 'among', 'on the right', 'across', 'front', 'opposite', 'before', 'beneath', 'beside', 'against' ] when_year_verbs = ['play', 'fought'] #'win','lose','victorius'] when_year = [ '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999' ] #print 'Question is :',cleansedQuestion # 1. Find score for each sentence using word march score first for i in range(0, len(sentence_list)): score = 0 score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) #2. Check if the sentence contains location preposition, then it is a good clue temp = complete_sentence_list[i].split() flag = 0 for j in range(0, len(temp)): if temp[j] in location_prepositions: flag = 1 if flag == 1: score = score + 4 # 3. Check if the sentence contains Location entity person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging( sentence_list[i]) if loc_list != []: # If sentence contains location score = score + 6 candidate_sent_list.append(sentence_list[i]) master_loc_list.append((' '.join(loc_list), i)) sent_score_list.append(score) #print 'Master loc list is :',master_loc_list #print 'Candidate sentences based on Location entity are:',candidate_sent_list # For when and where questions the answer to the question could also be from the timeline of the story dateline_score = 0 for i in range(0, len(cleansedQuestion)): # 1. If question contains "happen", it is a good clue that timeline could be answer if cleansedQuestion[i].lower() == 'happen': dateline_score = dateline_score + 4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(cleansedQuestion) - 1 and cleansedQuestion[i].lower( ) == 'take' and cleansedQuestion[i + 1].lower() == 'place': dateline_score = dateline_score + 4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower() == 'this': dateline_score = dateline_score + 12 # 4. If question contains "story", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower() == 'story': dateline_score = dateline_score + 12 #print 'Date line score for the question is :',dateline_score # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose # dateline_score else choose the maximum score from sent_score_list max_score_value = max(sent_score_list) #print 'Max value is :', max_score_value # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i] == max_score_value: final_sent_list.append((complete_sentence_list[i], i)) #print 'Final sent list is:',final_sent_list #TODO - check which works better #TODO - based on the verbs in the question select the appropriate sentence from sentence list '''for i in range(0, len(sent_score_list)): if sent_score_list[i] in candidate_sent_list: if sent_score_list[i]==max_score_value: final_sent_list.append(sentence_list[i]) else: final_temp_list.append(sentence_list[i])''' # Now from the sentences extracting out the years or the date /time values alone and representing them temp_solution = [] answer_loc = [] if len(final_sent_list) == 1: sent = final_sent_list[0][0] index = final_sent_list[0][1] #print index for i in range(0, len(master_loc_list)): answer_loc.append(master_loc_list[i][0]) print 'Answer: ', ' '.join(set(answer_loc)) + '\n' #print '\n' return ' '.join(set(answer_loc)) '''print master_loc_list[i] temp=master_loc_list[i][1] if temp==index: result=master_loc_list[i][0] print 'Result is :',master_loc_list[i][0] return result''' else: for i in range(0, len(final_sent_list)): temp = final_sent_list[i][0] temp_solution.append(temp) break result = ' '.join(temp_solution) print 'Answer: ', result + '\n' #print '\n' return result else: result = dateline print 'Answer: ', result + '\n' #print '\n' return result
def answering_when(cleansedQuestion,stop_words_free_question,sentence_list,dateline): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list={} final_sent_list=[] when_year_verbs=['play','fought'] #'win','lose','victorius'] when_time_values=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec','1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] #print 'Question is :',cleansedQuestion # 1. Check if the sentence contains "TIME" expression for i in range(0,len(sentence_list)): score=0 person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if time_list != []: # Sentence contains a time expression candidate_sent_list.append(sentence_list[i]) # Now compute the wordmatch score score = score + 4 + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #sent_score_list.append((score,i)) # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk for word in cleansedQuestion: if word.lower() in ['the','last']: for sent in sentence_list[i]: if sent in ['first','last','since','ago']: score = score +20 # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year} if word.lower() in ['start','begin']: for sent in sentence_list[i]: if sent in ['start','begin','since','year']: score = score +20 sent_score_list[i]=score #print 'Candidate sentences list is :',candidate_sent_list #print 'Sent score list is :', sent_score_list # For when and where questions the answer to the question could also be from the timeline of the story dateline_score=0 for i in range(0,len(cleansedQuestion)): # 1. If question contains "happen", it is a good clue that timeline could be answer if cleansedQuestion[i].lower()=='happen': dateline_score= dateline_score+4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(cleansedQuestion)-1 and cleansedQuestion[i].lower()=='take' and cleansedQuestion[i+1].lower()=='place': dateline_score=dateline_score+4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower()=='this': dateline_score= dateline_score+12 # 4. If question contains "story", it is slam_dunk that timeline could be answer if cleansedQuestion[i].lower()=='story': dateline_score= dateline_score+12 #print 'Date line score for the question is :',dateline_score # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose # dateline_score else choose the maximum score from sent_score_list max_score_index=max(sent_score_list, key=lambda i: sent_score_list[i]) score_values=sent_score_list.values() max_score_value =max(score_values) #print 'Max value is :', max_score_value # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list # First step is to parse the stop-words free question and look for words in the question which might help us find #the answer #print 'Stopwords free question :', stop_words_free_question '''for i in stop_words_free_question: if i in when_year: final_sent_list.append(''' # Giving preference to sentences which contain a year value # for i in sent_score_list.keys(): '''temp=sentence_list[i].split() for j in range(0, len(temp)): if j in when_year: print 'Year is true' #final_sent_list.append(sentence_list[i]) final_sent_list.append(j)''' # If none of the sentences contain a year, then choose the one with maximum value if sent_score_list[i]==max_score_value: final_sent_list.append(sentence_list[i]) #print 'Final sentence list is:',final_sent_list # Now from the sentences extracting out the years or the date /time values alone and representing them final_temp_list=[] if len(final_sent_list) == 1: temp=nltk.word_tokenize(final_sent_list[0]) for j in range(0, len(temp)): if temp[j].lower() in when_time_values: #print 'year true' final_temp_list.append(temp[j]) if final_temp_list != []: result=' '.join(final_temp_list) print 'Answer: ', result+'\n' #print '\n' return result else: print 'Answer: ', final_sent_list[0]+'\n' #print '\n' return final_sent_list[0] else: for i in range(0,len(final_sent_list)): temp=nltk.word_tokenize(final_sent_list[i]) for j in range(0, len(temp)): if temp[j].lower() in when_time_values: #print 'year true' final_temp_list.append(temp[j]) if final_temp_list != []: result=' '.join(final_temp_list) print 'Answer: ', result+'\n' #print '\n' return result else: print 'Answer: ', ' '.join(final_sent_list)+'\n' #print '\n' return ' '.join(final_sent_list) else: result=dateline print 'Answer: ', result +'\n' #print '\n' return result
def answering_when(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline,month_list,time_list): # Declaring globals to be used in this function candidate_list=[] sent_score_list=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','for','from', 'has','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] time_nos=['one','two','three','four','five','six','seven','eight','nine','ten', 'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','hundred', 'thousand','million','billion','trillion'] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') #print 'Question is :',temp_q #print 'Month list is :',month_list #print 'Time list is :',time_list # 1. Check if the sentence contains "TIME" expression #print 'Time list is :',time_list for i in range(0,len(sentence_list)): score=0 #print 'Sentence is :',complete_sentence_list[i] if time_list[i] != [] or month_list[i]!= []: # Sentence contains a time expression # Now compute the wordmatch score score = score + 4 + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk temp=cleansedQuestion.split() for m in range(0, len(temp)-1): if temp[m].lower()=='the' and temp[m+1].lower()=='last': for sent in sentence_list[i].split(): if sent in ['first','last','since','ago']: score = score +20 # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year} for word in cleansedQuestion.split(): if word.lower() in ['start','begin']: for sent in sentence_list[i].split(): if sent in ['start','begin','since','year']: score = score +20 sent_score_list.append(score) #4. Verb match ?? #print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score=0 temp_list=cleansedQuestion.split() for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower()=='happen': dateline_score= dateline_score+4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list)-1 and temp_list[i].lower()=='take' and temp_list[i+1].lower()=='place': dateline_score=dateline_score+4 # 3. If question contains "this", it is slam_dunk that timeline could be answer if temp_list[i].lower()=='this': dateline_score= dateline_score+20 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower()=='story': dateline_score= dateline_score+20 #print 'Date line score for the question is :',dateline_score # Selecting the sentence/sentences that has the maximum score. max_score_value =max(sent_score_list) #Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): if month_list[candidate_list[k][1]] !=[]: #Rewarding sentences with month #Cleaning up the candidate sentence temp_str=candidate_list[k][0] index=candidate_list[k][1] break else: temp_str=candidate_list[0][0] index =candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') temp_str=temp_str.replace(',','').replace('?','').replace('!','') ################### SENTENCE PROCESSING ####################### result_list=[] answer_list=[] s_monthlist=month_list[index] s_timelist=time_list[index] #print 'Month list:',s_monthlist #print 'Time list:', s_timelist if s_monthlist == [] and s_timelist == []: #The selected sentence does not seem to have a time or month expression, then print whole sentence minus the words in the question for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) if s_monthlist!=[]: for i in range(0, len(s_monthlist)): if s_monthlist[i] not in temp_q : #To counter situations where question has a month and NER doesn't identify it answer_list.append(s_monthlist[i]) # If time list is not empty if s_timelist != []: temp_list=temp_str.split() for j in range(0, len(temp_list)): if temp_list[j] in s_timelist and j!=0 and temp_list[j] not in temp_q:#and j!=len(temp_list)-1: if temp_list[j-1] in stanford_stop_words_list: answer_list.append(temp_list[j-1].lower()) #Appending the word before the time list which is generally a number or indicative of the time if j-2 >=0: answer_list.append(temp_list[j-2].lower()) else: answer_list.append(temp_list[j-1].lower()) #Appending the word after the time list word which will be the result in few cases #Non-days time values for i in range(0, len(s_timelist)): if s_timelist[i] not in temp_q : #and s_timelist[i] not in ['days']: answer_list.append(s_timelist[i]) # Time list values will usually have numbers or other prepositions before it which will give us the complete answer time_prep=['over','period','within','inside','under','ago','through','past'] for k in temp_str.split(): if k.lower() in time_prep: answer_list.append(k.lower()) if k.isdigit(): answer_list.append(k) if k.lower() in time_nos: answer_list.append(k.lower()) #print 'Answer list is :',set(answer_list) temp_result=[] if answer_list != []: result=' '.join(list(set(answer_list))) return result else: for k in temp_str.split(): if k not in temp_q: temp_result.append(k) return ' '.join(temp_result) else: result=dateline return result
def answering_who(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_person_list, sent_prof_list): # Declaring globals to be used in this function sent_score_list = [] q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] temp_q = cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace('?', '') lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Temp_q: ',temp_q q_person_list, q_org_list, q_loc_list, q_month_list, q_time_list, q_money_list, q_percent_list, q_prof_list = NER.named_entity_recognition( temp_q) for i in range(0, len(complete_sentence_list)): #print 'Sentence is :', complete_sentence_list[i] score = 0 # 1. Score using word match rule. Match words in question with the words in stop free sentence #print 'Sentence is :',sentence_list[i] score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) if q_person_list == []: #Giving more weights to sentences having more names in it if sent_person_list[i] != [] or sent_prof_list[i] != []: #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list) score = score + 6 # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) lmtzr = WordNetLemmatizer() temp = complete_sentence_list[i].split() for k in range(0, len(temp)): if lmtzr.lemmatize(temp[k].lower()) == 'name': score = score + 4 # 4. Awards points to all sentences that contain a name or reference to a human if sent_person_list[i] != [] or sent_prof_list[i] != []: #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list) score = score + 4 # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question # then it is a confident clue and we reward it more sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) '''for m in range(0, len(sent_pos_list)): if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split(): score=score + 18 #print 'Score now is :', score''' for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 # 6. If the question contains a profession name, the answer has to be a person and sentence would have #the person name and the profession if q_prof_list != []: for k in complete_sentence_list[i].split(): if k.lower() in q_prof_list: #print 'Profession Yes !' score = score + 18 else: #Question contains name so the chances of answer being a profession name are decent if sent_prof_list[i] != []: score = score + 6 sent_score_list.append(score) #print 'Sent score list is :',sent_score_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first candidate_list = [] npfinal_list = [] temp_list = [] answer_list = [] max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(complete_sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i], i)) #print 'Candidate list is :',candidate_list #If there is only one sentence, then choose the sentence and then do the processing to display the answer if len(candidate_list) == 1: temp_str = candidate_list[0][0] index = candidate_list[0][1] #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): #Cleaning up the candidate sentence temp_str = candidate_list[k][0] index = candidate_list[k][1] #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace("'",'"') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') break ####################### SENTENCE PROCESSING TO FIND THE ANSWER ############################### #Just pick out the noun-phrase or PERSON names from the sentence #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str) s_plist = sent_person_list[index] s_proflist = sent_prof_list[index] #print 'Prof list is:',s_proflist #If the question has a name of person, then the answer sentence should/would most probably #the name of a person but it should not be the name of the person appearing in the question. #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases #print 'Question person list is:',q_person_list #print 'Sentence person list is:',s_plist result_list = [] q_loc_who_list = [] if q_person_list == [] and s_plist == []: #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str) if pos_np_list != []: for x in pos_np_list: if x not in temp_q and x[0].isupper(): #Noun phrases or names generally start with an upper case character print 'First character caps',x result_list.append(x) return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q: result_list.append(k) return ' '.join(result_list) elif q_person_list != [] and s_plist != []: #To counter situations when both question and sentence has names Ex. Who defeated who ? for k in s_plist: if k not in temp_q: answer_list.append(k) elif q_person_list == [] and s_plist != []: for i in range(0, len(s_plist)): if s_plist[i] not in q_person_list and s_plist[ i] not in temp_q: #To counter situations where question has a name and NER doesn't identify it answer_list.append(s_plist[i]) elif q_person_list != [] and s_proflist != []: #To counter situations for 'Who is X' type questions which could have a profession name in the answer for k in s_proflist: answer_list.append(k) elif q_person_list == [] and q_loc_list != []: # Who is <X> where ? #print 'Question has no name but has a location' for k in temp_str.split(): if k not in temp_q: q_loc_who_list.append(k) if q_loc_who_list != []: return ' '.join(q_loc_who_list) '''elif q_person_list==[] and s_proflist !=[]: for k in s_proflist: answer_list.append(k)''' if answer_list != []: #and flag==1: #Indicating candidate sentence has a name other than that in question result = ' '.join(answer_list) else: #Pick out the noun phrase or nouns and then display them as answer np_list = POS_Tagging.pos_noun_tagging(temp_str) for x in np_list: if x not in temp_q: npfinal_list.append( x ) #Removing all occurences of existing noun phrases from the question #print 'NP Final list after removal is',npfinal_list if npfinal_list != []: result = ' '.join(npfinal_list) else: result = temp_str # Printing out the whole sentence #print 'Result is:',result return result
def answering_how(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] best_sent_index=[] best=[] # List of the best scoring sentences based on word match with the question what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour', 'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days', 'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday', 'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight', 'hourly','nightly','daily','monthly','weekly','quarterly','yearly'] much_list=['thousand','hundred','dollars','cents','million','billion','none','nothing','everything','few','something', 'salary','pay','income','loss','profit','one','two','three','four','five','six','seven','eight','nine','ten'] how_often=['daily','weekly','bi-weekly','fortnightly','monthly','bi-monthly','quarterly','half-yearly','yearly','decade','millennium' 'day','everyday','night','afternoon','noon'] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs=[] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') # 1. Find score for each sentence using word march score first for i in range(0,len(complete_sentence_list)): score=0 much_flag=0 many_flag=0 money_flag=0 score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #2. If the question contains "many" and sentence contains an expression of number, then it is confident score temp=cleansedQuestion.split() for j in range(0, len(temp)): if temp[j]=='many': many_flag=1 break if many_flag==1: #print 'In many' temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]): # Implies answer contains a number #print 'many Q - number sentence' score=score+6 #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score temp=cleansedQuestion.split() for j in range(0, len(temp)): if temp[j]=='much': #print 'In much' much_flag=1 break if temp[j] in ['money','earn','salary','profit','loss']: money_flag=1 if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6 sent_score_list.append(score) #print 'Score list is:',sent_score_list max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final sent list is:',final_sent_list temp_solution=[] answer_loc=[] if len(final_sent_list) == 1: #If the question contains often, the sentence will usually contain a time expression.If so pick #that expression as the solution temp=cleansedQuestion.split() if 'often' in temp: #print 'often' temp2=final_sent_list[0].split() for m in range(0,len(temp2)): if temp2[m] in how_often: temp_solution.append(temp2[m]) print 'Answer: ',' '.join(temp_solution)+'\n' #print '\n' return temp_solution if 'many' in temp: #print 'many' temp2=final_sent_list[0].split() for m in range(0,len(temp2)): if nums.match(temp2[m]): temp_solution.append(temp2[m]) print 'Answer: ',' '.join(temp_solution)+'\n' #print '\n' return temp_solution else: print 'Answer: ',final_sent_list[0]+'\n' #print '\n' return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0,len(final_sent_list)): result=final_sent_list[k] break print 'Answer: ', result+'\n' #print '\n' return result
def answering_why(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] best_sent_index=[] best=[] # List of the best scoring sentences based on word match with the question location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around', 'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against'] what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour', 'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days', 'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday', 'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight', 'hourly','nightly','daily','monthly','weekly','quarterly','yearly'] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') # Find score for each sentence using word march score first for i in range(0,len(complete_sentence_list)): wm_score=0 wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) sent_score_list.append(wm_score) #print 'Score list is:',sent_score_list max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: best.append((complete_sentence_list[i],i)) #print 'Best list is:',best # Finding indices of the best sentences for j in range(0,len(best)): best_sent_index.append(best[j][1]) # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i]=0 for i in range(0, len(complete_sentence_list)): score=0 # 1. If the given sentence is in the best list, then reward them. It is a clue for j in range(0,len(best)): if complete_sentence_list[i] in best[j][0]: #print 'Yes' score=score + 3 #print 'Score after 1 is :',score #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i==k-1: score=score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i==k+1: score=score + 4 #4. If the sentence contains word "want", then it is a good clue temp=complete_sentence_list[i].split() for word in temp: if word.lower()=='want': #print 'Score increment rule 4' score=score+4 elif word.lower() in ['so','because']: #print 'Score increment rule 5' score=score+4 sent_score_list[i]=score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list #print 'len of sent_score_list:',len(sent_score_list) for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list temp_solution=[] answer_loc=[] if len(final_sent_list) == 1: print 'Answer: ',final_sent_list[0]+'\n' #print '\n' return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0,len(final_sent_list)): result=final_sent_list[k] print 'Answer: ', result+'\n' #print '\n' return result
def answering_how(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] q_verblist = [] best = [ ] # List of the best scoring sentences based on word match with the question much_list = [ 'thousand', 'thousands', 'hundred', 'hundreds', 'dollars', 'cents', 'million', 'billion', 'trillion', 'none', 'nothing', 'everything', 'few', 'something', 'dollars', 'grams', 'kilos', 'kilogram', 'kilograms', 'milligrams', 'mg', 'metre', 'centimetre', 'inches', 'feet', 'foot', 'ft', 'cent', 'percent', 'salary', 'pay', 'income', 'loss', 'profit', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hour', 'hours', 'minutes', 'seconds', 'second', 'minute', 'half', 'quarter', 'more', 'less', 'than' ] many_list = [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion', 'trillion' ] how_often = [ 'daily', 'weekly', 'bi-weekly', 'fortnightly', 'monthly', 'bi-monthly', 'quarterly', 'half-yearly', 'yearly', 'decade', 'millennium' 'day', 'everyday', 'night', 'afternoon', 'noon', 'hourly', 'hours', 'minutes', 'seconds', 'second', 'minute' ] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] abbreviation_list = [('Mt.', 'Mount')] ########################### QUESTION PROCESSING ################## temp_q = cleansedQuestion #temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace('?', '') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist #print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0, len(complete_sentence_list)): score = 0 # 1. Find score for each sentence using word march score first #print 'The sentence is :',complete_sentence_list[i] #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) #2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower() == "many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score = score + 6 #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower() == "much": for m in complete_sentence_list[i].split(): if m.lower() in [ 'money', 'earn', 'salary', 'profit', 'loss' ] or m in much_list: score = score + 6 #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower() == 'often' or k.lower() == 'long': for m in complete_sentence_list[i].split(): if m in how_often: #m.lower() in sent_time_list[i] or score = score + 10 break '''if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6''' sent_score_list.append(score) #print 'Score list is:',sent_score_list max_score_value = max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final sent list is:',final_sent_list temp_result = [] temp_solution = [] if len(final_sent_list) == 1: #If the question contains often, the sentence will usually contain a time expression.If so pick #that expression as the solution if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1: req_string = final_sent_list[0][:-1] temp2 = req_string.split() else: temp2 = final_sent_list[0].split() else: if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1: req_string = final_sent_list[0][:-1] temp2 = req_string.split() else: temp2 = final_sent_list[0].split( ) #Picking the sentence which comes first when there are multiple candidates #If sentence contains per cent most probably it would be an answer to the how question (much or many) for k in range(0, len(temp2)): if k != 0 or k != len(temp2) - 1: if temp2[k].lower() == 'per' and temp2[k + 1].lower() == 'cent': return ' '.join(temp2[k - 1:k + 2]) if 'many' in temp_q.split(): #print 'many' for m in range(0, len(temp2)): #print 'temp2[m]:',temp2[m] if nums.match(temp2[m]) or temp2[m] in many_list: #print 'Yes' temp_solution.append(temp2[m]) #print 'Temp solution is:',temp_solution if temp_solution != []: return ' '.join(temp_solution) else: return ' '.join(temp2) elif 'much' in temp_q.split(): #print 'many' for m in range(0, len(temp2)): if nums.match(temp2[m]) or temp2[m] in much_list: temp_solution.append(temp2[m]) if temp_solution != []: return ' '.join(temp_solution) else: return ' '.join(temp2) for k in temp2: if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result)
def answering_what(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] master_loc_list = [] location_prepositions = [ 'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under', 'next to', 'below', 'between', 'around', 'outside', 'among', 'on the right', 'across', 'front', 'opposite', 'before', 'beneath', 'beside', 'against' ] what_year = [ '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999' ] what_month = [ 'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov', 'december', 'dec' ] date_expression_list = [ 'yesterday', 'today', 'tomorrow', 'last week', 'this week', 'next week', 'an hour ago', 'now', 'in an hour', 'recently', 'soon', 'a little while ago', 'at this moment', 'in the near future', 'a long time ago', 'these days', 'those days', 'future', 'present', 'past', 'nowadays', 'eventually', 'morning', 'evening', 'night', 'midnight', 'dawn', 'dusk', 'afternoon', 'noon', 'midday', 'am', 'pm', 'sunrise', 'sunset', 'lunchtime', 'teatime', 'dinnertime', 'interval', 'twilight', 'hourly', 'nightly', 'daily', 'monthly', 'weekly', 'quarterly', 'yearly' ] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') # 1. Find score for each sentence using word march score first for i in range(0, len(complete_sentence_list)): score = 0 score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue temp = cleansedQuestion.split() temp = nltk.word_tokenize(stop_words_free_question) flag = 0 for j in range(0, len(temp)): if temp[j].lower() in what_month: temp2 = sentence_list[i].split() for k in range(0, len(temp2)): if temp2[k] in date_expression_list: count = count + 4 # 3. What "kind" questions. Sentences containing "call" or "from" if temp[j].lower() == 'kind': temp2 = sentence_list[i].split() for k in range(0, len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['call', 'from']: count = count + 6 # 4. If question contains "name" and the sentence contains {name,call,known} if temp[j].lower() == 'name': temp2 = complete_sentence_list[i].split() for k in range(0, len(temp2)): if snowball_stemmer.stem( temp2[k]) in ['name', 'call', 'known']: score = score + 20 #5. If question contains name + PP and contains(S,ProperNoun) and Head PP if j != len(temp) - 1 and temp[j] == 'name' and temp[j + 1] in [ 'of', 'for' ]: person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging( sentence_list[i]) if person_list != []: #TODO Check if it also contains (proper_noun,head(PP)) score = score + 20 # If the question contains "sport" related terms, answer should also have sport related terms '''if temp[j].lower() in ['sports','games','olympics']: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']: score=score+6''' # If the sentence contains a "country" name and the sentence contains a LOCATION, then it is confident score if temp[j].lower() in ['country', 'countries', 'olympics']: person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging( sentence_list[i]) if loc_list != []: score = score + 6 * len( loc_list ) # Confidence score increases with increasing number of countries appearing in the sentence. sent_score_list.append(score) #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value = max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list temp_solution = [] answer_loc = [] if len(final_sent_list) == 1: print 'Answer: ', final_sent_list[0] + '\n' #print '\n' return final_sent_list[0] else: for i in range(0, len(final_sent_list)): temp = final_sent_list[i] break #result=' '.join(final_sent_list) result = temp print 'Answer: ', result + '\n' #print '\n' return result
def answering_why(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list): # Declaring globals to be used in this function sent_score_list=[] final_sent_list=[] best_sent_index=[] best=[] # List of the best scoring sentences based on word match with the question q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') #print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN','VBP'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist # Find score for each sentence using word march score first for i in range(0,len(complete_sentence_list)): wm_score=0 #complete_sentence_list[i]=complete_sentence_list[i].replace('.','').replace(',','').replace('!','') wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) sent_score_list.append(wm_score) max_score_value=max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0,len(sentence_list)): if sent_score_list[i]==max_score_value: best.append((complete_sentence_list[i],i)) best_sent_index.append(i) #print 'Best list is:',best # Finding indices of the best sentences # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i]=0 for i in range(0, len(complete_sentence_list)): score=0 # 1. If the given sentence is in the best list, then reward them. It is a clue if i in best_sent_index: score=score + 3 #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i==k-1: score=score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i==k+1: score=score + 4 #4. If the sentence contains word "want", then it is a good clue temp=complete_sentence_list[i].split() for word in temp: if word.lower()=='want': score=score+4 #5. If the sentence contains word "so" or "because" then it is a good clue elif word.lower() in ['so','because']: score=score+4 #5. Matching the main verb in question and sentence. If so it is a confident clue sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) lmtzr=WordNetLemmatizer() for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 sent_score_list[i]=score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences # which have both maximum value and present in candidate list. For why questions we don't do more filtering # since most of the answers span the entire sentence for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list if len(final_sent_list) == 1: temp=final_sent_list[0].split() for k in range(0, len(temp)): if temp[k].lower() =='so': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("so") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower() =='because': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("because") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower() =='to': #If sentence contains "to", the answer is generally the words that come after so #index=final_sent_list[0].index("to") #return final_sent_list[0][k:] return ' '.join(temp[k:]) return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0,len(final_sent_list)): result=final_sent_list[k] return result
def answering_where(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline,sent_loc_list): # Declaring globals to be used in this function candidate_list=[] sent_score_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','for','from', 'has','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] location_prepositions=['above','across','after','against','along','among','around', 'before','behind','below','beneath','beside','between','by','down','from', 'in','inside','into','near','off','onto','opposite','outside','over','surrounding', 'round','through','towards','under','up'] abbreviation_list=[('Mt.','Mount')] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist #print 'Master location list is:',sent_loc_list # 1. Find score for each sentence using word march score first for i in range(0,len(sentence_list)): score=0 #print 'Sentence is :',sentence_list[i] score= score + WM.stemWordMatch(stop_words_free_question,sentence_list[i]) #print 'After wordmatch score is:',score #2. Check if the sentence contains location preposition, then it is a good clue for k in complete_sentence_list[i].split(): if k in location_prepositions: score=score+4 # 3. Check if the sentence contains Location entity if sent_loc_list[i] != []: # If sentence contains location score=score + 6 # 4. Reward sentences which has "from" in the question and in the answer too from_qflag=0 cand_list=[] for k in temp_q.split(): if k.lower()=='from': #print 'From qflag is true' from_qflag=1 if from_qflag==1 and 'from' in complete_sentence_list[i].split(): #print 'True:' '''if sent_loc_list[i] !=[]: for m in sent_loc_list[i]: if m not in temp_q.split(): cand_list.append(m) if cand_list!=[]: return ' '.join(cand_list) else: for k in complete_sentence_list[i].split(): if k not in temp_q: cand_list.append(k) return ' '.join(cand_list)''' score=score + 6 # 4. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 sent_score_list.append(score) #print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score=0 first_sentence_flag=0 temp_list=cleansedQuestion.split() flag=0 for word in temp_list: if word.lower() == 'where': flag=1 for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower()=='happen': dateline_score= dateline_score+4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list)-1 and temp_list[i].lower()=='take' and temp_list[i+1].lower()=='place': dateline_score=dateline_score+4 # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions if temp_list[i].lower()=='this': if flag==0: dateline_score= dateline_score+20 else: first_sentence_flag=1 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower()=='story' and flag==0: dateline_score= dateline_score+20 #print 'Date line score for the question is :',dateline_score first_list=[] if first_sentence_flag==1: #Choose the first sentence as the answer pos_np_list=POS_Tagging.pos_NNP_tagging(complete_sentence_list[0]) if pos_np_list !=[]: for k in pos_np_list: if k not in temp_q.split(): first_list.append(k) return ' '.join(first_list) else: return complete_sentence_list[0] # Selecting the sentence/sentences that has the maximum score. max_score_value =max(sent_score_list) #Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i],i)) #print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list)==1: temp_str= candidate_list[0][0] index=candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): temp_str=candidate_list[k][0] index=candidate_list[k][1] break #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " #temp_str=temp_str.replace('"','') #temp_str=temp_str.replace(',','').replace('?','').replace('!','') ################### SENTENCE PROCESSING ####################### result_list=[] answer_list=[] s_loclist=sent_loc_list[index] #print 'Location list:', s_loclist if s_loclist==[]: #The selected sentence does not seem to have a location expression, then print whole sentence minus the words in the question '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str) if nnp_list != []: for k in nnp_list: if k not in temp_q: result_list.append(k) if result_list !=[]: return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q.split(): result_list.append(k) if result_list !=[]: return ' '.join(result_list) if s_loclist!=[]: for i in range(0, len(s_loclist)): if s_loclist[i] not in temp_q.split() : #To counter situations where question has a location and NER doesn't identify it answer_list.append(s_loclist[i]) #print 'Answer list is :',answer_list temp_result=[] np_result_list=[] if answer_list != []: result=' '.join(answer_list) return result else: '''np_list = POS_Tagging.pos_noun_tagging(temp_str) if np_list != []: for k in np_list: if k not in temp_q: np_result_list.append(k) return ' '.join(np_result_list)''' for k in temp_str.split(): if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) # Dateline score is greater than the sent list score else: result=dateline return result
def answering_why(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list): # Declaring globals to be used in this function sent_score_list = [] final_sent_list = [] best_sent_index = [] best = [ ] # List of the best scoring sentences based on word match with the question q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from', 'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] temp_q = cleansedQuestion temp_q = temp_q.replace('"', '') temp_q = temp_q.replace("'", '"') temp_q = temp_q.replace('?', '') #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN', 'VBP'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist # Find score for each sentence using word march score first for i in range(0, len(complete_sentence_list)): wm_score = 0 complete_sentence_list[i] = complete_sentence_list[i].replace( '.', '').replace(',', '').replace('!', '') wm_score = wm_score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) sent_score_list.append(wm_score) max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: best.append((complete_sentence_list[i], i)) best_sent_index.append(i) #print 'Best list is:',best # Finding indices of the best sentences '''for j in range(0,len(best)): best_sent_index.append(best[j][1])''' # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i] = 0 for i in range(0, len(complete_sentence_list)): score = 0 # 1. If the given sentence is in the best list, then reward them. It is a clue if i in best_sent_index: score = score + 3 #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i == k - 1: score = score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i == k + 1: score = score + 4 #4. If the sentence contains word "want", then it is a good clue temp = complete_sentence_list[i].split() for word in temp: if word.lower() == 'want': score = score + 4 #5. If the sentence contains word "so" or "because" then it is a good clue elif word.lower() in ['so', 'because']: score = score + 4 #5. Matching the main verb in question and sentence. If so it is a confident clue sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) lmtzr = WordNetLemmatizer() for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 sent_score_list[i] = score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value = max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences # which have both maximum value and present in candidate list. For why questions we don't do more filtering # since most of the answers span the entire sentence for i in range(0, len(sent_score_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list if len(final_sent_list) == 1: temp = final_sent_list[0].split() for k in range(0, len(temp)): if temp[k].lower( ) == 'so': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("so") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower( ) == 'because': #If sentence contains "so", the answer is generally the words that come after so #index=final_sent_list[0].index("because") #return final_sent_list[0][k:] return ' '.join(temp[k:]) if temp[k].lower( ) == 'to': #If sentence contains "to", the answer is generally the words that come after so #index=final_sent_list[0].index("to") #return final_sent_list[0][k:] return ' '.join(temp[k:]) return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0, len(final_sent_list)): result = final_sent_list[k] return result
def answering_where(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline, sent_loc_list): # Declaring globals to be used in this function candidate_list = [] sent_score_list = [] q_verblist = [] stanford_stop_words_list = [ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with' ] location_prepositions = [ 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by', 'down', 'from', 'in', 'inside', 'into', 'near', 'off', 'onto', 'opposite', 'outside', 'over', 'surrounding', 'round', 'through', 'towards', 'under', 'up' ] abbreviation_list = [('Mt.', 'Mount')] temp_q = cleansedQuestion temp_q = temp_q.replace('"', '') temp_q = temp_q.replace("'", '"') temp_q = temp_q.replace('?', '') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) #print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize( pos_list[i][0], 'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v')) #print 'Question verb list is :',q_verblist #print 'Master location list is:',sent_loc_list # 1. Find score for each sentence using word march score first for i in range(0, len(sentence_list)): score = 0 #print 'Sentence is :',sentence_list[i] score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i]) #print 'After wordmatch score is:',score #2. Check if the sentence contains location preposition, then it is a good clue for k in complete_sentence_list[i].split(): if k in location_prepositions: score = score + 4 # 3. Check if the sentence contains Location entity if sent_loc_list[i] != []: # If sentence contains location score = score + 6 # 4. Reward sentences which has "from" in the question and in the answer too from_qflag = 0 cand_list = [] for k in temp_q.split(): if k.lower() == 'from': #print 'From qflag is true' from_qflag = 1 if from_qflag == 1 and 'from' in complete_sentence_list[i].split(): #print 'True:' '''if sent_loc_list[i] !=[]: for m in sent_loc_list[i]: if m not in temp_q.split(): cand_list.append(m) if cand_list!=[]: return ' '.join(cand_list) else: for k in complete_sentence_list[i].split(): if k not in temp_q: cand_list.append(k) return ' '.join(cand_list)''' score = score + 6 # 4. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in [ 'VB', 'VBD', 'VBZ', 'VBN' ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist: #print 'Verb in question and sentence matches' score = score + 6 sent_score_list.append(score) #print 'Sent score list is :', sent_score_list ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION ##################### # For when and where questions the answer to the question could also be from the timeline of the story dateline_score = 0 first_sentence_flag = 0 temp_list = cleansedQuestion.split() flag = 0 for word in temp_list: if word.lower() == 'where': flag = 1 for i in range(0, len(temp_list)): # 1. If question contains "happen", it is a good clue that timeline could be answer if temp_list[i].lower() == 'happen': dateline_score = dateline_score + 4 # 2. If question contains "take place", it is a good clue that timeline could be answer if i != len(temp_list) - 1 and temp_list[i].lower( ) == 'take' and temp_list[i + 1].lower() == 'place': dateline_score = dateline_score + 4 # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions if temp_list[i].lower() == 'this': if flag == 0: dateline_score = dateline_score + 20 else: first_sentence_flag = 1 # 4. If question contains "story", it is slam_dunk that timeline could be answer if temp_list[i].lower() == 'story' and flag == 0: dateline_score = dateline_score + 20 #print 'Date line score for the question is :',dateline_score first_list = [] if first_sentence_flag == 1: #Choose the first sentence as the answer pos_np_list = POS_Tagging.pos_NNP_tagging(complete_sentence_list[0]) if pos_np_list != []: for k in pos_np_list: if k not in temp_q.split(): first_list.append(k) return ' '.join(first_list) else: return complete_sentence_list[0] # Selecting the sentence/sentences that has the maximum score. max_score_value = max(sent_score_list) #Creating candidate list of sentences based on the maximum sent score for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((complete_sentence_list[i], i)) #print 'Candidate list is :',candidate_list # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find # the corresponding sentences and choose the best among them. Else we return the dateline as the result. if max_score_value > dateline_score: # Now we have to choose the best sentence among the sentences in candidate list if len(candidate_list) == 1: temp_str = candidate_list[0][0] index = candidate_list[0][1] # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): temp_str = candidate_list[k][0] index = candidate_list[k][1] break #Cleaning up the candidate sentence # Replacing double quotes with blank and single quotes with " temp_str = temp_str.replace('"', '') temp_str = temp_str.replace(',', '').replace('?', '').replace('!', '') ################### SENTENCE PROCESSING ####################### result_list = [] answer_list = [] s_loclist = sent_loc_list[index] #print 'Location list:', s_loclist if s_loclist == []: #The selected sentence does not seem to have a location expression, then print whole sentence minus the words in the question '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str) if nnp_list != []: for k in nnp_list: if k not in temp_q: result_list.append(k) if result_list !=[]: return ' '.join(result_list)''' for k in temp_str.split(): if k not in temp_q.split(): result_list.append(k) if result_list != []: return ' '.join(result_list) if s_loclist != []: for i in range(0, len(s_loclist)): if s_loclist[i] not in temp_q.split( ): #To counter situations where question has a location and NER doesn't identify it answer_list.append(s_loclist[i]) #print 'Answer list is :',answer_list temp_result = [] np_result_list = [] if answer_list != []: result = ' '.join(answer_list) return result else: '''np_list = POS_Tagging.pos_noun_tagging(temp_str) if np_list != []: for k in np_list: if k not in temp_q: np_result_list.append(k) return ' '.join(np_result_list)''' for k in temp_str.split(): if k not in temp_q.split(): temp_result.append(k) return ' '.join(temp_result) # Dateline score is greater than the sent list score else: result = dateline return result
def answering_who(cleansedQuestion, stop_words_free_question, sentence_list): # Declaring globals to be used in this function wordmatch_score_list = [] sent_containing_person_score_list = [] sent_containing_name_score_list = [] sent_containing_person_and_name_score_list = [] sent_containing_person_or_name_score_list = [] master_person_list = [] sent_score_list = [] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') for i in range(0, len(sentence_list)): #print 'Sentence is :', sentence_list[i] score = 0 # 1. Score using word match rule wordmatch_score_list.append( WM.stemWordMatch(cleansedQuestion, sentence_list[i])) score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) # 2. If question does not contain name but the answer contains NAME then you are confident(+6) q_person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging( cleansedQuestion) if q_person_list == []: sent_plist, sent_olist, sent_llist, sent_tlist, sent_proflist = NET.named_entity_tagging( sentence_list[i]) master_person_list.append((sent_plist, i)) if sent_plist != []: score = score + 6 * len(sent_plist) # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4) temp = sentence_list[i].split() for k in range(0, len(temp)): if snowball_stemmer.stem(temp[k].lower()) == 'name': score = score + 4 else: #Question has a name, and if the sentence contains the same name, then it is a good clue. # 4. Awards points to all sentences that contain a name or reference to a human sent_plist, sent_olist, sent_llist, sent_tlist, sent_proflist = NET.named_entity_tagging( sentence_list[i]) master_person_list.append(sent_plist) if sent_plist == q_person_list: score = score + 4 * len(sent_plist) elif sent_plist != [] or "name" in sentence_list[i]: score = score + 4 '''if sent_plist==[] and "name" in sentence_list[i]: sent_containing_name_score_list.append(4) else: sent_containing_name_score_list.append(0)''' sent_score_list.append(score) #print 'Sent score list is :',sent_score_list #print 'Master person list is:',master_person_list # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first # Preference is given to sentences which have a person name in them. If there is only one such sentence that is the answer candidate_list = [] final_result_set = [] temp_list = [] max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: candidate_list.append((sentence_list[i], i)) #print 'Candidate list is :',candidate_list if len(candidate_list) == 1: q_plist, q_olist, q_llist, q_tlist, q_proflist = NET.named_entity_tagging( stop_words_free_question) #If the question has a profession but not name of person, then the answer sentence should/would most probably #the name of a person #print 'Question Person List',q_plist if q_plist == [] or q_proflist != []: #temp_result=master_person_list[candidate_list[0][1]][0] s_plist, s_olist, s_llist, s_tlist, s_proflist = NET.named_entity_tagging( candidate_list[0][0]) result = ' '.join(s_plist) print 'Answer: ', result + '\n' #print '\n' return result elif q_plist != [] or q_proflist != []: #print candidate_list[0][1] s_plist, s_olist, s_llist, s_tlist, s_proflist = NET.named_entity_tagging( candidate_list[0][0]) result = ' '.join(s_plist) print 'Answer: ', result + '\n' #print '\n' return result elif q_plist != [] or q_proflist == []: # Implies question has a name. So pick a sentence which has the same name in sentence which is present in question # result = candidate_list[0][0] print 'Answer: ', result + '\n' #print '\n' return result else: # There are more than one candidate sentences. Print the first sentence for k in range(0, len(candidate_list)): val = candidate_list[k][0] #print 'val is :',val index = candidate_list[k][1] #print 'index is :', index temp_list.append(index) break #result=' '.join(temp_list) x = master_person_list[temp_list[0]] #print 'x is :', x result2 = temp_list[0] #for i in range(0,len(x)): if x != []: temp = ' '.join(x[0]) if temp not in stop_words_free_question: final_result_set.append(temp) else: final_result_set.append(val) if final_result_set != []: print 'Answer: ', ' '.join(final_result_set) + '\n' #print '\n' #print 'Result 2 is :',result2 return ' '.join(final_result_set) else: print 'Answer: ', temp + '\n' #print '\n' return temp #' '.join(x) # Checking to see if the question contains profession name. If so the answer should be a sentence containing a name and higher weights # is given for the score from Rule 2. Else Rule 1 and Rule 2 are given equal weightage. '''q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question)
def answering_why(cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] best_sent_index = [] best = [ ] # List of the best scoring sentences based on word match with the question location_prepositions = [ 'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under', 'next to', 'below', 'between', 'around', 'outside', 'among', 'on the right', 'across', 'front', 'opposite', 'before', 'beneath', 'beside', 'against' ] what_year = [ '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999' ] what_month = [ 'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov', 'december', 'dec' ] date_expression_list = [ 'yesterday', 'today', 'tomorrow', 'last week', 'this week', 'next week', 'an hour ago', 'now', 'in an hour', 'recently', 'soon', 'a little while ago', 'at this moment', 'in the near future', 'a long time ago', 'these days', 'those days', 'future', 'present', 'past', 'nowadays', 'eventually', 'morning', 'evening', 'night', 'midnight', 'dawn', 'dusk', 'afternoon', 'noon', 'midday', 'am', 'pm', 'sunrise', 'sunset', 'lunchtime', 'teatime', 'dinnertime', 'interval', 'twilight', 'hourly', 'nightly', 'daily', 'monthly', 'weekly', 'quarterly', 'yearly' ] #print 'Question is :',cleansedQuestion snowball_stemmer = SnowballStemmer('english') # Find score for each sentence using word march score first for i in range(0, len(complete_sentence_list)): wm_score = 0 wm_score = wm_score + WM.stemWordMatch(cleansedQuestion, sentence_list[i]) sent_score_list.append(wm_score) #print 'Score list is:',sent_score_list max_score_value = max(sent_score_list) #print 'Max score is :',max_score_value # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: best.append((complete_sentence_list[i], i)) #print 'Best list is:',best # Finding indices of the best sentences for j in range(0, len(best)): best_sent_index.append(best[j][1]) # Re-setting the scores of all sentences to zero for i in range(0, len(sent_score_list)): sent_score_list[i] = 0 for i in range(0, len(complete_sentence_list)): score = 0 # 1. If the given sentence is in the best list, then reward them. It is a clue for j in range(0, len(best)): if complete_sentence_list[i] in best[j][0]: #print 'Yes' score = score + 3 #print 'Score after 1 is :',score #2. If the sentence immediately precedes member of best, then it is a clue for k in best_sent_index: #print k if i == k - 1: score = score + 3 #3. If the sentence immediately follows member of best, then it is a good clue elif i == k + 1: score = score + 4 #4. If the sentence contains word "want", then it is a good clue temp = complete_sentence_list[i].split() for word in temp: if word.lower() == 'want': #print 'Score increment rule 4' score = score + 4 elif word.lower() in ['so', 'because']: #print 'Score increment rule 5' score = score + 4 sent_score_list[i] = score #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value = max(sent_score_list) #print 'Max value is :', max_score_value # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list #print 'len of sent_score_list:',len(sent_score_list) for i in range(0, len(sent_score_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list temp_solution = [] answer_loc = [] if len(final_sent_list) == 1: print 'Answer: ', final_sent_list[0] + '\n' #print '\n' return final_sent_list[0] else: # Choose the sentence that comes at the last, in case of a tie for k in range(0, len(final_sent_list)): result = final_sent_list[k] print 'Answer: ', result + '\n' #print '\n' return result
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list): # Declaring globals to be used in this function candidate_sent_list=[] sent_score_list=[] final_sent_list=[] q_verblist=[] stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from', 'has','have','he','in','is','it','its','of','on','that','the', 'to','was','were','will','with'] what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999'] what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec'] abbreviation_list=[('Mt.','Mount')] temp_q=cleansedQuestion temp_q=temp_q.replace('"','') #temp_q=temp_q.replace("'",'"') temp_q=temp_q.replace('?','') for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q=temp_q.replace(k,abbreviation_list[0][1]) #print 'Question is :',temp_q q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q) lmtzr=WordNetLemmatizer() pos_list= POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list: q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v')) #print 'Question verb list is :',q_verblist for i in range(0,len(complete_sentence_list)): score=0 #print complete_sentence_list[i] # 1. Word Match scoring function for each of the sentences score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) #print 'Score after wordmatch is :',score #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue for k in temp_q.split(): if k.lower() in what_month: if sent_time_list[i] != []: score=score + 4 #print 'Score after Rule 2 is :',score # 3. What "kind" questions. Sentences containing "call" or "from" elif k.lower() =='kind': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['call','from']: score=score+6 #print 'Score after Rule 3 is :',score # 4. If question contains "name" and the sentence contains {name,call,known} elif k.lower() =='name': for m in complete_sentence_list[i].split(): if lmtzr.lemmatize(m,'v') in ['name','call','known']: score=score+20 #print 'Score after Rule 4 is :',score '''if q_person_list !=[]: if sent_person_list[i] !=[]: score=score+6''' #print 'Score after Rule 4 is :',score #5. If question contains name + PP and contains(S,ProperNoun) and Head PP '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if person_list != []: #TODO Check if it also contains (proper_noun,head(PP)) score=score +20''' # 6. Reward sentences which has the verb appearing in the question in its sentence sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i]) for k in range(0, len(sent_pos_list)): if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist: #print 'Verb in question and sentence matches' score=score + 6 # 7. Definition type questions or what is X or what are X questions ? temp_list=temp_q.split() if len(temp_list) <= 6: if '(' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('(') + 1 end_index=complete_sentence_list[i].index(')') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '--' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('--') + 1 end_index=complete_sentence_list[i].index('--') score=score+20 return complete_sentence_list[i][start_index:end_index] elif '{' in complete_sentence_list[i]: for k in temp_list: if k in complete_sentence_list[i].split(): start_index= complete_sentence_list[i].index('{') + 1 end_index=complete_sentence_list[i].index('}') score=score+20 return complete_sentence_list[i][start_index:end_index] # If the question contains "sport" related terms, answer should also have sport related terms '''if temp[j].lower() in ['sports','games','olympics']: temp2=sentence_list[i].split() for k in range(0,len(temp2)): if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']: score=score+6''' # If the sentence contains a "country" name and the sentence contains a LOCATION, then it is confident score '''if temp[j].lower() in ['country','countries','olympics']: person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i]) if loc_list != []: score=score + 6*len(loc_list)''' # Confidence score increases with increasing number of countries appearing in the sentence. sent_score_list.append(score) #print 'Sent score list values are:',sent_score_list # Selecting the sentence that has the maximum score. max_score_value =max(sent_score_list) # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences # which have both maximum value and present in candidate list for i in range(0, len(sent_score_list)): if sent_score_list[i]==max_score_value: final_sent_list.append(complete_sentence_list[i]) #print 'Final list is:', final_sent_list answer_list=[] if len(final_sent_list) == 1: temp= final_sent_list[0].split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:])''' #print temp_q.split() for k in range(0,len(temp)): if k !=0 or k!=len(temp)-1: if temp[k].lower()=='per' and temp[k+1].lower()=='cent': return ' '.join(temp[k-1:k+2]) if temp[k] not in temp_q.split(): #print temp[k] answer_list.append(temp[k]) return ' '.join(answer_list) else: for i in range(0,len(final_sent_list)): result=final_sent_list[i] break temp= result.split() '''for k in range(0,len(temp)): if temp[k].lower() =='to': return ' '.join(temp[k:]) else: temp=result''' for k in range(0, len(temp)): if temp[k] not in temp_q.split(): answer_list.append(temp[k]) return ' '.join(answer_list)
def answering_how( cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list ): # Declaring globals to be used in this function candidate_sent_list = [] sent_score_list = [] final_sent_list = [] q_verblist = [] best = [] # List of the best scoring sentences based on word match with the question much_list = [ "thousand", "hundred", "dollars", "cents", "million", "billion", "trillion", "none", "nothing", "everything", "few", "something", "cent", "percent", "salary", "pay", "income", "loss", "profit", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", ] many_list = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million", "billion", "trillion", ] how_often = [ "daily", "weekly", "bi-weekly", "fortnightly", "monthly", "bi-monthly", "quarterly", "half-yearly", "yearly", "decade", "millennium" "day", "everyday", "night", "afternoon", "noon", ] nums = re.compile(r"[+-]?\d+(?:\.\d+)?") measurement_verbs = [] stanford_stop_words_list = [ "a", "an", "and", "are", "as", "at", "be", "buy", "do", "for", "from", "has", "have", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with", ] abbreviation_list = [("Mt.", "Mount")] ########################### QUESTION PROCESSING ################## temp_q = cleansedQuestion # temp_q=temp_q.replace('"','') # temp_q=temp_q.replace("'",'"') temp_q = temp_q.replace("?", "") for k in temp_q.split(): if k in abbreviation_list[0][0]: temp_q = temp_q.replace(k, abbreviation_list[0][1]) # print 'Question is :',temp_q lmtzr = WordNetLemmatizer() pos_list = POS_Tagging.pos_tagging(temp_q) for i in range(0, len(pos_list)): if ( pos_list[i][1] in ["VB", "VBD", "VBZ", "VBN"] and lmtzr.lemmatize(pos_list[i][0], "v") not in stanford_stop_words_list ): q_verblist.append(lmtzr.lemmatize(pos_list[i][0], "v")) # print 'Question verb list is :',q_verblist # print 'Time list is:',sent_time_list ################## SENTENCE PROCESSING AND SCORING ################### for i in range(0, len(complete_sentence_list)): score = 0 # 1. Find score for each sentence using word march score first # print 'The sentence is :',complete_sentence_list[i] # score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i]) score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i]) # 2. If the question contains "many" and sentence contains an expression of number, then it is confident score for k in temp_q.split(): if k.lower() == "many": for m in complete_sentence_list[i].split(): if nums.match(m) or m in many_list: score = score + 6 # 3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score elif k.lower() == "much": for m in complete_sentence_list[i].split(): if m.lower() in ["money", "earn", "salary", "profit", "loss"] or m in much_list: score = score + 6 # 4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score elif k.lower() == "often": for m in complete_sentence_list[i].split(): if m.lower() in sent_time_list or m.lower() in how_often: score = score + 10 """if much_flag==1 and money_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if temp2[k] in much_list: score=score +20 #slam-dunk elif much_flag==1: temp2=complete_sentence_list[i].split() #print temp2 for k in range(0, len(temp2)): if nums.match(temp2[k]) or temp2[k] in much_list: # Implies answer contains a number #print 'much Q - number or list sentence' score=score+6""" sent_score_list.append(score) # print 'Score list is:',sent_score_list max_score_value = max(sent_score_list) # Finding the sentences which has the highest score and adding them to the best list for i in range(0, len(sentence_list)): if sent_score_list[i] == max_score_value: final_sent_list.append(complete_sentence_list[i]) # print 'Final sent list is:',final_sent_list temp_result = [] temp_solution = [] if len(final_sent_list) == 1: # If the question contains often, the sentence will usually contain a time expression.If so pick # that expression as the solution """temp=cleansedQuestion.split() if 'often' in temp: #print 'often' temp2=final_sent_list[0].split() for m in range(0,len(temp2)): if temp2[m] in how_often: temp_solution.append(temp2[m]) #print 'Answer: ',' '.join(temp_solution)+'\n' #print '\n' return ' '.join(temp_solution)""" if "many" in temp_q.split(): # print 'many' temp2 = final_sent_list[0].split() for m in range(0, len(temp2)): if nums.match(temp2[m]) or temp2[m] in many_list: temp_solution.append(temp2[m]) return " ".join(temp_solution) return final_sent_list[0] """for k in final_sent_list[0].split(): if k not in cleansedQuestion.split(): temp_result.append(k) return ' '.join(temp_result)""" else: # Choose the sentence that comes at the last, in case of a tie for k in range(0, len(final_sent_list)): result = final_sent_list[k] break for k in result.split(): if k not in cleansedQuestion.split(): temp_result.append(k) return " ".join(temp_result)