コード例 #1
0
def answering_which(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list):


    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]

    for i in range(0,len(complete_sentence_list)):
        score=0

        # 1. Find score for each sentence using word march score first

        #print 'The sentence is :',complete_sentence_list[i]
        #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(stop_words_free_question,sentence_list[i])

        sent_score_list.append(score)

    #print 'Score list is:',sent_score_list
    max_score_value=max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list


    final_sent_list=[]
    temp_result=[]

    for i in range(0,len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final sent list is:',final_sent_list


    if len(final_sent_list) == 1:
        temp = final_sent_list[0].split()
        for k in range(0, len(temp)):
            if temp[k].lower()=='that':
                return ' '.join(temp[k:])

        return final_sent_list[0]
    else:
        for k in range(0,len(final_sent_list)):
            result=final_sent_list[k]
            break

        temp = result.split()
        for k in range(0, len(temp)):
            if temp[k].lower()=='that':
                return ' '.join(temp[k:])

        return result
コード例 #2
0
class FindBooks(object):
    file_patten = "pattens.txt"
    wu_pattens = []
    kmp_pattens = []
    Wu = WM.WuManber(BookNameConstraint)
    Kmp = kmp.KmpSearch(BookNameConstraint)

    def __init__(self):
        all_pattens = open(self.file_patten).readlines()
        for i, item in enumerate(all_pattens):
            item = item.strip(" \n")
            if len(item) < 5:
                self.kmp_pattens.append(item)
            else:
                self.wu_pattens.append(item)
        self.Wu.InitPatten(self.wu_pattens)
        self.Kmp.InitPattens(self.kmp_pattens)

    def SearchBooks(self, string):
        rWu = self.Wu.Search(string)
        rKmp = self.Kmp.Search(string)
        return [rWu, rKmp]
コード例 #3
0
def answering_when(cleansedQuestion, stop_words_free_question, sentence_list,
                   dateline):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = {}
    final_sent_list = []

    when_year_verbs = ['play', 'fought']  #'win','lose','victorius']

    when_time_values = [
        'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr',
        'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug',
        'september', 'sep', 'october', 'oct', 'november', 'nov', 'december',
        'dec', '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407',
        '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416',
        '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425',
        '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434',
        '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443',
        '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452',
        '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461',
        '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470',
        '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479',
        '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488',
        '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497',
        '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506',
        '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515',
        '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524',
        '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533',
        '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542',
        '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551',
        '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560',
        '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569',
        '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578',
        '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587',
        '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596',
        '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605',
        '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614',
        '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623',
        '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632',
        '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641',
        '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650',
        '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659',
        '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668',
        '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677',
        '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686',
        '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695',
        '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704',
        '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713',
        '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722',
        '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731',
        '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740',
        '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749',
        '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758',
        '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767',
        '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776',
        '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785',
        '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794',
        '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803',
        '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812',
        '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821',
        '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830',
        '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839',
        '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848',
        '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857',
        '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866',
        '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875',
        '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884',
        '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893',
        '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902',
        '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911',
        '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920',
        '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
        '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938',
        '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947',
        '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956',
        '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965',
        '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974',
        '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983',
        '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
        '1993', '1994', '1995', '1996', '1997', '1998', '1999'
    ]

    #print 'Question is :',cleansedQuestion

    # 1. Check if the sentence contains "TIME" expression

    for i in range(0, len(sentence_list)):
        score = 0
        person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging(
            sentence_list[i])

        if time_list != []:  # Sentence contains a time expression

            candidate_sent_list.append(sentence_list[i])

            # Now compute the wordmatch score
            score = score + 4 + WM.stemWordMatch(cleansedQuestion,
                                                 sentence_list[i])
            #sent_score_list.append((score,i))

        # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk

        for word in cleansedQuestion:
            if word.lower() in ['the', 'last']:
                for sent in sentence_list[i]:
                    if sent in ['first', 'last', 'since', 'ago']:
                        score = score + 20

            # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year}
            if word.lower() in ['start', 'begin']:
                for sent in sentence_list[i]:
                    if sent in ['start', 'begin', 'since', 'year']:
                        score = score + 20

        sent_score_list[i] = score

    #print 'Candidate sentences list is :',candidate_sent_list
    #print 'Sent score list is :', sent_score_list

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score = 0
    for i in range(0, len(cleansedQuestion)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if cleansedQuestion[i].lower() == 'happen':
            dateline_score = dateline_score + 4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(cleansedQuestion) - 1 and cleansedQuestion[i].lower(
        ) == 'take' and cleansedQuestion[i + 1].lower() == 'place':
            dateline_score = dateline_score + 4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if cleansedQuestion[i].lower() == 'this':
            dateline_score = dateline_score + 12

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if cleansedQuestion[i].lower() == 'story':
            dateline_score = dateline_score + 12

    #print 'Date line score for the question is :',dateline_score
    # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose
    # dateline_score else choose the maximum score from sent_score_list

    max_score_index = max(sent_score_list, key=lambda i: sent_score_list[i])

    score_values = sent_score_list.values()
    max_score_value = max(score_values)

    #print 'Max value is :', max_score_value
    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

        # Now we have to choose the best sentence among the sentences in candidate list

        # First step is to parse the stop-words free question and look for words in the question which might help us find
        #the answer

        #print 'Stopwords free question :', stop_words_free_question
        '''for i in stop_words_free_question:
            if i in when_year:
                final_sent_list.append('''

        # Giving preference to sentences which contain a year value #
        for i in sent_score_list.keys():
            '''temp=sentence_list[i].split()
            for j in range(0, len(temp)):
                if j in when_year:
                    print 'Year is true'
                    #final_sent_list.append(sentence_list[i])
                    final_sent_list.append(j)'''

            # If none of the sentences contain a year, then choose the one with maximum value
            if sent_score_list[i] == max_score_value:
                final_sent_list.append(sentence_list[i])

        #print 'Final sentence list is:',final_sent_list

        # Now from the sentences extracting out the years or the date /time values alone and representing them
        final_temp_list = []
        if len(final_sent_list) == 1:
            temp = nltk.word_tokenize(final_sent_list[0])
            for j in range(0, len(temp)):
                if temp[j].lower() in when_time_values:
                    #print 'year true'
                    final_temp_list.append(temp[j])

            if final_temp_list != []:
                result = ' '.join(final_temp_list)
                print 'Answer: ', result + '\n'
                #print '\n'
                return result
            else:
                print 'Answer: ', final_sent_list[0] + '\n'
                #print '\n'
                return final_sent_list[0]
        else:

            for i in range(0, len(final_sent_list)):
                temp = nltk.word_tokenize(final_sent_list[i])
                for j in range(0, len(temp)):
                    if temp[j].lower() in when_time_values:
                        #print 'year true'
                        final_temp_list.append(temp[j])

            if final_temp_list != []:
                result = ' '.join(final_temp_list)
                print 'Answer: ', result + '\n'
                #print '\n'
                return result
            else:
                print 'Answer: ', ' '.join(final_sent_list) + '\n'
                #print '\n'
                return ' '.join(final_sent_list)

    else:
        result = dateline
        print 'Answer: ', result + '\n'
        #print '\n'
        return result
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    master_loc_list=[]


    location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around',
                           'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against']

    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']

    date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour',
          'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days',
          'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday',
          'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight',
          'hourly','nightly','daily','monthly','weekly','quarterly','yearly']

    #print 'Question is :',cleansedQuestion


    snowball_stemmer = SnowballStemmer('english')
    # 1. Find score for each sentence using word march score first

    for i in range(0,len(complete_sentence_list)):
        score=0

        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])


        #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue

        temp=cleansedQuestion.split()
        temp=nltk.word_tokenize(stop_words_free_question)
        flag=0
        for j in range(0, len(temp)):
            if temp[j].lower() in what_month:
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if temp2[k] in date_expression_list:
                        count=count+4

            # 3. What "kind" questions. Sentences containing "call" or "from"
            if temp[j].lower() =='kind':
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['call','from']:
                        count=count+6

            # 4. If question contains "name" and the sentence contains {name,call,known}

            if temp[j].lower() =='name':
                temp2=complete_sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['name','call','known']:
                        score=score+20

            #5. If question contains name + PP and contains(S,ProperNoun) and Head PP

            if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']:
                 person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
                 if person_list != []:
                     #TODO Check if it also contains (proper_noun,head(PP))
                     score=score +20

            # If the question contains "sport" related terms, answer should also have sport related terms
            '''if temp[j].lower() in ['sports','games','olympics']:
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']:
                        score=score+6'''

            # If the sentence contains a  "country" name and the sentence contains a LOCATION, then it is confident score
            if temp[j].lower() in ['country','countries','olympics']:
                person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
                if loc_list != []:
                    score=score + 6*len(loc_list)  # Confidence score increases with increasing number of countries appearing in the sentence.



        sent_score_list.append(score)

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)
    #print 'Max value is :', max_score_value


    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:
                final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list
    temp_solution=[]
    answer_loc=[]
    if len(final_sent_list) == 1:
        print 'Answer: ',final_sent_list[0] +'\n'
        #print '\n'
        return final_sent_list[0]

    else:

        for i in range(0,len(final_sent_list)):
            temp=final_sent_list[i]
            break

        #result=' '.join(final_sent_list)
        result=temp
        print 'Answer: ', result +'\n'
        #print '\n'
        return result
def answering_where(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    master_loc_list=[]


    location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around',
                           'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against']
    when_year_verbs=['play','fought'] #'win','lose','victorius']

    when_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    #print 'Question is :',cleansedQuestion



    # 1. Find score for each sentence using word march score first

    for i in range(0,len(sentence_list)):
        score=0

        score= score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])


        #2. Check if the sentence contains location preposition, then it is a good clue
        temp=complete_sentence_list[i].split()
        flag=0
        for j in range(0, len(temp)):
            if temp[j] in location_prepositions:
                flag=1

        if flag == 1:
            score= score + 4

        # 3. Check if the sentence contains Location entity
        person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])

        if loc_list != []: # If sentence contains location
            score=score + 6

            candidate_sent_list.append(sentence_list[i])
            master_loc_list.append((' '.join(loc_list),i))


        sent_score_list.append(score)
    #print 'Master loc list is :',master_loc_list
    #print 'Candidate sentences based on Location entity are:',candidate_sent_list

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score=0
    for i in range(0,len(cleansedQuestion)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if cleansedQuestion[i].lower()=='happen':
            dateline_score= dateline_score+4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(cleansedQuestion)-1 and cleansedQuestion[i].lower()=='take' and cleansedQuestion[i+1].lower()=='place':
            dateline_score=dateline_score+4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if cleansedQuestion[i].lower()=='this':
            dateline_score= dateline_score+12

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if cleansedQuestion[i].lower()=='story':
            dateline_score= dateline_score+12

    #print 'Date line score for the question is :',dateline_score

    # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose
    # dateline_score else choose the maximum score from sent_score_list

    max_score_value =max(sent_score_list)
    #print 'Max value is :', max_score_value


    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:


        # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
        # which have both maximum value and present in candidate list

        for i in range(0, len(sent_score_list)):
             if sent_score_list[i]==max_score_value:
                    final_sent_list.append((complete_sentence_list[i],i))

        #print 'Final sent list is:',final_sent_list
        #TODO - check which works better
        #TODO - based on the verbs in the question select the appropriate sentence from sentence list
        '''for i in range(0, len(sent_score_list)):
            if sent_score_list[i] in candidate_sent_list:
                if sent_score_list[i]==max_score_value:
                    final_sent_list.append(sentence_list[i])
                else:
                    final_temp_list.append(sentence_list[i])'''

        # Now from the sentences extracting out the years or the date /time values alone and representing them
        temp_solution=[]
        answer_loc=[]
        if len(final_sent_list) == 1:
            sent=final_sent_list[0][0]
            index=final_sent_list[0][1]
            #print index
            for i in range(0,len(master_loc_list)):
                answer_loc.append(master_loc_list[i][0])

            print 'Answer: ',' '.join(set(answer_loc))+'\n'
            #print '\n'
            return ' '.join(set(answer_loc))
            '''print master_loc_list[i]
                temp=master_loc_list[i][1]
                if temp==index:
                    result=master_loc_list[i][0]
                    print 'Result is :',master_loc_list[i][0]
                    return result'''
        else:
            for i in range(0,len(final_sent_list)):
                temp=final_sent_list[i][0]
                temp_solution.append(temp)
                break

            result=' '.join(temp_solution)
            print 'Answer: ', result+'\n'
            #print '\n'
            return result

    else:
        result=dateline
        print 'Answer: ', result+'\n'
        #print '\n'
        return result
コード例 #6
0
def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list):

    # Declaring globals to be used in this function

    sent_score_list=[]
    q_verblist=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']

    temp_q=cleansedQuestion
    #temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))



    #print 'Temp_q: ',temp_q

    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)

    for i in range(0, len(complete_sentence_list)):
        #print 'Sentence is :', complete_sentence_list[i]
        score=0

        # 1. Score using word match rule. Match words in question with the words in stop free sentence

        #print 'Sentence is :',sentence_list[i]
        score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        if q_person_list==[]:

            #Giving more weights to sentences having more names in it
            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list)
                score=score + 6

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            lmtzr = WordNetLemmatizer()
            temp= complete_sentence_list[i].split()
            for k in range(0,len(temp)):
                if lmtzr.lemmatize(temp[k].lower())=='name':
                    score=score + 4

            #  4. Awards points to all sentences  that contain a name or reference to a human

            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list)
                score=score+4


        # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question
        # then it is a confident clue and we reward it more

        sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i])

        '''for m in range(0, len(sent_pos_list)):
            if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split():
                score=score + 18
                #print 'Score now is :', score'''

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6



        # 6. If the question contains a profession name, the answer has to be a person and sentence would have
        #the person name and the profession

        if q_prof_list!=[]:
            for k in complete_sentence_list[i].split():
                if k.lower() in q_prof_list:
                    #print 'Profession Yes !'
                    score=score+18

        else:  #Question contains name so the chances of answer being a profession name are decent
            if sent_prof_list[i] !=[]:
                score=score+6


        sent_score_list.append(score)

    #print 'Sent score list is :',sent_score_list


    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first


    candidate_list=[]
    npfinal_list=[]
    temp_list=[]
    answer_list=[]

    max_score_value=max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(complete_sentence_list)):
        if sent_score_list[i]==max_score_value:
            candidate_list.append((complete_sentence_list[i],i))
    #print 'Candidate list is :',candidate_list


    #If there is only one sentence, then choose the sentence and then do the processing to display the answer

    if len(candidate_list)==1:

        temp_str= candidate_list[0][0]
        index=candidate_list[0][1]
        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
        #temp_str=temp_str.replace('"','')
        #temp_str=temp_str.replace("'",'"')
        #temp_str=temp_str.replace(',','').replace('?','').replace('!','')


    # If there are multiple candidates, then choose the sentence which appeared first in the story  and then do the processing
    else:
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):

            #Cleaning up the candidate sentence

            temp_str=candidate_list[k][0]
            index =candidate_list[k][1]
            #temp_str=temp_str.replace('"','')
            #temp_str=temp_str.replace("'",'"')
            #temp_str=temp_str.replace(',','').replace('?','').replace('!','')


            break

    ####################### SENTENCE PROCESSING TO FIND THE ANSWER ###############################

    #Just pick out the noun-phrase or PERSON names from the sentence

    #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str)
    s_plist=sent_person_list[index]
    s_proflist=sent_prof_list[index]

    #print 'Prof list is:',s_proflist

    #If the question has a name of person, then the answer sentence should/would most probably
    #the name of a person but it should not be the name of the person appearing in the question.
    #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases

    #print 'Question person list is:',q_person_list
    #print 'Sentence person list is:',s_plist

    result_list=[]
    q_loc_who_list=[]

    if q_person_list==[] and s_plist==[]:   #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question

        '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str)
        if pos_np_list != []:
            for x in pos_np_list:
                if x not in temp_q and x[0].isupper():   #Noun phrases or names generally start with an upper case character
                    print 'First character caps',x
                    result_list.append(x)
            return ' '.join(result_list)'''

        for k in temp_str.split():
            if k not in temp_q:
                result_list.append(k)

        return ' '.join(result_list)

    elif q_person_list !=[] and s_plist !=[]:    #To counter situations when both question and sentence has names Ex. Who defeated who ?
        for k in s_plist:
            if k not in temp_q:
                answer_list.append(k)


    elif q_person_list==[] and s_plist !=[]:
        for i in range(0, len(s_plist)):
            if s_plist[i] not in q_person_list and s_plist[i] not in temp_q:  #To counter situations where question has a name and NER doesn't identify it
                answer_list.append(s_plist[i])


    elif q_person_list != [] and s_proflist !=[]:  #To counter situations for 'Who is X' type questions which could have a profession name in the answer
        for k in s_proflist:
            answer_list.append(k)

    elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ?
        #print 'Question has no name but has a location'
        for k in temp_str.split():
            if k not in temp_q:
                q_loc_who_list.append(k)
        if q_loc_who_list !=[]:
            return ' '.join(q_loc_who_list)

    '''elif q_person_list==[] and s_proflist !=[]:
        for k in s_proflist:
            answer_list.append(k)'''

    if answer_list != [] :#and flag==1:                #Indicating candidate sentence has a name other than that in question
        result= ' '.join(answer_list)
    else:

        #Pick out the noun phrase or nouns and then display them as answer

        np_list = POS_Tagging.pos_noun_tagging(temp_str)
        for x in np_list :
            if x not in temp_q:
                npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question


        #print 'NP Final list after removal is',npfinal_list
        if npfinal_list !=[]:
            result=' '.join(npfinal_list)

        else:
            result=temp_str                  # Printing out the whole sentence

    #print 'Result is:',result
    return result
def answering_when(
    cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, dateline, month_list, time_list
):

    # Declaring globals to be used in this function

    candidate_list = []
    sent_score_list = []

    stanford_stop_words_list = [
        "a",
        "an",
        "and",
        "are",
        "as",
        "at",
        "be",
        "buy",
        "for",
        "from",
        "has",
        "he",
        "in",
        "is",
        "it",
        "its",
        "of",
        "on",
        "that",
        "the",
        "to",
        "was",
        "were",
        "will",
        "with",
    ]

    time_nos = [
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "twenty",
        "thirty",
        "forty",
        "fifty",
        "sixty",
        "seventy",
        "eighty",
        "ninety",
        "hundred",
        "thousand",
        "million",
        "billion",
        "trillion",
    ]

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('"', "")
    temp_q = temp_q.replace("'", '"')
    temp_q = temp_q.replace("?", "")

    # print 'Question is :',temp_q

    # print 'Month list is :',month_list
    # print 'Time list is :',time_list
    # 1. Check if the sentence contains "TIME" expression

    # print 'Time list is :',time_list
    for i in range(0, len(sentence_list)):
        score = 0
        # print 'Sentence is :',complete_sentence_list[i]
        if time_list[i] != [] or month_list[i] != []:  # Sentence contains a time expression

            # Now compute the wordmatch score
            score = score + 4 + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk

        temp = cleansedQuestion.split()
        for m in range(0, len(temp) - 1):
            if temp[m].lower() == "the" and temp[m + 1].lower() == "last":
                for sent in sentence_list[i].split():
                    if sent in ["first", "last", "since", "ago"]:
                        score = score + 20

            # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year}
        for word in cleansedQuestion.split():
            if word.lower() in ["start", "begin"]:
                for sent in sentence_list[i].split():
                    if sent in ["start", "begin", "since", "year"]:
                        score = score + 20

        sent_score_list.append(score)

        # 4. Verb match ??

    # print 'Sent score list is :', sent_score_list

    ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION #####################

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score = 0
    temp_list = cleansedQuestion.split()
    for i in range(0, len(temp_list)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if temp_list[i].lower() == "happen":
            dateline_score = dateline_score + 4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(temp_list) - 1 and temp_list[i].lower() == "take" and temp_list[i + 1].lower() == "place":
            dateline_score = dateline_score + 4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if temp_list[i].lower() == "this":
            dateline_score = dateline_score + 20

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if temp_list[i].lower() == "story":
            dateline_score = dateline_score + 20

    # print 'Date line score for the question is :',dateline_score

    # Selecting the sentence/sentences that has the maximum score.

    max_score_value = max(sent_score_list)

    # Creating candidate list of sentences based on the maximum sent score

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i], i))

    # print 'Candidate list is :',candidate_list

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

        # Now we have to choose the best sentence among the sentences in candidate list

        if len(candidate_list) == 1:

            temp_str = candidate_list[0][0]
            index = candidate_list[0][1]

        # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing
        else:
            # There are more than one candidate sentences. Print the first sentence
            for k in range(0, len(candidate_list)):

                if month_list[candidate_list[k][1]] != []:  # Rewarding sentences with month

                    # Cleaning up the candidate sentence
                    temp_str = candidate_list[k][0]
                    index = candidate_list[k][1]
                    break
                else:
                    temp_str = candidate_list[0][0]
                    index = candidate_list[0][1]

        # Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
        # temp_str=temp_str.replace('"','')
        # temp_str=temp_str.replace("'",'"')
        # temp_str=temp_str.replace(',','').replace('?','').replace('!','')

        ################### SENTENCE PROCESSING #######################

        result_list = []
        answer_list = []

        s_monthlist = month_list[index]
        s_timelist = time_list[index]

        # print 'Month list:',s_monthlist
        # print 'Time list:', s_timelist

        if (
            s_monthlist == [] and s_timelist == []
        ):  # The selected sentence does not seem to have a time or month expression, then print whole sentence  minus the words in the question
            for k in temp_str.split():
                if k not in temp_q:
                    result_list.append(k)

            return " ".join(result_list)

        if s_monthlist != []:
            for i in range(0, len(s_monthlist)):
                if (
                    s_monthlist[i] not in temp_q
                ):  # To counter situations where question has a month and NER doesn't identify it
                    answer_list.append(s_monthlist[i])

        # If time list is not empty
        if s_timelist != []:

            temp_list = temp_str.split()
            for j in range(0, len(temp_list)):
                if temp_list[j] in s_timelist and j != 0 and temp_list[j] not in temp_q:  # and j!=len(temp_list)-1:
                    if temp_list[j - 1] in stanford_stop_words_list:
                        answer_list.append(
                            temp_list[j - 1].lower()
                        )  # Appending the word before the time list which is generally a number or indicative of the time
                        if j - 2 >= 0:
                            answer_list.append(temp_list[j - 2].lower())
                    else:
                        answer_list.append(
                            temp_list[j - 1].lower()
                        )  # Appending the word after the time list word which will be the result in few cases

            # Non-days time values
            for i in range(0, len(s_timelist)):
                if s_timelist[i] not in temp_q:  # and s_timelist[i] not in ['days']:
                    answer_list.append(s_timelist[i])

            # Time list values will usually have numbers or other prepositions before it which will give us the complete answer
            time_prep = ["over", "period", "within", "inside", "under", "ago", "through", "past"]

            for k in temp_str.split():
                if k.lower() in time_prep:
                    answer_list.append(k.lower())

                if k.isdigit():
                    answer_list.append(k)

                if k.lower() in time_nos:
                    answer_list.append(k.lower())

        # print 'Answer list is :',set(answer_list)

        temp_result = []

        if answer_list != []:
            result = " ".join(list(set(answer_list)))
            return result

        else:
            for k in temp_str.split():
                if k not in temp_q:
                    temp_result.append(k)

            return " ".join(temp_result)

    else:
        result = dateline
        return result
def answering_how(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_percent_list):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    q_verblist=[]
    best=[] # List of the best scoring sentences based on word match with the question


    much_list=['thousand','thousands','hundred','hundreds','dollars','cents','million','billion','trillion','none','nothing','everything','few','something',
               'dollars','grams','kilos','kilogram','kilograms','milligrams','mg','metre','centimetre','inches','feet','foot','ft','cent','percent','salary','pay','income','loss','profit','one','two','three','four','five','six','seven','eight','nine','ten',
               'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety',
               'hour','hours','minutes','seconds','second','minute','half','quarter','more','less','than']

    many_list=['one','two','three','four','five','six','seven','eight','nine','ten',
               'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','hundred',
               'thousand','million','billion','trillion']

    how_often=['daily','weekly','bi-weekly','fortnightly','monthly','bi-monthly','quarterly','half-yearly','yearly','decade','millennium'
               'day','everyday','night','afternoon','noon','hourly','hours','minutes','seconds','second','minute']
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs=[]

    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']

    abbreviation_list=[('Mt.','Mount')]


    ########################### QUESTION PROCESSING ##################

    temp_q=cleansedQuestion
    #temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q=temp_q.replace(k,abbreviation_list[0][1])

    print 'Question is :',temp_q


    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))

    #print 'Question verb list is :',q_verblist

    #print 'Time list is:',sent_time_list

    ################## SENTENCE PROCESSING AND SCORING ###################

    for i in range(0,len(complete_sentence_list)):
        score=0

        # 1. Find score for each sentence using word march score first

        #print 'The sentence is :',complete_sentence_list[i]
        #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])



        #2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        for k in temp_q.split():
            if k.lower()=="many":
                for m in complete_sentence_list[i].split():
                    if nums.match(m) or m in many_list:
                        score=score + 6

            #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score
            elif k.lower()=="much":
                for m in complete_sentence_list[i].split():
                    if m.lower()  in ['money','earn','salary','profit','loss'] or m in much_list:
                        score=score+6

            #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score
            elif k.lower()=='often' or k.lower() =='long':
                for m in complete_sentence_list[i].split():
                    if  m in how_often: #m.lower() in sent_time_list[i] or
                        score=score+10
                        break

        '''if much_flag==1 and money_flag==1:
            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk

        elif much_flag==1:

            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'
                    score=score+6'''

        sent_score_list.append(score)

    print 'Score list is:',sent_score_list
    max_score_value=max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0,len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    print 'Final sent list is:',final_sent_list

    temp_result=[]
    temp_solution=[]
    if len(final_sent_list) == 1:

        #If the question contains often, the sentence will usually contain a time expression.If so pick
        #that expression as the solution

        if final_sent_list[0].index('.')==len(final_sent_list[0]) -1:
            req_string=final_sent_list[0][:-1]
            temp2=req_string.split()
        else:
            temp2=final_sent_list[0].split()

    else:

         if final_sent_list[0].index('.')==len(final_sent_list[0]) -1:
            req_string=final_sent_list[0][:-1]
            temp2=req_string.split()
         else:
            temp2=final_sent_list[0].split()           #Picking the sentence which comes first when there are multiple candidates


    #If sentence contains per cent most probably it would be an answer to the how question (much or many)
    for k in range(0,len(temp2)):
        if k !=0 or k!=len(temp2)-1:
            if temp2[k].lower()=='per' and temp2[k+1].lower()=='cent':
                return ' '.join(temp2[k-1:k+2])


    if 'many' in temp_q.split():
        #print 'many'

        for m in range(0,len(temp2)):
            if nums.match(temp2[m]) or temp2[m] in many_list:
                print 'Yes'
                temp_solution.append(temp2[m])

        print 'Temp solution is:',temp_solution
        return ' '.join(temp_solution)

    elif 'much' in temp_q.split():
        #print 'many'

        for m in range(0,len(temp2)):
            if nums.match(temp2[m]) or temp2[m] in much_list:
                temp_solution.append(temp2[m])

        return ' '.join(temp_solution)



    for k in temp2:
        if k not in temp_q.split():
            temp_result.append(k)

    return ' '.join(temp_result)


    '''else:
def answering_who(cleansedQuestion,stop_words_free_question,sentence_list):

    # Declaring globals to be used in this function

    wordmatch_score_list=[]
    sent_containing_person_score_list=[]
    sent_containing_name_score_list=[]
    sent_containing_person_and_name_score_list=[]
    sent_containing_person_or_name_score_list=[]
    master_person_list=[]
    sent_score_list=[]

    #print 'Question is :',cleansedQuestion

    snowball_stemmer = SnowballStemmer('english')

    for i in range(0, len(sentence_list)):
        #print 'Sentence is :', sentence_list[i]
        score=0
        # 1. Score using word match rule
        wordmatch_score_list.append(WM.stemWordMatch(cleansedQuestion,sentence_list[i]))
        score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        q_person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(cleansedQuestion)
        if q_person_list==[]:
            sent_plist,sent_olist,sent_llist,sent_tlist,sent_proflist=NET.named_entity_tagging(sentence_list[i])
            master_person_list.append((sent_plist,i))
            if sent_plist !=[]:
                score=score + 6*len(sent_plist)

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            temp= sentence_list[i].split()
            for k in range(0,len(temp)):
                if snowball_stemmer.stem(temp[k].lower())=='name':
                    score=score +4

        else:
            #Question has a name, and if the sentence contains the same name, then it is a good clue.

            #  4. Awards points to all sentences  that contain a name or reference to a human
            sent_plist,sent_olist,sent_llist,sent_tlist,sent_proflist=NET.named_entity_tagging(sentence_list[i])
            master_person_list.append(sent_plist)
            if sent_plist==q_person_list:
                score=score+4*len(sent_plist)

            elif sent_plist != [] or "name" in sentence_list[i]:
                score=score+4

            '''if sent_plist==[] and "name" in sentence_list[i]:
                sent_containing_name_score_list.append(4)
            else:
                sent_containing_name_score_list.append(0)'''
        sent_score_list.append(score)

    #print 'Sent score list is :',sent_score_list
    #print 'Master person list is:',master_person_list

    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first
    # Preference is given to sentences which have a person name in them. If there is only one such sentence that is the answer


    candidate_list=[]
    final_result_set=[]
    temp_list=[]

    max_score_value=max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            candidate_list.append((sentence_list[i],i))
    #print 'Candidate list is :',candidate_list

    if len(candidate_list)==1:
        q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question)
        #If the question has a profession but not name of person, then the answer sentence should/would most probably
        #the name of a person
        #print 'Question Person List',q_plist

        if q_plist == [] or q_proflist != []:
            #temp_result=master_person_list[candidate_list[0][1]][0]
            s_plist,s_olist,s_llist,s_tlist,s_proflist=NET.named_entity_tagging(candidate_list[0][0])
            result= ' '.join(s_plist)
            print 'Answer: ',result+'\n'
            #print '\n'
            return result

        elif q_plist != [] or q_proflist != []:
            #print candidate_list[0][1]
            s_plist,s_olist,s_llist,s_tlist,s_proflist=NET.named_entity_tagging(candidate_list[0][0])
            result= ' '.join(s_plist)
            print 'Answer: ',result+'\n'
            #print '\n'
            return result

        elif q_plist != [] or q_proflist == []:  # Implies question has a name. So pick a sentence which has the same name in sentence which is present in question #
            result=candidate_list[0][0]
            print 'Answer: ',result+'\n'
            #print '\n'
            return result
    else:
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):
            val=candidate_list[k][0]
            #print 'val is :',val
            index=candidate_list[k][1]
            #print 'index is :', index
            temp_list.append(index)
            break

        #result=' '.join(temp_list)
        x= master_person_list[temp_list[0]]
        #print 'x is :', x
        result2 = temp_list[0]
        #for i in range(0,len(x)):
        if x != []:
            temp=' '.join(x[0])
            if temp not in stop_words_free_question:
                final_result_set.append(temp)
        else:
            final_result_set.append(val)

        if final_result_set != []:
            print 'Answer: ',' '.join(final_result_set)+'\n'
            #print '\n'
            #print 'Result 2 is :',result2
            return ' '.join(final_result_set)
        else:
            print 'Answer: ',temp+'\n'
            #print '\n'
            return temp #' '.join(x)


    # Checking to see if the question contains profession name. If so the answer should be a sentence containing a name and higher weights
    # is given for the score from Rule 2. Else Rule 1 and Rule 2 are given equal weightage.

    '''q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question)
コード例 #10
0
def train(FIS_name,
          data,
          target_col,
          mf,
          Ncentroids,
          overlap,
          alpha=0.5,
          iterations=50,
          sa=False,
          sa_plot=False):
    '''
	Trains a FIS, writes all the properties of this FIS to a FIS file
	using the write function.

	Inputs: 
		data: nummpy array of size > number of centroids x 2
		target_col: integer index of the target column
		Ncentroids: either an integer (for each feature te same)
	          or an array size = number of features
		mf:  'triangle', 'trapezoid' or 'Gaussian'
		overlap: number between 0 and 1, 
					when gaussian mf overlap is the variance
					when triangle/trapezoid overlap is half of the base 
		iterations: number of iterations for the simulated annealing

	Outputs:
		RB: list of lists of integer rules
		target_centroids: list with scaled target centroids
		feature_centroids: the other feature centroids
	'''
    # scale the data
    data, min_x, max_x = scale(data)
    # get centroids
    centroids = cluster(data, target_col, Ncentroids, plot=False)
    # learn WM rules
    RB = WM.learn(data, centroids, overlap, mf, target_col)
    # return everything needed for testing
    target_centroids = centroids[target_col]
    # delete target centroid for testing
    feature_centroids = np.delete(centroids, target_col, 0)
    # delete target values for testing
    targets = data[:, target_col]
    data = np.delete(data, target_col, 1)
    method = 'WM'
    # for simulated annealing, get the new rule base
    if sa:
        method = 'WM+SA'
        RB = SA.search(data,
                       targets,
                       RB,
                       alpha,
                       feature_centroids,
                       overlap,
                       mf,
                       target_centroids,
                       min_x[target_col],
                       max_x[target_col],
                       plot=sa_plot,
                       iterations=iterations)
    # Write FIS file in the format:
    # FIS_name.FIS
    with open(FIS_name + '.FIS', "w") as fis_file:
        write(fis_file, method, mf, overlap, target_centroids,
              feature_centroids, RB)
コード例 #11
0
def answering_where(cleansedQuestion, stop_words_free_question,
                    complete_sentence_list, sentence_list, dateline):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    master_loc_list = []

    location_prepositions = [
        'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under',
        'next to', 'below', 'between', 'around', 'outside', 'among',
        'on the right', 'across', 'front', 'opposite', 'before', 'beneath',
        'beside', 'against'
    ]
    when_year_verbs = ['play', 'fought']  #'win','lose','victorius']

    when_year = [
        '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408',
        '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417',
        '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426',
        '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435',
        '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444',
        '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453',
        '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462',
        '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471',
        '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480',
        '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489',
        '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498',
        '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507',
        '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516',
        '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525',
        '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534',
        '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543',
        '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552',
        '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561',
        '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570',
        '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579',
        '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588',
        '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597',
        '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606',
        '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615',
        '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624',
        '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633',
        '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642',
        '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651',
        '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660',
        '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669',
        '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678',
        '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687',
        '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696',
        '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705',
        '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714',
        '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723',
        '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732',
        '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741',
        '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750',
        '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759',
        '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768',
        '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777',
        '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786',
        '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795',
        '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804',
        '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813',
        '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822',
        '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831',
        '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840',
        '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849',
        '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858',
        '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867',
        '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876',
        '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885',
        '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894',
        '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903',
        '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912',
        '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
        '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930',
        '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939',
        '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948',
        '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957',
        '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966',
        '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
        '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984',
        '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
        '1994', '1995', '1996', '1997', '1998', '1999'
    ]

    #print 'Question is :',cleansedQuestion

    # 1. Find score for each sentence using word march score first

    for i in range(0, len(sentence_list)):
        score = 0

        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        #2. Check if the sentence contains location preposition, then it is a good clue
        temp = complete_sentence_list[i].split()
        flag = 0
        for j in range(0, len(temp)):
            if temp[j] in location_prepositions:
                flag = 1

        if flag == 1:
            score = score + 4

        # 3. Check if the sentence contains Location entity
        person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging(
            sentence_list[i])

        if loc_list != []:  # If sentence contains location
            score = score + 6

            candidate_sent_list.append(sentence_list[i])
            master_loc_list.append((' '.join(loc_list), i))

        sent_score_list.append(score)
    #print 'Master loc list is :',master_loc_list
    #print 'Candidate sentences based on Location entity are:',candidate_sent_list

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score = 0
    for i in range(0, len(cleansedQuestion)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if cleansedQuestion[i].lower() == 'happen':
            dateline_score = dateline_score + 4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(cleansedQuestion) - 1 and cleansedQuestion[i].lower(
        ) == 'take' and cleansedQuestion[i + 1].lower() == 'place':
            dateline_score = dateline_score + 4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if cleansedQuestion[i].lower() == 'this':
            dateline_score = dateline_score + 12

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if cleansedQuestion[i].lower() == 'story':
            dateline_score = dateline_score + 12

    #print 'Date line score for the question is :',dateline_score

    # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose
    # dateline_score else choose the maximum score from sent_score_list

    max_score_value = max(sent_score_list)
    #print 'Max value is :', max_score_value

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

        # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
        # which have both maximum value and present in candidate list

        for i in range(0, len(sent_score_list)):
            if sent_score_list[i] == max_score_value:
                final_sent_list.append((complete_sentence_list[i], i))

        #print 'Final sent list is:',final_sent_list
        #TODO - check which works better
        #TODO - based on the verbs in the question select the appropriate sentence from sentence list
        '''for i in range(0, len(sent_score_list)):
            if sent_score_list[i] in candidate_sent_list:
                if sent_score_list[i]==max_score_value:
                    final_sent_list.append(sentence_list[i])
                else:
                    final_temp_list.append(sentence_list[i])'''

        # Now from the sentences extracting out the years or the date /time values alone and representing them
        temp_solution = []
        answer_loc = []
        if len(final_sent_list) == 1:
            sent = final_sent_list[0][0]
            index = final_sent_list[0][1]
            #print index
            for i in range(0, len(master_loc_list)):
                answer_loc.append(master_loc_list[i][0])

            print 'Answer: ', ' '.join(set(answer_loc)) + '\n'
            #print '\n'
            return ' '.join(set(answer_loc))
            '''print master_loc_list[i]
                temp=master_loc_list[i][1]
                if temp==index:
                    result=master_loc_list[i][0]
                    print 'Result is :',master_loc_list[i][0]
                    return result'''
        else:
            for i in range(0, len(final_sent_list)):
                temp = final_sent_list[i][0]
                temp_solution.append(temp)
                break

            result = ' '.join(temp_solution)
            print 'Answer: ', result + '\n'
            #print '\n'
            return result

    else:
        result = dateline
        print 'Answer: ', result + '\n'
        #print '\n'
        return result
def answering_when(cleansedQuestion,stop_words_free_question,sentence_list,dateline):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list={}
    final_sent_list=[]

    when_year_verbs=['play','fought'] #'win','lose','victorius']

    when_time_values=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec','1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    #print 'Question is :',cleansedQuestion


    # 1. Check if the sentence contains "TIME" expression

    for i in range(0,len(sentence_list)):
        score=0
        person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])

        if time_list != []: # Sentence contains a time expression

            candidate_sent_list.append(sentence_list[i])

            # Now compute the wordmatch score
            score = score + 4 + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
            #sent_score_list.append((score,i))

        # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk

        for word in cleansedQuestion:
            if word.lower() in ['the','last']:
                for sent in  sentence_list[i]:
                    if sent in ['first','last','since','ago']:
                        score = score +20


            # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year}
            if word.lower() in ['start','begin']:
                for sent in  sentence_list[i]:
                    if sent in ['start','begin','since','year']:
                        score = score +20

        sent_score_list[i]=score

    #print 'Candidate sentences list is :',candidate_sent_list
    #print 'Sent score list is :', sent_score_list



    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score=0
    for i in range(0,len(cleansedQuestion)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if cleansedQuestion[i].lower()=='happen':
            dateline_score= dateline_score+4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(cleansedQuestion)-1 and cleansedQuestion[i].lower()=='take' and cleansedQuestion[i+1].lower()=='place':
            dateline_score=dateline_score+4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if cleansedQuestion[i].lower()=='this':
            dateline_score= dateline_score+12

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if cleansedQuestion[i].lower()=='story':
            dateline_score= dateline_score+12

    #print 'Date line score for the question is :',dateline_score
    # Selecting the sentence that has the maximum score. If the dateline score is greater than max of sent_score choose
    # dateline_score else choose the maximum score from sent_score_list

    max_score_index=max(sent_score_list, key=lambda i: sent_score_list[i])

    score_values=sent_score_list.values()
    max_score_value =max(score_values)


    #print 'Max value is :', max_score_value
    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:


        # Now we have to choose the best sentence among the sentences in candidate list

        # First step is to parse the stop-words free question and look for words in the question which might help us find
        #the answer

        #print 'Stopwords free question :', stop_words_free_question

        '''for i in stop_words_free_question:
            if i in when_year:
                final_sent_list.append('''


        # Giving preference to sentences which contain a year value #
        for i in sent_score_list.keys():
            '''temp=sentence_list[i].split()
            for j in range(0, len(temp)):
                if j in when_year:
                    print 'Year is true'
                    #final_sent_list.append(sentence_list[i])
                    final_sent_list.append(j)'''

             # If none of the sentences contain a year, then choose the one with maximum value
            if sent_score_list[i]==max_score_value:
                final_sent_list.append(sentence_list[i])

        #print 'Final sentence list is:',final_sent_list


        # Now from the sentences extracting out the years or the date /time values alone and representing them
        final_temp_list=[]
        if len(final_sent_list) == 1:
            temp=nltk.word_tokenize(final_sent_list[0])
            for j in range(0, len(temp)):
                if temp[j].lower() in when_time_values:
                    #print 'year true'
                    final_temp_list.append(temp[j])

            if final_temp_list != []:
                result=' '.join(final_temp_list)
                print 'Answer: ', result+'\n'
                #print '\n'
                return result
            else:
                print 'Answer: ', final_sent_list[0]+'\n'
                #print '\n'
                return final_sent_list[0]
        else:

            for i in range(0,len(final_sent_list)):
              temp=nltk.word_tokenize(final_sent_list[i])
              for j in range(0, len(temp)):
                if temp[j].lower() in when_time_values:
                    #print 'year true'
                    final_temp_list.append(temp[j])

            if final_temp_list != []:
                result=' '.join(final_temp_list)
                print 'Answer: ', result+'\n'
                #print '\n'
                return result
            else:
                print 'Answer: ', ' '.join(final_sent_list)+'\n'
                #print '\n'
                return ' '.join(final_sent_list)

    else:
        result=dateline
        print 'Answer: ', result +'\n'
        #print '\n'
        return result
コード例 #13
0
def answering_when(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline,month_list,time_list):

    # Declaring globals to be used in this function

    candidate_list=[]
    sent_score_list=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','for','from',
                          'has','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']



    time_nos=['one','two','three','four','five','six','seven','eight','nine','ten',
              'twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','hundred',
              'thousand','million','billion','trillion']


    temp_q=cleansedQuestion
    temp_q=temp_q.replace('"','')
    temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    #print 'Question is :',temp_q

    #print 'Month list is :',month_list
    #print 'Time list is :',time_list
    # 1. Check if the sentence contains "TIME" expression


    #print 'Time list is :',time_list
    for i in range(0,len(sentence_list)):
        score=0
        #print 'Sentence is :',complete_sentence_list[i]
        if time_list[i] != [] or month_list[i]!= []: # Sentence contains a time expression

            # Now compute the wordmatch score
            score = score + 4 + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        # 2. Check if the Question contains "the last" and sentence contains any of "first,last,since,ago", then score+= slam_dunk

        temp=cleansedQuestion.split()
        for m in range(0, len(temp)-1):
            if temp[m].lower()=='the' and temp[m+1].lower()=='last':
                for sent in  sentence_list[i].split():
                    if sent in ['first','last','since','ago']:
                        score = score +20


            # 3. If the question contains {start,begin} and sentence contains {start,begin,since,year}
        for word in cleansedQuestion.split():
            if word.lower() in ['start','begin']:
                for sent in  sentence_list[i].split():
                    if sent in ['start','begin','since','year']:
                        score = score +20

        sent_score_list.append(score)


        #4. Verb match ??


    #print 'Sent score list is :', sent_score_list


    ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION #####################

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score=0
    temp_list=cleansedQuestion.split()
    for i in range(0, len(temp_list)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if temp_list[i].lower()=='happen':
            dateline_score= dateline_score+4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(temp_list)-1 and temp_list[i].lower()=='take' and temp_list[i+1].lower()=='place':
            dateline_score=dateline_score+4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer
        if temp_list[i].lower()=='this':
            dateline_score= dateline_score+20

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if temp_list[i].lower()=='story':
            dateline_score= dateline_score+20

    #print 'Date line score for the question is :',dateline_score


    # Selecting the sentence/sentences that has the maximum score.

    max_score_value =max(sent_score_list)

    #Creating candidate list of sentences based on the maximum sent score

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i],i))

    #print 'Candidate list is :',candidate_list

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:


        # Now we have to choose the best sentence among the sentences in candidate list

        if len(candidate_list)==1:

            temp_str= candidate_list[0][0]
            index=candidate_list[0][1]


        # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing
        else:
            # There are more than one candidate sentences. Print the first sentence
            for k in range(0, len(candidate_list)):

                if month_list[candidate_list[k][1]] !=[]:                      #Rewarding sentences with month

                    #Cleaning up the candidate sentence
                    temp_str=candidate_list[k][0]
                    index=candidate_list[k][1]
                    break
                else:
                    temp_str=candidate_list[0][0]
                    index =candidate_list[0][1]

         #Cleaning up the candidate sentence
            # Replacing double quotes with blank and single quotes with "
            temp_str=temp_str.replace('"','')
            #temp_str=temp_str.replace("'",'"')
            temp_str=temp_str.replace(',','').replace('?','').replace('!','')

        ################### SENTENCE PROCESSING #######################

        result_list=[]
        answer_list=[]

        s_monthlist=month_list[index]
        s_timelist=time_list[index]

        #print 'Month list:',s_monthlist
        #print 'Time list:', s_timelist


        if s_monthlist == [] and s_timelist == []:    #The selected sentence does not seem to have a time or month expression, then print whole sentence  minus the words in the question
            for k in temp_str.split():
                if k not in temp_q:
                    result_list.append(k)

            return ' '.join(result_list)


        if s_monthlist!=[]:
            for i in range(0, len(s_monthlist)):
                if s_monthlist[i] not in temp_q :   #To counter situations where question has a month and NER doesn't identify it
                    answer_list.append(s_monthlist[i])


        # If time list is not empty
        if s_timelist != []:

            temp_list=temp_str.split()
            for j in range(0, len(temp_list)):
                if temp_list[j] in s_timelist and j!=0 and temp_list[j] not in temp_q:#and j!=len(temp_list)-1:
                    if temp_list[j-1] in stanford_stop_words_list:
                        answer_list.append(temp_list[j-1].lower())      #Appending the word before the time list which is generally a number or indicative of the time
                        if j-2 >=0:
                            answer_list.append(temp_list[j-2].lower())
                    else:
                        answer_list.append(temp_list[j-1].lower())      #Appending the word after the time list word which will be the result in few cases

            #Non-days time values
            for i in range(0, len(s_timelist)):
                if s_timelist[i] not in temp_q : #and s_timelist[i] not in ['days']:
                    answer_list.append(s_timelist[i])

            # Time list values will usually have numbers or other prepositions before it which will give us the complete answer
            time_prep=['over','period','within','inside','under','ago','through','past']

            for k in temp_str.split():
                if k.lower() in time_prep:
                    answer_list.append(k.lower())

                if k.isdigit():
                    answer_list.append(k)

                if k.lower() in time_nos:
                    answer_list.append(k.lower())

        #print 'Answer list is :',set(answer_list)

        temp_result=[]

        if answer_list != []:
           result=' '.join(list(set(answer_list)))
           return result

        else:
            for k in temp_str.split():
                if k not in temp_q:
                    temp_result.append(k)

            return ' '.join(temp_result)

    else:
        result=dateline
        return result
コード例 #14
0
def answering_who(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list, sent_person_list,
                  sent_prof_list):

    # Declaring globals to be used in this function

    sent_score_list = []
    q_verblist = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from',
        'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that',
        'the', 'to', 'was', 'were', 'will', 'with'
    ]

    temp_q = cleansedQuestion
    #temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q = temp_q.replace('?', '')

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Temp_q: ',temp_q

    q_person_list, q_org_list, q_loc_list, q_month_list, q_time_list, q_money_list, q_percent_list, q_prof_list = NER.named_entity_recognition(
        temp_q)

    for i in range(0, len(complete_sentence_list)):
        #print 'Sentence is :', complete_sentence_list[i]
        score = 0

        # 1. Score using word match rule. Match words in question with the words in stop free sentence

        #print 'Sentence is :',sentence_list[i]
        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        if q_person_list == []:

            #Giving more weights to sentences having more names in it
            if sent_person_list[i] != [] or sent_prof_list[i] != []:
                #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list)
                score = score + 6

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            lmtzr = WordNetLemmatizer()
            temp = complete_sentence_list[i].split()
            for k in range(0, len(temp)):
                if lmtzr.lemmatize(temp[k].lower()) == 'name':
                    score = score + 4

            #  4. Awards points to all sentences  that contain a name or reference to a human

            if sent_person_list[i] != [] or sent_prof_list[i] != []:
                #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list)
                score = score + 4

        # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question
        # then it is a confident clue and we reward it more

        sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i])
        '''for m in range(0, len(sent_pos_list)):
            if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split():
                score=score + 18
                #print 'Score now is :', score'''

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in [
                    'VB', 'VBD', 'VBZ', 'VBN'
            ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score = score + 6

        # 6. If the question contains a profession name, the answer has to be a person and sentence would have
        #the person name and the profession

        if q_prof_list != []:
            for k in complete_sentence_list[i].split():
                if k.lower() in q_prof_list:
                    #print 'Profession Yes !'
                    score = score + 18

        else:  #Question contains name so the chances of answer being a profession name are decent
            if sent_prof_list[i] != []:
                score = score + 6

        sent_score_list.append(score)

    #print 'Sent score list is :',sent_score_list

    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first

    candidate_list = []
    npfinal_list = []
    temp_list = []
    answer_list = []

    max_score_value = max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(complete_sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i], i))
    #print 'Candidate list is :',candidate_list

    #If there is only one sentence, then choose the sentence and then do the processing to display the answer

    if len(candidate_list) == 1:

        temp_str = candidate_list[0][0]
        index = candidate_list[0][1]
        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
        #temp_str=temp_str.replace('"','')
        #temp_str=temp_str.replace("'",'"')
        #temp_str=temp_str.replace(',','').replace('?','').replace('!','')

    # If there are multiple candidates, then choose the sentence which appeared first in the story  and then do the processing
    else:
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):

            #Cleaning up the candidate sentence

            temp_str = candidate_list[k][0]
            index = candidate_list[k][1]
            #temp_str=temp_str.replace('"','')
            #temp_str=temp_str.replace("'",'"')
            #temp_str=temp_str.replace(',','').replace('?','').replace('!','')

            break

    ####################### SENTENCE PROCESSING TO FIND THE ANSWER ###############################

    #Just pick out the noun-phrase or PERSON names from the sentence

    #s_plist,s_orglist,s_loclist,s_monthlist,s_timelist,s_moneylist,s_percentlist,s_proflist=NER.named_entity_recognition(temp_str)
    s_plist = sent_person_list[index]
    s_proflist = sent_prof_list[index]

    #print 'Prof list is:',s_proflist

    #If the question has a name of person, then the answer sentence should/would most probably
    #the name of a person but it should not be the name of the person appearing in the question.
    #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases

    #print 'Question person list is:',q_person_list
    #print 'Sentence person list is:',s_plist

    result_list = []
    q_loc_who_list = []

    if q_person_list == [] and s_plist == []:  #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question
        '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str)
        if pos_np_list != []:
            for x in pos_np_list:
                if x not in temp_q and x[0].isupper():   #Noun phrases or names generally start with an upper case character
                    print 'First character caps',x
                    result_list.append(x)
            return ' '.join(result_list)'''

        for k in temp_str.split():
            if k not in temp_q:
                result_list.append(k)

        return ' '.join(result_list)

    elif q_person_list != [] and s_plist != []:  #To counter situations when both question and sentence has names Ex. Who defeated who ?
        for k in s_plist:
            if k not in temp_q:
                answer_list.append(k)

    elif q_person_list == [] and s_plist != []:
        for i in range(0, len(s_plist)):
            if s_plist[i] not in q_person_list and s_plist[
                    i] not in temp_q:  #To counter situations where question has a name and NER doesn't identify it
                answer_list.append(s_plist[i])

    elif q_person_list != [] and s_proflist != []:  #To counter situations for 'Who is X' type questions which could have a profession name in the answer
        for k in s_proflist:
            answer_list.append(k)

    elif q_person_list == [] and q_loc_list != []:  # Who is <X> where ?
        #print 'Question has no name but has a location'
        for k in temp_str.split():
            if k not in temp_q:
                q_loc_who_list.append(k)
        if q_loc_who_list != []:
            return ' '.join(q_loc_who_list)
    '''elif q_person_list==[] and s_proflist !=[]:
        for k in s_proflist:
            answer_list.append(k)'''

    if answer_list != []:  #and flag==1:                #Indicating candidate sentence has a name other than that in question
        result = ' '.join(answer_list)
    else:

        #Pick out the noun phrase or nouns and then display them as answer

        np_list = POS_Tagging.pos_noun_tagging(temp_str)
        for x in np_list:
            if x not in temp_q:
                npfinal_list.append(
                    x
                )  #Removing all occurences of existing noun phrases from the question

        #print 'NP Final list after removal is',npfinal_list
        if npfinal_list != []:
            result = ' '.join(npfinal_list)

        else:
            result = temp_str  # Printing out the whole sentence

    #print 'Result is:',result
    return result
コード例 #15
0
def answering_how(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    best_sent_index=[]
    best=[] # List of the best scoring sentences based on word match with the question



    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']

    date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour',
          'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days',
          'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday',
          'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight',
          'hourly','nightly','daily','monthly','weekly','quarterly','yearly']


    much_list=['thousand','hundred','dollars','cents','million','billion','none','nothing','everything','few','something',
               'salary','pay','income','loss','profit','one','two','three','four','five','six','seven','eight','nine','ten']

    how_often=['daily','weekly','bi-weekly','fortnightly','monthly','bi-monthly','quarterly','half-yearly','yearly','decade','millennium'
               'day','everyday','night','afternoon','noon']
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs=[]

    #print 'Question is :',cleansedQuestion


    snowball_stemmer = SnowballStemmer('english')

    # 1. Find score for each sentence using word march score first

    for i in range(0,len(complete_sentence_list)):
        score=0
        much_flag=0
        many_flag=0
        money_flag=0

        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        #2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        temp=cleansedQuestion.split()
        for j in range(0, len(temp)):
            if temp[j]=='many':
                many_flag=1
                break

        if many_flag==1:        #print 'In many'
            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]):   # Implies answer contains a number
                    #print 'many Q - number sentence'
                    score=score+6


        #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score

        temp=cleansedQuestion.split()
        for j in range(0, len(temp)):
            if temp[j]=='much':
                #print 'In much'
                much_flag=1
                break

            if temp[j] in ['money','earn','salary','profit','loss']:
                money_flag=1



        if much_flag==1 and money_flag==1:
            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk


        elif much_flag==1:

            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'
                    score=score+6



        sent_score_list.append(score)

    #print 'Score list is:',sent_score_list
    max_score_value=max(sent_score_list)
    #print 'Max score is :',max_score_value


    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0,len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final sent list is:',final_sent_list

    temp_solution=[]
    answer_loc=[]
    if len(final_sent_list) == 1:

        #If the question contains often, the sentence will usually contain a time expression.If so pick
        #that expression as the solution

        temp=cleansedQuestion.split()
        if 'often' in temp:
            #print 'often'
            temp2=final_sent_list[0].split()
            for m in range(0,len(temp2)):
                if temp2[m] in how_often:
                    temp_solution.append(temp2[m])
            print 'Answer: ',' '.join(temp_solution)+'\n'
            #print '\n'
            return temp_solution

        if 'many' in temp:
            #print 'many'
            temp2=final_sent_list[0].split()
            for m in range(0,len(temp2)):
                if nums.match(temp2[m]):
                    temp_solution.append(temp2[m])
            print 'Answer: ',' '.join(temp_solution)+'\n'
            #print '\n'
            return temp_solution


        else:

            print 'Answer: ',final_sent_list[0]+'\n'
            #print '\n'
            return final_sent_list[0]

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0,len(final_sent_list)):
            result=final_sent_list[k]
            break

        print 'Answer: ', result+'\n'
        #print '\n'
        return result
def answering_why(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    best_sent_index=[]
    best=[] # List of the best scoring sentences based on word match with the question


    location_prepositions=['in','at','near','inside','on','behind','above','under','next to','below','between','around',
                           'outside','among','on the right', 'across','front','opposite','before','beneath','beside','against']

    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']

    date_expression_list=['yesterday','today','tomorrow','last week','this week','next week','an hour ago','now','in an hour',
          'recently','soon','a little while ago','at this moment','in the near future','a long time ago','these days',
          'those days','future','present','past','nowadays','eventually','morning', 'evening','night','midnight','dawn','dusk','afternoon','noon','midday',
          'am','pm','sunrise','sunset','lunchtime','teatime','dinnertime','interval','twilight',
          'hourly','nightly','daily','monthly','weekly','quarterly','yearly']

    #print 'Question is :',cleansedQuestion


    snowball_stemmer = SnowballStemmer('english')

    # Find score for each sentence using word march score first

    for i in range(0,len(complete_sentence_list)):
        wm_score=0

        wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        sent_score_list.append(wm_score)

    #print 'Score list is:',sent_score_list
    max_score_value=max(sent_score_list)
    #print 'Max score is :',max_score_value
    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0,len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            best.append((complete_sentence_list[i],i))

    #print 'Best list is:',best


    # Finding indices of the best sentences

    for j in range(0,len(best)):
            best_sent_index.append(best[j][1])

    # Re-setting the scores of all sentences to zero
    for i in range(0, len(sent_score_list)):
        sent_score_list[i]=0


    for i in range(0, len(complete_sentence_list)):
        score=0
        # 1. If the given sentence is in the best list, then reward them. It is a clue
        for j in range(0,len(best)):
            if complete_sentence_list[i] in best[j][0]:
                #print 'Yes'
                score=score + 3
        #print 'Score after 1 is :',score

        #2. If the sentence immediately precedes member of best, then it is a clue

        for k in best_sent_index:
            #print k
            if i==k-1:
                score=score + 3
            #3. If the sentence immediately follows member of best, then it is a good clue
            elif i==k+1:
                score=score + 4

        #4. If the sentence contains word "want", then it is a good clue
        temp=complete_sentence_list[i].split()
        for word in temp:
            if word.lower()=='want':
                #print 'Score increment rule 4'
                score=score+4
            elif word.lower() in ['so','because']:
                #print 'Score increment rule 5'
                score=score+4

        sent_score_list[i]=score

    #print 'Sent score list values are:',sent_score_list


    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)
    #print 'Max value is :', max_score_value


    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    #print 'len of sent_score_list:',len(sent_score_list)
    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:
                final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list
    temp_solution=[]
    answer_loc=[]
    if len(final_sent_list) == 1:
        print 'Answer: ',final_sent_list[0]+'\n'
        #print '\n'
        return final_sent_list[0]

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0,len(final_sent_list)):
            result=final_sent_list[k]
        print 'Answer: ', result+'\n'
        #print '\n'
        return result
コード例 #17
0
def answering_how(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list, sent_time_list,
                  sent_percent_list):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    q_verblist = []
    best = [
    ]  # List of the best scoring sentences based on word match with the question

    much_list = [
        'thousand', 'thousands', 'hundred', 'hundreds', 'dollars', 'cents',
        'million', 'billion', 'trillion', 'none', 'nothing', 'everything',
        'few', 'something', 'dollars', 'grams', 'kilos', 'kilogram',
        'kilograms', 'milligrams', 'mg', 'metre', 'centimetre', 'inches',
        'feet', 'foot', 'ft', 'cent', 'percent', 'salary', 'pay', 'income',
        'loss', 'profit', 'one', 'two', 'three', 'four', 'five', 'six',
        'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty',
        'sixty', 'seventy', 'eighty', 'ninety', 'hour', 'hours', 'minutes',
        'seconds', 'second', 'minute', 'half', 'quarter', 'more', 'less',
        'than'
    ]

    many_list = [
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
        'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy',
        'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion',
        'trillion'
    ]

    how_often = [
        'daily', 'weekly', 'bi-weekly', 'fortnightly', 'monthly', 'bi-monthly',
        'quarterly', 'half-yearly', 'yearly', 'decade', 'millennium'
        'day', 'everyday', 'night', 'afternoon', 'noon', 'hourly', 'hours',
        'minutes', 'seconds', 'second', 'minute'
    ]
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from',
        'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that',
        'the', 'to', 'was', 'were', 'will', 'with'
    ]

    abbreviation_list = [('Mt.', 'Mount')]

    ########################### QUESTION PROCESSING ##################

    temp_q = cleansedQuestion
    #temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q = temp_q.replace('?', '')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    #print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Question verb list is :',q_verblist

    #print 'Time list is:',sent_time_list

    ################## SENTENCE PROCESSING AND SCORING ###################

    for i in range(0, len(complete_sentence_list)):
        score = 0

        # 1. Find score for each sentence using word march score first

        #print 'The sentence is :',complete_sentence_list[i]
        #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        #2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        for k in temp_q.split():
            if k.lower() == "many":
                for m in complete_sentence_list[i].split():
                    if nums.match(m) or m in many_list:
                        score = score + 6

            #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score
            elif k.lower() == "much":
                for m in complete_sentence_list[i].split():
                    if m.lower() in [
                            'money', 'earn', 'salary', 'profit', 'loss'
                    ] or m in much_list:
                        score = score + 6

            #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score
            elif k.lower() == 'often' or k.lower() == 'long':
                for m in complete_sentence_list[i].split():
                    if m in how_often:  #m.lower() in sent_time_list[i] or
                        score = score + 10
                        break
        '''if much_flag==1 and money_flag==1:
            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk

        elif much_flag==1:

            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'
                    score=score+6'''

        sent_score_list.append(score)

    #print 'Score list is:',sent_score_list
    max_score_value = max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final sent list is:',final_sent_list

    temp_result = []
    temp_solution = []
    if len(final_sent_list) == 1:

        #If the question contains often, the sentence will usually contain a time expression.If so pick
        #that expression as the solution

        if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1:
            req_string = final_sent_list[0][:-1]
            temp2 = req_string.split()
        else:
            temp2 = final_sent_list[0].split()

    else:

        if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1:
            req_string = final_sent_list[0][:-1]
            temp2 = req_string.split()
        else:
            temp2 = final_sent_list[0].split(
            )  #Picking the sentence which comes first when there are multiple candidates

    #If sentence contains per cent most probably it would be an answer to the how question (much or many)
    for k in range(0, len(temp2)):
        if k != 0 or k != len(temp2) - 1:
            if temp2[k].lower() == 'per' and temp2[k + 1].lower() == 'cent':
                return ' '.join(temp2[k - 1:k + 2])

    if 'many' in temp_q.split():
        #print 'many'

        for m in range(0, len(temp2)):
            #print 'temp2[m]:',temp2[m]
            if nums.match(temp2[m]) or temp2[m] in many_list:
                #print 'Yes'
                temp_solution.append(temp2[m])

        #print 'Temp solution is:',temp_solution
        if temp_solution != []:
            return ' '.join(temp_solution)
        else:
            return ' '.join(temp2)

    elif 'much' in temp_q.split():
        #print 'many'

        for m in range(0, len(temp2)):
            if nums.match(temp2[m]) or temp2[m] in much_list:
                temp_solution.append(temp2[m])

        if temp_solution != []:
            return ' '.join(temp_solution)
        else:
            return ' '.join(temp2)

    for k in temp2:
        if k not in temp_q.split():
            temp_result.append(k)

    return ' '.join(temp_result)
コード例 #18
0
def answering_what(cleansedQuestion, stop_words_free_question,
                   complete_sentence_list, sentence_list, dateline):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    master_loc_list = []

    location_prepositions = [
        'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under',
        'next to', 'below', 'between', 'around', 'outside', 'among',
        'on the right', 'across', 'front', 'opposite', 'before', 'beneath',
        'beside', 'against'
    ]

    what_year = [
        '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408',
        '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417',
        '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426',
        '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435',
        '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444',
        '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453',
        '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462',
        '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471',
        '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480',
        '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489',
        '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498',
        '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507',
        '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516',
        '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525',
        '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534',
        '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543',
        '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552',
        '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561',
        '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570',
        '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579',
        '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588',
        '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597',
        '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606',
        '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615',
        '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624',
        '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633',
        '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642',
        '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651',
        '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660',
        '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669',
        '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678',
        '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687',
        '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696',
        '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705',
        '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714',
        '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723',
        '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732',
        '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741',
        '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750',
        '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759',
        '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768',
        '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777',
        '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786',
        '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795',
        '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804',
        '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813',
        '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822',
        '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831',
        '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840',
        '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849',
        '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858',
        '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867',
        '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876',
        '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885',
        '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894',
        '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903',
        '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912',
        '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
        '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930',
        '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939',
        '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948',
        '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957',
        '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966',
        '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
        '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984',
        '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
        '1994', '1995', '1996', '1997', '1998', '1999'
    ]

    what_month = [
        'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr',
        'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug',
        'september', 'sep', 'october', 'oct', 'november', 'nov', 'december',
        'dec'
    ]

    date_expression_list = [
        'yesterday', 'today', 'tomorrow', 'last week', 'this week',
        'next week', 'an hour ago', 'now', 'in an hour', 'recently', 'soon',
        'a little while ago', 'at this moment', 'in the near future',
        'a long time ago', 'these days', 'those days', 'future', 'present',
        'past', 'nowadays', 'eventually', 'morning', 'evening', 'night',
        'midnight', 'dawn', 'dusk', 'afternoon', 'noon', 'midday', 'am', 'pm',
        'sunrise', 'sunset', 'lunchtime', 'teatime', 'dinnertime', 'interval',
        'twilight', 'hourly', 'nightly', 'daily', 'monthly', 'weekly',
        'quarterly', 'yearly'
    ]

    #print 'Question is :',cleansedQuestion

    snowball_stemmer = SnowballStemmer('english')
    # 1. Find score for each sentence using word march score first

    for i in range(0, len(complete_sentence_list)):
        score = 0

        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue

        temp = cleansedQuestion.split()
        temp = nltk.word_tokenize(stop_words_free_question)
        flag = 0
        for j in range(0, len(temp)):
            if temp[j].lower() in what_month:
                temp2 = sentence_list[i].split()
                for k in range(0, len(temp2)):
                    if temp2[k] in date_expression_list:
                        count = count + 4

            # 3. What "kind" questions. Sentences containing "call" or "from"
            if temp[j].lower() == 'kind':
                temp2 = sentence_list[i].split()
                for k in range(0, len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['call', 'from']:
                        count = count + 6

            # 4. If question contains "name" and the sentence contains {name,call,known}

            if temp[j].lower() == 'name':
                temp2 = complete_sentence_list[i].split()
                for k in range(0, len(temp2)):
                    if snowball_stemmer.stem(
                            temp2[k]) in ['name', 'call', 'known']:
                        score = score + 20

            #5. If question contains name + PP and contains(S,ProperNoun) and Head PP

            if j != len(temp) - 1 and temp[j] == 'name' and temp[j + 1] in [
                    'of', 'for'
            ]:
                person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging(
                    sentence_list[i])
                if person_list != []:
                    #TODO Check if it also contains (proper_noun,head(PP))
                    score = score + 20

            # If the question contains "sport" related terms, answer should also have sport related terms
            '''if temp[j].lower() in ['sports','games','olympics']:
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']:
                        score=score+6'''

            # If the sentence contains a  "country" name and the sentence contains a LOCATION, then it is confident score
            if temp[j].lower() in ['country', 'countries', 'olympics']:
                person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging(
                    sentence_list[i])
                if loc_list != []:
                    score = score + 6 * len(
                        loc_list
                    )  # Confidence score increases with increasing number of countries appearing in the sentence.

        sent_score_list.append(score)

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value = max(sent_score_list)
    #print 'Max value is :', max_score_value

    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    for i in range(0, len(sent_score_list)):
        if sent_score_list[i] == max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list
    temp_solution = []
    answer_loc = []
    if len(final_sent_list) == 1:
        print 'Answer: ', final_sent_list[0] + '\n'
        #print '\n'
        return final_sent_list[0]

    else:

        for i in range(0, len(final_sent_list)):
            temp = final_sent_list[i]
            break

        #result=' '.join(final_sent_list)
        result = temp
        print 'Answer: ', result + '\n'
        #print '\n'
        return result
def answering_why(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list):

    # Declaring globals to be used in this function

    sent_score_list=[]
    final_sent_list=[]
    best_sent_index=[]
    best=[] # List of the best scoring sentences based on word match with the question
    q_verblist=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']

    temp_q=cleansedQuestion
    temp_q=temp_q.replace('"','')
    temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    #print 'Question is :',temp_q


    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN','VBP'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))

    #print 'Question verb list is :',q_verblist


    # Find score for each sentence using word march score first

    for i in range(0,len(complete_sentence_list)):
        wm_score=0
        #complete_sentence_list[i]=complete_sentence_list[i].replace('.','').replace(',','').replace('!','')

        wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        sent_score_list.append(wm_score)


    max_score_value=max(sent_score_list)
    #print 'Max score is :',max_score_value

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0,len(sentence_list)):
        if sent_score_list[i]==max_score_value:
            best.append((complete_sentence_list[i],i))
            best_sent_index.append(i)

    #print 'Best list is:',best


    # Finding indices of the best sentences

    # Re-setting the scores of all sentences to zero
    for i in range(0, len(sent_score_list)):
        sent_score_list[i]=0


    for i in range(0, len(complete_sentence_list)):
        score=0
        # 1. If the given sentence is in the best list, then reward them. It is a clue
        if i in best_sent_index:
            score=score + 3

        #2. If the sentence immediately precedes member of best, then it is a clue

        for k in best_sent_index:
            #print k
            if i==k-1:
                score=score + 3
            #3. If the sentence immediately follows member of best, then it is a good clue
            elif i==k+1:
                score=score + 4

        #4. If the sentence contains word "want", then it is a good clue
        temp=complete_sentence_list[i].split()
        for word in temp:
            if word.lower()=='want':
                score=score+4
            #5. If the sentence contains word "so" or "because"  then it is a good clue
            elif word.lower() in ['so','because']:
                score=score+4

        #5. Matching the main verb in question and sentence. If so it is a confident clue

        sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i])
        lmtzr=WordNetLemmatizer()
        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6


        sent_score_list[i]=score

    #print 'Sent score list values are:',sent_score_list


    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)
    #print 'Max value is :', max_score_value


    # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences
    # which have both maximum value and present in candidate list. For why questions we don't do more filtering
    # since most of the answers span the entire sentence

    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:
                final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list

    if len(final_sent_list) == 1:
        temp=final_sent_list[0].split()
        for k in range(0, len(temp)):
            if temp[k].lower() =='so':                         #If sentence contains "so", the answer is generally the words that come after so
                #index=final_sent_list[0].index("so")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])
            if temp[k].lower() =='because':                         #If sentence contains "so", the answer is generally the words that come after so
                #index=final_sent_list[0].index("because")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])
            if temp[k].lower() =='to':                         #If sentence contains "to", the answer is generally the words that come after so
                #index=final_sent_list[0].index("to")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])

        return final_sent_list[0]

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0,len(final_sent_list)):
            result=final_sent_list[k]

        return result
def answering_where(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,dateline,sent_loc_list):

    # Declaring globals to be used in this function

    candidate_list=[]
    sent_score_list=[]
    q_verblist=[]

    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','for','from',
                          'has','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']



    location_prepositions=['above','across','after','against','along','among','around',
                           'before','behind','below','beneath','beside','between','by','down','from',
                           'in','inside','into','near','off','onto','opposite','outside','over','surrounding',
                           'round','through','towards','under','up']


    abbreviation_list=[('Mt.','Mount')]


    temp_q=cleansedQuestion
    temp_q=temp_q.replace('"','')
    temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q=temp_q.replace(k,abbreviation_list[0][1])

    #print 'Question is :',temp_q

    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))

    #print 'Question verb list is :',q_verblist
    #print 'Master location list is:',sent_loc_list

    # 1. Find score for each sentence using word march score first

    for i in range(0,len(sentence_list)):
        score=0

        #print 'Sentence is :',sentence_list[i]
        score= score + WM.stemWordMatch(stop_words_free_question,sentence_list[i])
        #print 'After wordmatch score is:',score

        #2. Check if the sentence contains location preposition, then it is a good clue

        for k in complete_sentence_list[i].split():
            if k in location_prepositions:
                score=score+4

        # 3. Check if the sentence contains Location entity

        if sent_loc_list[i] != []: # If sentence contains location
            score=score + 6


        # 4.  Reward sentences which has "from" in the question and in the answer too

        from_qflag=0
        cand_list=[]

        for k in temp_q.split():
            if k.lower()=='from':
                #print 'From qflag is true'
                from_qflag=1
        if from_qflag==1 and 'from' in complete_sentence_list[i].split():
            #print 'True:'
            '''if sent_loc_list[i] !=[]:
                for m in sent_loc_list[i]:
                    if m not in temp_q.split():
                        cand_list.append(m)
            if cand_list!=[]:
                return ' '.join(cand_list)
            else:
                for k in complete_sentence_list[i].split():
                    if k not in temp_q:
                        cand_list.append(k)
                return ' '.join(cand_list)'''
            score=score + 6

        # 4.  Reward sentences which has the verb appearing in the question in its sentence

        sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i])

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6

        sent_score_list.append(score)

    #print 'Sent score list is :', sent_score_list

    ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION #####################

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score=0
    first_sentence_flag=0
    temp_list=cleansedQuestion.split()

    flag=0
    for word in temp_list:
        if word.lower() == 'where':
            flag=1


    for i in range(0, len(temp_list)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if temp_list[i].lower()=='happen':
            dateline_score= dateline_score+4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(temp_list)-1 and temp_list[i].lower()=='take' and temp_list[i+1].lower()=='place':
            dateline_score=dateline_score+4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions
        if temp_list[i].lower()=='this':
            if flag==0:
                dateline_score= dateline_score+20
            else:
                first_sentence_flag=1

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if temp_list[i].lower()=='story' and flag==0:
            dateline_score= dateline_score+20

    #print 'Date line score for the question is :',dateline_score

    first_list=[]

    if first_sentence_flag==1:                        #Choose the first sentence as the answer
        pos_np_list=POS_Tagging.pos_NNP_tagging(complete_sentence_list[0])
        if pos_np_list !=[]:
            for k in pos_np_list:
                if k not in temp_q.split():
                    first_list.append(k)

            return ' '.join(first_list)
        else:
            return complete_sentence_list[0]

    # Selecting the sentence/sentences that has the maximum score.

    max_score_value =max(sent_score_list)

    #Creating candidate list of sentences based on the maximum sent score

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i],i))


    #print 'Candidate list is :',candidate_list

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

    # Now we have to choose the best sentence among the sentences in candidate list

        if len(candidate_list)==1:

            temp_str= candidate_list[0][0]
            index=candidate_list[0][1]

        # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing
        else:
            # There are more than one candidate sentences. Print the first sentence
            for k in range(0, len(candidate_list)):

                temp_str=candidate_list[k][0]
                index=candidate_list[k][1]
                break

        #Cleaning up the candidate sentence
            # Replacing double quotes with blank and single quotes with "
            #temp_str=temp_str.replace('"','')
            #temp_str=temp_str.replace(',','').replace('?','').replace('!','')

        ################### SENTENCE PROCESSING #######################

        result_list=[]
        answer_list=[]

        s_loclist=sent_loc_list[index]
        #print 'Location list:', s_loclist


        if s_loclist==[]:   #The selected sentence does not seem to have a location expression, then print whole sentence  minus the words in the question
            '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str)
            if nnp_list != []:
                for k in nnp_list:
                    if k not in temp_q:
                        result_list.append(k)
                if result_list !=[]:
                    return ' '.join(result_list)'''

            for k in temp_str.split():
                if k not in temp_q.split():
                    result_list.append(k)
            if result_list !=[]:
                return ' '.join(result_list)


        if s_loclist!=[]:
            for i in range(0, len(s_loclist)):
                if s_loclist[i] not in temp_q.split() :   #To counter situations where question has a location and NER doesn't identify it
                    answer_list.append(s_loclist[i])

        #print 'Answer list is :',answer_list

        temp_result=[]
        np_result_list=[]

        if answer_list != []:
           result=' '.join(answer_list)
           return result

        else:

            '''np_list = POS_Tagging.pos_noun_tagging(temp_str)
            if np_list != []:
                for k in np_list:
                    if k not in temp_q:
                        np_result_list.append(k)

                return ' '.join(np_result_list)'''



            for k in temp_str.split():
                if k not in temp_q.split():
                    temp_result.append(k)

            return ' '.join(temp_result)

    # Dateline score is greater than the sent list score
    else:
        result=dateline
        return result
コード例 #21
0
def answering_why(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list):

    # Declaring globals to be used in this function

    sent_score_list = []
    final_sent_list = []
    best_sent_index = []
    best = [
    ]  # List of the best scoring sentences based on word match with the question
    q_verblist = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from',
        'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that',
        'the', 'to', 'was', 'were', 'will', 'with'
    ]

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('"', '')
    temp_q = temp_q.replace("'", '"')
    temp_q = temp_q.replace('?', '')

    #print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ',
                              'VBN', 'VBP'] and lmtzr.lemmatize(
                                  pos_list[i][0],
                                  'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Question verb list is :',q_verblist

    # Find score for each sentence using word march score first

    for i in range(0, len(complete_sentence_list)):
        wm_score = 0
        complete_sentence_list[i] = complete_sentence_list[i].replace(
            '.', '').replace(',', '').replace('!', '')

        wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,
                                               sentence_list[i])
        sent_score_list.append(wm_score)

    max_score_value = max(sent_score_list)
    #print 'Max score is :',max_score_value

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            best.append((complete_sentence_list[i], i))
            best_sent_index.append(i)

    #print 'Best list is:',best

    # Finding indices of the best sentences
    '''for j in range(0,len(best)):
            best_sent_index.append(best[j][1])'''

    # Re-setting the scores of all sentences to zero
    for i in range(0, len(sent_score_list)):
        sent_score_list[i] = 0

    for i in range(0, len(complete_sentence_list)):
        score = 0
        # 1. If the given sentence is in the best list, then reward them. It is a clue
        if i in best_sent_index:
            score = score + 3

        #2. If the sentence immediately precedes member of best, then it is a clue

        for k in best_sent_index:
            #print k
            if i == k - 1:
                score = score + 3
            #3. If the sentence immediately follows member of best, then it is a good clue
            elif i == k + 1:
                score = score + 4

        #4. If the sentence contains word "want", then it is a good clue
        temp = complete_sentence_list[i].split()
        for word in temp:
            if word.lower() == 'want':
                score = score + 4
            #5. If the sentence contains word "so" or "because"  then it is a good clue
            elif word.lower() in ['so', 'because']:
                score = score + 4

        #5. Matching the main verb in question and sentence. If so it is a confident clue

        sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i])
        lmtzr = WordNetLemmatizer()
        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in [
                    'VB', 'VBD', 'VBZ', 'VBN'
            ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score = score + 6

        sent_score_list[i] = score

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value = max(sent_score_list)
    #print 'Max value is :', max_score_value

    # Now we have to choose the best sentence among the sentences in candidate list. Choosing sentences
    # which have both maximum value and present in candidate list. For why questions we don't do more filtering
    # since most of the answers span the entire sentence

    for i in range(0, len(sent_score_list)):
        if sent_score_list[i] == max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list

    if len(final_sent_list) == 1:
        temp = final_sent_list[0].split()
        for k in range(0, len(temp)):
            if temp[k].lower(
            ) == 'so':  #If sentence contains "so", the answer is generally the words that come after so
                #index=final_sent_list[0].index("so")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])
            if temp[k].lower(
            ) == 'because':  #If sentence contains "so", the answer is generally the words that come after so
                #index=final_sent_list[0].index("because")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])
            if temp[k].lower(
            ) == 'to':  #If sentence contains "to", the answer is generally the words that come after so
                #index=final_sent_list[0].index("to")
                #return final_sent_list[0][k:]
                return ' '.join(temp[k:])

        return final_sent_list[0]

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0, len(final_sent_list)):
            result = final_sent_list[k]

        return result
コード例 #22
0
def answering_where(cleansedQuestion, stop_words_free_question,
                    complete_sentence_list, sentence_list, dateline,
                    sent_loc_list):

    # Declaring globals to be used in this function

    candidate_list = []
    sent_score_list = []
    q_verblist = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'for', 'from', 'has',
        'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
        'were', 'will', 'with'
    ]

    location_prepositions = [
        'above', 'across', 'after', 'against', 'along', 'among', 'around',
        'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by',
        'down', 'from', 'in', 'inside', 'into', 'near', 'off', 'onto',
        'opposite', 'outside', 'over', 'surrounding', 'round', 'through',
        'towards', 'under', 'up'
    ]

    abbreviation_list = [('Mt.', 'Mount')]

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('"', '')
    temp_q = temp_q.replace("'", '"')
    temp_q = temp_q.replace('?', '')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    #print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Question verb list is :',q_verblist
    #print 'Master location list is:',sent_loc_list

    # 1. Find score for each sentence using word march score first

    for i in range(0, len(sentence_list)):
        score = 0

        #print 'Sentence is :',sentence_list[i]
        score = score + WM.stemWordMatch(stop_words_free_question,
                                         sentence_list[i])
        #print 'After wordmatch score is:',score

        #2. Check if the sentence contains location preposition, then it is a good clue

        for k in complete_sentence_list[i].split():
            if k in location_prepositions:
                score = score + 4

        # 3. Check if the sentence contains Location entity

        if sent_loc_list[i] != []:  # If sentence contains location
            score = score + 6

        # 4.  Reward sentences which has "from" in the question and in the answer too

        from_qflag = 0
        cand_list = []

        for k in temp_q.split():
            if k.lower() == 'from':
                #print 'From qflag is true'
                from_qflag = 1
        if from_qflag == 1 and 'from' in complete_sentence_list[i].split():
            #print 'True:'
            '''if sent_loc_list[i] !=[]:
                for m in sent_loc_list[i]:
                    if m not in temp_q.split():
                        cand_list.append(m)
            if cand_list!=[]:
                return ' '.join(cand_list)
            else:
                for k in complete_sentence_list[i].split():
                    if k not in temp_q:
                        cand_list.append(k)
                return ' '.join(cand_list)'''
            score = score + 6

        # 4.  Reward sentences which has the verb appearing in the question in its sentence

        sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i])

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in [
                    'VB', 'VBD', 'VBZ', 'VBN'
            ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score = score + 6

        sent_score_list.append(score)

    #print 'Sent score list is :', sent_score_list

    ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION #####################

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score = 0
    first_sentence_flag = 0
    temp_list = cleansedQuestion.split()

    flag = 0
    for word in temp_list:
        if word.lower() == 'where':
            flag = 1

    for i in range(0, len(temp_list)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if temp_list[i].lower() == 'happen':
            dateline_score = dateline_score + 4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(temp_list) - 1 and temp_list[i].lower(
        ) == 'take' and temp_list[i + 1].lower() == 'place':
            dateline_score = dateline_score + 4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions
        if temp_list[i].lower() == 'this':
            if flag == 0:
                dateline_score = dateline_score + 20
            else:
                first_sentence_flag = 1

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if temp_list[i].lower() == 'story' and flag == 0:
            dateline_score = dateline_score + 20

    #print 'Date line score for the question is :',dateline_score

    first_list = []

    if first_sentence_flag == 1:  #Choose the first sentence as the answer
        pos_np_list = POS_Tagging.pos_NNP_tagging(complete_sentence_list[0])
        if pos_np_list != []:
            for k in pos_np_list:
                if k not in temp_q.split():
                    first_list.append(k)

            return ' '.join(first_list)
        else:
            return complete_sentence_list[0]

    # Selecting the sentence/sentences that has the maximum score.

    max_score_value = max(sent_score_list)

    #Creating candidate list of sentences based on the maximum sent score

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i], i))

    #print 'Candidate list is :',candidate_list

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

        # Now we have to choose the best sentence among the sentences in candidate list

        if len(candidate_list) == 1:

            temp_str = candidate_list[0][0]
            index = candidate_list[0][1]

        # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing
        else:
            # There are more than one candidate sentences. Print the first sentence
            for k in range(0, len(candidate_list)):

                temp_str = candidate_list[k][0]
                index = candidate_list[k][1]
                break

        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
            temp_str = temp_str.replace('"', '')
            temp_str = temp_str.replace(',', '').replace('?',
                                                         '').replace('!', '')

        ################### SENTENCE PROCESSING #######################

        result_list = []
        answer_list = []

        s_loclist = sent_loc_list[index]
        #print 'Location list:', s_loclist

        if s_loclist == []:  #The selected sentence does not seem to have a location expression, then print whole sentence  minus the words in the question
            '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str)
            if nnp_list != []:
                for k in nnp_list:
                    if k not in temp_q:
                        result_list.append(k)
                if result_list !=[]:
                    return ' '.join(result_list)'''

            for k in temp_str.split():
                if k not in temp_q.split():
                    result_list.append(k)
            if result_list != []:
                return ' '.join(result_list)

        if s_loclist != []:
            for i in range(0, len(s_loclist)):
                if s_loclist[i] not in temp_q.split(
                ):  #To counter situations where question has a location and NER doesn't identify it
                    answer_list.append(s_loclist[i])

        #print 'Answer list is :',answer_list

        temp_result = []
        np_result_list = []

        if answer_list != []:
            result = ' '.join(answer_list)
            return result

        else:
            '''np_list = POS_Tagging.pos_noun_tagging(temp_str)
            if np_list != []:
                for k in np_list:
                    if k not in temp_q:
                        np_result_list.append(k)

                return ' '.join(np_result_list)'''

            for k in temp_str.split():
                if k not in temp_q.split():
                    temp_result.append(k)

            return ' '.join(temp_result)

    # Dateline score is greater than the sent list score
    else:
        result = dateline
        return result
コード例 #23
0
def answering_who(cleansedQuestion, stop_words_free_question, sentence_list):

    # Declaring globals to be used in this function

    wordmatch_score_list = []
    sent_containing_person_score_list = []
    sent_containing_name_score_list = []
    sent_containing_person_and_name_score_list = []
    sent_containing_person_or_name_score_list = []
    master_person_list = []
    sent_score_list = []

    #print 'Question is :',cleansedQuestion

    snowball_stemmer = SnowballStemmer('english')

    for i in range(0, len(sentence_list)):
        #print 'Sentence is :', sentence_list[i]
        score = 0
        # 1. Score using word match rule
        wordmatch_score_list.append(
            WM.stemWordMatch(cleansedQuestion, sentence_list[i]))
        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        q_person_list, org_list, loc_list, time_list, prof_list = NET.named_entity_tagging(
            cleansedQuestion)
        if q_person_list == []:
            sent_plist, sent_olist, sent_llist, sent_tlist, sent_proflist = NET.named_entity_tagging(
                sentence_list[i])
            master_person_list.append((sent_plist, i))
            if sent_plist != []:
                score = score + 6 * len(sent_plist)

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            temp = sentence_list[i].split()
            for k in range(0, len(temp)):
                if snowball_stemmer.stem(temp[k].lower()) == 'name':
                    score = score + 4

        else:
            #Question has a name, and if the sentence contains the same name, then it is a good clue.

            #  4. Awards points to all sentences  that contain a name or reference to a human
            sent_plist, sent_olist, sent_llist, sent_tlist, sent_proflist = NET.named_entity_tagging(
                sentence_list[i])
            master_person_list.append(sent_plist)
            if sent_plist == q_person_list:
                score = score + 4 * len(sent_plist)

            elif sent_plist != [] or "name" in sentence_list[i]:
                score = score + 4
            '''if sent_plist==[] and "name" in sentence_list[i]:
                sent_containing_name_score_list.append(4)
            else:
                sent_containing_name_score_list.append(0)'''
        sent_score_list.append(score)

    #print 'Sent score list is :',sent_score_list
    #print 'Master person list is:',master_person_list

    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first
    # Preference is given to sentences which have a person name in them. If there is only one such sentence that is the answer

    candidate_list = []
    final_result_set = []
    temp_list = []

    max_score_value = max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((sentence_list[i], i))
    #print 'Candidate list is :',candidate_list

    if len(candidate_list) == 1:
        q_plist, q_olist, q_llist, q_tlist, q_proflist = NET.named_entity_tagging(
            stop_words_free_question)
        #If the question has a profession but not name of person, then the answer sentence should/would most probably
        #the name of a person
        #print 'Question Person List',q_plist

        if q_plist == [] or q_proflist != []:
            #temp_result=master_person_list[candidate_list[0][1]][0]
            s_plist, s_olist, s_llist, s_tlist, s_proflist = NET.named_entity_tagging(
                candidate_list[0][0])
            result = ' '.join(s_plist)
            print 'Answer: ', result + '\n'
            #print '\n'
            return result

        elif q_plist != [] or q_proflist != []:
            #print candidate_list[0][1]
            s_plist, s_olist, s_llist, s_tlist, s_proflist = NET.named_entity_tagging(
                candidate_list[0][0])
            result = ' '.join(s_plist)
            print 'Answer: ', result + '\n'
            #print '\n'
            return result

        elif q_plist != [] or q_proflist == []:  # Implies question has a name. So pick a sentence which has the same name in sentence which is present in question #
            result = candidate_list[0][0]
            print 'Answer: ', result + '\n'
            #print '\n'
            return result
    else:
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):
            val = candidate_list[k][0]
            #print 'val is :',val
            index = candidate_list[k][1]
            #print 'index is :', index
            temp_list.append(index)
            break

        #result=' '.join(temp_list)
        x = master_person_list[temp_list[0]]
        #print 'x is :', x
        result2 = temp_list[0]
        #for i in range(0,len(x)):
        if x != []:
            temp = ' '.join(x[0])
            if temp not in stop_words_free_question:
                final_result_set.append(temp)
        else:
            final_result_set.append(val)

        if final_result_set != []:
            print 'Answer: ', ' '.join(final_result_set) + '\n'
            #print '\n'
            #print 'Result 2 is :',result2
            return ' '.join(final_result_set)
        else:
            print 'Answer: ', temp + '\n'
            #print '\n'
            return temp  #' '.join(x)

    # Checking to see if the question contains profession name. If so the answer should be a sentence containing a name and higher weights
    # is given for the score from Rule 2. Else Rule 1 and Rule 2 are given equal weightage.
    '''q_plist,q_olist,q_llist,q_tlist,q_proflist=NET.named_entity_tagging(stop_words_free_question)
コード例 #24
0
def answering_why(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list, dateline):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    best_sent_index = []
    best = [
    ]  # List of the best scoring sentences based on word match with the question

    location_prepositions = [
        'in', 'at', 'near', 'inside', 'on', 'behind', 'above', 'under',
        'next to', 'below', 'between', 'around', 'outside', 'among',
        'on the right', 'across', 'front', 'opposite', 'before', 'beneath',
        'beside', 'against'
    ]

    what_year = [
        '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408',
        '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417',
        '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426',
        '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435',
        '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444',
        '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453',
        '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462',
        '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471',
        '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480',
        '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489',
        '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498',
        '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507',
        '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516',
        '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525',
        '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534',
        '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543',
        '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552',
        '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561',
        '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570',
        '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579',
        '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588',
        '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597',
        '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606',
        '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615',
        '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624',
        '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633',
        '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642',
        '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651',
        '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660',
        '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669',
        '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678',
        '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687',
        '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696',
        '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705',
        '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714',
        '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723',
        '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732',
        '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741',
        '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750',
        '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759',
        '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768',
        '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777',
        '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786',
        '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795',
        '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804',
        '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813',
        '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822',
        '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831',
        '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840',
        '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849',
        '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858',
        '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867',
        '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876',
        '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885',
        '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894',
        '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903',
        '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912',
        '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
        '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930',
        '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939',
        '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948',
        '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957',
        '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966',
        '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
        '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984',
        '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
        '1994', '1995', '1996', '1997', '1998', '1999'
    ]

    what_month = [
        'january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr',
        'may', 'may', 'june', 'jun', 'july', 'jul', 'august', 'aug',
        'september', 'sep', 'october', 'oct', 'november', 'nov', 'december',
        'dec'
    ]

    date_expression_list = [
        'yesterday', 'today', 'tomorrow', 'last week', 'this week',
        'next week', 'an hour ago', 'now', 'in an hour', 'recently', 'soon',
        'a little while ago', 'at this moment', 'in the near future',
        'a long time ago', 'these days', 'those days', 'future', 'present',
        'past', 'nowadays', 'eventually', 'morning', 'evening', 'night',
        'midnight', 'dawn', 'dusk', 'afternoon', 'noon', 'midday', 'am', 'pm',
        'sunrise', 'sunset', 'lunchtime', 'teatime', 'dinnertime', 'interval',
        'twilight', 'hourly', 'nightly', 'daily', 'monthly', 'weekly',
        'quarterly', 'yearly'
    ]

    #print 'Question is :',cleansedQuestion

    snowball_stemmer = SnowballStemmer('english')

    # Find score for each sentence using word march score first

    for i in range(0, len(complete_sentence_list)):
        wm_score = 0

        wm_score = wm_score + WM.stemWordMatch(cleansedQuestion,
                                               sentence_list[i])
        sent_score_list.append(wm_score)

    #print 'Score list is:',sent_score_list
    max_score_value = max(sent_score_list)
    #print 'Max score is :',max_score_value
    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            best.append((complete_sentence_list[i], i))

    #print 'Best list is:',best

    # Finding indices of the best sentences

    for j in range(0, len(best)):
        best_sent_index.append(best[j][1])

    # Re-setting the scores of all sentences to zero
    for i in range(0, len(sent_score_list)):
        sent_score_list[i] = 0

    for i in range(0, len(complete_sentence_list)):
        score = 0
        # 1. If the given sentence is in the best list, then reward them. It is a clue
        for j in range(0, len(best)):
            if complete_sentence_list[i] in best[j][0]:
                #print 'Yes'
                score = score + 3
        #print 'Score after 1 is :',score

        #2. If the sentence immediately precedes member of best, then it is a clue

        for k in best_sent_index:
            #print k
            if i == k - 1:
                score = score + 3
            #3. If the sentence immediately follows member of best, then it is a good clue
            elif i == k + 1:
                score = score + 4

        #4. If the sentence contains word "want", then it is a good clue
        temp = complete_sentence_list[i].split()
        for word in temp:
            if word.lower() == 'want':
                #print 'Score increment rule 4'
                score = score + 4
            elif word.lower() in ['so', 'because']:
                #print 'Score increment rule 5'
                score = score + 4

        sent_score_list[i] = score

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value = max(sent_score_list)
    #print 'Max value is :', max_score_value

    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    #print 'len of sent_score_list:',len(sent_score_list)
    for i in range(0, len(sent_score_list)):
        if sent_score_list[i] == max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list
    temp_solution = []
    answer_loc = []
    if len(final_sent_list) == 1:
        print 'Answer: ', final_sent_list[0] + '\n'
        #print '\n'
        return final_sent_list[0]

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0, len(final_sent_list)):
            result = final_sent_list[k]
        print 'Answer: ', result + '\n'
        #print '\n'
        return result
コード例 #25
0
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list):

    # Declaring globals to be used in this function

    candidate_sent_list=[]
    sent_score_list=[]
    final_sent_list=[]
    q_verblist=[]


    stanford_stop_words_list=['a','an','and','are','as','at','be','buy','do','for','from',
                          'has','have','he','in','is','it','its','of','on','that','the',
                          'to','was','were','will','with']


    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']


    abbreviation_list=[('Mt.','Mount')]

    temp_q=cleansedQuestion
    temp_q=temp_q.replace('"','')
    #temp_q=temp_q.replace("'",'"')
    temp_q=temp_q.replace('?','')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q=temp_q.replace(k,abbreviation_list[0][1])

    #print 'Question is :',temp_q


    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)



    lmtzr=WordNetLemmatizer()
    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0],'v'))

    #print 'Question verb list is :',q_verblist


    for i in range(0,len(complete_sentence_list)):
        score=0

        #print complete_sentence_list[i]
        # 1. Word Match scoring function for each of the sentences
        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        #print 'Score after wordmatch is :',score
        #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue
        for k in temp_q.split():
            if k.lower() in what_month:
                if sent_time_list[i] != []:
                    score=score + 4

                #print 'Score after Rule 2 is :',score
            # 3. What "kind" questions. Sentences containing "call" or "from"
            elif k.lower() =='kind':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v') in ['call','from']:
                        score=score+6
                #print 'Score after Rule 3 is :',score

            # 4. If question contains "name" and the sentence contains {name,call,known}

            elif k.lower() =='name':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v')  in ['name','call','known']:
                        score=score+20

                #print 'Score after Rule 4 is :',score

        '''if q_person_list !=[]:
            if sent_person_list[i] !=[]:
                score=score+6'''
        #print 'Score after Rule 4 is :',score


        #5. If question contains name + PP and contains(S,ProperNoun) and Head PP

        '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']:
             person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
             if person_list != []:
                 #TODO Check if it also contains (proper_noun,head(PP))
                 score=score +20'''

         # 6.  Reward sentences which has the verb appearing in the question in its sentence

        sent_pos_list=POS_Tagging.pos_tagging(complete_sentence_list[i])

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6

        # 7. Definition type questions or what is X or what are X  questions ?

        temp_list=temp_q.split()


        if len(temp_list) <= 6:
            if '(' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('(') + 1
                        end_index=complete_sentence_list[i].index(')')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]

            elif '--' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('--') + 1
                        end_index=complete_sentence_list[i].index('--')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]
            elif '{' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('{') + 1
                        end_index=complete_sentence_list[i].index('}')
                        score=score+20
                        return complete_sentence_list[i][start_index:end_index]



            # If the question contains "sport" related terms, answer should also have sport related terms
            '''if temp[j].lower() in ['sports','games','olympics']:
                temp2=sentence_list[i].split()
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']:
                        score=score+6'''

            # If the sentence contains a  "country" name and the sentence contains a LOCATION, then it is confident score
            '''if temp[j].lower() in ['country','countries','olympics']:
                person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
                if loc_list != []:
                    score=score + 6*len(loc_list)'''  # Confidence score increases with increasing number of countries appearing in the sentence.



        sent_score_list.append(score)

    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)


    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:
                final_sent_list.append(complete_sentence_list[i])

    #print 'Final list is:', final_sent_list

    answer_list=[]

    if len(final_sent_list) == 1:
        temp= final_sent_list[0].split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])'''

        #print temp_q.split()
        for k in range(0,len(temp)):

            if k !=0 or k!=len(temp)-1:
                if temp[k].lower()=='per' and temp[k+1].lower()=='cent':
                    return ' '.join(temp[k-1:k+2])

            if temp[k] not in temp_q.split():
                #print temp[k]
                answer_list.append(temp[k])

        return ' '.join(answer_list)

    else:

        for i in range(0,len(final_sent_list)):
            result=final_sent_list[i]
            break

        temp= result.split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])
            else:
                temp=result'''



        for k in range(0, len(temp)):
            if temp[k] not in temp_q.split():
                answer_list.append(temp[k])

        return ' '.join(answer_list)
def answering_how(
    cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list
):

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    q_verblist = []
    best = []  # List of the best scoring sentences based on word match with the question

    much_list = [
        "thousand",
        "hundred",
        "dollars",
        "cents",
        "million",
        "billion",
        "trillion",
        "none",
        "nothing",
        "everything",
        "few",
        "something",
        "cent",
        "percent",
        "salary",
        "pay",
        "income",
        "loss",
        "profit",
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "twenty",
        "thirty",
        "forty",
        "fifty",
        "sixty",
        "seventy",
        "eighty",
        "ninety",
    ]

    many_list = [
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "twenty",
        "thirty",
        "forty",
        "fifty",
        "sixty",
        "seventy",
        "eighty",
        "ninety",
        "hundred",
        "thousand",
        "million",
        "billion",
        "trillion",
    ]

    how_often = [
        "daily",
        "weekly",
        "bi-weekly",
        "fortnightly",
        "monthly",
        "bi-monthly",
        "quarterly",
        "half-yearly",
        "yearly",
        "decade",
        "millennium" "day",
        "everyday",
        "night",
        "afternoon",
        "noon",
    ]
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs = []

    stanford_stop_words_list = [
        "a",
        "an",
        "and",
        "are",
        "as",
        "at",
        "be",
        "buy",
        "do",
        "for",
        "from",
        "has",
        "have",
        "he",
        "in",
        "is",
        "it",
        "its",
        "of",
        "on",
        "that",
        "the",
        "to",
        "was",
        "were",
        "will",
        "with",
    ]

    abbreviation_list = [("Mt.", "Mount")]

    ########################### QUESTION PROCESSING ##################

    temp_q = cleansedQuestion
    # temp_q=temp_q.replace('"','')
    # temp_q=temp_q.replace("'",'"')
    temp_q = temp_q.replace("?", "")

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    # print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if (
            pos_list[i][1] in ["VB", "VBD", "VBZ", "VBN"]
            and lmtzr.lemmatize(pos_list[i][0], "v") not in stanford_stop_words_list
        ):
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], "v"))

    # print 'Question verb list is :',q_verblist

    # print 'Time list is:',sent_time_list

    ################## SENTENCE PROCESSING AND SCORING ###################

    for i in range(0, len(complete_sentence_list)):
        score = 0

        # 1. Find score for each sentence using word march score first

        # print 'The sentence is :',complete_sentence_list[i]
        # score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i])

        # 2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        for k in temp_q.split():
            if k.lower() == "many":
                for m in complete_sentence_list[i].split():
                    if nums.match(m) or m in many_list:
                        score = score + 6

            # 3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score
            elif k.lower() == "much":
                for m in complete_sentence_list[i].split():
                    if m.lower() in ["money", "earn", "salary", "profit", "loss"] or m in much_list:
                        score = score + 6

            # 4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score
            elif k.lower() == "often":
                for m in complete_sentence_list[i].split():
                    if m.lower() in sent_time_list or m.lower() in how_often:
                        score = score + 10

        """if much_flag==1 and money_flag==1:
            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk

        elif much_flag==1:

            temp2=complete_sentence_list[i].split()
            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'
                    score=score+6"""

        sent_score_list.append(score)

    # print 'Score list is:',sent_score_list
    max_score_value = max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            final_sent_list.append(complete_sentence_list[i])

    # print 'Final sent list is:',final_sent_list

    temp_result = []
    temp_solution = []
    if len(final_sent_list) == 1:

        # If the question contains often, the sentence will usually contain a time expression.If so pick
        # that expression as the solution

        """temp=cleansedQuestion.split()
        if 'often' in temp:
            #print 'often'
            temp2=final_sent_list[0].split()
            for m in range(0,len(temp2)):
                if temp2[m] in how_often:
                    temp_solution.append(temp2[m])
            #print 'Answer: ',' '.join(temp_solution)+'\n'
            #print '\n'
            return ' '.join(temp_solution)"""

        if "many" in temp_q.split():
            # print 'many'
            temp2 = final_sent_list[0].split()
            for m in range(0, len(temp2)):
                if nums.match(temp2[m]) or temp2[m] in many_list:
                    temp_solution.append(temp2[m])

            return " ".join(temp_solution)

        return final_sent_list[0]

        """for k in final_sent_list[0].split():
            if k not in cleansedQuestion.split():
                temp_result.append(k)

        return ' '.join(temp_result)"""

    else:
        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0, len(final_sent_list)):
            result = final_sent_list[k]
            break

        for k in result.split():
            if k not in cleansedQuestion.split():
                temp_result.append(k)

        return " ".join(temp_result)