def answering_who(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_person_list,sent_prof_list):

    # Declaring globals to be used in this function




    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:

    #print 'Temp_q: ',temp_q

    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)

    for i in range(0, len(complete_sentence_list)):
        #print 'Sentence is :', complete_sentence_list[i]

        # 1. Score using word match rule. Match words in question with the words in stop free sentence

        #print 'Sentence is :',sentence_list[i]
        score=score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        if q_person_list==[]:

            #Giving more weights to sentences having more names in it
            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list)
                score=score + 6

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            lmtzr = WordNetLemmatizer()
            temp= complete_sentence_list[i].split()
            for k in range(0,len(temp)):
                if lmtzr.lemmatize(temp[k].lower())=='name':
                    score=score + 4

            #  4. Awards points to all sentences  that contain a name or reference to a human

            if sent_person_list[i] !=[] or sent_prof_list[i] !=[]:
                #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list)

        # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question
        # then it is a confident clue and we reward it more

        sent_pos_list= POS_Tagging.pos_tagging(complete_sentence_list[i])

        '''for m in range(0, len(sent_pos_list)):
            if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split():
                score=score + 18
                #print 'Score now is :', score'''

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6

        # 6. If the question contains a profession name, the answer has to be a person and sentence would have
        #the person name and the profession

        if q_prof_list!=[]:
            for k in complete_sentence_list[i].split():
                if k.lower() in q_prof_list:
                    #print 'Profession Yes !'

        else:  #Question contains name so the chances of answer being a profession name are decent
            if sent_prof_list[i] !=[]:


    #print 'Sent score list is :',sent_score_list

    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first



    #print 'Max score is :',max_score_value

    for i in range(0, len(complete_sentence_list)):
        if sent_score_list[i]==max_score_value:
    #print 'Candidate list is :',candidate_list

    #If there is only one sentence, then choose the sentence and then do the processing to display the answer

    if len(candidate_list)==1:

        temp_str= candidate_list[0][0]
        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "

    # If there are multiple candidates, then choose the sentence which appeared first in the story  and then do the processing
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):

            #Cleaning up the candidate sentence

            index =candidate_list[k][1]


    ####################### SENTENCE PROCESSING TO FIND THE ANSWER ###############################

    #Just pick out the noun-phrase or PERSON names from the sentence


    #print 'Prof list is:',s_proflist

    #If the question has a name of person, then the answer sentence should/would most probably
    #the name of a person but it should not be the name of the person appearing in the question.
    #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases

    #print 'Question person list is:',q_person_list
    #print 'Sentence person list is:',s_plist


    if q_person_list==[] and s_plist==[]:   #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question

        '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str)
        if pos_np_list != []:
            for x in pos_np_list:
                if x not in temp_q and x[0].isupper():   #Noun phrases or names generally start with an upper case character
                    print 'First character caps',x
            return ' '.join(result_list)'''

        for k in temp_str.split():
            if k not in temp_q:

        return ' '.join(result_list)

    elif q_person_list !=[] and s_plist !=[]:    #To counter situations when both question and sentence has names Ex. Who defeated who ?
        for k in s_plist:
            if k not in temp_q:

    elif q_person_list==[] and s_plist !=[]:
        for i in range(0, len(s_plist)):
            if s_plist[i] not in q_person_list and s_plist[i] not in temp_q:  #To counter situations where question has a name and NER doesn't identify it

    elif q_person_list != [] and s_proflist !=[]:  #To counter situations for 'Who is X' type questions which could have a profession name in the answer
        for k in s_proflist:

    elif q_person_list==[] and q_loc_list !=[]: # Who is <X> where ?
        #print 'Question has no name but has a location'
        for k in temp_str.split():
            if k not in temp_q:
        if q_loc_who_list !=[]:
            return ' '.join(q_loc_who_list)

    '''elif q_person_list==[] and s_proflist !=[]:
        for k in s_proflist:

    if answer_list != [] :#and flag==1:                #Indicating candidate sentence has a name other than that in question
        result= ' '.join(answer_list)

        #Pick out the noun phrase or nouns and then display them as answer

        np_list = POS_Tagging.pos_noun_tagging(temp_str)
        for x in np_list :
            if x not in temp_q:
                npfinal_list.append(x) #Removing all occurences of existing noun phrases from the question

        #print 'NP Final list after removal is',npfinal_list
        if npfinal_list !=[]:
            result=' '.join(npfinal_list)

            result=temp_str                  # Printing out the whole sentence

    #print 'Result is:',result
    return result
def answering_what(cleansedQuestion,stop_words_free_question,complete_sentence_list,sentence_list,sent_time_list,sent_person_list):

    # Declaring globals to be used in this function



    what_year=['1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1417', '1418', '1419', '1420', '1421', '1422', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1474', '1475', '1476', '1477', '1478', '1479', '1480', '1481', '1482', '1483', '1484', '1485', '1486', '1487', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1517', '1518', '1519', '1520', '1521', '1522', '1523', '1524', '1525', '1526', '1527', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1544', '1545', '1546', '1547', '1548', '1549', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1560', '1561', '1562', '1563', '1564', '1565', '1566', '1567', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1575', '1576', '1577', '1578', '1579', '1580', '1581', '1582', '1583', '1584', '1585', '1586', '1587', '1588', '1589', '1590', '1591', '1592', '1593', '1594', '1595', '1596', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1617', '1618', '1619', '1620', '1621', '1622', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1633', '1634', '1635', '1636', '1637', '1638', '1639', '1640', '1641', '1642', '1643', '1644', '1645', '1646', '1647', '1648', '1649', '1650', '1651', '1652', '1653', '1654', '1655', '1656', '1657', '1658', '1659', '1660', '1661', '1662', '1663', '1664', '1665', '1666', '1667', '1668', '1669', '1670', '1671', '1672', '1673', '1674', '1675', '1676', '1677', '1678', '1679', '1680', '1681', '1682', '1683', '1684', '1685', '1686', '1687', '1688', '1689', '1690', '1691', '1692', '1693', '1694', '1695', '1696', '1697', '1698', '1699', '1700', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1713', '1714', '1715', '1716', '1717', '1718', '1719', '1720', '1721', '1722', '1723', '1724', '1725', '1726', '1727', '1728', '1729', '1730', '1731', '1732', '1733', '1734', '1735', '1736', '1737', '1738', '1739', '1740', '1741', '1742', '1743', '1744', '1745', '1746', '1747', '1748', '1749', '1750', '1751', '1752', '1753', '1754', '1755', '1756', '1757', '1758', '1759', '1760', '1761', '1762', '1763', '1764', '1765', '1766', '1767', '1768', '1769', '1770', '1771', '1772', '1773', '1774', '1775', '1776', '1777', '1778', '1779', '1780', '1781', '1782', '1783', '1784', '1785', '1786', '1787', '1788', '1789', '1790', '1791', '1792', '1793', '1794', '1795', '1796', '1797', '1798', '1799', '1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1813', '1814', '1815', '1816', '1817', '1818', '1819', '1820', '1821', '1822', '1823', '1824', '1825', '1826', '1827', '1828', '1829', '1830', '1831', '1832', '1833', '1834', '1835', '1836', '1837', '1838', '1839', '1840', '1841', '1842', '1843', '1844', '1845', '1846', '1847', '1848', '1849', '1850', '1851', '1852', '1853', '1854', '1855', '1856', '1857', '1858', '1859', '1860', '1861', '1862', '1863', '1864', '1865', '1866', '1867', '1868', '1869', '1870', '1871', '1872', '1873', '1874', '1875', '1876', '1877', '1878', '1879', '1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999']

    what_month=['january','jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','may', 'june', 'jun', 'july', 'jul','august','aug','september','sep','october','oct','november','nov','december','dec']



    for k in temp_q.split():
        if k in abbreviation_list[0][0]:

    #print 'Question is :',temp_q

    q_person_list,q_org_list,q_loc_list,q_month_list,q_time_list,q_money_list,q_percent_list,q_prof_list = NER.named_entity_recognition(temp_q)

    pos_list= POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(pos_list[i][0],'v') not in stanford_stop_words_list:

    #print 'Question verb list is :',q_verblist

    for i in range(0,len(complete_sentence_list)):

        #print complete_sentence_list[i]
        # 1. Word Match scoring function for each of the sentences
        score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])

        #print 'Score after wordmatch is :',score
        #2. Check if the question contains a month of the year and sentence contains date expression,then it is a clue
        for k in temp_q.split():
            if k.lower() in what_month:
                if sent_time_list[i] != []:
                    score=score + 4

                #print 'Score after Rule 2 is :',score
            # 3. What "kind" questions. Sentences containing "call" or "from"
            elif k.lower() =='kind':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v') in ['call','from']:
                #print 'Score after Rule 3 is :',score

            # 4. If question contains "name" and the sentence contains {name,call,known}

            elif k.lower() =='name':
                for m in complete_sentence_list[i].split():
                    if lmtzr.lemmatize(m,'v')  in ['name','call','known']:

                #print 'Score after Rule 4 is :',score

        '''if q_person_list !=[]:
            if sent_person_list[i] !=[]:
        #print 'Score after Rule 4 is :',score

        #5. If question contains name + PP and contains(S,ProperNoun) and Head PP

        '''if j != len(temp) -1 and temp[j]=='name' and temp[j+1] in ['of','for']:
             person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
             if person_list != []:
                 #TODO Check if it also contains (proper_noun,head(PP))
                 score=score +20'''

         # 6.  Reward sentences which has the verb appearing in the question in its sentence


        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in ['VB','VBD','VBZ','VBN'] and lmtzr.lemmatize(sent_pos_list[k][0],'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score=score + 6

        # 7. Definition type questions or what is X or what are X  questions ?


        if len(temp_list) <= 6:
            if '(' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('(') + 1
                        return complete_sentence_list[i][start_index:end_index]

            elif '--' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('--') + 1
                        return complete_sentence_list[i][start_index:end_index]
            elif '{' in complete_sentence_list[i]:
                for k in temp_list:
                    if k in complete_sentence_list[i].split():
                        start_index= complete_sentence_list[i].index('{') + 1
                        return complete_sentence_list[i][start_index:end_index]

            # If the question contains "sport" related terms, answer should also have sport related terms
            '''if temp[j].lower() in ['sports','games','olympics']:
                for k in range(0,len(temp2)):
                    if snowball_stemmer.stem(temp2[k]) in ['soccer','hockey','baseball','cricket','rugby','ultimate']:

            # If the sentence contains a  "country" name and the sentence contains a LOCATION, then it is confident score
            '''if temp[j].lower() in ['country','countries','olympics']:
                person_list,org_list,loc_list,time_list,prof_list = NET.named_entity_tagging(sentence_list[i])
                if loc_list != []:
                    score=score + 6*len(loc_list)'''  # Confidence score increases with increasing number of countries appearing in the sentence.


    #print 'Sent score list values are:',sent_score_list

    # Selecting the sentence that has the maximum score.

    max_score_value =max(sent_score_list)

    # Now we have to choose the best sentence among the sentences in candidate list.Choosing sentences
    # which have both maximum value and present in candidate list

    for i in range(0, len(sent_score_list)):
         if sent_score_list[i]==max_score_value:

    #print 'Final list is:', final_sent_list


    if len(final_sent_list) == 1:
        temp= final_sent_list[0].split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])'''

        #print temp_q.split()
        for k in range(0,len(temp)):

            if k !=0 or k!=len(temp)-1:
                if temp[k].lower()=='per' and temp[k+1].lower()=='cent':
                    return ' '.join(temp[k-1:k+2])

            if temp[k] not in temp_q.split():
                #print temp[k]

        return ' '.join(answer_list)


        for i in range(0,len(final_sent_list)):

        temp= result.split()
        '''for k in range(0,len(temp)):
            if temp[k].lower() =='to':
                return ' '.join(temp[k:])

        for k in range(0, len(temp)):
            if temp[k] not in temp_q.split():

        return ' '.join(answer_list)
Ejemplo n.º 3
    return [
        taggeditem[0] for taggeditem in all_tagged_tokens
        if taggeditem[1][int(numchars) - 1] in alloweditems

for i in range(len(all_filepaths)):
    lemmatized_text = []
    text = et1.get_article_text(all_filepaths[i])  ## get article text
    #print text
    tokenize_sentences = et1.tokenize_sent_word(text)  ## tokenize sentences
    # chack if tagging is needed
    if pos_tagging_needed in ['True', 'yes', 'y', 'Y', 'Yes']:
        text_tok = pos_t.tag_tokennized_text(tokenize_sentences)
        for s in text_tok:
            for word, POStag in s:
                lemmatized = wordnet_lemmatizer.lemmatize(word), POStag
        for s in tokenize_sentences:
            for word in s:
                lemmatized = wordnet_lemmatizer.lemmatize(word)

    ### filtering
    if pos_tagging_needed in ['True', 'yes', 'y', 'Y', 'Yes']:
        filtered_pos_tokens = filter_pos_tags(lemmatized_text)
        cleaned_text = fs.filter_spam_items(filtered_pos_tokens)
Ejemplo n.º 7
def answering_how(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list, sent_time_list,

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    q_verblist = []
    best = [
    ]  # List of the best scoring sentences based on word match with the question

    much_list = [
        'thousand', 'thousands', 'hundred', 'hundreds', 'dollars', 'cents',
        'million', 'billion', 'trillion', 'none', 'nothing', 'everything',
        'few', 'something', 'dollars', 'grams', 'kilos', 'kilogram',
        'kilograms', 'milligrams', 'mg', 'metre', 'centimetre', 'inches',
        'feet', 'foot', 'ft', 'cent', 'percent', 'salary', 'pay', 'income',
        'loss', 'profit', 'one', 'two', 'three', 'four', 'five', 'six',
        'seven', 'eight', 'nine', 'ten', 'twenty', 'thirty', 'forty', 'fifty',
        'sixty', 'seventy', 'eighty', 'ninety', 'hour', 'hours', 'minutes',
        'seconds', 'second', 'minute', 'half', 'quarter', 'more', 'less',

    many_list = [
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
        'ten', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy',
        'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion',

    how_often = [
        'daily', 'weekly', 'bi-weekly', 'fortnightly', 'monthly', 'bi-monthly',
        'quarterly', 'half-yearly', 'yearly', 'decade', 'millennium'
        'day', 'everyday', 'night', 'afternoon', 'noon', 'hourly', 'hours',
        'minutes', 'seconds', 'second', 'minute'
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from',
        'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that',
        'the', 'to', 'was', 'were', 'will', 'with'

    abbreviation_list = [('Mt.', 'Mount')]

    ########################### QUESTION PROCESSING ##################

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('?', '')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    #print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Question verb list is :',q_verblist

    #print 'Time list is:',sent_time_list

    ################## SENTENCE PROCESSING AND SCORING ###################

    for i in range(0, len(complete_sentence_list)):
        score = 0

        # 1. Find score for each sentence using word march score first

        #print 'The sentence is :',complete_sentence_list[i]
        #score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        #2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        for k in temp_q.split():
            if k.lower() == "many":
                for m in complete_sentence_list[i].split():
                    if nums.match(m) or m in many_list:
                        score = score + 6

            #3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score
            elif k.lower() == "much":
                for m in complete_sentence_list[i].split():
                    if m.lower() in [
                            'money', 'earn', 'salary', 'profit', 'loss'
                    ] or m in much_list:
                        score = score + 6

            #4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score
            elif k.lower() == 'often' or k.lower() == 'long':
                for m in complete_sentence_list[i].split():
                    if m in how_often:  #m.lower() in sent_time_list[i] or
                        score = score + 10
        '''if much_flag==1 and money_flag==1:
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk

        elif much_flag==1:

            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'


    #print 'Score list is:',sent_score_list
    max_score_value = max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:

    #print 'Final sent list is:',final_sent_list

    temp_result = []
    temp_solution = []
    if len(final_sent_list) == 1:

        #If the question contains often, the sentence will usually contain a time expression.If so pick
        #that expression as the solution

        if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1:
            req_string = final_sent_list[0][:-1]
            temp2 = req_string.split()
            temp2 = final_sent_list[0].split()


        if final_sent_list[0].index('.') == len(final_sent_list[0]) - 1:
            req_string = final_sent_list[0][:-1]
            temp2 = req_string.split()
            temp2 = final_sent_list[0].split(
            )  #Picking the sentence which comes first when there are multiple candidates

    #If sentence contains per cent most probably it would be an answer to the how question (much or many)
    for k in range(0, len(temp2)):
        if k != 0 or k != len(temp2) - 1:
            if temp2[k].lower() == 'per' and temp2[k + 1].lower() == 'cent':
                return ' '.join(temp2[k - 1:k + 2])

    if 'many' in temp_q.split():
        #print 'many'

        for m in range(0, len(temp2)):
            #print 'temp2[m]:',temp2[m]
            if nums.match(temp2[m]) or temp2[m] in many_list:
                #print 'Yes'

        #print 'Temp solution is:',temp_solution
        if temp_solution != []:
            return ' '.join(temp_solution)
            return ' '.join(temp2)

    elif 'much' in temp_q.split():
        #print 'many'

        for m in range(0, len(temp2)):
            if nums.match(temp2[m]) or temp2[m] in much_list:

        if temp_solution != []:
            return ' '.join(temp_solution)
            return ' '.join(temp2)

    for k in temp2:
        if k not in temp_q.split():

    return ' '.join(temp_result)
Ejemplo n.º 8
def answering_where(cleansedQuestion, stop_words_free_question,
                    complete_sentence_list, sentence_list, dateline,

    # Declaring globals to be used in this function

    candidate_list = []
    sent_score_list = []
    q_verblist = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'for', 'from', 'has',
        'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
        'were', 'will', 'with'

    location_prepositions = [
        'above', 'across', 'after', 'against', 'along', 'among', 'around',
        'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by',
        'down', 'from', 'in', 'inside', 'into', 'near', 'off', 'onto',
        'opposite', 'outside', 'over', 'surrounding', 'round', 'through',
        'towards', 'under', 'up'

    abbreviation_list = [('Mt.', 'Mount')]

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('"', '')
    temp_q = temp_q.replace("'", '"')
    temp_q = temp_q.replace('?', '')

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    #print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Question verb list is :',q_verblist
    #print 'Master location list is:',sent_loc_list

    # 1. Find score for each sentence using word march score first

    for i in range(0, len(sentence_list)):
        score = 0

        #print 'Sentence is :',sentence_list[i]
        score = score + WM.stemWordMatch(stop_words_free_question,
        #print 'After wordmatch score is:',score

        #2. Check if the sentence contains location preposition, then it is a good clue

        for k in complete_sentence_list[i].split():
            if k in location_prepositions:
                score = score + 4

        # 3. Check if the sentence contains Location entity

        if sent_loc_list[i] != []:  # If sentence contains location
            score = score + 6

        # 4.  Reward sentences which has "from" in the question and in the answer too

        from_qflag = 0
        cand_list = []

        for k in temp_q.split():
            if k.lower() == 'from':
                #print 'From qflag is true'
                from_qflag = 1
        if from_qflag == 1 and 'from' in complete_sentence_list[i].split():
            #print 'True:'
            '''if sent_loc_list[i] !=[]:
                for m in sent_loc_list[i]:
                    if m not in temp_q.split():
            if cand_list!=[]:
                return ' '.join(cand_list)
                for k in complete_sentence_list[i].split():
                    if k not in temp_q:
                return ' '.join(cand_list)'''
            score = score + 6

        # 4.  Reward sentences which has the verb appearing in the question in its sentence

        sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i])

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in [
                    'VB', 'VBD', 'VBZ', 'VBN'
            ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score = score + 6


    #print 'Sent score list is :', sent_score_list

    ##################### COMPUTING THE DATE LINE SCORE FOR THE QUESTION #####################

    # For when and where questions the answer to the question could also be from the timeline of the story

    dateline_score = 0
    first_sentence_flag = 0
    temp_list = cleansedQuestion.split()

    flag = 0
    for word in temp_list:
        if word.lower() == 'where':
            flag = 1

    for i in range(0, len(temp_list)):
        # 1. If question contains "happen", it is a good clue that timeline could be answer
        if temp_list[i].lower() == 'happen':
            dateline_score = dateline_score + 4

        # 2. If question contains "take place", it is a good clue that timeline could be answer
        if i != len(temp_list) - 1 and temp_list[i].lower(
        ) == 'take' and temp_list[i + 1].lower() == 'place':
            dateline_score = dateline_score + 4

        # 3. If question contains "this", it is slam_dunk that timeline could be answer for when type questions
        if temp_list[i].lower() == 'this':
            if flag == 0:
                dateline_score = dateline_score + 20
                first_sentence_flag = 1

        # 4. If question contains "story", it is slam_dunk that timeline could be answer

        if temp_list[i].lower() == 'story' and flag == 0:
            dateline_score = dateline_score + 20

    #print 'Date line score for the question is :',dateline_score

    first_list = []

    if first_sentence_flag == 1:  #Choose the first sentence as the answer
        pos_np_list = POS_Tagging.pos_NNP_tagging(complete_sentence_list[0])
        if pos_np_list != []:
            for k in pos_np_list:
                if k not in temp_q.split():

            return ' '.join(first_list)
            return complete_sentence_list[0]

    # Selecting the sentence/sentences that has the maximum score.

    max_score_value = max(sent_score_list)

    #Creating candidate list of sentences based on the maximum sent score

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i], i))

    #print 'Candidate list is :',candidate_list

    # Checking which of the scores is greater. IF score from sent_Score_list is greater than dateline score, then we find
    # the corresponding sentences and choose the best among them. Else we return the dateline as the result.
    if max_score_value > dateline_score:

        # Now we have to choose the best sentence among the sentences in candidate list

        if len(candidate_list) == 1:

            temp_str = candidate_list[0][0]
            index = candidate_list[0][1]

        # If there are multiple candidates, then choose the sentence which appeared first in the story and then do the processing
            # There are more than one candidate sentences. Print the first sentence
            for k in range(0, len(candidate_list)):

                temp_str = candidate_list[k][0]
                index = candidate_list[k][1]

        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "
            temp_str = temp_str.replace('"', '')
            temp_str = temp_str.replace(',', '').replace('?',
                                                         '').replace('!', '')

        ################### SENTENCE PROCESSING #######################

        result_list = []
        answer_list = []

        s_loclist = sent_loc_list[index]
        #print 'Location list:', s_loclist

        if s_loclist == []:  #The selected sentence does not seem to have a location expression, then print whole sentence  minus the words in the question
            '''nnp_list = POS_Tagging.pos_NNP_tagging(temp_str)
            if nnp_list != []:
                for k in nnp_list:
                    if k not in temp_q:
                if result_list !=[]:
                    return ' '.join(result_list)'''

            for k in temp_str.split():
                if k not in temp_q.split():
            if result_list != []:
                return ' '.join(result_list)

        if s_loclist != []:
            for i in range(0, len(s_loclist)):
                if s_loclist[i] not in temp_q.split(
                ):  #To counter situations where question has a location and NER doesn't identify it

        #print 'Answer list is :',answer_list

        temp_result = []
        np_result_list = []

        if answer_list != []:
            result = ' '.join(answer_list)
            return result

            '''np_list = POS_Tagging.pos_noun_tagging(temp_str)
            if np_list != []:
                for k in np_list:
                    if k not in temp_q:

                return ' '.join(np_result_list)'''

            for k in temp_str.split():
                if k not in temp_q.split():

            return ' '.join(temp_result)

    # Dateline score is greater than the sent list score
        result = dateline
        return result
Ejemplo n.º 10
def answering_who(cleansedQuestion, stop_words_free_question,
                  complete_sentence_list, sentence_list, sent_person_list,

    # Declaring globals to be used in this function

    sent_score_list = []
    q_verblist = []

    stanford_stop_words_list = [
        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'buy', 'do', 'for', 'from',
        'has', 'have', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that',
        'the', 'to', 'was', 'were', 'will', 'with'

    temp_q = cleansedQuestion
    temp_q = temp_q.replace('?', '')

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if pos_list[i][1] in ['VB', 'VBD', 'VBZ', 'VBN'] and lmtzr.lemmatize(
                pos_list[i][0], 'v') not in stanford_stop_words_list:
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], 'v'))

    #print 'Temp_q: ',temp_q

    q_person_list, q_org_list, q_loc_list, q_month_list, q_time_list, q_money_list, q_percent_list, q_prof_list = NER.named_entity_recognition(

    for i in range(0, len(complete_sentence_list)):
        #print 'Sentence is :', complete_sentence_list[i]
        score = 0

        # 1. Score using word match rule. Match words in question with the words in stop free sentence

        #print 'Sentence is :',sentence_list[i]
        score = score + WM.stemWordMatch(cleansedQuestion, sentence_list[i])

        # 2. If question does not contain name but the answer contains NAME then you are confident(+6)
        if q_person_list == []:

            #Giving more weights to sentences having more names in it
            if sent_person_list[i] != [] or sent_prof_list[i] != []:
                #score=score + 6*len(sent_person_list) + 6* len(sent_prof_list)
                score = score + 6

            # 3. If question does not contain a name and answer contains the word "name" then good_clue (+4)
            lmtzr = WordNetLemmatizer()
            temp = complete_sentence_list[i].split()
            for k in range(0, len(temp)):
                if lmtzr.lemmatize(temp[k].lower()) == 'name':
                    score = score + 4

            #  4. Awards points to all sentences  that contain a name or reference to a human

            if sent_person_list[i] != [] or sent_prof_list[i] != []:
                #score=score + 4*len(sent_person_list) + 4* len(sent_prof_list)
                score = score + 4

        # 5. If the answer contains the exact verb found in the question after the "Who" or in fact in the whole question
        # then it is a confident clue and we reward it more

        sent_pos_list = POS_Tagging.pos_tagging(complete_sentence_list[i])
        '''for m in range(0, len(sent_pos_list)):
            if sent_pos_list[m][1] in ['VB','VBD','VBN','VBG','VBZ'] and sent_pos_list[m][0] in stop_words_free_question.split():
                score=score + 18
                #print 'Score now is :', score'''

        for k in range(0, len(sent_pos_list)):
            if sent_pos_list[k][1] in [
                    'VB', 'VBD', 'VBZ', 'VBN'
            ] and lmtzr.lemmatize(sent_pos_list[k][0], 'v') in q_verblist:
                #print 'Verb in question and sentence matches'
                score = score + 6

        # 6. If the question contains a profession name, the answer has to be a person and sentence would have
        #the person name and the profession

        if q_prof_list != []:
            for k in complete_sentence_list[i].split():
                if k.lower() in q_prof_list:
                    #print 'Profession Yes !'
                    score = score + 18

        else:  #Question contains name so the chances of answer being a profession name are decent
            if sent_prof_list[i] != []:
                score = score + 6


    #print 'Sent score list is :',sent_score_list

    # Selecting the sentence that has the maximum score. If it is a tie, we choose the sentence that appears first

    candidate_list = []
    npfinal_list = []
    temp_list = []
    answer_list = []

    max_score_value = max(sent_score_list)

    #print 'Max score is :',max_score_value

    for i in range(0, len(complete_sentence_list)):
        if sent_score_list[i] == max_score_value:
            candidate_list.append((complete_sentence_list[i], i))
    #print 'Candidate list is :',candidate_list

    #If there is only one sentence, then choose the sentence and then do the processing to display the answer

    if len(candidate_list) == 1:

        temp_str = candidate_list[0][0]
        index = candidate_list[0][1]
        #Cleaning up the candidate sentence
        # Replacing double quotes with blank and single quotes with "

    # If there are multiple candidates, then choose the sentence which appeared first in the story  and then do the processing
        # There are more than one candidate sentences. Print the first sentence
        for k in range(0, len(candidate_list)):

            #Cleaning up the candidate sentence

            temp_str = candidate_list[k][0]
            index = candidate_list[k][1]


    ####################### SENTENCE PROCESSING TO FIND THE ANSWER ###############################

    #Just pick out the noun-phrase or PERSON names from the sentence

    s_plist = sent_person_list[index]
    s_proflist = sent_prof_list[index]

    #print 'Prof list is:',s_proflist

    #If the question has a name of person, then the answer sentence should/would most probably
    #the name of a person but it should not be the name of the person appearing in the question.
    #If we can't find any other name in the candidate sentence then we do POS tagging and display the NOUN phrases

    #print 'Question person list is:',q_person_list
    #print 'Sentence person list is:',s_plist

    result_list = []
    q_loc_who_list = []

    if q_person_list == [] and s_plist == []:  #If both the question does not have a name and the sentence does not have a name,print the whole sentence minus words which appear in question
        '''pos_np_list= POS_Tagging.pos_noun_tagging(temp_str)
        if pos_np_list != []:
            for x in pos_np_list:
                if x not in temp_q and x[0].isupper():   #Noun phrases or names generally start with an upper case character
                    print 'First character caps',x
            return ' '.join(result_list)'''

        for k in temp_str.split():
            if k not in temp_q:

        return ' '.join(result_list)

    elif q_person_list != [] and s_plist != []:  #To counter situations when both question and sentence has names Ex. Who defeated who ?
        for k in s_plist:
            if k not in temp_q:

    elif q_person_list == [] and s_plist != []:
        for i in range(0, len(s_plist)):
            if s_plist[i] not in q_person_list and s_plist[
                    i] not in temp_q:  #To counter situations where question has a name and NER doesn't identify it

    elif q_person_list != [] and s_proflist != []:  #To counter situations for 'Who is X' type questions which could have a profession name in the answer
        for k in s_proflist:

    elif q_person_list == [] and q_loc_list != []:  # Who is <X> where ?
        #print 'Question has no name but has a location'
        for k in temp_str.split():
            if k not in temp_q:
        if q_loc_who_list != []:
            return ' '.join(q_loc_who_list)
    '''elif q_person_list==[] and s_proflist !=[]:
        for k in s_proflist:

    if answer_list != []:  #and flag==1:                #Indicating candidate sentence has a name other than that in question
        result = ' '.join(answer_list)

        #Pick out the noun phrase or nouns and then display them as answer

        np_list = POS_Tagging.pos_noun_tagging(temp_str)
        for x in np_list:
            if x not in temp_q:
                )  #Removing all occurences of existing noun phrases from the question

        #print 'NP Final list after removal is',npfinal_list
        if npfinal_list != []:
            result = ' '.join(npfinal_list)

            result = temp_str  # Printing out the whole sentence

    #print 'Result is:',result
    return result
def answering_how(
    cleansedQuestion, stop_words_free_question, complete_sentence_list, sentence_list, sent_time_list, sent_percent_list

    # Declaring globals to be used in this function

    candidate_sent_list = []
    sent_score_list = []
    final_sent_list = []
    q_verblist = []
    best = []  # List of the best scoring sentences based on word match with the question

    much_list = [

    many_list = [

    how_often = [
        "millennium" "day",
    nums = re.compile(r"[+-]?\d+(?:\.\d+)?")

    measurement_verbs = []

    stanford_stop_words_list = [

    abbreviation_list = [("Mt.", "Mount")]

    ########################### QUESTION PROCESSING ##################

    temp_q = cleansedQuestion
    # temp_q=temp_q.replace('"','')
    # temp_q=temp_q.replace("'",'"')
    temp_q = temp_q.replace("?", "")

    for k in temp_q.split():
        if k in abbreviation_list[0][0]:
            temp_q = temp_q.replace(k, abbreviation_list[0][1])

    # print 'Question is :',temp_q

    lmtzr = WordNetLemmatizer()
    pos_list = POS_Tagging.pos_tagging(temp_q)

    for i in range(0, len(pos_list)):
        if (
            pos_list[i][1] in ["VB", "VBD", "VBZ", "VBN"]
            and lmtzr.lemmatize(pos_list[i][0], "v") not in stanford_stop_words_list
            q_verblist.append(lmtzr.lemmatize(pos_list[i][0], "v"))

    # print 'Question verb list is :',q_verblist

    # print 'Time list is:',sent_time_list

    ################## SENTENCE PROCESSING AND SCORING ###################

    for i in range(0, len(complete_sentence_list)):
        score = 0

        # 1. Find score for each sentence using word march score first

        # print 'The sentence is :',complete_sentence_list[i]
        # score = score + WM.stemWordMatch(cleansedQuestion,sentence_list[i])
        score = score + WM.stemWordMatch(stop_words_free_question, sentence_list[i])

        # 2. If the question contains "many" and sentence contains an expression of number, then it is confident score

        for k in temp_q.split():
            if k.lower() == "many":
                for m in complete_sentence_list[i].split():
                    if nums.match(m) or m in many_list:
                        score = score + 6

            # 3. If the question contains "much" and sentence contains an expression for distance or for money, then it is a confident score
            elif k.lower() == "much":
                for m in complete_sentence_list[i].split():
                    if m.lower() in ["money", "earn", "salary", "profit", "loss"] or m in much_list:
                        score = score + 6

            # 4. If the question contains "often" and sentence contains an expression of time, then it is more than confident score
            elif k.lower() == "often":
                for m in complete_sentence_list[i].split():
                    if m.lower() in sent_time_list or m.lower() in how_often:
                        score = score + 10

        """if much_flag==1 and money_flag==1:
            #print temp2
            for k in range(0, len(temp2)):
                if temp2[k] in much_list:
                    score=score +20 #slam-dunk

        elif much_flag==1:

            #print temp2
            for k in range(0, len(temp2)):
                if nums.match(temp2[k]) or temp2[k] in much_list:   # Implies answer contains a number
                    #print 'much Q - number or list sentence'


    # print 'Score list is:',sent_score_list
    max_score_value = max(sent_score_list)

    # Finding the sentences which has the highest score and adding them to the best list

    for i in range(0, len(sentence_list)):
        if sent_score_list[i] == max_score_value:

    # print 'Final sent list is:',final_sent_list

    temp_result = []
    temp_solution = []
    if len(final_sent_list) == 1:

        # If the question contains often, the sentence will usually contain a time expression.If so pick
        # that expression as the solution

        if 'often' in temp:
            #print 'often'
            for m in range(0,len(temp2)):
                if temp2[m] in how_often:
            #print 'Answer: ',' '.join(temp_solution)+'\n'
            #print '\n'
            return ' '.join(temp_solution)"""

        if "many" in temp_q.split():
            # print 'many'
            temp2 = final_sent_list[0].split()
            for m in range(0, len(temp2)):
                if nums.match(temp2[m]) or temp2[m] in many_list:

            return " ".join(temp_solution)

        return final_sent_list[0]

        """for k in final_sent_list[0].split():
            if k not in cleansedQuestion.split():

        return ' '.join(temp_result)"""

        # Choose the sentence that comes at the last, in case of a tie
        for k in range(0, len(final_sent_list)):
            result = final_sent_list[k]

        for k in result.split():
            if k not in cleansedQuestion.split():

        return " ".join(temp_result)