def wrapWPost(keyword, maxPage = 1, pastDay = 7): searchDate = date.today() oneDay = timedelta(days=1) for i in range(pastDay): y = str(searchDate.year) m = str(searchDate.month) d = str(searchDate.day) if (len(m)==1): m = '0'+m if (len(d)==1): d = '0'+d sd = y+m+d for j in range(maxPage): wp = WPostParser() url = 'http://www.washingtonpost.com/newssearch/search.html?sa=as&sd=%s&ed=%s&st=%s&cp=%d' % (sd, sd, keyword, j+1) url += '&fa_1_sourcenavigator=%22The+Washington+Post%22&fa_1_sourcenavigator=washingtonpost.com&fa_1_mediatypenavigator=^Articles%24' try: text = urlopen(url).read() except: print 'error occur during connect to url %s and read contents' % url continue try: wp.feed(text.decode('cp949', errors='replace')) except: print 'error occur during parsing %s' % url continue print 'wrapping WashingtonPost : '+str(searchDate)+', page '+str(j+1) print url wp.storeArticle(keyword, searchDate) wp.close() searchDate -= oneDay print 'done'
def wrapWPost(keyword, maxPage=1, pastDay=7): searchDate = date.today() oneDay = timedelta(days=1) for i in range(pastDay): y = str(searchDate.year) m = str(searchDate.month) d = str(searchDate.day) if (len(m) == 1): m = '0' + m if (len(d) == 1): d = '0' + d sd = y + m + d for j in range(maxPage): wp = WPostParser() url = 'http://www.washingtonpost.com/newssearch/search.html?sa=as&sd=%s&ed=%s&st=%s&cp=%d' % ( sd, sd, keyword, j + 1) url += '&fa_1_sourcenavigator=%22The+Washington+Post%22&fa_1_sourcenavigator=washingtonpost.com&fa_1_mediatypenavigator=^Articles%24' try: text = urlopen(url).read() except: print 'error occur during connect to url %s and read contents' % url continue try: wp.feed(text.decode('cp949', errors='replace')) except: print 'error occur during parsing %s' % url continue print 'wrapping WashingtonPost : ' + str( searchDate) + ', page ' + str(j + 1) print url wp.storeArticle(keyword, searchDate) wp.close() searchDate -= oneDay print 'done'
def wrapNYTimes(keyword, maxPage=1, pastDay=7): searchDate = date.today() oneDay = timedelta(days=1) while 1: index = keyword.find(' ') if index == -1: break keyword = keyword[:index] + '%20' + keyword[index + 1:] for i in range(pastDay): y = searchDate.year m = str(searchDate.month) d = str(searchDate.day) if (len(m) == 1): m = '0' + m if (len(d) == 1): d = '0' + d url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s' % ( keyword, y, m, d, y, m, d) try: req = urlopen(url) page = req.read() except: print 'error occur during connect to url %s and read contents' % url continue soup = BeautifulSoup(page) n = resultNum(soup) if n > maxPage * 10: pageNum = maxPage else: pageNum = (n + 9) / 10 for j in range(pageNum): url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s&frow=%d' % ( keyword, y, m, d, y, m, d, j * 10) try: req = urlopen(url) page = req.read() except: print 'error occur during connect to url %s and read contents' % url continue print 'wrapping NYTimes : ' + str(searchDate) + ', page ' + str(j + 1) print url soup = BeautifulSoup(page.decode('utf8', 'replace')) storeArticles(soup, keyword, searchDate) searchDate -= oneDay print 'done'
def wrapNYTimes(keyword, maxPage = 1, pastDay = 7): searchDate = date.today() oneDay = timedelta(days=1) while 1: index = keyword.find(' ') if index==-1: break keyword = keyword[:index] + '%20' + keyword[index+1:] for i in range(pastDay): y = searchDate.year m = str(searchDate.month) d = str(searchDate.day) if (len(m)==1): m = '0'+m if (len(d)==1): d = '0'+d url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s' % (keyword, y, m, d, y, m, d) try: req = urlopen(url) page = req.read() except: print 'error occur during connect to url %s and read contents' % url continue soup = BeautifulSoup(page) n = resultNum(soup) if n>maxPage*10: pageNum = maxPage else: pageNum = (n+9)/10 for j in range(pageNum): url = 'http://query.nytimes.com/search/query?query=%s&daterange=period&year1=%d&mon1=%s&day1=%s&year2=%d&mon2=%s&day2=%s&frow=%d' % (keyword, y, m, d, y, m, d, j*10) try: req = urlopen(url) page = req.read() except: print 'error occur during connect to url %s and read contents' % url continue print 'wrapping NYTimes : '+str(searchDate)+', page '+str(j+1) print url soup = BeautifulSoup(page.decode('utf8', 'replace')) storeArticles(soup, keyword, searchDate) searchDate -= oneDay print 'done'