def get_one_page(utils): time_one_page = time() news_api = AppleNews(html_r(), 'http://www.appledaily.com.tw/') api = SaveAppleNewsToHtml(news_api.home_url) api.store_in = "one_page/" utils.mkdir(api.store_in, True) utils.mkdir(api.store_in + '/img', True) f = open(api.store_in + 'apple.html', 'w') PageContent = utils.GetPage(opt.page) PageContent = news_api.page_parser(PageContent) api.PastHeader(f, "") PageContent = api.page_compose(PageContent) api.PastEntry(f, "", "", ''.join(PageContent), "") api.PastTail(f) logger.info("get one page spend %d sec", time() - time_one_page) sys.exit()
time_start_item = time() try: url = NewsList['href'] if not 'http' in url: url = news_api.home_url + url page = news_api.get_page(url) except IOError: #abandent logger.info('The item spend %d secs' % (time() - time_start_item)) continue logger.info('【' + subClassify + '】' + NewsList['title']) summary = [] try: result = news_api.page_parser(page) except: logger.critical('parse failur %s' % url) traceback.print_exc(file=sys.stdout) logger.info('The item spend %d secs' % (time() - time_start_item)) continue # blacklist filter if blacklist: articles = NewsList['title'] + \ ''.join([ a['header'] + a['text'] for a in result['article'] ]) if [ bl for bl in blacklist if bl[:-1] in articles ]: print '【' + subClassify + '】' + NewsList['title'] + ' be blocked' api.PastEntry(blf, NewsList['title'], news_api.home_url + NewsList['href'], api.page_compose(result), subClassify)