Exemple #1
0
def get_one_page(utils):
    time_one_page = time()
    news_api = AppleNews(html_r(), 'http://www.appledaily.com.tw/')
    api = SaveAppleNewsToHtml(news_api.home_url)
    api.store_in = "one_page/"
    utils.mkdir(api.store_in, True)
    utils.mkdir(api.store_in + '/img', True)
    f = open(api.store_in + 'apple.html', 'w')

    PageContent = utils.GetPage(opt.page)
    PageContent = news_api.page_parser(PageContent)
    api.PastHeader(f, "")
    PageContent = api.page_compose(PageContent)
    api.PastEntry(f, "", "", ''.join(PageContent), "")
    api.PastTail(f)
    logger.info("get one page spend %d sec", time() - time_one_page)
    sys.exit()
Exemple #2
0
                time_start_item = time()
                try:
                    url = NewsList['href']
                    if not 'http' in url:
                        url = news_api.home_url + url
                    page = news_api.get_page(url)
                except IOError:
                    #abandent
                    logger.info('The item spend %d secs' % (time() - time_start_item))
                    continue

                logger.info('【' + subClassify + '】' + NewsList['title'])
                summary = []

                try:
                    result =  news_api.page_parser(page)
                except:
                    logger.critical('parse failur %s' % url)
                    traceback.print_exc(file=sys.stdout)
                    logger.info('The item spend %d secs' % (time() - time_start_item))
                    continue

                # blacklist filter
                if blacklist:
                    articles = NewsList['title'] + \
                        ''.join([ a['header'] + a['text'] for a in result['article'] ])

                    if [ bl for bl in blacklist if bl[:-1] in articles ]:
                        print '【' + subClassify + '】' + NewsList['title'] + ' be blocked'
                        api.PastEntry(blf, NewsList['title'], news_api.home_url +
                                NewsList['href'], api.page_compose(result), subClassify)