Ejemplo n.º 1
0
    def __init__(self, start='', end=''):
        self.TAG = WsjImg.__name__
        self.init_date(start, end)
        self.db = WsjPersist()

        self.callbacks = {
                'http://cn.wsj.com/gb/pho.asp': self.find_links, 
                'http://cn.wsj.com/gb/20': self.parse_page,
                'http://cn.wsj.com/pictures/photo/': self.save_img
        }
        self.spider = Spider('WsjImg')
        self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7')
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls(self.starts)
        self.spider.start()
Ejemplo n.º 2
0
class WsjImg:
    site_root = 'http://cn.wsj.com/'
    page_root = 'http://cn.wsj.com/gb/'
    img_root = 'http://cn.wsj.com/pictures/photo/'
    starts = ['http://cn.wsj.com/gb/pho.asp']
    # starts = ['http://cn.wsj.com/gb/20141230/PHO094555.asp']
    # callbacks = {'http://cn.wsj.com/gb/pho.asp':WsjImg.find_links, 'http://cn.wsj.com/gb/':WsjImg.parse_page, 'http://cn.wsj.com/pictures/photo/':WsjImg.save_img}

    # page url path
    # ['', 'gb', '20130528', 'PHO184538.asp']
    idx_page_date = 2
    idx_page_filename = 3
    # img url path
    # ['', 'pictures', 'photo', 'BJ20141226094555', '01.jpg']
    idx_img_dir = 3
    idx_img_filename = 4

    # persist
    DIR_BASE = 'base'
    DIR_ROOT = 'dat'
    DIR_IMG = 'img'

    def __init__(self, start='', end=''):
        self.TAG = WsjImg.__name__
        self.init_date(start, end)
        self.db = WsjPersist()

        self.callbacks = {
                'http://cn.wsj.com/gb/pho.asp': self.find_links, 
                'http://cn.wsj.com/gb/20': self.parse_page,
                'http://cn.wsj.com/pictures/photo/': self.save_img
        }
        self.spider = Spider('WsjImg')
        self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7')
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls(self.starts)
        self.spider.start()

    def init_date(self, strStart='', strEnd=''):
        '''Initiate start/end date'''
        self.strStart = strStart
        self.strEnd = strEnd

    def find_links(self, url, response):
        '''Parse the photos news default page and find photos news page urls'''
        Log.i(self.TAG, 'find links in %s' % url)
        links = ImgPageLinks(response, self.strStart, self.strEnd)
        urls = links.getLinks(response)
        # urls = links.persistToDB(self.db)
        self.spider.add_urls(urls)

    def parse_page(self, url, response):
        '''Parse photos news page, find content and image urls, also with other photos news page urls.'''
        # find img page links
        self.find_links(url, response)
        # process image page.
        imgPage = ImgPage(url, response)
        imgPage.clear()
        imgPage.parseImgUrls()
        if len(imgPage.imgUrls.keys()) > 1:
            imgPage.save(os.path.join(WsjImg.DIR_ROOT, imgPage.filePath))

            with open(os.path.join(WsjImg.DIR_ROOT, imgPage.data['path']), 'w') as f:
                f.write(json.dumps(imgPage.data))

            imgPage.persistToDB(self.db)
            self.db.updateArt(url, imgPage.title, imgPage.summary)

            # save imgs of the page
            self.save_imgs(imgPage)

            # copy base files to here
            # os.system('cp -a %s/* %s/' % (WsjImg.dir_base, os.path.join(WsjImg.dir_root, page_date)))

            self.spider.fetch.copyall(WsjImg.DIR_BASE, os.path.join(WsjImg.DIR_ROOT, imgPage.pageDate))
        else:
            print 'no link find in %s' % url

    def save_img(self, url, response):
        print 'ignore %s' % url

    def save_imgs(self, imgPage):
        for url in imgPage.imgUrls.keys():
            dstfile = os.path.join(WsjImg.DIR_ROOT, imgPage.imgUrls[url]['path'])
            self.spider.download(url, dstfile)