Beispiel #1
0
    def __init__(self):
        spiderBase.__init__(self)
        self.url_base = 'http://www.sh.sgcc.com.cn/sdnw2010/'
        # self.logger = login.initLog('xz.log')
        print 'shanghaiSpider'
        self.session = self.init_session()
        self.folder_base = u'上海'

        js_str = common_utils.read_file_content('shanghai.js')
        self.jsEngineMgr = jsEngineMgr.initJsEngine()
        self.jsShowMenu2 = self.jsEngineMgr.eval(js_str.encode('utf-8'))


        # section 文件夹
        self.section_folder_map = {}
        # 栏目对应的url
        self.section_url_map = {}
        self.post_data_map = {}
        self.referer_map = {}

        self.section_key = [
            #国网要闻
            'gwxw',
            #公司要闻
            'gsyw',
            #本部新闻
            'bbxw',
            #基层信息
            'jcxx',
            #媒体报道
            'mtbd',
            #行业资讯
            'hyzx',
        ]




        #国网要闻
        self.section_url_map[self.section_key[0]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MjE5NDU3NjE%3D'
        #公司要闻
        self.section_url_map[self.section_key[1]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub_gsyw.xml&siteCode=sdnw'
        #本部新闻
        self.section_url_map[self.section_key[2]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg0'
        #基层信息
        self.section_url_map[self.section_key[3]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg3'
        #媒体报道
        self.section_url_map[self.section_key[4]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg1'
        #行业资讯
        self.section_url_map[self.section_key[5]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg2'




        #国网要闻
        self.section_folder_map[self.section_key[0]] = u'国网要闻'
        #公司要闻
        self.section_folder_map[self.section_key[1]] = u'公司要闻'
        #本部新闻
        self.section_folder_map[self.section_key[2]] = u'本部新闻'
        #基层信息
        self.section_folder_map[self.section_key[3]] = u'基层信息'
        #媒体报道
        self.section_folder_map[self.section_key[4]] = u'媒体报道'
        #行业资讯
        self.section_folder_map[self.section_key[5]] = u'行业资讯'
Beispiel #2
0
            #if time.strptime( pubdt, '%Y-%m-%d' ).tm_year != 2015:
                #self.logger.error(u'不下载 发布时间:' + pubdt + 'u' + article_item.title)
                #return

        if contentHtml.status_code == requests.codes.ok:
            self.logger.info( u'返回成功')
            common_utils.write_to_file_with_stream( contentHtml.content, file_name)
        else:
            self.logger.error(u'下载失败!!!:' + file_name)




if __name__ == '__main__':
    js_str = common_utils.read_file_content('shanghai.js')
    shanghai_spider = shanghaiSpider()
    shanghai_spider.init_log(u'上海.log')
    shanghai_spider.jsEngineMgr = jsEngineMgr.initJsEngine()
    shanghai_spider.jsShowMenu2 = shanghai_spider.jsEngineMgr.eval(js_str.encode('utf-8'))
    shanghai_spider.set_save_folder_path(globalconf.save_folder['shanghai'])
    shanghai_spider.init_mkdir_folder()
    str_limit_date = globalconf.spider_limit_date_time['shanghai']
    shanghai_spider.set_limit_date_time(str_limit_date[0:8], str_limit_date[9:])

    for section_item in shanghai_spider.section_key:
        shanghai_spider.logger.info(u"获取栏目:" + section_item + ":" + shanghai_spider.section_folder_map[section_item])
        for page_num in range(shanghai_spider.page_number):
            article_list = shanghai_spider.stripy_article_list(section_item, page_num)
            for item in article_list:
                shanghai_spider.stripy_article_context(item)