Example #1
0
class GovBuy(object):
    '''陕西公共资源交易信息网'''
    def __init__(self):
        name = 'shaanxi_sxggzyjy_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxggzyjy.cn/jydt/001001/subPage_jyxx.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shaanxip_list1',
                             dbset='shaanxip_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h3[@class="article-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="info-source"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('陕西', title)
            # area_name = '四川-成都'
            # print(area_name)

            source = 'http://www.sxggzyjy.cn/'

            table_ele = selector.xpath('//div[@class="ewb-main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '陕西省公共资源交易中心'
            retult_dict['en_name'] = 'Shaanxi Province Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://www.sxggzyjy.cn/jydt/001001/{}.html'.format(page)

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath('//ul[@class="ewb-list"]/li/a/@href')

            for url in url_li:
                urls = 'http://www.sxggzyjy.cn' + url
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            # {'categoryId':'', 'types':'','all_page': 1845},
            {
                'categoryId': '',
                'types': '',
                'all_page': 2
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''内蒙政府采购网'''
    def __init__(self):
        name = 'neimeng_nmgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.nmgp.gov.cn/wp-content/themes/caigou_pcweb/skin/css/css.css?ver=2.0',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }


        self.rq = Rdis_Queue(host='localhost', dblist='neimeng_list1', dbset='neimeng_set1')



    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self,params):
        try:
            url = 'http://www.nmgp.gov.cn/zfcgwslave/web/index.php'
            response = requests.get(url=url, headers=self.headers,params=params).json()
        except:
            print('load_post error')
        else:
            if len(response) >= 1:
                response_li = response[0]
            else:
                return
            for ret_dict in response_li:
                if not self.rq.in_rset(ret_dict):
                    self.rq.add_to_rset(ret_dict)
                    self.rq.pull_to_rlist(ret_dict)

    def load_get_html(self,ret_dict):
        # print(ret_dict)
        if ret_dict == None:
            return
        try:
            ret = eval(ret_dict)
            url = 'http://www.nmgp.gov.cn/ay_post/post.php?tb_id=' + ret['ay_table_tag'] + '&p_id=' + ret['wp_mark_id']

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # print(ret)
            _id = self.hash_to_md5(url)
            title = ret['TITLE_ALL']
            try:
                status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
            except:
                status = '公告'

            # print(title)
            publish_date = selector.xpath('//*[@id="info-box"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'\d+年\d+月\d+日',publish_date[0]).group()
            else:
                publish_date = None
            # print(publish_date)
            # return
            end_date = ret['ENDDATE']
            soup = BeautifulSoup(response)
            content_html = soup.find(id='s-main-2').div.div
            # print(content_html)
            # print(content)
            source = 'http://www.nmgp.gov.cn/'
            area_name = self.get_area('内蒙古', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '内蒙古自治区政府采购网 '
            retult_dict['en_name'] = 'NeiMengGu District Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'type_name':1, 'all_page': 5268},
            # {'type_name':2, 'all_page': 735},
            # {'type_name':3, 'all_page': 4482},
            # {'type_name':4, 'all_page': 101},
            # {'type_name':5, 'all_page': 925},
            # {'type_name':6, 'all_page': 2386},
            # {'type_name':7, 'all_page': 101},
            # {'type_name':8, 'all_page': 25},
            {'type_name':1, 'all_page': 2},
            {'type_name':2, 'all_page': 2},
            {'type_name':3, 'all_page': 2},
            {'type_name':4, 'all_page': 2},
            {'type_name':5, 'all_page': 2},
            {'type_name':6, 'all_page': 2},
            {'type_name':7, 'all_page': 2},
            {'type_name':8, 'all_page': 1},
                   ]
        for task in task_li:
            for page in range(1,task['all_page'] + 1):
                params = {
                    'r': 'zfcgw/anndata',
                    'type_name': task['type_name'],
                    'byf_page': str(page),
                    'fun': 'cggg',
                }
                if self.rq.r_len() > 8000:
                    time.sleep(3)
                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
Example #3
0
class GovBuy(object):
    '''山西采购电子商城'''
    def __init__(self):
        name = 'shanxi_sxzfcg_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxzfcg.cn/view.php?nav=61',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanxi_sxzfcg_cn_list1',
                             dbset='shanxi_sxzfcg_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@valign="middle"]/h2/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//td[@bgcolor="#E6E6E6"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'年|月', '-',
                    re.search(r'(\d{8}|\d{4}年\d+月\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '山西'
            # print(area_name)

            source = 'http://www.sxzfcg.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//td[@class="c_pt"]/table/tr[2]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '山西省省级政府采购中心'
            retult_dict['en_name'] = 'Shanxi Government Procurement Center'

            print(publish_date)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (
                ('nav', types),
                ('page', page),
            )
            url = 'http://www.sxzfcg.cn/view.php'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = selector.xpath('//tr[@class="odd"]/td/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                urls = 'http://www.sxzfcg.cn/{}'.format(url)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '',
                'types': '61',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '62',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '63',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '64',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '65',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '66',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '67',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '68',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '69',
                'all_page': flag
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']
                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''深圳政府采购网'''
    def __init__(self):
        name = 'shenzhen_zfcg_sz_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            'Referer':
            'http://61.144.227.212/was5/web/search?page=4096^&channelid=261279^&orderby=-DOCRELTIME^&perpage=10^&outlinepage=5^&searchscope=^&timescope=^&timescopecolumn=^&orderby=-DOCRELTIME^&chnlid=^&andsen=^&total=^&orsen=^&exclude=',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'Origin': 'http://61.144.227.212',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shenzhen_list1',
                             dbset='shenzhen_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb2312')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h4/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h6/label//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='main')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = 'http://www.zfcg.sz.gov.cn/'
            retult_dict['area_name'] = '深圳'

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '深圳市政府采购监管网 '
            retult_dict['en_name'] = 'Shenzhen Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('page', str(page)),
                ('channelid', '261279'),
                ('orderby', ['-DOCRELTIME', '-DOCRELTIME']),
                ('perpage', '10'),
                ('outlinepage', '5'),
                ('searchscope', ''),
                ('timescope', ''),
                ('timescopecolumn', ''),
                ('chnlid', ''),
                ('andsen', ''),
                ('total', ''),
                ('orsen', ''),
                ('exclude', ''),
            )
            data = [
                ('showother', 'false'),
                ('showtype', 'txt'),
                ('classnum', '20'),
                ('classcol', 'CTYPE'),
                ('channelid', '261279'),
                ('orderby', '-DOCRELTIME'),
            ]
            url = 'http://61.144.227.212/was5/web/search'
            response = self.session.post(url=url,
                                         headers=self.headers,
                                         params=params,
                                         data=data).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath('//div[@class="r_list"]/dl/dd/a/@href')
            print('第{}页'.format(page))
        except:
            print('load_post error')
        else:

            for url in url_li:
                # print(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 43879},
            {
                'all_page': 5
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                except:
                    pass

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #5
0
class GovBuy(object):
    '''重庆政府采购网'''
    def __init__(self):
        name = 'chongqing_cqgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Referer':
            'https://www.cqgp.gov.cn/notices/list?source=41,42^&area=^%^E9^%^87^%^8D^%^E5^%^BA^%^86^%^E5^%^B8^%^82^&purches=^%^E9^%^87^%^87^%^E8^%^B4^%^AD^%^E5^%^85^%^AC^%^E5^%^91^%^8A',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='chongqing_list1',
                             dbset='chongqing_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format(
                pid)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(pid)
        else:
            title = response['notice']['title']
            try:
                status = response['notice']['projectPurchaseWayName']
            except:
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            publish_date = response['notice']['issueTime']
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         publish_date).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '重庆'

            # print(area_name)

            source = 'https://www.cqgp.gov.cn/'

            content_html = response['notice']['html']

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '重庆市政府采购网'
            retult_dict['en_name'] = 'Chongqing City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('pi', page),
                ('ps', '20'),
                ('timestamp', str(int(time.time() * 1000))),
            )
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies,
                                    timeout=5).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            response_li = response['notices']
            for data_dict in response_li:
                pid = data_dict['id']
                # print(pid)
                # self.load_get_html(pid)
                # time.sleep(2)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [

            # {'all_page': 18647},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(types, page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #6
0
class GovBuy(object):
    '''苏州公共资源交易信息网'''
    def __init__(self):
        name = 'suzhou_szzyjy_fwzx_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'X-DevTools-Emulate-Network-Conditions-Client-Id': '06AB3D9C05E9FDAB1EDDAD36BA60296F',
            'Referer': 'http://ggzy.hefei.gov.cn/jyxx/002001/002001001/3.html',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_szzyjy_fwzx_suzhou_gov_cn_list1', dbset='suzhou_szzyjy_fwzx_suzhou_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h2[@class="word-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//h4[@class="word-info"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area()
            area_name = '江苏-苏州'

            source = 'http://szzyjy.fwzx.suzhou.gov.cn'

            table_ele  = selector.xpath('//div[@class="border"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市公共资源交易中心'
            retult_dict['en_name'] = 'Suzhou City Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            params = (
                ('paging', page),
            )
            url = 'http://szzyjy.fwzx.suzhou.gov.cn/Front/jyzx/{}/'.format(types)
            response = requests.get(url=url, headers=self.headers, params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            try:
                url_li = selector.xpath('//*[@class="mr-content"]/div[1]/table/tr/td[1]/a/@href')
            except:
                time.sleep(3)
                self.load_get(categoryId, types, page)


            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://szzyjy.fwzx.suzhou.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'002004/002004001','all_page': 2},
                {'categoryId':'', 'types':'002004/002004002','all_page': 2},
                {'categoryId':'', 'types':'002004/002004003','all_page': 2},
                {'categoryId':'', 'types':'002004/002004004','all_page': 1},
                {'categoryId':'', 'types':'002004/002004005','all_page': 2},
                {'categoryId':'', 'types':'002004/002004006','all_page': 1},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #7
0
class GovBuy(object):
    '''上海公共资源交易信息网'''
    def __init__(self):
        name = 'shanghai_ztb_shmh_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'X-DevTools-Emulate-Network-Conditions-Client-Id':
            'C30FE2988AF840A005E144C01A1874D4',
            'Referer':
            'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/shmhztb_subject_zfcg_cggg/List/list_350.htm',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanghai_ztb_shmh_gov_cn_list1',
                             dbset='shanghai_ztb_shmh_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="title"]/h2/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="title"]/h3//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '上海'
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://ztb.shmh.gov.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//div[@class="list_right"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '上海市闵行区公共资源交易网'
            retult_dict['en_name'] = 'Minhang District Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/{}/List/list_{}.htm'.format(
                types, page)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//ul[@id="list_ul"]/li/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                # response_li = response['result']['records']
                # for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/' + re.sub(
                    '\.\.\/\.\.\/', '', url)

                # print(data_dic)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_cggg',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_jggg',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_dyly',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_jsgc_zbxx',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_jsgc_zgbxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_jyxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_cjxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_cjxx',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''江苏政府采购网'''
    def __init__(self):
        name = 'jiangsu_ccgp-jiangsu_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/index_1.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jiangsu_list1',
                             dbset='jiangsu_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="dtit"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="detail_bz"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('江苏', title)
            # print(area_name)

            source = 'http://www.ccgp-jiangsu.gov.cn/'

            table = selector.xpath('//div[@class="detail"]')
            if table != []:
                table = table[0]
            else:
                return
            content_html = etree.tostring(table,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            # print(content_html)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江苏政府采购网'
            retult_dict['en_name'] = 'Jiangsu Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, base_url, page):
        try:
            if page == 0:
                url = base_url
            else:
                url = base_url + 'index_' + str(page) + '.html'
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('load_post error')
            # self.load_get(url)
        else:
            # print('第{}页'.format(page))
            url_li = selector.xpath('//div[@class="list_list"]/ul/li/a/@href')
            if url_li == []:
                url_li = selector.xpath(
                    '//div[@class="list_list02"]/ul/li/a/@href')

            for url in url_li:
                urls = base_url + url.replace('./', '')
                # print(urls)
                # self.load_get_html((urls))
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cgyg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/htgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/xqyj/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/ysgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/xuzhou/',
                'all_page': flag
            },
        ]
        count = 3
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    base_url = task['url']

                    # self.load_get(base_url, page)
                    spawns = [
                        gevent.spawn(self.load_get, base_url, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #9
0
class GovBuy(object):
    '''西藏政府采购网'''
    def __init__(self):
        name = 'xizang_ccgp-xizang_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.ccgp-xizang.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.ccgp-xizang.gov.cn/shopHome/morePolicyNews.action?categoryId=124',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='xizang_list1',
                             dbset='xizang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(url)
        else:
            # print(response)

            title = selector.xpath('//h2[@class="sd"]/font/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//h3[@class="wzxq"]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            print(publish_date, title)
            # print (title)
            area_name = self.get_area('西藏', title)

            # print(area_name)

            source = 'http://www.ccgp-xizang.gov.cn/'

            table_ele = selector.xpath('//div[@class="neirong"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '西藏自治区政府采购网'
            retult_dict['en_name'] = 'Xizang Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, page):
        try:
            params = {'categoryId': categoryId}

            data = {'currentPage': str(page)}
            url = 'http://www.ccgp-xizang.gov.cn/shopHome/morePolicyNews.action'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath(
                '//div[@id="news_div"]/ul/li/div[1]/a/@href')
            # print(url_li)
            # return
            for url in url_li:
                urls = 'http://www.ccgp-xizang.gov.cn' + url
                # print(urls)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '124',
                'all_page': 2
            },
            {
                'categoryId': '125',
                'all_page': 2
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #10
0
class GovBuy(object):
    '''乌鲁木齐公共资源交易信息网'''
    def __init__(self):
        name = 'wulumuqi_ggzy_wlmq_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://ggzy.wlmq.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'text/plain',
            'Accept': '*/*',
            'Referer': 'http://ggzy.wlmq.gov.cn/generalpage.do?method=showList&fileType=201605-048&faname=201605-046',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='wulumuqi_list1', dbset='wulumuqi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, info_id):
        if info_id == None:
            return
        try:
            url = 'http://ggzy.wlmq.gov.cn/infopublish.do?method=infoPublishView&infoid=' + info_id
            response = requests.get(url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="title"]/text()')
            if title != '':
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//td[@class="td_name"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.search(r'(\d{8}|\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            # area_name = self.get_area('云南',title)
            area_name = '新疆-乌鲁木齐'
            # print(area_name)

            source = 'http://ggzy.wlmq.gov.cn/'

            table_ele  = selector.xpath('//div[@class="w_content_main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '乌鲁木齐市公共资源交易网'
            retult_dict['en_name'] = 'Urumqi City Public resource'
            
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            data = 'callCount=1\n\npage=/generalpage.do?method=showList&fileType='+categoryId+'&faname=201605-046\n\nhttpSessionId=\n\nscriptSessionId=A0890501B5665F11F1222EBC440FC5FC644\n\nc0-scriptName=projectDWR\n\nc0-methodName=queryItemInfoByIndustryType2\n\nc0-id=0\n\nc0-e1=string:packTable\n\nc0-e2=string:'+categoryId+'\n\nc0-e3=number:'+str(page)+'\n\nc0-e4=string:15\n\nc0-e5=string:true\n\nc0-e6=string:packTable\n\nc0-e7=string:982\n\nc0-param0=Object_Object:{flag:reference:c0-e1, name:reference:c0-e2, currentPage:reference:c0-e3, pageSize:reference:c0-e4, isPage:reference:c0-e5, tabId:reference:c0-e6, totalRows:reference:c0-e7}\n\nbatchId=3\n\n'
            url = 'http://ggzy.wlmq.gov.cn/dwr/call/plaincall/projectDWR.queryItemInfoByIndustryType2.dwr'
            response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            info_id_il = re.findall(r"""\[\'FILE_ID\'\]\=\"(.*?)\"\;""", response)
            print(info_id_il)
            for pid in info_id_il:
                # print(info_id)
                # self.load_get_html(pid)

                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)
    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'201605-048', 'types':'','all_page': 2},
                {'categoryId':'201605-049', 'types':'','all_page': 1},
                {'categoryId':'201605-050', 'types':'','all_page': 2},
                {'categoryId':'201605-051', 'types':'','all_page': 1},
                {'categoryId':'201605-052', 'types':'','all_page': 1},
                {'categoryId':'201605-053', 'types':'','all_page': 1},
                {'categoryId':'201605-039', 'types':'','all_page': 2},
                {'categoryId':'201605-041', 'types':'','all_page': 1},
                {'categoryId':'201605-042', 'types':'','all_page': 1},
                {'categoryId':'201605-043', 'types':'','all_page': 2},
                {'categoryId':'201605-044', 'types':'','all_page': 2},
                {'categoryId':'201605-045', 'types':'','all_page': 2},
            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #11
0
class GovBuy(object):
    '''河北-政府采购网'''
    def __init__(self):
        name = 'hebei_ccgp-hebei_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'If-None-Match': '594gpnM6qpxwGpEvFYoNJpzY8YE=',
            'If-Modified-Since': 'Mon, 23 Jul 2018 02:32:18 GMT',
            'Referer': 'http://www.ccgp-hebei.gov.cn/province/cggg/zhbgg/index_3.html',
            'X-DevTools-Emulate-Network-Conditions-Client-Id': 'F24524FAD50B25DB7D7D89DBCEA53767',
            'Intervention': '<https://www.chromestatus.com/feature/5718547946799104>; level=warning',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='hebei_list1', dbset='hebei_set1')



    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,url):
        if url == None:
            return
        try:
            response = self.session.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:

            title = selector.xpath('//span[@class="txt2"]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'


            publish_date = selector.xpath('//body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/text()')
            # print(publish_date)
            if publish_date !=[]:
                publish_date = re.sub(r'\r|\n|\s|发布时间:','',publish_date[0])
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find('body').table.tr.td.table
            # print(content_html)
            area_name = self.get_area('河北',title)
            source = 'http://www.ccgp-hebei.gov.cn/province/'


            _id = self.hash_to_md5(url)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source
            retult_dict['publish_date'] = publish_date
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '河北省政府采购网'
            retult_dict['en_name'] = 'Hebei Province Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, params):
        try:
            url = 'http://search.hebcz.gov.cn:8080/was5/web/search'
            response = self.session.get(url=url, headers=self.headers, params=params).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
        else:
            url_li = selector.xpath('//tr[@id="biaoti"]/td[2]/a/@href')
            for url in url_li:
                # self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)


    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 10
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        #     city_id_li = [
        #         '130100000','130181000','130200000','130300000','130400000','130500000',
        #         '130600000','130682000','130700000','130800000','130900000','131000000','131100000',
        #         '139900000']
        count = 2
        task_li = [
                {'lanmu':'zhbgg','code':130000000,'all_page': count},
                {'lanmu':'zbgg','code':130000000,'all_page': count},
                {'lanmu':'zhbgg','code':130181000,'all_page': count},
                {'lanmu':'zbgg','code':130181000,'all_page': count},
                {'lanmu':'zhbgg','code':130200000,'all_page': count},
                {'lanmu':'zbgg','code':130200000,'all_page': count},
                {'lanmu':'zhbgg','code':130300000,'all_page': count},
                {'lanmu':'zbgg','code':130300000,'all_page': count},
                {'lanmu':'zhbgg','code':130400000,'all_page': count},
                {'lanmu':'zbgg','code':130400000,'all_page': count},
                {'lanmu':'zhbgg','code':130500000,'all_page': count},
                {'lanmu':'zbgg','code':130500000,'all_page': count},
                {'lanmu':'zhbgg','code':130600000,'all_page': count},
                {'lanmu':'zbgg','code':130600000,'all_page': count},
                {'lanmu':'zhbgg','code':130682000,'all_page': count},
                {'lanmu':'zbgg','code':130682000,'all_page': count},
                {'lanmu':'zhbgg','code':130700000,'all_page': count},
                {'lanmu':'zbgg','code':130700000,'all_page': count},
                {'lanmu':'zhbgg','code':130800000,'all_page': count},
                {'lanmu':'zbgg','code':130800000,'all_page': count},
                {'lanmu':'zhbgg','code':130900000,'all_page': count},
                {'lanmu':'zbgg','code':130900000,'all_page': count},
                {'lanmu':'zhbgg','code':131000000,'all_page': count},
                {'lanmu':'zbgg','code':131000000,'all_page': count},
                {'lanmu':'zhbgg','code':131100000,'all_page': count},
                {'lanmu':'zbgg','code':131100000,'all_page': count},
                {'lanmu':'zhbgg','code':139900000,'all_page': count},
                {'lanmu':'zbgg','code':139900000,'all_page': count},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                params = {
                    'page': str(page),
                    'channelid':'228483',
                    'perpage':'50',
                    'outlinepage':'10',
                    'lanmu': task['lanmu'],
                    'admindivcode': task['code'],
                    }

                try:
                    self.load_get(params)

                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)
                print('第{}页'.format(page))
        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #12
0
class GovBuy(object):
    '''山西政府采购网'''
    def __init__(self):
        name = 'shanxi_ccgp-shanxi_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Referer': 'http://www.ccgp-shanxi.gov.cn/view.php?nav=104',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanxi_list1',
                             dbset='shanxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, params):
        try:
            url = 'http://www.ccgp-shanxi.gov.cn/view.php'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params).content.decode('utf-8')

            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//*[@id="node_list"]/tbody/tr/td[1]/a/@href')
        except:
            print('load_post error')
        else:
            # print(url_li)
            if url_li != []:
                for url in url_li:
                    url = 'http://www.ccgp-shanxi.gov.cn/' + url
                    if not self.rq.in_rset(url):
                        #     pass
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('laod_get_html error')
        else:
            # print(response)
            _id = self.hash_to_md5(url)
            # # print(_id)
            title = selector.xpath(
                '//tr[@class="bk5"]/td/table/tr/td/table/tr/td/div/h2/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            publish_date = selector.xpath(
                '//tr[@class="bk5"]/td/table/tr[2]/td//text()')
            # print(publish_date)
            if publish_date != []:
                publish_date = re.search(r'(\d+年\d+月\d+日)', publish_date[2])
                if publish_date != []:
                    publish_date = publish_date[0]
                else:
                    publish_date = None
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='bk5')
            # print(content_html)

            source = 'http://www.ccgp-shanxi.gov.cn/'
            area_name = self.get_area('山西', title)
            #
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中国山西政府采购'
            retult_dict['en_name'] = 'Shanxi Government Procurement'
            #
            # print(retult_dict)
            #
            print('列表长度为={}'.format(self.rq.r_len()))
            #
            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            spawns = [
                gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                for i in range(count)
            ]
            gevent.joinall(spawns)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            #{'nav':100, 'end_page':14705,'status':'招标公告'},
            #{'nav':104, 'end_page':13667,'status':'结果公告'},
            #{'nav':105, 'end_page':2291,'status':'变更公告'},
            #{'nav':116, 'end_page':747,'status':'单一来源公告'},
            #{'nav':131, 'end_page':249,'status':'招标预公告'},
            #{'nav':132, 'end_page':1,'status':'邀请公告'},
            #{'nav':153, 'end_page':7279,'status':'合同公告'},
            {
                'nav': 100,
                'end_page': 4,
                'status': '招标公告'
            },
            {
                'nav': 104,
                'end_page': 3,
                'status': '结果公告'
            },
            {
                'nav': 105,
                'end_page': 2,
                'status': '变更公告'
            },
            {
                'nav': 116,
                'end_page': 2,
                'status': '单一来源公告'
            },
            {
                'nav': 131,
                'end_page': 1,
                'status': '招标预公告'
            },
            {
                'nav': 132,
                'end_page': 1,
                'status': '邀请公告'
            },
            {
                'nav': 153,
                'end_page': 1,
                'status': '合同公告'
            },
        ]
        for task in task_li:
            for page in range(1, task['end_page'] + 1):
                params = {
                    'app': '',
                    'type': '',
                    'nav': task['nav'],
                    'page': str(page)
                }
                self.load_get(params)
                print('第{}页'.format(page))

    def main(self):
        self.run()
Example #13
0
class GovBuy(object):
    def __init__(self):
        name = 'tianjin_city_gov_buy'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.tjgp.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer':
            'http://www.tjgp.gov.cn/portal/topicView.do?method=view^&view=Infor^&id=1665^&ver=2^&st=1^&stmp=1532324224291',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='tianjin_list1',
                             dbset='tianjin_set1')

    def is_running(self):
        is_runing = True
        # if self._post_ret_url_queue.empty() and len (self._post_ret_url_set) > 0:
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_post(self, data):
        try:
            response = requests.post(
                'http://www.tjgp.gov.cn/portal/topicView.do',
                headers=self.headers,
                data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('load_post error')
        else:
            url_li = selector.xpath('//*[@id="reflshPage"]/ul/li/a/@href')
            if url_li != []:
                for url in url_li:
                    url = 'http://www.tjgp.gov.cn' + url
                    if not self.rq.in_rset(url):
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('laod_get_html error')
        else:

            _id = self.hash_to_md5(url)
            # print(_id)
            title = selector.xpath(
                '//body/table/tbody/tr/td/div/p[1]/font/b/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            publish_date = selector.xpath(
                '//body/table/tbody/tr/td/div/p[3]/text()')
            if publish_date != []:
                publish_date = publish_date[0]
            else:
                publish_date = None
            # print(publish_date)
            source = 'http://www.tjgp.gov.cn/'
            area_name = self.get_area('', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name
            # #
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(response)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '天津市政府采购网'
            retult_dict['en_name'] = 'Tianjin government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            spawns = [
                gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                for i in range(count)
            ]
            gevent.joinall(spawns)

    def run(self):
        threading.Thread(target=self.init).start()
        count = 5
        task_li = [
            {
                'id': '1665',
                'end_page': count
            },
            {
                'id': '1664',
                'end_page': count
            },
            {
                'id': '1664',
                'end_page': count
            },
            {
                'id': '1666',
                'end_page': count
            },
            {
                'id': '2013',
                'end_page': count
            },
            {
                'id': '2014',
                'end_page': count
            },
            {
                'id': '2015',
                'end_page': count
            },
            {
                'id': '2016',
                'end_page': count
            },
        ]
        for task in task_li:
            for page in range(1, task['end_page'] + 1):
                data = [
                    ('method', 'view'),
                    ('page', str(page)),
                    ('id', task['id']),
                    ('step', '1'),
                    ('view', 'Infor'),
                    ('st', '1'),
                    ('ldateQGE', ''),
                    ('ldateQLE', ''),
                ]
                self.load_post(data)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''广州政府采购网'''
    def __init__(self):
        name = 'guangzhou_gzg2b_gzfinance_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://gzg2b.gzfinance.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer':
            'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalindex.do?method=goInfogsgg^&linkId=gsgg',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangzhou_list1',
                             dbset='guangzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, data):
        try:
            url = 'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalsys/portal.do'
            params = (
                ('method', 'queryHomepageList'),
                ('t_k', 'null'),
            )
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).json()
        except:
            print('load_post error')
        else:
            response_li = response['rows']
            for ret_dict in response_li:
                if not self.rq.in_rset(ret_dict):
                    self.rq.add_to_rset(ret_dict)
                    self.rq.pull_to_rlist(ret_dict)

    def load_get_html(self, ret_dict):
        if ret_dict == None:
            return
        try:
            ret = eval(ret_dict)
            url = 'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalsys/portal.do?method=pubinfoView&&info_id=' + ret[
                'info_id'] + '&t_k=null'
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            _id = self.hash_to_md5(url)
            title = ret['title']
            status = ret['info_key']
            publish_date = ret['finish_day']
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='row').div
            # print(content_html)

            source = 'http://gzg2b.gzfinance.gov.cn/'
            area_name = self.get_area('广州', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广州市政府采购平台 '
            retult_dict[
                'en_name'] = 'Guangzhou Government Procurement Platform'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 329},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                data = [
                    ('current', str(page)),
                    ('rowCount', '10'),
                    ('searchPhrase', ''),
                    ('title_name', ''),
                    ('porid', 'zbcggg'),
                    ('kwd', ''),
                ]

                self.load_get(data)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #15
0
class GovBuy(object):
    '''广西政府采购网'''
    def __init__(self):
        name = 'guangxi_gxzfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gxzfcg.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://www.gxzfcg.gov.cn/CmsNewsController/search/chnlCodes-/distin-/beginDate-0/endDate-0/p-20/c-3/0-0.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangxi_list1',
                             dbset='guangxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, url):
        try:
            data = [
                ('searchKey', ''),
                ('title', ''),
                ('str1', 'undefined'),
                ('str2', 'undefined'),
                ('cmsNews.title', ''),
                ('cmsNews.buyerName', ''),
                ('cmsNews.str2', ''),
                ('cmsNews.str3', ''),
                ('cmsNews.str1', ''),
                ('cmsNews.str5', ''),
                ('cmsNews.str6', ''),
                ('cmsNews.str8', ''),
                ('cmsNews.agentName', ''),
                ('cmsNews.startPubdate', ''),
                ('cmsNews.endPubdate', ''),
            ]
            response = self.session.post(url=url,
                                         headers=self.headers,
                                         data=data).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//*[@id="channelBody"]/div[2]/ul/li/a/@href')
        except:
            print('load_post error')
        else:
            for url in url_li:
                url = 'http://www.gxzfcg.gov.cn' + url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            response = self.session.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # try:
            title = selector.xpath(
                '//*[@id="bodyMain"]/div/div/div[2]/div[2]/div[1]/h1/text()')
            # print(title)
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="bodyMain"]/div/div/div[2]/div[2]/div[1]/span//text()'
            )
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='frameReport')
            if content_html == None:
                raise EOFError

            source = 'http://www.gxzfcg.gov.cn/'
            area_name = self.get_area('广西', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广西壮族自治区政府采购网 '
            retult_dict[
                'en_name'] = 'Guangxi Zhuang National Government Procurement'

            # print(retult_dict)
            #
            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 49876},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                url = 'http://www.gxzfcg.gov.cn/CmsNewsController/search/chnlCodes-/distin-/beginDate-0/endDate-0/p-20/c-' + str(
                    page) + '/0-0.html'
                self.load_get(url)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #16
0
class GovBuy(object):
    '''广西公共资源交易信息网'''
    def __init__(self):
        name = 'guangxi_gxzbtb_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'trbofu0uet0aywbdhr35s0x4',
            '__CSRFCOOKIE': '6f7e275f-5762-4569-8ea2-ae98d3b0379d',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gxzbtb.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.gxzbtb.cn/gxzbw/jyxx/001010/001010001/MoreInfo.aspx?CategoryNum=001010001',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangxi_gxzbtb_cn_list1',
                             dbset='guangxi_gxzbtb_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'\/', '-',
                    re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('', title)
            area_name = '广西'

            # print(area_name)

            source = 'http://www.gxzbtb.cn/'

            table_ele = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广西壮族自治区公共资源交易中心'
            retult_dict['en_name'] = 'Guangxi Zhuang National Public Resources'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:

            params = (('CategoryNum', types), )

            data = {
                '__CSRFTOKEN':
                '/ wEFJDZmN2UyNzVmLTU3NjItNDU2OS04ZWEyLWFlOThkM2IwMzc5ZA ==',
                '__VIEWSTATE':
                'z6UhCTu3jqnsz47aHWA7gSWW/wG9YleyN9akSy8SDfrTIhkXv/8D27JYdKJG/ZWKPqb0smc7bS8/xKHcu0vIwbRrxU6DQIlkQJ3m97wtYMFtK8KpjZwIdMqSgnw1q3DjBu9vEraO4xfqzJacAXSCukutXa8BPCyLevI3U1naYHFUSSNqQhNo9cICs8Kqr8n8HNpvSRjqJB8CTAWoGlc8x6IeC/j50VdUETRudT9/w6Xc0To0rsY/yH+VbMRbAzIFlzNvQP/dmUPEdjMSRkvyULU6ZIgal19QWLJXJSGioQKJ3StzC9BqsgyaCobteQoKLs8/h20aMOCs2YO/oSpUVr0AaapTqrGIMhrM/IaPn8N00monNce5uU1fWffkBK6zL4mJijgPTmuyCrA3/AUY5La8VvP1v2NUScoKAdjRaPypTDlh1+ZUt3x7ZdwcmWd7bwgAK42uneSLZWSC7Er0k9VcuPZTR4E/I8fbEzJWf4Bf9pI2hs5actOcnD4ETfu5m/dgfq1mgz4uTrYIRcqE1xOmE3WDJNircnYO4xVTI49MDYfgDcGtjWRiBZHd788/5abVt7h9sXkzXPHboi7zVv4haS8yZsIEeQG3F8MFVtM7H9+/Kbss3uPC5I5V/dDw54S2zejVmbAx9dU68wJfL1+c28EbvOUOWgOE6dCuFmTS3nSiBLMjwUeRtKwhvP1RA5MsKD4WI8JtqR545DULfQz0XJCh1PlO+Nd5L661UuspC/PvgWRoNQEoaVpLJK1S9UuPAdNnAqXdMuJdZZPu60+Jdig7zOBSEGbmwNvmXB0vphagqkqxf1nruFS0OGP/D7OJcbcObzotOwp1GpMmBdqg6hiDh2nccyFZ+E4DUv5NssGK4Zj7dY2jhMBv8bvkIwaY+uLYMLJwJ0DqhNyP2JTKv/FmENt0pjfytL0EU2HRmLTcPgJWgdQ2IZ7GZCYhkSzBfkkJOmVx7t+UZervSy+hZBsVsaz0DpKZ9JMVXfOYVzQNZt+VmcmIn9feEEJH6F8t4gYlC1pvrfcFcBVI8ndumFTtsYjnHhD7gMG8q64rCOoD0DAB0si2OdOndIUczT0RlhLkpqa9RA1nQ9kj75RJGe/dm4NGfCqqUHcRJTNbixZUPdA2pJNzYhRAMLQBqGmFANV+tvqB2yDiJg86H6ouBO7v2+SXxkp70ZBxv8CiAHw2kKEWoxfqmnMi552GiJRSrpOhcw3ylrYA3dINyJjtDJ9ZNYxLGWD5Vzu170wePz/foMZ6o2+8zWgEBc3PDx2l3UTG1TUwt8dbQbokscyKFWtCoo/qs9kkZS1KYBZ2NKe8K3EswLU3d5pHJsUtFhe2QtHhMolkwM3LTzBIIBl1QyPS6HDnCasCFHNbGX2/k/TMPLEBrOpdsRo1YhhhNMYz8pwQbwTxo4WRsmmQBvPUkTSlk04Iasp9Tm8/2WO/NIFs0Y/h0BvdanXJHwP8blNEMZCd5qmP02LdYGygy6hs2uU79m+VM6wtHIbYxkC2AKuDkErEqaOuQCNNiQfyP6e0oaZwNcWQOOaQDWsz9/F12QCDwx3X2ihIaG2v1YCQHKC/lfnBQ8o9Q9LvrLuZ4yjVbaO3B1eM0Q46zuTbT9KbhLwkFm8LH/2JM/OOvvUDNN7A4z42REh3kZWC0eXOyCDD1HvNdOFxluC6GRtEdv6/31i/PKLPr7te5VCIGCFjF9oCxquF9+2ecYtlmqcvbEvCKnPx1JDO1BloI0i8TqwjDmqyaORTTYzrtJwnXjKiX+8TKDC9yyOMXA1qbTt/KZPTpHI1R0P9qQ7Pk8AwKeL1y5g68OMHUqOsSyJuo6SNety/xymBke5m1FG2zE9M7OOqSGtV6NCKXNoSgi3laUmbAMZ8x+GOQnXrfpMGvtPUPIUY9zEvUiEDeKKQSnqlxf5LwEWyWlEuzSjO4+8nIGVC9nUb/YEIM5o2wiC1lMXl2d2tDQ/Mau5M3B6qmSLx5QP3nfjDKsoFqN0tQxlo/TBCKLXAUOHM8zTrEBY/xkb6tvnijW+leCYPSKURnheByCjFWSPnlz6C8tiktma/JzVph5blcc3thpmHiGp87enSqKQkjIf8RJkVeM+ENhg0gAndrokhhPBiS+MTNCuX2zXimlH2dpTY7JKu4uSyltVswpG2mLWFGegTeKLsBjVks8je/eeJvAaevVRh9mNOD4Cj8jh6/6taR2ee0/EjYlDIkrCNstLNBcQ35u7NQOesHpN9j7Zxf1iSsz1ChY/fS3w+3AVg7hnZA6yr1pUa54NWQEakrAgjpNUTzdTkSfyLkGmdqSXZEady89XXYBKDfF6rkDa8hb19ujrWQZn9m9K22OeAw1k3w8wl29I2LMno86bezhCDhZRVa2RrsbsYAtJ+TMnEdWUuINhSrEbe9zRRga2N4BJv+eopnSClJYNNgkMiNVEOdWnfDVa9Wb9iqVRYfjBKfZRv8g4/tlMr6ygKYPBRprLiv1VQT9M+5hkhLWgtGeyOzTGxfiZG6QHnqSL2g/A+nu5Ij3fGoVDEPPj3Adcqk6AUrcY+XaJxR9wVVz927mFfFq5kxjo12Sw2ak1pS7faIy7o9Fk7Y9XKh1qu35ltABHEqiVIeb/dymZ7oLV+AClQeLbbmciJ7NKrdzTwRxanqOirpiPl5MnJtQxROEbt6lYeRG1RzEUsKMlp/L5v2aBRnkVWC4odd6FafVJw1NFDAhtVrI3uGta566tdsuT+FYaXOtELa/hUjBES+jWAJ0+qrDVff6ilka90N5wpQ55cjCwAs1VtaLa6b/zuin4h6+wfwtJnEGBfXND+1AQSbrveJHojhedFjPAYsSG988yhO0A1+TdQWGoJQmlEINiELipfNz/CUCbHENz431cxEjZV6No6qEXLUVXcbXp0BRB8sOZWtmbJ5LaLzS+unRSRN9RMk/80ct6AuINtSE2MCwrBpkrB3DhkebVRwWxxODsfGOj20j5pVpeI8jF75k/9igiTP/+3+N20FTsoJ/fVXevJ2YTUHIrJZc2j3bNDZ6LuHcJbEjS5DQat9WGeZa2FzDRba3ikBTxMevju8T9I2s19yFeztg72WQTcyDhN0I/TryQNcqZq67e8ScokSwQ1pE95EkIBdxk+7J9IIm1KHGp7P1T6PmxBqSyCyJT53AJgQxbhG2N+2NCpIk0ZfKA9Apvg/UfBFli/pa42N1XCdVnLwWW9wOY+vSbuo9Fnf91wTW1SrH1cZCrcWDFzJTlB703WUdA97ZyWuRMwypjXj5RGpTRi1R/maM3DwIcC6ktl+aczr8jK94UVPZ2iNVmgk/Ml92vly8vycYSTkHvFCHmw0gzSyhBjaCDSEL80nw4T4XjrrNfohWQRYDnk+isTfbfmpt6KRz8yIczndwTZdSN5rYigqeAJMd9DAxm28DcGCUk1nOyeASMtByfmPDd/jp6ihDR8Uj10eaty7X0LyjvB3Ol4kjvNucSPwJhwe6PCULDCMKKM9EQFTs0UiiyAhA/1N52njX2EpWDLOnT8yfMMDfDOwdwex/3DVo22nYjzTArBjbjJ4N6RtPW0rrWXJNJFHpm6ZSUTFZXgtZw+wAvBxRWiuXsvQqUYS4a25rN1/8aIaKxV9rxhSTZzF7l9K5S0wvjF1+kwarDs/M5SQT8pZtdEnySC5tgn057VgiCpEHbCWYm16zWPv7ARLsRV8D21nmMoYAJqJ5jZZMcrVTMuutYG7zc7W2rmjt2Nto/enbDGWgBeyMCsCPPA6+VYvOXWV6JTCwwCUQ//+LH4z1Kokk02ObYuNfwh0x4ilnU6JYM9t65ExOl7shHpKQUHrXwtwDi49hZNTD78s3yPOJYa5E9delhUSFFCAqH5/AxgSFKMOJXyBgsQlntLLWlYGCUabX61ClQuf3flIQ80RBZKlwA6qTpW3dS4EcgCP4beaujMVq/ifreAkY3hGwZwbdXViux7rLJTdj188Bim8KVbCYfIwWWoin8Nsi/rZiPorqikSMdyEw9VoWtIMz6/PNeJY5mh68hzeCGFKEIRNDPy+wMlMbh1Q1vzj1RTQa7sMAaDrq99gx3oc+CXHZKpbVwPOk/HwjJ6JM90TNrZdBIL0+PW98LgriR5FuqoUFp4DUHMSW0YjZDqj+MUq9OMFhOCFUTzg53NkBlgvKdzzr8Afve7xL9pXCcvXdRPxCHW78Hj1cJn/zmOe19RissiNTqUS5ArxaCeiD3IEmVKJboz2B2E7kp+mwpjCvx0IJ9HUUGJiBeP2ayo9SGOxZPfKVZ3hLV5Yrk2kyOagPI9ZA7kNzCRQO0+cgObPKve9kqANbcB7CxIWP7yVTTMGN2hHwzK731hA4nU7VXT2af6fO1/A42/DHaqmLqgBNBij6ihMW+xtOUmfJ9Fft/+9fTMps9rvznPluGxp4LwmLiugk9OEg+5qzJMzpec/zYFU0L3GWPiMJpcrBgO4uZ9Sl+beLk11GzbrFgcL+3Uhb7dzgxZvAaE4kHPbx2W4VDJGCuXdiTTlPZFwV2KTE2k7U37bP7IvgRDSu18ZjXqS0ckwDqd/jbXwmc84FLEo73rs9D050kmeYREx9c/GJHs6bR5bTIKkrCorEXJ+I1LNItiyYpgQ0fCsutxe91UwVLh4IV1l+jmjQoOeY99vzYmqJ/mv1FbWuqTSFZzHOIJmxpY3hSHGsnjh3fTlCwp2vb7OI2OcS4hdPfm/wUwiMoO4o0+MEEIZq4s2/243WkXxnQv4x8eGJkbBvhlhKgNOoNxwB5wgAAnhXkH3PH08VS1skVudmUwMNChQMwKnQr44CUMhYsmy3PXftAeLMBvTjSAngfdupJU6mV6hQHcioY+uk3cq0AfBtDdRKa/ANMFXNFt45zbANxG2wtfbaGLKmSETIPxshs5KupcFM+E0ikl+/iO8sLV7tbIqPmgzKG4kuovGfVw/Io8Z+ol113M9419oCHr8M9LZcqOw1HbQcCC2hDQCyW9aiCEryPZyUN0c84vCukRQACb0YeTBu8Hl693+QJd0KVAJ8c05wTRa0xBjdsTdZ2jVGdSez42wtoI/ZaMsjcOFKrjaeMuzH5ZWNJROiawSaucbQfRtrfvIXBDaOacqMEIFp3qU9wlzUYAAJhhHp0I2DeM4moOILIdIS0hflR4p2MLF8VR9TO6sy3qaQ+omHxh4mWqVin/PqYKElWtTbxMOCM5U5sxJHVw+MsnD9lcqpWRyunuYDGtMdDLOXHUxRsoqk7O0X3gB2Pta+ffxXL5yNMsQAMBqzvlO/x6N6gWQxkySjqMwrj+oeKs/uVWuSbxvnsGAkR1k4XobilSn8pN4Tws3cnNH848CYCoLrOEIXGQOFfm5IqLBami3ECbfrxZOnlctJ2O2FMMtM4oKK889EbGznvm76A2lOEmgIMhPDFsNwca6AJRIP+AbZVafFTK/pjG/DQR+Onj5x1ArfG7xkX1GcgsKqlPk1XC+SyBa1Q0/BE+lvrYD7/ozLSA9t87Gsm0/+fFpWr7+Dx7dKA1qQfhE5TU+uhAn5iz6m/4mcH1JTKhW2EZVdLI34Fg8MVPBHDoGwcnYGw54D9UT1dHjUdYKXDmkECVg9t/fGLNAryddSE8gwBmGQPQBCg8ACFDG1Vz7pz4DwtIHtc+vs8Q0tjuCRut2S7fexj9jEXaUHUaUiY9yMHL6g/3X9/7WsxsQ9BVauhusCPC4WjsKkFny/W7felQcWbX9OJ/73kRA78BuG1yWPN1xEkZFe9IWQhMCCOKZ+xXJs7IBi4bsctunx/TyWznFXi5mUtVyLgEG1JAG/7MvLXxyJrg2RhViCrMv/zWdjxuaGL1oPA2JINl9QnSsWFMYJwsUFy93HIP2KIILJzam7R0Q23+Xj0ioiO9tFl5PGAlLJEMhRVREnayraf5PKcmAYJsJNguyoTJhfyFCsC0kA74a9S7YwXiBnr7SLHNuVvBACVyvcSqGsVb/hXDDwzdW+UTXiklYnH5U7POZNSkXq539j+FG71Ndxsxz906PmTb/ZU6d3X1Zlm583SRB8VzYf5qCXrHJCK7d98zytr9XKoUH1rIItoqalLp67udBMEOqRrdiG5GYV/P117dunqKt8cVryDjUuiFfkNNRSSBknnFEVuIXdeFOo/tgfX6AqU5sDmjajo88fRSOnnDkAK0YazroIwporIjp8QxTCv+HLLpt1FsQWnxI7gc1hNaUnzCkTuoTTwLzIAKzJ5iWfgJvu2voRLFZ7crFe8gJ5eCZ3x3O6uvxvkhit0XYFsuPL2A7b4agWGb+fXNbdccCoKVo1mZjI5EX6medskd6mcEEKscxBWb8skl4azvlcA8v4l58nkVF3P6puR3nR+nMlT+igLAEttSfIO4aKH2ry5R4D14InwrKbURhOZOiLmilVjqtTZJ1gI/pLg9F7d4FLpG0qINV+srl1aC56zfI4MkXjroArUE85yO/XmgrqWHS+PIFlUZyAEk7tb0HcK5J/vAt0MGEsya6QGk2+6nBi2QDEdcykbe9GKcJSv4JKlzjngqjh5yjz1PY2Ui4QsuAQYfVpLOJXFtsVyXxl9OnWNAkIcyRdjR+UyJUqeMrJmvCGZvyDkr0heFp+W0XN1aW6fOlB4wURO6wvvmT+f2cOR8e8oRW3UdURs+UPBVQSkaU8fHDFecHkfvruVuN0JhKFDGijcGQEicA2sSHfgSrzv38aOwCUmxsPaSIdLqYlz+Q+GzPkMFQpkQt43C1yLaEh8FSOkixOV/P2Y3q5PsII3yfgdHf6aTGAy3OPK7eWc4Yo/avmsj21hPcJDoJk3iMYGQE/kGwueljbGLkESjROGcbJOe7qwavRbM5Ok+TgKmR1kEeKJ7rU3UWh9Ttz+oBd+SZUXzbphYUvPLH1GLR0J8qW41Yv6WmL7Zg5XMYw6OmCWInmkSCQPoTUEhrkagnscZ7OFpdls0QE7tFTHKmzXU66cAD86BZofRkBTdYI0bk61VLr6hXV81YSBQTBVZu8FkAYYfI40l7FHDi/3fNQQ6vGGlSCz5ULlF4QEeBA5rzPBkzpcK22e+bl6YBOnnpx3N7edak3Auc96oGVFabec8QM3CUI4G98rt3A/OGQw9iu6P8WFfbuBQnCtva4pFCrJorA/6QXda247/pRL7ov5lMMc1qqLrYzxLgTUoYs0CCgIosEhucryWseQ9c5KzY+r0pChkUkKhkmXUxMqO6+5pFZ/ef1Oy4KXQYUMR+RU/obNSHyyB+L70Sw/xJCeGy9d9bCjMmkDL0t9elhkn0unvzObirMrHPh4h4FXYx6rxyfqdcz8w7KsElalaFk4PIQKupZTp+UayvTCKNPwLuaEXQr5tXccra5niBnN+TAWRzWKXefACVlF1xiVE3mhbH/M6gdTYp/Pj6fxWoP7pQG5lolcJsn84BG8yt2DYJUknDNBw992dolm7mpFWDbFySsKcyZfXTl9qxNUTG8ge19reYz+pNZANlWEQf2tG+StIiFZVZkj/X9DQECCuvK1aCPfb7jop14pPtOC9iNIjBG2/MvwoiqsDLz0IZMMA+Yz//STFJBO/mDzll0Js+znxQTl2VOuTxOpZ4SQvPnp2jPxVW/+EaA1PCQhOvy0x2kkH1K+KPsIJkQvLG7XbS0C+qOqvmccjBRN0iwf0DD+tqjYVUZ/EkLd7vtQEKL00HMKkdErClQxRPD/1bTe1aw3OUfegjlohma7sjZCQPrD/7Z81oVOZfLBxTM9kYwx5DdvZP8K2g/v3qjtEac4oT71W/a3yLRGllWEuKf6d08Yq2LrR5jcNy22U0B9R0exyFKegatzOOCoyxzQ4/GRGNuRXdvdnzwZqUCxY4war/yVplduX9R8pq+wZZLvFF9T1AN13JSKbB82LG/D7dMgZpw+Av8ur98jpUn8RoTPWaLAyEVFaYPSy5QT6vDHtXFXD8PVi2ET3uWpKCrVPRiy6sYGHB75XzN2MvXsqvRr7voBo4Rl4TXbZaznSxwxYLzHmIM8XzLekBxOGg+p6ROERQ0Bw0MYscv5TDPunfts+tIU2ykVfyfkt+4wyzX32uOseAi7rn40pXw2fixSAc8lBe3h7myKkGvkn2EkxmKsvs+6ML3TeoTherBgPi+8V3cCgIakdNXCyq7Dm9HeZ4yJEmgWaAHkLZq6C4ZmrJ2ZVXVFc8zxGao/IHFQCrsNMXa4WcnDdLKl/88v9A+W4nLQmDIcvU+rfQKGhBp2XbnEWrzewVw9d8ysuyeqiJyjvjjBIbLK+AZapva7xG74cN9FNuGWOdt7+pxiZes94+9ERUbT/Sxhdca+sGV1E9ueSv8Bw4FZ0l7qFOs2AyO65DUTekPwM3H84MMyRDXrVi733KMjduEnhjQtfoEYidQBuvpOUm5opb7xiVGgtDqtYU/P2D4Ztf1x+n/r7aZqytfI+8CJwKh9qhhgT5NKH4Bp/AuJVJqHsZIdUUNxrUhCprv8RU74Q1y3DimHkHr+yqr3LU7flZ1MnZQF+VZ51PgQTfhrGgsLCs73jPMgv9jLsRpNxs5K7EIThZUiiDMgdP4jicfrsI7e0XT9D9Nmpvwj2flU2pBkGNO9v+1YYpK2hb71KAxj9kE8KrKshiJHv9WU1RqRmWmIfvIvi+BjfaIMeywTCFcMKWFPret7zY8ZhJqvaowFoCyhNiLYWFjvKqeTZbeJti/O7AKjavWn8fa5LoHqGiIQeGjp5izIEbD79R7CaNNmE0suHKFFSjOqU1yrQQ8saoMkT3wHspM4A42gOD/HGFu0fNm/RGNZpBAqHmwOJ/6AhkKU2scSX0QXZrRXhLjABqvBo0z/OcCJSgTx3aUECmBfggWoSJcFUnfREqLlaecxEfme1ZKGkpNvJBwnNCscy9FQfQsgw0ryS1AzcUyX/VNgW/3ny4edpDK8dcVVmXJhft4c1yH+QLA1be0clmpLf64M8t4pkD0LGSXSNL7UqVHfkSiyaSWttwjdGmELYSQohx4nWEsPUO+tze1TYeBlsgVdH6UctVzFTuop5jLUVR3oHOBScFashAOHcDalzVcbzpJ0vn2n7YeCN51+5gPhgzWqd4bP1xNo0Gr0VTCVWpAqoTlRj6HekLhriSxZ6peiDarmlLp40AYGgViIf2Zh7uWa71YEDHlx+MT/VIQxPrSnnbCHGT+a2hzy54AoynEdL73gkCj8JgKce73cjtoPaFfRG+FLk/R+07JI+Al8cV07RwQysZSFZS1nkAvntcMgtBJLudhlc3IMP9k3l6EOxZqHJyZGoOwTfCMjLg2P7Z7SScEpykE+lzWFU0W6O+4OGpE8K/zYbErAdtF7JCLJLVA9AvKCyeDmQGdRGbhcPhnskFIGpL9RNHKZHAh8eTKQdE/Dk/K3+0wPGXD45iCk6lbgu9S5x0uE/kcVWDb3TfrvvoqycGwdnxALI7/lFVlb0sxrDrnNOuEG0canG+RKOKIPJZYa5FyVu1tXpr+2kYvvcsIwxVVzTl7/jOo4Fnmb8b/7QxXfZ3UVVLj+8N+P6M0qUCsiaE3pGnGy1HxEwfC7nIJfki3+tBIBa0hDnw1cxfQi32uvdlqyeV0/VX1O2tg1Dj3ihbMrG2KQ+YTKNjinDUA3QmF3K1ipKk2+xoilF8vuCQaEJVJIaDOOIwX4x+/4d0n1Q68MaLgw5mxK3dv1A8hr60kZ8fYPVvNkMCBeLo7cMVs4swMuVSjM6CxrQsBId/+JktBo7RHJakj7aeWdZ+g8ITxx54oNQutt9+h2QTKGpSDyU0j6kF61rn5M+H3MB1ZN8dLE22fcXjzHFGAKvJzJM/7w5LDQ/Oh/Q0Z66oDeacr+NtAjsok7FIn91NLerbGoy4rjKNc83qyoKwdDmhWrokeneCS5kqgTG2b17cGb1ynyBNKBFTqDtbnkFTK68vsJtP0hzN7hXfINKOTGUEPdKTE5WPyt2ZrYOoz0VnA01EewJa7gXUec6x/kBp7X9240r01ywuRGrw5l+JXtiUmYjOteA7iSqWwbwqnFWwAlBgfXIvw9hcrVN3eXxX4K3fIucHS2ibM3KE/e1VviGkCU8/K2OMzQHOuiQiix9UPiwF4oUGRalHxBchirfetT5yIhGsdLah1X3CyDEaxUA6Cos6rV4gB6ouVnVVw8pqJVY5JX+rYc161tRLFVmtrQZhbstM9Gbc5dJpHJl6xql//rGdgAcEnv5jm2xe2BHY4Wn5y4P5PGeuNe1fKBcnLlgpjK4dHMB0NUfLH4o2E767ZXB8rfndv19ZMfhMIU2E2x+A0MZNhKhy2mefFaj+wQ0OVddKhEoXYMETtGaP0pn2jfwd/r8jBwn62zgNRmZFfhJ4OqbYydTuuhuZQyfZpLlF3uWxE8tqNWWzLRRWTZVwBAzKexPEuzsVIiKkrXX94kYze8kt8KcoDkN19jrSVyomHZMBk94OnNouLVONkXWkDxvDIbVMvXSJs+uqk7228DmhZplBwNSpaVg19q9Ny0vCvio98Dh78Pqi12XYaKRohe7RuJbzUrwunTW4hsV5xAreCy9n2DtRKWWI1v7rw4L/750nS1LtOJXUDbG0FLCpRyHmVhckad+YXGK/V6QtTtVDOp+DqH/7mlgeTkOjzuXej5i03PaZhez4eXw6Cozt0BqmbbaOK7aBv1GdZTWWVlQ7A7fnGGCxFoElmuksWIKIzhwqf89a0Lnk0TjF37f55mvnr5F3XVW4TlVUhsKhsHIANNqb/xKBFdxWSjMJg12V/5DeItXrcpr3pI8KJayTCpBOqbzcfhk+fMjMmDY6/+f1E+nMpqRYfzecMDYwHwpPV0F9DT5xzddj/vFPQMWgR46dz/jVaakX804jDbJ3xCVBGa4EpCLR3Br8Tmi7lA8RKoRgxEayH4PYHpI++Zi+VdU9X5R0ANvWmFqtzzv2XuCg4dPwIwFAfmeisnvis81lF4xei5s7bTlubyuMo13VKRbMAYj92exfPxrwl5N+9qbnmIzidl7/mmGq5pNHJ6zUOXizulKFbnpJw2S65Aun2jmaWdQinTF7Nv+Jxcd+4GSkkUPcQNhIwoE7rIF2PaLBSPFwEYkro/FnxsWElzk8z1ReQikPzMGh4+GnW2dzU0qF+G4X0CNiVewq1of+B6jQotyvLXtmsinINsLZ+EtE1J7ld4El1EMvTPD4hyVHmU5TMlKq320KlRFE9h33vszSAjEmhnM695IoF9R8jlHQ7uDJ7n05l1da3nugwlRewsC5sQtuOQ2+DQq2MKwGKDe/FckChLyWE04XHP+pDmSnNzjzjScWJswnucFfv+ThapwkyJHzGIU6kFd1RXXSnusEkker69Er4NvK4MIYUIqUBXBBIKdOCD/90q8FB/22tu7JITuKl6c3vPlcSI5zUNdClEl99ccvLc2nY9ggGVe028=',
                '__VIEWSTATEGENERATOR': '16D6DBB1',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page,
                '__VIEWSTATEENCRYPTED': '',
            }
            url = 'http://www.gxzbtb.cn/gxzbw/jyxx/{}/MoreInfo.aspx'.format(
                categoryId)
            response = requests.post(
                url=url,
                headers=self.headers,
                params=params,
                data=data,
                cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath(
                '//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')

            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://www.gxzbtb.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '001010/001010001',
                'types': '001010001',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010002',
                'types': '001010002',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010004',
                'types': '001010004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001001',
                'types': '001001001',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001002',
                'types': '001001002',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001004',
                'types': '001001004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001005',
                'types': '001001005',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004001',
                'types': '001004001',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004002',
                'types': '001004002',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004004',
                'types': '001004004',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004005',
                'types': '001004005',
                'all_page': flag
            },
            {
                'categoryId': '001007/001007001',
                'types': '001007001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011001',
                'types': '001011001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011002',
                'types': '001011002',
                'all_page': flag
            },
            {
                'categoryId': '001012/001012001',
                'types': '001012001',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #17
0
class GovBuy(object):
    '''陝西政府采购网'''
    def __init__(self):
        name = 'shaanxi_ccgp-shaanxi_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.ccgp-shaanxi.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Proxy-Authorization': 'Basic MTYzOTY2MzE2ODphamxhNTJ0bQ==',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'text/html, */*; q=0.01',
            'Referer': 'http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&index=3&province=province',
            'X-Requested-With': 'XMLHttpRequest',
            'Proxy-Connection': 'keep-alive',
        }

        self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='shaanxi_list1', dbset='shaanxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):

        if url == None:
            return
        try:
            response = requests.get(url=url, headers=self.headers, verify=False).content.decode("utf-8")
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # self.load_get_html(li)
        else:
            title = selector.xpath('//h1[@class="content-tit"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('陝西',title)
            # print(area_name)

            source = 'http://www.ccgp-shaanxi.gov.cn/'

            table_ele  = selector.xpath('//div[@class="contain detail-con"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '陕西省政府采购网'
            retult_dict['en_name'] = 'Shaanxi Province Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def load_get(self,noticetype, page):
        try:
            params = (
                ('noticetype', noticetype),
            )
            data = {

                "parameters['purcatalogguid']": "",
                "page.pageNum": page,
                "parameters['title']": "",
                "parameters['startdate']": "",
                "parameters['enddate']": "",
                "parameters['regionguid']": 610001,
                "parameters['projectcode']": "",
                "province": "",
                "parameters['purmethod']": "",

            }
            url = 'http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do'
            response = requests.post(url=url, headers=self.headers, params=params, data=data, verify=False).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # self.load_get(types,page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath('//div[@class="list-box"]/table/tbody/tr/td[3]/a/@href')
            for url in url_li:
                # self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'noticetype':'3', 'all_page': 2},
                {'noticetype':'5', 'all_page': 2},
                {'noticetype':'4', 'all_page': 2},
                {'noticetype':'6', 'all_page': 2},
                {'noticetype':'99', 'all_page': 1},
                {'noticetype':'1', 'all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    noticetype = task['noticetype']
                    # self.load_get(types, page)
                    spawns = [gevent.spawn(self.load_get,noticetype, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''广东采购电子商城'''
    def __init__(self):
        name = 'guangdong_gpcgd_com'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gpcgd.com',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gpcgd.com/gpcgd/portal/portal-news^!list',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }


        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='guangdong_gpcgd_com_list1', dbset='guangdong_gpcgd_com_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!detailNews?portalNews.id={}'.format(pid)
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="pub_title"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//div[@class="pub_note"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            area_name = '广东'
            # print(area_name)

            source = 'http://www.gpcgd.com/'
            # print(url)
            # print(response)

            table_ele  = selector.xpath('//div[@class="pub_cont_details"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广东省政府采购中心'
            retult_dict['en_name'] = 'Guangdong Government Procurement Center'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            data = [
                ('portalNews.typeId', types),
                ('pageNum', page),
            ]
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!list'
            response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = re.findall(r'onclick\=\"detailNews\(\'(.*?)\'\)\"',response)

            # for div_ele in div_ele_li:
            for pid in url_li:

            # for data_dic in response_li:
            #     div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                # urls = 'http://www.jngp.gov.cn{}'.format(url)
                # print(data_dic)
                # self.load_get_html(pid)

                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'90011','all_page': 1},
                {'categoryId':'', 'types':'90013','all_page': 1},
                {'categoryId':'', 'types':'40011','all_page': 2},
                {'categoryId':'', 'types':'40012','all_page': 2},
                {'categoryId':'', 'types':'40013','all_page': 1},
                {'categoryId':'', 'types':'40014','all_page': 1},
                {'categoryId':'', 'types':'40015','all_page': 1},
                {'categoryId':'', 'types':'40016','all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #19
0
class GovBuy(object):
    '''南宁公共资源交易信息网'''
    def __init__(self):
        name = 'nanning_nnggzy_net'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'yunsuo_session_verify': '2c0b046605eb7acf81b64a462d5a88e3',
            'ASP.NET_SessionId': 'k2oz1d45keci5055fe5br43f',
            '_gscu_1349052524': '33974463sf7nus87',
            '_gscbrs_1349052524': '1',
            '_gscs_1349052524': '3397446376zl7787^|pv:1',
            '__CSRFCOOKIE': 'e0612cbd-55e6-4892-9a1a-bad08d9eafed',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.nnggzy.net',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.nnggzy.net/nnzbwmanger/ShowInfo/more.aspx?categoryNum=001001001',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='nanning_nnggzy_net_list1', dbset='nanning_nnggzy_net_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//span[@id="lblTitle"]//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '广西-南宁'
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://www.nnggzy.net/'
            # print(url)
            # print(response)

            table_ele  = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南宁公共资源交易中心'
            retult_dict['en_name'] = 'Nanning Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            params = (
                ('categoryNum', types),
            )
            data = {
                '__CSRFTOKEN': '/wEFJGUwNjEyY2JkLTU1ZTYtNDg5Mi05YTFhLWJhZDA4ZDllYWZlZA==',
                '__VIEWSTATE': 'ENWLp05tdLmofrQhOLBfOX9cVYZhmfSq4eEj5SOhG3tVllYGKwfS2gNjrfHqQfJt00vRa4vShYV8i/62pAOVCBYN6NwfCyeIgcbloQOvnnK1HovJnjtSKsT+IfP4pZPpdfEmcQrHQuVrp/c4MQkBmDNoJjrz4Nn2fx7yMCqrPxTvjhKVUwSnzvYeSCoiCV0swXlJan7hXMX6riIuKWwYCjcbZGQiMYHaJk2CIOtpIaILEHYCuQavxbUsTSQd+bEr6Uoq+U9UTHi/v4X+GvNREQOJkHpNcedt8lvkCzIpNIgCjeLIU55XGzTwkD2TE/QqVXLRkPZWZWyztjBOaqx0aN3WDsSzZO+tjgYc6tDIEX+f/o/nBNEWYpHVZb0tp4jMr9y5mPKn6q4uCQ3vAMAduJtL294HmwSSpiSeuRlsX/Epm2mk7NUbuOVuduji1cDQwkaU2lv0LFEY0IZHX52tpOKGQSOolksXGnsr+v2sLDxZ5SOKrlBu4eup1c+oYNbWZK68ytKsa1rJ0jcjjBYsET9bwr6Ps2+MytJtrTowfDuBDkM26PdZegUqdKSltqzkFu06Mo+oz4UazIB2ry5tb/62jTw3P9NNf24bOFG+YtbCOCbV7m4gqJ7OAcaPXFCEchokQd6ti+bkSTen4N1SceegmpAWcfkd55vFDY8bFF3iq0SXvUw3MOW22BhVoNyvguEZleEJr/y6fg/q+W0BTQGgmrruxJYwRXaxdrsbtx+3pIrue7P9dUZaJzlNJm+5TxFCp9D+2sOjnP5KKoG5tbntOHZK4iafoYC4za2YfnbWvW5lQ7ioDLPw0dIRPgV04rPG0cMPDWao7pAg95IBivlZKxQpjIDz871zLk5sYhEDFUH3pad6QlR3pD5elWsdc+4udjICLfJ+GSvcfJSdqc4aM5TRFZP6aTSFLUfn+An/OhTr50aYn/J0uO3arGlp0cvtn4/E27o71tuXbGuPYW3mrg2np9+XkFXLIfx5jJIi0eQtod3GKtILYhLgiDfsc5qCG3ueZy1QB7UnovrD3BU5oVbBMXHuzTNJUOrz7JakdrmXxgLEV3f0L2V5YrL9h8wmHuDmKPxORqxdaU/yejRJaEUjrjuBwC+rZsZouv7+o9Sg5F45qSjX+aZmWiwglICMdUdFQiT73B3ljjqeAZiCUoILeWM6dOEe6vSqxPYuELnnXSEimR0OssVqN7KHDmcq6Q+9dvyuzwGbAabNEN+TbtppMHloyNXZY6qLVTAgfbHr2Ir2tBTeHt07WXij4Cz/1/e/sUaF8Zl/Rgb1cC7GUIfA4AAyAW4a09zcej42E1N8ba3BcVLzBQ/ZrIZhFrX/ETaMMhCQMlGCIBZOn2nvItSDjpOfrlUxDfERHm+ftrsH44UtFpYP0HYR+0c4K6VISaunySinTWB9Rh6e6ax361mkRRNIs27AfMdqxByuVKeJii8nTUXEIFYaAozDczi4gDZrPXJZ/zAljjtBdfIaHFspPRDbe3Gdf3pZAWw9QzjuMkrppg2mDLh2wOJ//7mcKhMkBuXGJblNbFDzW7fdNvnSWF810t7hHL+v0kA1dZGw2Cl3RZD/IVTsc8JGGaKbpeN/AznRzMAH3/R9tXObE7nQ38J67dAtxmBtCdSD+IDqhumPWs9hlB3M0Jv/iJ8340VR4z/pjGe6Sy0OLyt4EscTB0mCc+AVnc5c94EpA71bWLT3z2zMhnoQTTlW1B2+8BJ5A0y4sue4wPP/KpII9x3A4WXc4r0pj4+xf8mom4PQip/9Cq8SUFxA+1jZsKWEolrMkE2WXU7/Qyv6+CGwsclG65FPkUw7+eOcQseuj7tq2ChjQyIvR1/2z3Z74Vl8GRTf0zMSMVfdPhFqdiS1Lkk+DQ57XOXhBenN5IJRXPdxlE7TFWYCPG7FZACHuwxnuVdODhR7bWxB8Zq6ySYBqYkXIjCYuzjwFJkgTLPT6LyUXhUgkFdoyj7KU08VcqkZrft+2Fc4PJ8BiuoZ89ScQwKGhvGuf9qbMwAR8m7aKyT3ZSwS99eNIq80uEWX4DXV0z2ipB95IfgooB1eQOFUoRba43Ld/KHztxb1eUZszZZxoG4mS0S5YS5An83vrap1BdQM0t0qKxEkUAFHfM9HxqbyU7cJ4yVgizRPGKwXX7NCb7m+UbhVP7bUImNszweAekuvVS9mypOc7it54YW1N8YZJrv36XocUOSfg+Mw+sb744qLh9uQ5ihOzD6sD+Jgb+fY+VBtheVUNMfCRlbishYUBvKz1/SXdZVo2nBLL4iWRFxUiRJJwxXbknOlRVA2HTMPGKLm+YBQJrw3U5Vq3tlJqWcPLG+5/+k52m4CN1EQZyS4wL59RSyYfdPvy+thc7b634/HSBS12dv+I7v4Pjvh3dJKB2QioEQtAcY8RKdLd1LwT9B61L0+4QjU4SBTAcaZ4wBTQfqdl650sOMlnejc/kCCPYY0ejM7Ze3MhsSkHFcFuZdhfumQW6Vx3LFoiAYJVXYZabgiHAIyZO2zdHfBwDZrv6RH4OLKnaad1WePqE6yYhwhCWF/ePXq6slEwKO8aZa3x86t1U6Vio2pmvq+pkk9gjcjXZUTwweKbVn6V1aU+wW/sYQJp3J1n1IsjiYSQDcmYXJP1c3pRvvdYNmuVDAVLHvtQwGioULwXtVAeMwoRXYwGf4JPEVQZHMX13wejqIwlbvUeIuna5IR7NTAUO1DCePkvJ7sq/WbNM5AaCy0wMOD8wovuD80YXPCgkIe7FFXqrpJKaOMqSk6+TxuPUKKNYv7tzYOFgx7G1BUcmYTpbKKBsvCKTfz7RmdA0fKB+i3ZHKztbvbsuYBw46LYIj/dlUyf6XcVnWl/akMCUI6O0Tm9r0s+zCI6YYyR+VfrbpHTPtnTuitVLLkJgdwixOkP9bwjivvW2E8CxGg2XktW2umjDOBY7ps4iPxBncmrs5Q1f9FnzwBMs0VOJDlfeNjlWHfynn2XYA5/Of319cxK9uibdseUUMP2MyQLwMm/1+z7eKeq2y+vU/dZEvAoTkUPV0ugupubWnDwH/wT90mqm5eNCHXEbs8JQ5sNkqCCNVrEPIty9hoDSmiibSjmSDOqiXiej6UOF+N5e8Ux0Os+Tie7B0roZKHAPBlgV3+fQbRpcenbkJnrLCkjGaDxorC30FUFqmOW1wxjELB0+vSSeloW0CkvKKyoepHhEMajyXQC0C7atZc7pkALHGmYs3T/2B+EJ0ITWJoqzYdtCRTL1zPqQP/yNkLxSF7LztLAb2SMrXbIl4ow+u6BGycfil1kGEH5uxOREAL8RLLZOi0FpW+m8/IHKqvjgxffKQLfryKpmsk5SH1oscZ5lzGcFc3N6HTZHs1ZeSMjVkRLWE0DSaAcx6juoFJ6bqCxMnqnyONC/iB0xuOEAQ0xValejZUsXhiz0tTcO17yBTP1v0R/uNF7tyStYuGkWIDGXzNiNrDPM6bnprYQleqSxjs2Zwa3TxM5ePrlVLaWgQmTiEsf+262eMTblCDPnHsXRRbTWByq7MobtPVUjafUpLzE/WzNl7YVf+vW35UDiVemsV2judqeajuXiALujUedXjS2BfDpfyYllHOgVOvNQQB1ly1dKvG026Krsi4DDrpdJldbaxKzENq01T/oKT4l0ag8VsHIRhBOwhnP2NKQqc6klqbcynsShUdAAykD0quW02Xw/xxWHjMyD3NXj22W24ZJwTEIjePAV6v3S4h7zPsFp53ok7zbhHENi7sAa5kIZx9AlyguXJqGEzFCy03m0rW+1aRWdzd+ZnhVcubVsdABsAXMpIt0hG0oR94fJr1jbRADnZhMXE02d0NSbEKHcDoFMSi8ryUmi8RGWUy56VS2d4W3wRoUZ6QGDn6rEUlOHuanuAYIBBFsRfX6EHs0dEM4eXd+du1lqo0lXY+64KcSEywAu4HoAMkN+I5S3ojdVZP0HSCqE8AcyG2O+rV2cuJy55O4896HNPdMQvFnY4SyGjU0cvx9UKTi5wx+A+s0Rb7kfUmJQ9gGNTFebyCk+Z3M1oMCdfcGiDBBt3F+c4pZjCYEwBmSxVz6NoH6nRSEd71JcLvxMtJtW7lgzTbF15Uc08UmXfFGpZ1s5pg22k+OstSqUXO5TjoPyMoFLmmHeNxkX0HVk5XehlCWloHUYQk3nwGnUvXNIguY65yCCI0EJ3HY7GajyAZMQGntqb1vkmqSCWnlK1MV9EMW9Dm+5pF8lYSFqPWglPaU6QzWVyUfK1MAE6OXTOxBotQe29GGr0CErAcM8TeFQ9Rd4/grSpExwKVyUZrl8stZXbSxKMqMjealLAstbB9jIrQ6cJ1ThqOaabVFB6/DwBRqsRxjNn6/1NynP1WivrP0LT4d8lLUPXm/JQDqrF4/mFmZP4LMdmUx0ni7o1MRX0iMNYIMEkRrJzKdCsqVZly5AeGKzdIU11bYQfOlmU8JG1bSww12ci6d97pBhhASMPM3DQlm/N/m8BjjooglCiOgA6onr9NsVoTeUy6vqzWTRNnpE/dNH60ityKU9EB8ojOgBR1+omtvgbRAXcv4eh3zBgN+2rtAWXWMZl5xZiWldFPXe/Rp2AkSuhPgHw7KesAixhgQ409wtCy1sKbJRDYHxSTxlzqIa0zZPnDq+K+yJ851UG25CgQCgrAHdNoDjL/FfkkoD4DQ0ZLC6aXRPl2p+eaDNjNzzFC1m4xwpf532gmabiDrU3Jbqr8kOefR4KYSEHu7U0+zmmhz0KN7tbJ8AztoyolRXxPmvfPuVK3d5FecLPc4vG0dfyPI9TaRwBxWa1czJcW+xp3LPYTulGqHjU6RQEm+OMFrun1L067L88VCbwl1niEkzAbsgiUiaDL5IrtozIKwhN6KR0ytQUpbCEQvW5DAJteaiiu2wuouoCUmD6IbQJlge51yex8DfrojDxfYxvHzEWFv9xj8eq8DuSh4h6frFxhZtL/tpZxh33AJ+lBVrb89G2DAqbzKSnbTCaI55QXuMH3IBuVkqjBua+Z7AnEqGkEqV7JHEZsuojcpnUH/bWZQqBjRrH3hts6R+A7FC1EgYeF117OdzeIxHGsioFCt8qvMlrT+Ihr71+DLGzeL7xm4ZP1PSYyIruaR2xP4oRIQ6wJGbkLG1erqM7nIfyGMo3NK09J2BcK5JqeeooMY7pPDiMYIrXtfWi2vyFL1MnBIXYXuVjQEehV6rE0gp+x7d2S9UPzdyt6ihrTNi4E3cmX6wj1qK1LkHmgA4zvfx2Fvj9l9rCjOe4DHNNlSl691RVk7xmRnIuTrbNEXk6MEbCD4XmbRdHn3kA2fWwJsScwLmHJdvsyJGzNKu4aMiuuFa2a/8mocXFdcvJz3WvVyhxHxRqUj0J7DlTgwuOiZCYNHGD+mVMl8rn80/d+UzsUdGEMdlLB1HfEIarGe2//EsBx8Bz5ohIHAnXERuvCUGAxLNka7g4qhCvXM9GX4fRMqivAHWIl+znUGKDF4/7aSpMY+aOdiFCa4wL1X8UEijsVNR43Aw8aZBBjVULAVy6vsYRmSX5Jn9f6ImbPiiQHf3M1Ux5hsMHp5+EZGAuW5DHM0Ey36iMXLfzXhX+ckJ2qB2JWKWEtfcNZQq1h/NakvZTMdvy9EH324lC5DAerKM4S+cTGPPIZda7YAnbTC+OaeSZWzFsNdnZWAoZiG57ia3XXF7zuGTWoVc5Cqv8CiGOpt4B8jQg4VRQznKVR+Vof1DoohzHKBw9kyvAx9ILGXd7WnqdtelwWhGO+aDcJkJoBqW8XjzXIw1q7bJY/PXA69dbpwuT/EwsCNpcvqk1put80GrJeE4VRijcprs6X6iPsCJslNWlWmE6JaVJgg44EPkSIogapPjzM4KbtHdWWrrF1VoUkr8tqH0O/WW1sl3kbr716Kcs+ZgBc8jiXSVp8gSpDHiad8SPXrjtTn0G1NY+CIw8EsYoJWrD2RCWxsa7PGJf+qa16B6UJe9R4Yl5l2BapFFeDsoD9lprf91z6OaOjE9vQVzRVzgBs12evLo4SncT27O1sfvBhfnuQ8XfRGpaZeE5aF/4VxOwAbVNatdRqEi9O3aSBCECgMY6mOwhnuy1/aJ8d3AGQdSRttqo5QET7zwPEIT6LaOQM+ZE/2Nok1zVa4+PA1Dbht8BDO4RD6xljBkoYiO3sb21Aogm3P2+xUkl5cJdx+UBWWOrHZkKNGuuy0U9gQ4yHpBDlSdw+EhQ5Xjkc27FL2jeQ+8GfcDdUC6CyQxYdoCVMTgU838S4e+XCq4xibtZ5RU+Ly1WNb0wgJeNbi6WPmuEdkzQ4jYmOt5YWTuM9jAzfgjSZPcmD89p+KD6P1g+KO2aU+pSYyANDpwPwrzg9+qQ8LFKo3g2Ctg4Ns5av5H8rIqExKkr4U6sSJQyzj6BOGx4aNDlRw9+zGeT2SugkaQ+nv0z67AdvUNBeYmwiEtm6GFfNqZ49/ugdrmTNQGY11ESBICili9nkj/fKpA9EqVfb4JknmbAPx6eekT88+FAcfqr+9MWrmchPQkmHkGT6aGSesKEBF/Duj1PAC9IqyXhI/wu5EuBuQchRND+xQyBOpzVbMEFo/cYabGi5jAbGxhzTzJ0RBmv+uHEVpshzsSMrUvMHhhV+MVYxCjcm0WXIHQy+4xG6tlCEpKj3fUg3HUpXdlenDAphGkqz++e7doW2rXLrW3Nojsk1NzoXj1vjSCbOWCFagh5oeUlELJOdIiU76RlOKcp2ymLPZMNyjavz0lgqpfMrqNr9sC82xLN5pu64LrFiow7Go7oR6lM5lWoM+T28dRnijTf4Yh8qhcjqVXhC0IU1eWKQ4lCV1KNYD64svpkXqWw16I4uJJ4LWN5Xz/4lTLqv/Dit74w77VB2bB2ndDH/F+Zjp9cyB28jWHtqOmeCmXXFjT4UNQn67wKcDT0+qqhum08I3NSzZ8m4Lu10rCfISQlplVrpu8DZx/fVl3g5TnxOYtSG81AYs5gunv2qPn/zega1hUdkLF3UGeVdVwKLziEv4dosEsRvVOWeSb5dwi28Z3xjRbQjRIwu+X8kfY7spMPTRJszAXLFpRDYvC+SaKKf4bjg+xyuN5zzrRgKLZkoMa0FSY1ErDmmH4DrGRZODDvKNeGnJhPlvYOZOYl4xJe6KgggUxFfyTKhg/N+HcZMxj91jlb+VZmWZkJ8FehLIiFWHZIRL1fVUDRzZ5/sidEfbsMQUK7x2o2emqHPqbTQa304fLiCYwVxdrwVxtRmBw6mfH8faVzMTdRIDWRgDIdBn9Th1ZchZy1UoiW4jdeeGS9CXsmySuLt7UTXT0PNrjfq18dldg2kVQM5Wjj4u0PWeMwQI265DF30bMAYfORb7mNEPMCCfG8nVUeTZryY3bSuJRPK/9eJXn62S8MTw5AMyLmw0XFOpGS6FjmUVE2OgtpPRuioqMFdEePfOV4k5Q1PsLsuKeYjK77KJQpNLH1R44yIOiiNqYR0INEAv2IZdDBepT61XVqU2CM1DzZbhzZbfBnNHdW84qZsaVNRHUV35hPZXpwwD6XE99fCaBbuT9e7biAyYbC8hEU9q+Jm+8cT7Xjn0/xJnVmO0K3wRGzUh6J6IBbUjsWJ4w20IASj7nHDKiLlG+nxL/xVwatvKsokbJpxojvxilKYyUKb/c9ywKd+oliYEsgIF3yKh+h6ZtmOyWZbEE8tdY4K+/yMGCx1MyCPGR0OozvwNLTEYm3GyWvCX5dyrPRPcLZ6AqgcGny744/l7HZWAGsbhaRyaA093l3Xvq3uuTa0ZG9PshZ+eMe0DDnGzUrI/TiQl9/5oGsQrZcnIMazVtHJSg2Wm0mEJmCn8cMAFiVHitlPtvtMLD3xNDF1IVCWXDYRxBiWwRG1EVwvyQt8A6jfc6svYHcKBr3EI9tre+bEu/ejdQ6r8PJjdJGEE/dhy6fwFmj8vZfLy9KT7GyJUXCJ9fpyj6tKgAwQwndnGICc1hgx5hC4q30znuAm/350a8X6CiYoTK8DIMc1PJR9QsRgEUabCH+aaXnoYy+DkScgpV3XFiUIfjWwrYkO0F9JHoJnSE9g+kKGcJ1Y8o2nsX7rbkfSCbw6PfiU3LpiO7CP+dnV1Hhrfy+Pnd1Dn03pfbCCVi7J8ZyWO891RMVh5t1cWiTvCiu8UhoE6GP5qLI3+zJtl1ANlGo915hJJCZTTHQ9t+9Gpn9oPsnnUkcAT/5GJKjv5fGG2dinkqToYtBVoo1pOJPjsCU1GsbW9/vXb3CUE4oLj4TyVrxtx8mYG9+FChmEOXXjyNk2X0TYb6loXn15yh0WfNmAcau4xHWlXAQBanVljzFNyXmSWzRvNCz/dJdLKELEz/aQ+agxhwjBD8S5dwMCkeas8uVTPNveJSf8Rva6HcUujGSi67RDlq6WfTC9Lxbnsv6vWURc4E2a+KNp4bH044H6IUDxm3LE8M64hL/+E2taoURouHGTM2ZRe9a7hejwkKDtMM/8keVUEfySrI7h6OReilVhOiSbXFWrB8kkwCG4NIe80vh59jRalYXvQj6G8vCXZHXuXvDHNabFmW8lw5HAvEFghyhAapVgCyPGHSACwfemUfMIlq8UX8C3fv4NQ9UjdJBonk+R9i/RwsRWwj/g0j/MfI+Uq33Sw/OkGuPQfvuObMkP74QRjJeRa6gwC49qsXLyPpkKW23hx4VI7dH8GsgzVosz8eXiufjjm6KteBnu8gV0EK4+kllvZzLJ1gILXRsC+LzGglNspk36cxJ3HqGvjU+DVPng2fbq0vkk0m9je0qeGB40GkzeCdbS3yJDcnm7HgTzRotTOL/peeEgBYctZqR00a9Gp3C4Sy36hSKd8gfJZKU/gq6LgXR+Nr+JgKfzrY6Sj0xiSNrhDLDrh9PJipSVhfOZNwFeqR2S5pcrUEN4BHRf+HrtF9Lno/WxSRPGhyQvY28mt0LQV9cqMIj9wxPKz3JefbKS7ILWYPNodvUDJWaUUmGoho+1FqIdvbSvWGbHCJRq82V6Irk1X8dRXEZyTXGqOhNGm1RWuTsYOijFPaCK9OQs5Tlun/FTcX9UH/uzVBRcAzXLreD/Tld50KBX/lFVQ6P5bpVQKFUEaekO8IJCK1m7fn5PUyCtNPrEQj0MhzPJ8559Dw8hr9++Hhmd1CJQT5ByusUcrxuS7yMmLFUvMy4bDS6VwTX9GJ3QSSbcB9SU9tS3eu/RklHAdHPF1AdNU8O/gKQH8t46L7Guttkofhi+p2YUWc2DnGVIyK0tme0xSyh5EYmqV0gkoqphzlgNvyfErKJwwt4WIM6inqIl8VE4ZSfyHlzdzN0KeBOKWtclt4TVaseFfDXSNUtxQDbv7lo8NlyOS+5588XCMXVVkyBmSCOu6v79l2AZRlTKfN+TIGDvVrxk9f+E8AL6GbJPKe39fmX4XNRWij+CLbTaiZOKvS6RE1cKP8shBB5rVMnpBGfqIRgmaKVs0pxDzsGilnf6+TDP6NgJQMNmRhjnnWQ7u50WHcV//1vVsNKNjEABuErRcgPTLbCXGOs9jLesg5RMYfr5ZcCYyng1it+tzq0GWaUosIOlKoQFDJQrYmIJwh1zq4M4xSw3ed5nYSZQnM1uOFS4EKaNBo9dT4qFh59156kj4JrVKHbhwr8pj6bwLmu4x7Qp3e7WF0GIv9Zv47um3hk/iFGhyosTtw3wyzky+r1X0PxW1hEIdJ8FZtGd6jQXEXrax7ng3RK/DcgcHGgrHG0wjHX1sF0LsFmPT2kgB61L6G+2BLjBtvoFofPAB5SiUjKli/4p1HIdtMGpsc12UAFYYyACK6fB2OQidLPjsnqImq7PrfJKjJrFKY5DQC6tvi3U6H/J6hJfedMGKxbwhEX6ncatJFCO0wxeknSUBfsXrKkiGq2McXHk6KtzdpohonTVS/b90IeOdju3vGvVPIHEyYpygbOS/6+mFBs29s20eomFzXyFzoZwBEeq4zgGDx3FtbDtH2/htWPgfzstnWwN/bnS0mMJoZwZIwhv7X95vrTIntb1QN7KrsRju0GkM8wWi0BRkdzaXS7+5kBxkZYCm1xvgdhgF1RdCMnsXjpH5Y7zZB/XwYXOYZ8QH7HC5d6Edqnp7a/Gs2dLUS9ie9XG2PX3ozrX7HD73reZBk4WuRj41vwtudVQazcR7qCIqtvK7ZkPQyS/PUYtJuw2yp2XXR9re6HIQ6chIKhRWx1NHr9ST9b0nv6JU4PDk3JKNUtSkt/oGC41pxEYV7DTZyM2giW6C7t9q0+zhiY7cOY88rqFa0pLWZ4MNe8VJGQH1ANvh7Vy3P/4IPMY/Xf/EypgV9zD44IpvRD+qP2cp+cGORi8+Pi1QkSJ9C1F7OKzJXb8qdMPE9IlwNyU2DAa6D+G0Fqh7hNN8NZb4jm8hT35IgJUkLBLuIaJxSzKcnwsctTZRPbzkJ2erenCFA2D/PlBfUGDVJ4fMLtzOZ/5/KYQ95Z5CdFsE0UQN74mALE08Qqh+pmLTi9KR5lXtGOMVN++3AGIeE/VFncOdeSQcMrAEdnkBHjFdhsDa6iZvJtQhIKxRCpy1OF/blvyDgijDB876WbV1X9dReLVPz1cDZqj/18mZKW2seYoBVfsMTyHmDoKY65wAe3pftO+nDfbc/Clsm/V70tvqMYMN6qUneHVGXIGQcxv2K+DnXwlb49vdtC541m+5ViNt0VD5Mku/cZzxM/5z2gdDR0w8pNsHMUGD8dCu6fwaO8I+/qmhB4oP3VZNX4X48QshIAmsgsqdZ3SDQAIj452a5LZSLELyS+VkfC1Fa1KZ4WUXPf/LWGD5IXphba+AthqnNK+/mmGRvkS6RgV56o81rfmNZx/qbozYHoxEwc1XUdWyPF9QM0qkdelmGlcfMrB+QGCGYXQxWwk5NfcvqwjVxGsH0maJaFt3clwV0r/KgKCphG2ehJ4GWfp5OmazDCLWVmtVF+HjXYctPfIvSaaPjHv4fcRiD9m/+iY7WBfvUWb22Cwvu1LwaiPw6HNKl9uJD5C2px7gpTM509ZKgrOD2EhT6XihdEv5P72o7ay6f06/adMU/vPXqxuvdrTolGkvqhVqHU7kvgJmJ5hGLkocfr5SSQo0no3g9e6VutYpzFEwg+rfzZWfN0fc3ejCK6ajPvT6Ztp7+PhsJgNWcU4WfL+JSZcDUNE6Gq7Wo5cy0jbQszKQcxQRBhsZ1ewGyE2f6xwrJ+uoEEoACn2oZLbaAGJUoSMKOSep6lU2f9yfgWX400J/FQq0bt/+3DIE22hA8xbdD2SNtbp0xrkNJEsMH8pevSE6Sxt7f8dqKNUfq+LOGJ3Pacl0uoQbSB0oEJa//TI77KvbVJP1rtiOFZVW6ZB9nKQCBAMGqjPpPfbyAi3FvKpp3tVktbMOEjP2hlAE52t5IS0yXwmyuPasCbqjn7R1Ws0O6ner2W37BDvFUFgfdTMEJOS8trekVQJcw0MbTahtpLiYkPSCsEdph2IpdCqD7x+Yj/pgerwBz9hzlRkx4080cksWNYAf2yJOUy2wIkcLFVarxQNDSzmptKyCY2gUMNLvbSM7A2d21jHJ5KbVYvi/ZMRX83UxG7MTmU72vm9iQJHaQs0afHZghZwI6mduPEEA7rNkr41RcCnikhFhIy9s+GF5HchGW+xzEqMgoppXAxckyT0+EKuwwfpmOuaxhghJLS0h5CSoV/wkVVtX22CqKDpn8qGo4EtbdHVHcDZUd++1YHYn7r/iwVF1ThD3ZCKYHhu0CZBvm0DEZNEOipnNsIHZ7AINYFFGUaAFYXMF9JBxUFXJkn0jTaVRglAHyW+T6WRgMy+9lJhbF/3eCKi8F4/t8iArkkT37h0pgnsWaPJK8jBFnLXc6onwlg9KciYKCIvjcC48iwK3nJPvZowyNoYQK/9Zm36rXXnXJUj38A0TegB0rbCmVGaN9pwVUSrhjdDZ9m4WVswNJD5IEZjAZkEOcsRE+G7lIEqiPdFZUr5u+wWLdv5B/7fu6EGQiHYDTOSXtp5GRdJSayg8gDVkeEgqCs8YFsDUwtg32Y6vKZpmU7zdrpd0OAaouN5r/buoGCqzn9yVrethR1XLmyL3HWD+C7/9U1Ym5pInakUacwVQl/emzJwRzazJG4ezqARdo/pifOOlS/oDNd9n5b9dcu1ibq8D3va2EpPuRpsKG1jSqfafZR2iuCbtaSb9JW2NFQ615xJRltHTujWyQSIq5OcMTtCKFGJ8Rkc/HMaPQSw03KD4PtZlMckeHcfx39qEU8y5nACalGWgr8CAQaHSfNb7UdGuZ8H57ufOvMZTring3umxPIKb5HVPQBYmNYicfMnge8v/EL+c/7RaBlnnGvf4EXza8lT/vqSsc7pLZegLyBnzSLYBQ2v6psJpa9WukqZ6dOpzjaLL4hUVA2fJryd3r/m4qPAhH7jP5n2D6NDYcaXy4ytCPiAMdbSt7sFHVfPpKldbVd2eLWuKkXq3kTXx0E7TosdLRUpyES6HQPptIZujEaEvbQ6PZJ6EBYwdGt+mxI9zEOMALaCTE5NUQ+0sINoFp+6BVbiXARCTQ09Or18XYYHpAm5jPLmF1zOveJ2NmtR/dzsjbdFQe04NmWroPeX8M8/H308CfyIoUZpFTGR7auT3F7mm1Tl7XlVZOVWG68D4z6kmtn6PyEilq32JOPzjyBVBEKWm/l3j2wy9Bj/ulrB3hfGDxUjRIYNhalz/S2XV1TKa+Oa/jti3PYxjJ3+rTCffSJq+LZm0MFTk8eQLGW6cGJOFZfFaYvdoI/XalfSvQkkZA62jAh6R5C7XbVebNsoqNcYSCB2IyRt15LIRf1eXbBO5Cxr8cY/If7GlycO3k5jOAeT8shklFvlDIAmYbHCpiD9JAY01woeqbk8XX9xBpKV6RVDRPLUBeJQ',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': str(page),
                '__VIEWSTATEENCRYPTED': '',
            }
            url = 'http://www.nnggzy.net/nnzbwmanger/ShowInfo/more.aspx'
            response = requests.post(url=url, headers=self.headers, data=data, params=params, cookies=self.cookies).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))

            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//td[@id="MoreInfoList1_tdcontent"]//a/@href')

            # for div_ele in div_ele_li:

            for url in url_li:
                urls = 'http://www.nnggzy.net' + url
                # print(urls)

            # for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                # print(data_dic)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'001001001','all_page': 1},
                {'categoryId':'', 'types':'001001002','all_page': 2},
                {'categoryId':'', 'types':'001001004','all_page': 1},
                {'categoryId':'', 'types':'001001005','all_page': 2},
                {'categoryId':'', 'types':'001001006','all_page': 1},
                {'categoryId':'', 'types':'001004001','all_page': 2},
                {'categoryId':'', 'types':'001004002','all_page': 1},
                {'categoryId':'', 'types':'001004004','all_page': 2},
                {'categoryId':'', 'types':'001010001','all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #20
0
class GovBuy(object):
    '''海南政府采购网'''
    def __init__(self):
        name = 'hainan_ccgp-hainan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.ccgp-hainan.gov.cn/thirdparty/My97DatePicker/My97DatePicker.html',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'X-DevTools-Emulate-Network-Conditions-Client-Id':
            'EAC4BA3425D26FC6B117994EFF4DEC28',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hainan_list1',
                             dbset='hainan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath('//div[@class="nei03_02"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="nei03_02"]/div[2]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='nei03_02')

            source = 'http://www.ccgp-hainan.gov.cn/'
            area_name = self.get_area('海南', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中国海南政府采购网 '
            retult_dict['en_name'] = 'Hainan Province Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, params):
        try:
            url = 'http://www.ccgp-hainan.gov.cn/cgw/cgw_list.jsp'
            response = self.session.get(url=url,
                                        headers=self.headers,
                                        params=params).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//div[@class="nei02_04_01"]/ul/li/em/a/@href')
        except:
            print('load_post error')
        else:
            for url in url_li:
                url = 'http://www.ccgp-hainan.gov.cn' + url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 2521},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                params = (
                    ('currentPage', str(page)),
                    ('begindate', ''),
                    ('enddate', ''),
                    ('title', ''),
                    ('bid_type', ''),
                    ('proj_number', ''),
                    ('zone', ''),
                )

                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #21
0
class GovBuy(object):
    '''云南公共资源交易信息网'''
    def __init__(self):
        name = 'yunnan_ynggzyxx_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'https://www.ynggzyxx.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://www.ynggzyxx.gov.cn/res/css/basic.css',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='yunnan_list1',
                             dbset='yunnan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h3[@class="detail_t"]/text()')
            if title != '':
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","预","采购","更正","结果","补充"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//p[@class="kdg"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.search(r'(\d{8}|\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                if '-' not in publish_date:
                    publish_date = '{}-{}-{}'.format(publish_date[0:4],
                                                     publish_date[4:6],
                                                     publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('云南', title)
            # print(area_name)

            source = 'https://www.ynggzyxx.gov.cn/'

            table_ele = selector.xpath('//div[@class="page_contect bai_bg"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '云南省公共资源交易网'
            retult_dict['en_name'] = 'Yunnan Province Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            data = [
                ('currentPage', str(page)),
                ('area', '000'),
                ('industriesTypeCode', ''),
                ('scrollValue', categoryId),
                ('purchaseProjectCode', ''),
                ('bulletinTitle', ''),
                ('secondArea', ''),
            ]
            url = 'https://www.ynggzyxx.gov.cn/jyxx/{}'.format(types)
            response = requests.post(url=url, headers=self.headers,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # url_li =selector.xpath('//*[@id="data_tab"]/tbody/tr/td[3]/a/@href')
            url_li = selector.xpath('//*[@id="data_tab"]/tbody/tr/td/a/@href')
            # print(url_li)
            for url in url_li:
                urls = 'https://www.ynggzyxx.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            # {'categoryId':'1002', 'types':'jsgcZbgg','all_page': 2538},
            # {'categoryId':'1000', 'types':'jsgcBgtz','all_page': 940},
            # {'categoryId':'842', 'types':'jsgcZbjggs','all_page': 3417},
            # {'categoryId':'942', 'types':'jsgcpbjggs','all_page': 917},
            # {'categoryId':'825', 'types':'zfcg/cggg','all_page': 2522},
            # {'categoryId':'626', 'types':'zfcg/gzsx','all_page': 646},
            # {'categoryId':'843', 'types':'zfcg/zbjggs','all_page': 2033},
            # {'categoryId':'963', 'types':'zfcg/zfcgYcgg','all_page': 227},
            {
                'categoryId': '1002',
                'types': 'jsgcZbgg',
                'all_page': 2
            },
            {
                'categoryId': '1000',
                'types': 'jsgcBgtz',
                'all_page': 2
            },
            {
                'categoryId': '842',
                'types': 'jsgcZbjggs',
                'all_page': 2
            },
            {
                'categoryId': '942',
                'types': 'jsgcpbjggs',
                'all_page': 2
            },
            {
                'categoryId': '825',
                'types': 'zfcg/cggg',
                'all_page': 1
            },
            {
                'categoryId': '626',
                'types': 'zfcg/gzsx',
                'all_page': 1
            },
            {
                'categoryId': '843',
                'types': 'zfcg/zbjggs',
                'all_page': 2
            },
            {
                'categoryId': '963',
                'types': 'zfcg/zfcgYcgg',
                'all_page': 1
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #22
0
class GovBuy(object):
    '''海口政府采购网'''
    def __init__(self):
        name = 'haikou_ggzy_haikou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://ggzy.haikou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://ggzy.haikou.gov.cn/login.do?method=newsecond^&param=431241696e6465783d3326747970653d5a435f4a59',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='haikou_list1',
                             dbset='haikou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        try:
            if url == None:
                return
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="part_1"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="part_1"]/div[2]//text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='content_wrap')

            area_name = self.get_area('海口', title)

            source = 'http://ggzy.haikou.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '海口公共资源交易网'
            retult_dict['en_name'] = 'Hiakou Public resource'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, data):
        try:
            params = (('method', 'getSecondTableInfo'), )
            url = 'http://ggzy.haikou.gov.cn/login.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).json()
        except:
            print('load_post error')
        else:
            response_li = response['result']
            for dic in response_li:
                key_str = 'flag=3&name=' + dic['FLAG'] + '&key=' + dic['KEYID']
                es = EncodeStr(key_str)
                encodestr = es.encodes()
                urls = 'http://ggzy.haikou.gov.cn/login.do?method=newDetail&param=' + encodestr
                # print(urls)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 3
        task_li = [
            {
                'type': 'GC_JY',
                'all_page': flag
            },
            {
                'type': 'GC_GS',
                'all_page': flag
            },
            {
                'type': 'GC_JG',
                'all_page': flag
            },
            {
                'type': 'ZC_JY',
                'all_page': flag
            },
            {
                'type': 'ZC_JG',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                data = [
                    ('currentPage', str(page)),
                    ('pageSize', '20'),
                    ('flag', '3'),
                    ('type', task['type']),
                    ('notice_title', ''),
                ]
                try:
                    self.load_get(data)
                    print('第{}页'.format(page))
                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #23
0
class GovBuy(object):
    '''南京公共资源交易信息网'''
    def __init__(self):
        name = 'nanjing_ggzy_njzwfw_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html, */*; q=0.01',
            'Referer':
            'http://ggzy.njzwfw.gov.cn/njweb/gycq/stateProperty.html',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='nanjing_ggzy_njzwfw_gov_cn_list1',
                             dbset='nanjing_ggzy_njzwfw_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="article-info"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//p[@class="info-sources"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area()
            area_name = '江苏-南京'

            source = 'http://ggzy.njzwfw.gov.cn/'

            table_ele = selector.xpath('//div[@class="ewb-main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南京市公共资源交易平台'
            retult_dict['en_name'] = 'Nanjing City Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (('_', categoryId), )
            url = 'http://ggzy.njzwfw.gov.cn/njweb/{}/{}.html'.format(
                types, page)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = re.findall(r"window.open\(\'(.*?)\'\)", response)

            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ggzy.njzwfw.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '69171',
                'types': 'zfcg/067001/067001001',
                'all_page': 2
            },
            {
                'categoryId': '27720',
                'types': 'zfcg/067002/067002001',
                'all_page': 2
            },
            {
                'categoryId': '344',
                'types': 'fjsz/068001/068001001',
                'all_page': 2
            },
            {
                'categoryId': '21869',
                'types': 'fjsz/068002/068002001',
                'all_page': 3
            },
            {
                'categoryId': '48706',
                'types': 'fjsz/068003/068003001',
                'all_page': 2
            },
            {
                'categoryId': '95248',
                'types': 'fjsz/068005/068005002',
                'all_page': 3
            },
            {
                'categoryId': '74362',
                'types': 'gchw/070001',
                'all_page': 1
            },
            {
                'categoryId': '83799',
                'types': 'gchw/070003',
                'all_page': 1
            },
            {
                'categoryId': '81835',
                'types': 'gchw/070004',
                'all_page': 1
            },
            {
                'categoryId': '4620',
                'types': 'jtsw/069001/069001001',
                'all_page': 1
            },
            {
                'categoryId': '11321',
                'types': 'jtsw/069003',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #24
0
class GovBuy(object):
    '''济南公共资源交易信息网'''
    def __init__(self):
        name = 'jinan_jngp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'Connection':
            'keep-alive',
            'Host':
            'jnggzy.jinan.gov.cn',
            'Origin':
            'http: // jnggzy.jinan.gov.cn',
            'Referer':
            'http: // jnggzy.jinan.gov.cn / jnggzyztb / front / noticelist.do?type = 1 & xuanxiang = 1 & area =',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jinan_jngp_gov_cn_list1',
                             dbset='jinan_jngp_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="list"]/h1//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="list"]/div/span//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            area_name = '山东-济南'
            # print(area_name)
            source = 'http://jnggzy.jinan.gov.cn/'

            table_ele = selector.xpath('//div/div[@class="list"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '济南公共资源交易中心'
            retult_dict['en_name'] = 'Jinan Public resource'

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = {
                'area': '',
                'type': types,
                'xuanxiang': categoryId,
                'subheading': '',
                'pagenum': page,
            }

            url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/search.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     data=params).json()
            response_str = response['params']['str']
            selector = etree.HTML(response_str)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print(response)
            print('第{}页'.format(page))
            id_li = selector.xpath('//ul/li/a/@onclick')
            if len(id_li) > 0:
                iid_li = [re.sub(r'.*?\(|\).*', '', i) for i in id_li]
                for iid in iid_li:
                    url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/showNotice.do?iid={}&xuanxiang={}'.format(
                        iid, categoryId)
                    # self.load_get_html(url)
                    if not self.rq.in_rset(url):
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)
            else:
                url_li = selector.xpath('//ul/li/a/@href')
                for url in url_li:
                    urls = 'http://jnggzy.jinan.gov.cn' + url
                    # self.load_get_html(urls)
                    if not self.rq.in_rset(urls):
                        self.rq.add_to_rset(urls)
                        self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '招标公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '中标公示',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '变更公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '废标公告',
                'types': '1',
                'all_page': 4
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #25
0
class GovBuy(object):
    '''河南政府采购网'''
    def __init__(self):
        name = 'henan_hngp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'sId': '7c61a3bff6dc4969a336157b5f3dfb1d',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.hngp.gov.cn/henan/search?appCode=H60&pageSize=16&keyword=&dljg=&cgr=&year=2015&pageNo=15',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        self.session = requests.session()
        self.session.headers.update(self.headers)
        self.session.cookies.update(self.cookies)

        self.rq = Rdis_Queue(host='localhost',
                             dblist='henan_list1',
                             dbset='henan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            urls_li = re.findall(r'get\(\"(.*?\.htm)\"', response)
            if len(urls_li) < 1:
                return
            urls = 'http://www.hngp.gov.cn' + urls_li[0]
            # print(url)
            response1 = requests.get(
                url=urls, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath(
                '//*[@id="ng-app"]/body/div[3]/div[1]/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="ng-app"]/body/div[3]/div[1]/div[1]/span//text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            content_html = response1.lower()
            if len(content_html) < 100:
                return
            area_name = self.get_area('河南', title)

            source = 'http://www.hngp.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '河南省政府采购网 '
            retult_dict['en_name'] = 'Henan Province Government Procurement'

            print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = {
                'appCode': 'H60',
                'pageSize': 10,
                'keyword': '',
                'dljg': '',
                'cgr': '',
                'year': '2019',
                'pageNo': page,
            }

            url = 'http://www.hngp.gov.cn/henan/search'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    cookies=self.cookies)
            selector = etree.HTML(response.content.decode('utf-8'))
            url_li = selector.xpath('//div[@class="List2"]/ul/li/a/@href')
            # print(response.url)
            self.headers['Referer'] = response.url
        except:
            print('load_post error')
        else:
            print('第{}页'.format(page))
            # print(url_li)
            # return
            for url in url_li:
                url = 'http://www.hngp.gov.cn' + url
                # print(url)
                self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 5
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            #{'all_page': 500},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #26
0
class GovBuy(object):
    '''南昌公共资源交易信息网'''
    def __init__(self):
        name = 'nanchang_ncztb_nc_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'kxgkxo45v04bzs55ie3tib55',
            '__CSRFCOOKIE': 'ad60f543-41c8-481d-b0cf-accadc73c516',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://ncztb.nc.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://ncztb.nc.gov.cn/nczbw/jyxx/002001/002001002/MoreInfo.aspx?CategoryNum=002001002',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='nanchang_ncztb_nc_gov_cn_list1', dbset='nanchang_ncztb_nc_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('福建', title)
            area_name = '江西-南昌'

            # print(area_name)

            source = 'http://ncztb.nc.gov.cn'

            table_ele  = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江西省南昌公共资源交易网'
            retult_dict['en_name'] = 'Nanchang Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:

            params = (
                ('CategoryNum', types),
            )

            data = {
                '__CSRFTOKEN': '/wEFJGFkNjBmNTQzLTQxYzgtNDgxZC1iMGNmLWFjY2FkYzczYzUxNg==',
                '__VIEWSTATE': 'QnAQyc3/r6VenaNsD3N9NizbNQGw+P+E6kFiFSJ13jzqjGk0g2kwB5V0Kntc9wsMi2BWrroLA5x2G7wZzFXp9cYRvyfq+9DWkWifNlNexQajTfAAwwxwTYEs1j2HqAFRKPzmOEuCPkPncRO+t3YsexHUWv6gM1MuoFbn51QT+ewUuY/8FZBMImfUxklIPGmWLmfiMQJiQrlnC7z3sF5RmmWuoP5MQDORXRoqwhxWkWJjiI9YJaACijKj/p2k4l/g+m1C5h5MENVN5NBUH6XT0JKcX6U3x6heix6GMubJWfhEXa+naDfzt8sphrsVQqfLxndHZ+5y/L142pUh8NbguhSvJPTSIamouvOuXA40FfyWO480dj5KxhiBDh/gvHVJoEfBGtQsxkguGail3QJ2MQnH3rtczO2koJZMeIQ4v3Od+5aHkTYSbdm8XALJfrZCtiHQV9IwMoPTW8KMBOb2nib0tGvUYfP5h+RT/wVF0z074bLI3JlOdTxLthYBxyHXIFZCbiOawjKyKe9Vn6cMrcuVVICO/YOxA+nNHvd8yJQ/qQ8aUk1KbOAG1wAWLk1HYxVwg3n+8fsL8YvMvsJdYTVcAPnLyaIlKPWMrOEM9KcQa5/2PgXnbYBgPOO30mSN8qLO2tbwvZbUOlVKWeDw5eaQMkmDQpRO+ovhhIgHDdEcYUp47n5aFatq0uvJEd7/uICRGoE8lz2pc0ln1OhqtJlyKzpsA+zdBLmiQMGLJBzmGr6aGxd+zinnMApyqM7d8Zt9Ie1QRg3GX0a2ikdXyvsAi1UUaqDFaB13fB5mdKyrDexEUlHXD0HvhlRW0YD0m3hWgauV1OiEnj3w3Ju0j2M1tlTAqGPDUVFtkoFRAs61gXyihiNt2tnJO6S8XcFeZP6rPxH3YWeEdLMW2TfJDhOLNKeiSB37/sV5vUX+bNEr0z2j84SZONGpBcXrLq7WF9KKT8eAZuD4ATW1gN+BaCN2hkuBv9tpiEZoJTCqaYFxiWbRseiEe1GHAX7fohZbDkiwsASGCPtiahtWXxOOBPqhttRnpv+aguP8OdNNmdmQfL0MQ+XDPN7W4sxAqxdHoGf0lzUu8KYfetXfK9oWccJ6nWG38U/gKAnibKAIou8y0qR6mduD1v7UVXC3LSjNHSTPkxvbsLNJByq5zAeBa4HWHv046/fnz56p3ViWxV/rFiCdmHFfL72kH/V2zSNOCr1ec6J23vdZavplR4r68OF6zFV9Y8uFEPmtC4klHe64n+aqOtJs1vmA0XRCzGJ08AwfdNjSbGGpv1i0HUqEyQG2NCvEtfIHNCQcvCxU9rb2n920DGBN9aVF2LlF8VIVzDQq7Qizh0jtOQ934qHVYKjooasIs5etbXZqpxdNs0NQeZt5GbA9ysXkVqNUv1ca4lgVPy/smJc3YM64zPCHpcx6TucHMik26ksSTmaY27uJDgn9ihU1JeUMW+MewDVHekqsYprbIDnzYzpHTSZmA3JjBY5y+rlpES0FIROzzj9Ng8zCiiLyccFCCAEZf8RrcHiGUU7j7c/ljgh6b9PrFDQDAWcz3Vip5nGI6+Na4N7QmpawwncKFKUFfvYnmyzaGD3R9eCx5yKo6oIbo3M/4iIZxdDzF09CHZQd3WnUhNYcgP31ScXNzev2vY4LiBZidS8dbNhcm3ZMs4DeweLcDWthEWn2IVhqMZALreAbmx6cu3IAPrDBNm1e6kbguNW1UZI+6ZLK9YTKe+nnTN6b+4xk4gBbjXY8MoKnvcCPpw+M/jt7Ohqx8JIFaw4mEq3M5FY/FpRz1hQt9KkNfBvCKBIMZYJAq7kxJZlg03klgju2s1EniNP5UU6lTR7qMBeRCbdbvR5+G6nFzgM+G+YKNPiCFty0m5ab41lp9VkxXJ6qvyqCKfdZ7a1Hz8PReKqih6NZbP3dI6Ls3sV2JX30gq6u2XsjsB/wIbpv9/Lz6RIZedQUagmirtMD1ZfKWoCQOAyI/KBsSbLcOxHkgaMpIC3L68MNfY3JMJy2lSZfkwME0m4tKG0midI66EkcyZFywWoMnU3GxZtX9oYQXcA8sTW3O8c2xzEgE1N4s+fBluMMr/cSK+IyWswXFxu2yHete80frdq7tA/D10U9IBsG00pOqkh/z19FkjyrtvEmh9RbMitaSdiUMMMRzMDSUxMVlSAE61SxJpcaCrGi37Em10w1G05Pd8SKnVFKfb28B/enOUT/wAcQGyHy3IKlf4j+T4L298kYq9dGJjIxC3KmNZflc3y7gVeR1kOvOvnL5pyycRoFEGYqbsx3GFp7F4kLCBXHkFfrOzWFyrmE14yCeyCgGSlGmEazGtYQfoFeEr8lAD8HLOcss/Skwsrv8sglS4RaaSAGaxkPFHAOYpfAtNcus05Kj0z9vzPQmoAG0K1N8UJ4FwXMj7zvf/+yq/KI9GyyXZOEll52Zjsf2ojFP10nZJ1F2p4or9BYNLhAah+SDOjQHdSi3DBNRcies9qJEU9w39mH+VwF344u+tibtehwNv6Hr+VIFW+GASRsXxnBx2+SxTv2g6mxx5Keg2l7DEglcnh72/XmTDy9KrSrFiCSiwLoukwlD1E8vEguf4kAqluumMrRWRjWZyaFAYANuUN49/YZ6h6n3c8sr5zS9ceiK7TTRrgq6Hnd4K/ivoeHImyHXuPg0/EJI74F6J1WQxHLwAkKo1KE5O2c1xMLJV0deIAwIi3jEXuYHaUF6PzNoTFIWpXGpEVdckUx+Y6YC1ty1H3Q97eWfw7am/G1GP4WiotMFT7Iri72ORNrv4U3/LEljoST8CYwlesHDIERDgavp3UHl4OXpoNX7rPwN8N/Vvm0bPX5nLmlBGDhOhyqs2e4g1cAQS7gWYh5KTbEmSi7yQM5fIHPnXD94amrIFxxX0qwTXowXZlz+pJwRqJ0l8fMLFucYgZznjsV0aZFbpBRXTNfhjwY7fI8miJv9cj/ZS3W5RgVEjZHOkxwjf0MjYlL7Pgo+zln8tk4BvmceBh3obTb8oX1D0fbQf735RsLREEBkZwPHY8nw3G4KBNBqY625GsAGkQdJHwZK7mxt1PedLc7tNpELjvIug+RkNJCOkVX9ndhvw/+CAKgmDGNwa2/O9+kR+D2FZQ/Bz74RVkfZVwauky6J9whzK+pmp7UW2qTzzz1P3XWkq4TDWR37loewkm3M6UKhgeXH5oI5f+YB5kbpHz9FwVxdjAo4s3gUprbDdaTiid422W0eWvSMmJ3Qmjztsqxpey4mjC0QSfxum1eODI6zljomjhtY0i12XJHeaDkXuBAm4NhWwc0R6EnMJYS1YgkEJTpjUEiRfnIZq77FNMr2M+l+MlbW5cv43zB4kNI6/DRJuw4eNw38KulkRHo9lyMoqV0404ZLC7MxiOfCeIBOa75UqzpHo35bHG5mPfe+JeF2CPMMFZotXQfOvQUS1CSSC29XC4UrLGEn6wu9HxSb8pXlCMx4XuPgLO1mRHmS0HN1Zba/LJvZEUe2wZkUkw7ubRkt33jX9FBtKU00z4d/l1ZLeVudwidqzU2zF5JOmee5qrvCyog7YSGJj+G/iUrZK35KFZzvfKwmLYBcEvJS5cMb8v5v7U62JIvvTFp2CK6NzMO1ojBa4140XWunFKriOSxaauZeCuZ7lcw0GYbu6oOSOQCgsUEaHN5HmhcRkV88iI2TbXi4PpTN4UKqrMnbU0oA6hjPrTfOBM2EbU7HEYpi+P7Sa36i9uzSila5BYC0n3Fd//ai88+QJq8e6mB8N4p2V+orF6pnqnYH/+YLsW8DrIXjPaxgRGfdjSdcGH+BRlOc6BdvDEBp0xCLlIgeMkaGW2jR9uyBAFkJcvhKPtbXYmFc1CMtHcO0yrLo2CnCp8bNRpp5UedCGz9EHycl1YIuT0KVwYvyo/ngPHJPSzVk90j2GYFZiW3+LklW1RTvdOUFAOq03DlC/PFGw1piuxyeipPFy0onVxMLDovHIiKXtns5uN0Nrzv1UMpRpE26itdV4tRL2IvAjxcnn9eG7+tGlc1pO2NvbhJyfGTHfT1NxC+NZCzR6iOXRwII4apHZPyD+j7Hold/WFLS1hF9FRQ6vjAuxacT/Sb9p8kCgOW+O/rhaFx3LXj2WN/m4ur8wMSUlF3HpYPlIt2I4U5MNWtciFgnCegzNjLI98ja3pGjRcDo6xohcGkfnE648wF0mW5pmQLyFjdg7pAmDcQD3anZANklYVdmbpvvXv2ZbEWSkblFq7ExOhhLgilAn0bJbn810e0XPcy79zx+XIgc/i9qul3ATMeHzf54tXisCRtyml+fQvBW/O8+ofLw9470P2fALTAjyJSYFvPImWMOgDb49Td37OdceJjbfflldDly+XytCnhBNmYtrq1TZQrU3vxNiYXTwb4dJ7kDTET6h+Xwp+MmeoM1guIKTm+V5SAZVGij3io0Yrr8BDTZrt9OTitJg12CPCIx2JXdT8HqFigYHFpMUDhh2Xi/KKelqDHYbK+IjInQQCaJ8dmBi/jWwhgTdc65/y/Q3M3vQS6h82x6Y4gHlcxJy76jE/j3jAOJ9EIE1rqaQp5Dc+/Pc5g1pNALVSowQXN4f2IH1Ipu/evu3H7SjBBu202GuqVeY3xd1f5K8JKlgLk3zwMn354FUYfRB3Z8Vwp75JSInoy5W9e3yHnGfNnVUMYxb0T+hhBclbKPi1UtkoEU53QjMUiIeMLvYc1v9cElgiBmfUVDKkVTdwmiSU6miryV1mAVvMA7RshKPi74AT3kCnZZghMWgx2EJ8bZJaDGFNqukrx0sFz7+zrZEuTjwOWLi9HpLOX2dyqha7o5sndKHa7/nm4avXKESazFcHHQKUOxLjdVX8/VUZygXtDctLe50TnDUBySZ7P2DHkd7OW3TnV6icVLEVCmwMF7eMcyDC6R+TdzOgEFGOF8qv3EH1k0VEORnprnYeGUFoXkdc52MeL1NZDfkQ2VzHH1KSC18BmCmgaXkVU21nktA4Lr9nSXglHhY34ribfm/CY4LF8cc7clx5G17xRKLi0sjwfDu2Ra8VMVZCHjEJWljqHw5zMFkJDhNIRF40YJLr5yEg+88bglvScyvNZN3VCUtnIJk4Y5C7DM2GdDA68q+7BTfGoLOcwpGrAuWFesHhx6oXA2paZykfXNzZzUtyuOoHt3cws9L9jACYr9nQZGBWLVF1yUm4fWrfBljMPvRr6iaBzgRUHtbhUt2rICUJXcrrNKsqGhvLLqds+I+p5gSkaG678swB9Zfgr2ipluO4yBv/weSMzbiCswv6CRPaQdHek6mxxiEGCdqgzLhoiijQM9DVck9VC1Ox/eLwVoq//YAZcoFBZ8dgN2z8V6faesysLmomW6ZxGTc2t+1PPmVl23uzRZaoB+RPA2PD0DArK6BX+iYhSOTdXjslo8pebg44/nOGt69pkABVEBGfGadQcOHzpScZmh3A1LM9EeeRihCPC9gE7cn0qTncKOWz0Qs0lAN6TEx0+zrt6tgfT7DDNLj8GrGErfHNE3pKYDF4q9O2S9hVj8+5HF9zf55t86xqYRpbDVS+2lGJma73T3MrBBCSjyPVh1rsBA3v5W7Ca6dPC56WUaluUXyhMAPrmIoPMo2/AcAhHb54lbceEGG971sRpQYhV5kHYtJ0kUPtyegW64mNkhb0bi6MB2xNVe9mj6gBNxqX+h0e0p2CLKgQpsd+9PmFslJ7wrn/v7xfkGXM3YpKddzU2VL8as5bdMg5kFhv0kXUwtyrKOQM+WaFrccHQjU65tWeL6cGBLrfN1NYYd8bZR90vVXjR169ztMrEOy8ei/kzb/mlQOJoUgAPKFQXnCmdJdvnS+jyUxg4fO4IU1tqMXiKwmpLx6PIUrTrSOscUt4JSlWmM5+GQsGo1BB24Br909udaytL2f5JuBaHwUhjxhwv4bjsmMb41quhcWgr7XO8ycvAhCQPCexYC2C6xNJO3U2JtLy66YiT+EvTEUfCbbukOqRjRKv3i5XdR6p1fdA0ubU/FIUvQD8RSQYai/iQaj3nve6C4jLdaGA5p5QE/Rkx6TSyJ64jGqe3kq5K4RcvFk8TZ8dDCxuXzbZG9zP+UKv1y85kV+4XBDjcaeOe2zYu95uyGcSb7a/wfnNtsDg5uYwGsFWRWgWJKS9kXXtUAots+bkNzp3JRm4+ljntec9HisKPFIdxDCWlqOfpaPwgCoPLwvs2F0ZI0TXG4dihmnI6Bwm00NWbe9GwOCje179Msl49rED41zLJwyk6J3E+W++yBAajlnnOpx63oNQ7Pl7a5kEdSXtD6tx0SKbyGy+0iQGt1KKYOYYfB2HShdWpX7Lf0PkiPnrAFM9Mso5e0lXkUftM5q5J/WmIgBLECmTQAkhr93jftlM1HPcJycgEJYdE30fMPXcGQYcOIlecuPxDul7IiVA1/l6qI3tMxYZMXaqvpyPzgdrElxGTcG62faQ4fgw+ZJjXXfogz/NHeJUa8F4avUlWMJ9Rm2mDcMFryXrTS6Ul2a+39W03FpHCYLAGjb6RxxDMrO38fpfVUb6I73WmH5MhcAo81KR0lxWo/Ue5z9bLVBr1/yz7ZpowRONNlcQQQ0pSkGNbbHb0d1f/dUIDKxT7JwHoOUqzps2HS068laQGB5fphFdGqyfs9ZB+w4n8Y2KtYFjmfRG1SkYeqP3EcWfZ/QS73dhq33H3ijqkz7oLSomEmMnav71QjSW7JO/Goes14+uvOz6Kp4EhCEPxfJVcEgeKQjZkt/0kUUs0h9HPciZ0bma/gGSBxnwK/ktSCSkRCLFoPwSuY8VoxFs79V9D/Q9PfTgpJZBHqy21JDS8CVIfuHRUPz9aXoQxCbn0BKUP305AQWxu9J/TNPIXERQ2mjt/1OfgbcgTE9HAphhAGzEOLntaCZRKKqJ6kXN5kJpcg+NBUlYDdta3GdhUhswokuvvJAX/TYE0VQgzY5bPAQwvzDrE83v6cA8qmxMCJwHx7jQFHmHerK/zZJEbFATZA7TG5beeDSnR2l+XL4OkjQ0CGudJXMxGtyAqmonLvlwTcMkHfpQ5AyVLzAAOOr5O284q9aZLFdZm8epvjJI0hk4M48AC51DwQM7WJzK2aas3zVypwigBdpHY+4jy1T3E0BKvQjZiO8smn2CyU8MpEOUQGfREvYY/ug1Az1olxpqBVB67NjsLrTmMdQ948iTHezJXxOIDFwUBb2TQz6w5jk6hkS0/KkLJgiCmGav2ShyHfILiNjOtSh6ggqqAFHksEGDD8MO+fdYHxaIDi3BWu/gW2rtqaBBTiVQqA/N3KhyUeALiOl+RDhVAideylHw+kh3n/O2k2o1hUhC3stHEHxcPLXmXoVna04LtYFfZZSTUliO4iNji8ZYgfmO1JP90E0L7HD4my4ci3H3XyiYExNMaFEGL3rRISHIzKSRUFAgYbeVBoc7JEGZr8GVGRFEpogve/JM5Z7TPLdLS4u88hP44xytxLHw2OzYf2eqCv50Ux2wb5mgCjRG0xrubj62FRbCRxLJLfF5H88chKxbFu3/Da9vzVQ77k9LCa3fHb6tkfv1RXTMemM/5aMaMb58TQlwNyAImmzex5rjTl2dMb1S8FerYTcIlIx9PVdaqxYmN5A3T0u7u53DJH32l8P5wfRH7x2hArECgzV+/HP4YUZBPhqOeWZRoF+tZjyrbQv+srU/GRx2z1IUa6/BXBcidpznDD7a1uOcjXyD7jmTJIguEXW7sGm2MqfWPhpF0nMvH6SBkR0P3nrpDMte/0hIMdH8k1EaXm9tVq0KMoGVutL+vf0dhg8vqjVpziHSxz6+JRWkHTvc7bigzYf1T78SClBUqrBewxWITkfmMTBzlI2v+xZCCopTPnqbMjze76zB7L9INdT52NsYFupfYZ/mnVcVtDmjAsysZSWmZ7IjWSC1rL5IhcCXx5f1u5TS4z8asM4gkKI0khJVh+mGw01mUoXIDqlzNF+RjIK5M0/AZiS6S4K/BESTHUkVwkpmVdrmKUEcVeae3S9RFFy8axPLIFyiueKQxo4pRJmJa8hxgohQNEwGqSTYuCHDc0aCwZS2i2UQ+dTM/vex1bcvknYan6WLqh865Ijo15Hv1mT+YfjDi3kGjwUV2KdrUTQYQ/EQDRRLfofoxMML6vpAkVswa8cdbn0VQh2VI9F1NE7epGZdlw8+zQoNJXwEGIiaigSA8JmVpfN/a63PxbVhET7dyELiZxoO7GvUqI9BCM6lGitPkGl+/Di7w2DMFHR3+skcJRsFHn1HmFDrno6KhyE5yir5berCfOnA6aWGHHkL0qA4u25DjaGpv8SvYIb9WZaM3dBAx106tMDwyOeMcnb6INa1eRabadu0/Qo/8AHtjA5dky6gvLWnLMVADM2FTGHgt3RrSA3uy0s8FBYG1nxbA+rws0Z7egCAgRfuf3rE/BTR2DKRkF3RqnssJMnJM8XAahi8xP7zxQQijSrl0OA9ixJEBbJAxkGo1zeWz44YOFdmqkbOxClqIQcwqKLdxdSNoQt6DP3JkAcfZ+QwPGPp1bGi0H7c9pbWNLdZ/2LksjrgiTlh/L621jq6OfabquWKHFzpa8pQxv6GtQNPxYk/4yGj1io6i+JzfXZVqGWDv6gOKIW6bghC+Cm20088jo3GfaGjbx3LQiQj6dwC4fPJcLnZhrSFMzRrUa8DiUj9ZwJpTBcIPvoGpywDUgDQnRuRLOGqb0sZvppRyTK5c5XkDvr/efIsSj0G6W/FDxLbJaqihp/yTd1Rm4I2St7mH197FAqSQ9B11ZdZQyAGiSz18aTGcMXJNi/koXf2RtnVC9gOVHK6ZyuFxBW+GcAG/XHqbm87Ve+F66UKKaSk3YpYwoXnQStPCbNJXqMdzzaWQCTWw/rJk4r6xSAu3O0RP4lup+sP2xI15eKQ+Hp1NfBmhmk9M+NdbDfTQDb34vln7Matih5WndGq3FeiAu2/c88Tu7uxTBy5Bi7kmd0yslnudEfw79QUh4JccdvuIKMqRqE9gu/Wuv0jO3s9OJ41b/NamWah/PlL35Da2toUU7V2GN0PonXwASLoOTQBTfbGHhMRT9FSgx39qtZ1hEFtdTlRTNEI/2Ac5FLKwRUgZL30Z+RpZf1z4AdbpwxOG+NifHWxN1DvGunpIhPUFfQp7fLKvz892Mxw64U8tu7RgPjqVx/vjyy8zZ0Hdpvq2uP+t13Cty5u3jCCjdT0uwSpb/wh5XBi+bTatrjtNgCr8FtQ9vEPoh4tEpvZiBZ0XhQnC1TwbJJk11Tw+Z1y5aOKAt6wzK3KNZII9E2pJRL85lZb/6RKzM8rFVwWsAXR4HFVZsnIAa7UQ/WfpXb0Rs4gkA29vwPdu6P0/hOKtKVd8iEJDBgCeDhsnlNZmMbg4W48kVMPDTkdP+7CdxJWteFIpDiP6a0bnWNKAF0wczFw+3Yj3QnJannss9g7u5WhavZiOGCkC1IH0dOD/yDQcprtwOKhcCJDEGeSjHAQX7dmkG+dob6DgW28wHD1EpcpZxVSne3lAsk4GrXh9ZbUrNjp+5W7/6van8cFGuzApX9b+kJduAv2u4HcuBaINfD/xuJv4QmiZlfihefrFAzP07kQPTq+bPxiRLV00hhY0UK3YMnCRv1py/m8j6Bp3tD/V9i6tFkPqmEQYiq+76Q/anCe4NtiJN6w++TH8L5V40NHoKNTweG54dwxGIe7SGeZ09YNkQhKv42D7CxCkcpHSIZSqin+HTu/EMM3Zsh6Yl56EiCdC8jxg624cG1whjwBrAruDZdeU6G0r1HQ1Az7iCuVPW2HbVr+ybTWVK5JQtRuKQb11ZvtznKqaGypAqpE4IRCbCQ6GEMSp56z77VmTYqWFT/IaD7f57xlG4s1t7NjawxnD4NoYQmaWLIGxhl2SSCqRP7qznem98zB1aV9QtM+fA7R18QtUel8+8v/I0ViPBm1B3flYuDsEmKLc9V1TBmSyZ2/VOmeik026gOdNpkR+fAGsLnCcHY744PJkG3uiidDBTwJPVYxMqh2dF0m0oyKI0JNuXpwymhnEsFdWJBph2eFo3oiiI/RCph2Q+e5e2+QMd0yyomByWCT5T41Ay1Osisr3DFlh436SYD10c000VE7Rx6WK+3Fldddxee33R5oOw6Hu4gdAKSjo/RyzO1MQe/5fdqYgZmfGhcBVjk89oAcLFWKGWpk+AK+k7qxbDwRScuFkelGN6vO8TC8YsNutIZpOoyBAkvgkkCEL2Vc97xmwDfin5J+hJjYOzBBZwEK8uvwUNHkTM4/uWnXpeuHJWAR3Tthja30v0752hx2udM42S8KqWtzzKDoRyZlXEqPoOpHkOGWRadNdHM2Y3Poj8dEUxLPtucC9Ce6Kn5kXA8owvUkwxaGU6W4y3xSe+C9A1LZtQtbOt/WkAC6WJaM9dxvqZGBhMuM1J1pKxK7hzAICfrO/mPzmDPGDDDSNf2jiF2uiixl896rJEnc6Ht0aRN3QZcfTtxJM4F/gGhZIipFQRio9YIGKK0cGW+7SlTkNzZ+1kEnBLbNQ0H3i4xTkHtIVlg2MklOuZ1hbNY/qj91rueQsNrLfgxi8OB6pQ5BaJh0dwRwcry+QMUghXk/pP8Rcee68l0wJsII6qqdjQhrra14H7tzGrREi0kipJLtQsaIx9Jtlk14Hk/7eStM+rGKxAz4+n8YyEOQh3CHPUgNDOGXjeHdl+JuVkAihXhC/Ate1bTswLtCvXtOunQe+hacjFA3nX1qvX+Shd7wCLeKuGqpKrGjDhMFgq5ZlsumPgmdpBuNelVHDC4k+hX5+lPIs24vgt3enZxpRUxUiwfw3RPTtryQlQjCxHO9MA1eLvo5m1I6AmX8Mp7Mn/RPE0r64CPWsx1P+Xkx20RUjWefnPFPS/K0AB1lkaIi+6csaWLA25ZejPLL4kojqlRbN6KVYxML7umLJZxoWj2TzW2WonAaa2m/OFd5kheRS9xLcoAqi9h/4zh+7pFLZaNhz8t/iLgYTAKkVGt2x2MKe9Wz6oLh5zLxkuCDUX9fHMOuB7uEUgba9aqKP9Z3FmOVkjiMgv1qtjbjjJMu9E5+chdpva3oDWLV/lfKDq6tqppQXVYwVXQ6POBv5YKGoQSCHc/w35LuQOkBucHSl3v1bjm0KSW5Eg4gIe9kfs9eaVQ+kROPfOyCVS6DSrYVB7t20QgztcooNkMEk7q0oP37XBsfgdUFDzqhckckWx0numJfCZmQANsaXS1o5dhpL++OjTI3F8+DijVBesuZKW+Uslu6yysllv5ddjT+vfSmu3sXNaUUFbFkJXyNmdcMC1kgUCHU9IIxlrACk2TNcM2mxQOoQ5/jR+rgyu6epBA0Sjk2Kq4UI8e+z1R0v/7UG7m209gOsIMvGWnOazW2bcJXt3DXj/xnHzZXCAMbwfFVQUP1utyAU/NN3QskAmD+Q51zAtraztsT090kjgiAOwra7buyQ88x2wkS6t/OI3HxwXVsm5/YyI57eJeMhIHtVHp2/dYmHuSXpkBQDwq4a0/B4lnUjKzRc8jsrUmQ+YpvT9ofh6xBUwaiH6HIb6ojKCM4t1vc4BQxjkwwuCYPad2gLQGmJnkalg3njeN0jaGcP01Naxvtpi5fQi6tcakqyT06Ccsc8j4PRAPmjG7KI4Yz8loHlNyHb5/pgsbgWsxfXLXd/J21JRVsWN8PBkeXu68noT0Pg8ja6TTGqFxjmW4htaZd4GuVDFsB4t7nAGzFLigD4ZA0ZLUgJ+Bfygc63RrqmI/L2OUyEFJzxzjqGkk43mI/Z0eO7zfG6OZa+SYdIK++If+uJCaDZLmwCm3sRY3cFxtyJf5H7iXCqavYb+57PqGPQeAqc5tYwH+KM0wq5yTLzFikdIJCq2FMEtqmpgIa4C5OZii4BmsM/dYhNRxvQobSX3ajHGtopkA7IDQ7LJBLuuj+fMYoL3cMBOM4Hu56A3S5p9rIQV0yQCXDDt6QdHPbH0VgNlJisteaZ/fgTfLi57ZT0NbEDQKy0GPkhtUSbvlkJ+x0nCaDvfBe2rPtd1xiunIryIg6F/LACW8jDc64T7QSyJQxbsDDvuWOYKRmVWuUOCkO8Yh2Hi9HqPdiRToPT+7b8UrEjuP+kGNP27W6YMDUHEId/6UmQ23rWLxDAcjp3M9tR4LiNxYIHOp4A4cj+1wS9dz7IaEgB4c9/toGUUySuDE07xcDrTKsrNhVOyAuYOEzwkb/xqumeSrMfeSM8QZPG/C2YqMerFGecBGfrL05Ks45UITJjH75Y8dxMNK2G7i4mqhlj4ZwmQYRqvBtBFO+uEStFbLBV2BKNA/jFF13g+l3LgbS302Cu7q6QGZJ6zD4RLuVQc38xuvKYpEiYssEpoxY2sdGslQ/pzU56Rdmr+Ft7huoFd8Sb6nqyunSfUSP/AGHDT+EnVZUIliWJe/tLxhJCpUy1DEHB2ZcV7BuD1ZzkHy4+WyNWKUGLj2l4oRQjNpL+VeZhuKHRvsfxpF+7IUQc40JwEg8+8ZgAvfhfLSVHPGlVxbCaVN90eOFdEIQBaOfl+u4gKP0L/9iJEeh+kYctv7FsUkv9kbCAY3BRPK+tey/FtnTgoPEpfc2hfKgelP2ZWKkmcxx97bHDWrzAwWRcS+SKJY7HHKT3HgaO7TdubX+hb3LuQj+B7xKmTJzEKXGPJ+xIN1GJ0ogCvfgPsjr1hZ5HZqp5NXCMuLDPws8HCFWHVn6pKqmhRyZ3G2nMHjkmhMgQSC7tjGB7TWWnZazzgvK1XvdQVUGF13rvjD55A5jlYkl2uTeqr7LCQQNvcHVQvwYokWeOfAQNmRzaMXSugBkPFQ1QPTpa4F8vYuRCHOjNayYKNYYz5yQ0KRl3Cx3R3OL98NVaQlXzpGMhc4rHLWUmJLjMPs1udie+/huuEh5r4ZfZtieTjcPq6flTM3ua6GJoQ3Nc2/r4AAN7TYolIcThUJCqstCcxvn8klKtwbOLYOSMDydFNQxSCEOS8FoB4Bdmxax4uly1BOQW/ntTqnQVEILBuoXtxsnvjiV2TYz3HzrIzDK36vIB8RptWwW3Z7kUcstzMrucyjjHpcAUCi7mUW4xcRGK2wyMg9fXyVVo5zvqGHAWQO6EjWkOutr3cJnLtq5T1HSjjCI7xJTPNm6yeDhUlAH4dD9EhVxq3f/vrBWCVNyPKESjitKe47JPbxFhTst1GR9+ggDJqtaL4xeg8GcJoOtk3KdKB6uQ4CU3WAaxyVIorp9mLPEVmaSAHm4aAwffiA+egtjbzWzWMrzzs0ZLNt2wzL2ySLxmdbWlaAt+SQWXC0xliTeSP0YcTtV8MBltoyJ4vM/emJIHuC5B3XQiBEs6a1qsGxVkMTAFRD2MEdr/dG7RmdpB9V/xW+g5ke8F6ScpIVTyup2NFDqb2mUfJ6/SA2ie373dreVvFFbthSNtwUTdZVgUAS02SfSbWZQplVHDEeREH6+2YoGKfE1w+RZDWMWFPxwt6Ygm17Lt8NqzZZO6xYT5fcYqVqye7WdiOyD44tL7bT0u2Q1mBEUTPCHIG5hK+7KyhBbtp9nrp1E6S8zSHtGTLfNYLrTgFEmPOYGgDsRvLj1lMq8mWyqRKQ+avKi1eozmjTOAalGWCj5eA/5vG84UhyldG84HUr40gLlW/6XEyKebilO35r2YwdALjUT+c2H9VRvwUC319j0KdGJbBjP5wt5jmTEWqYjV1/Z1Pbo1OL5XyOfLKGw8nV0LPay5TOAL5gTRgIBX1E8hmf53PzCVZluvs6ON8WAm8x2M4nrCAJ3sw2+v81D16T2+evb6sqRNrVBJKHRY99YjS3GhMNdn08uIcLLVc3YiZhLTgQUuaoOyxOVtKxlZDOE+Tx7VVRl8SWZ7KdkdZH5o+i6FFF3LCc78Pox2RiDF+PvHY5N4uj4MGrq+uxEsO/GQkn7Np1I/HF6tcKOdoOFfgOhDjB9/eWEczposYjSntEPSMWwa2EkESAtHGPZ5QHj9PF0Cd/6xRvIvWfWhaaEIbIHCaM4BweMUebzvsN55MnlNGNdPMtn5uIXorycD4yX2AMimYJktd09SWRjqwhOXd7hb/uPOqbTBX8n9IfXnvy13nmd9wwRsGTL//zx8TqN3W/e3Ahrmc0Tbs6ZKszXPcMRBNiYROYatVi4TfDl27SRymoM5kTb5f1qLxpAocZKvoxzMaF89YXuF5C6Ry4IMsW1SdCCTP+VKODpRNp4fsFYuRr9RH87Vj2uUj/BTxbm9Q2ms0mkgc0PLmDvNPyjK2wNDwiQ7qoT/vVgzmPi9TOfbvdAUP7gNa4DXwLXlOVAShaf6U/not8+5SdzZDajEILVeIqugs4WsCuBP6Anl/znWp66YjwuSi2VNeyvx2ByUHDmXULJKpUeVKWZ/BXgtyn7dEYHZB4F7e+TBffJNIHv83Sd2odmH1OhkhM+1JCcJjpmBeDRmMU2usU7/SJWDUGkxDCu+88STWckOg85PG6r5X0hsXYWbVeNZyuyIjTyGIguDcnD8OashDDkr8DQX0dp5BILuIP5SWDra86CClV9kdkAG/TDX9lEHogQw7Jq39IsGc77RQlKxdaHFiGcQJbrEJmtnL6uL46QQTxJTKddPRSPUQMUUCgxKwIg2AEdVzGCOouN2+YvZIeivfuxcNQLbwLJyIomyWz9a4l6YM7m1so9CnEB5Wu00k5gGvjq+z8hxZgbVbxQ1T93rbpBf3Cz9sm9QVlfH5GhHfz8Sxl4Nhg1R5ucSg0kSMnR0XAVe3M5SsKv6KtUhwcHIdFGDhEdRr7Igq+zrZtPNtYoNjTL9p2vuP4O3GuQt9/x9Z6Agr2eoFiFXJSqCfVRfsvcbTQhmp0FVNSBnHHlNvZmhkt3MMLq9w0MYptaLGmZUegXnF7uxsKBgQ/FONgAQFhvn9RIyDp6zSBaUPDu7eXRdOU2w8OmY12loKn2ObLZdcYD2f1hXbdaWUuRl7rCKYgoapdHabiBP6+HtH2iFl/Fptqixl1my11BuPN7F7Ocgdbg8LIy9b/6OMe7Kam5dAhmfy9GKAJbRPAe6hYxFFPnwYMqsGOtQX4EoJcYI/HDcBg16MtC9lzrRspHuLe8unYTX7IhPH95y+B28QJWOS7imhKHTyVH2Chk919ktOHYSVYTuBMRT93ZJqwugp5NifI2YRgRNtn8CgNIwtofJL6JDTfY3KSSRWmGM8oX0P/vRo/rG4U6XSKY9jcrA1ohBx/k4lVpPqAbsPhUZu7MOWIK3n2ptgL4JTLyRPn3BS86tvpRRhqpAmhUWYyOzAvrtzb8mDvuTdDTy4OzSYrqYqOrI9j6aElBLAFvO/N+Qsi0uDTucO0VHig9wgG+lUE0iJQgkzKyZAuwlw6rGuFfNzlY6KYOcxxmn9oDLZjDQaf1+yA4bTIVp8S4Kkw12qXiVpvdH6AcGqKvASDPJ6+ayGWPig1IOt2hbR9dS2AaVyBRTZ2cxJYc0QrHQvPM+uN1b4Tz3AnEb1NI118f1clRBgLcgYgxcphF04KEfRbcx7LbF/y1sgMRHRYaHODdBYZ7IMkIdvoypWmaAXDCX3AuYF7aO3+W8BA0ThUshDJ7HOLywrgzkxkeyiJ9IYWSLBYKnIBE58ciw4iSYKvYGL6syesCcNh6BQo+VpWRaEpdSkDml+DvgNPgttr4svD3LMVsVeHvqPrCy1znc43ZHkX/84UwA1NDd/uLxw3UGr7Qiv2laboyc89D6RQ62U5AVaRso2pcBl0SVKVtdGa2h1eSaByDfYm6sWEZkluTfJfEn8RErPcQ9lM+aPDikq34k6frFaDY5xS0tk1nkOdeqrN2XUsHP4pEw+HKd0TRt24XpgTqKa9HjgVhSu+lyoyyWnXyO04Td6zUK5cUbcb+/jSR9cy/8egSRSsmKow4Ec89KNAmnN7HljkCGXCdhpPziwKIfup+a6qRu1EYGmEsjXqKHBC4gVgIO0XcfVHxtm/Kb3egbHVe8xbS6+blhZwT03nhbY0kUWrC2fQDOBfrdLL5NdQOh7rwDE2B2Z5pwwe3lb7M+cFNx57QXl4cpDJTYjJDUxew1Ne7KIk7QYIt9EfUwsG6yfdUkwZq90GT8pvQ0Ga6bmgA/cAYyNme1WVpGRaFL5ZPSGgyZ6YaFmyP7P8/zCJHgjWusgu6IwX6ad5gw4VT3GU9FoIiBFMgwV18egvI9x803oPepIaAnPjcteWOzVcKsWk+4H5bpLMrPfynaWWhcmNEsO7UfPJtU+jNuoZFya293o/iybdLXs9QOWHhoce5IOI12G5NbBFK8yHVxl+gcs5E3mcTGC7W3C2uqXryS784fkrvaaJtYMVPS5bZz0FA8LlXUBUYC6DqQt6SyMvK/XJMYSDVwepoCZuvxKvAii9IH0XC1PveYPpdPZxg3mkMpitmKZibVN3ApvKNT4fiHjF/4Ex1k1rCcWq+h3g0lFBH7HLwyx6CyptV2WlmulrNSL1Q85q4aht3K8GfyOhdGR43kt8segSI+xofrBmUsIcqS6uC3892PtcEfqFnI9X4EwjV0VChuOHduxdwLmORVWGJHGK3l5qHOvh93caHufhBdtr0JrNH5njtN3xIWump+5HgavQr4ElZR9DdoXwNJkqpsZm2l7gBKHSyDyfwisvprBgDRmrHmfuGYlwoXdvemTFbjbPgsLXLVqCD+lDJbngCsmMymn+xD2nF7U0R84/numzTU1OWVQ0cM+ANrPkaGFsBDxAu/CJOJ/G/j0FhlOltax/IWA2Flq4zQX9aFZBEcxZhUzpas1m7YkHRVhnvZ1IQPGk3WnFHJsIIGu8dhTJYkMUNuh0DQvTxprPaHvPmZCMylyTIlO2NLyP1ezHIxE1ZsNVwq7QPIzc9vVyaz8XD4+hoJ3hYidUhA9GCW88rbRooO0kT+0fNQob9X+SuWX/T8714Zxzf4j5FvxUh7A8JhEWLmaK1OxD6um/fsQCkhV5Q+1xhhlkXj7F/WZYTFy4w2VL6BoJ8o3fFp5WNy/Sp1vW1ozMV1wWNndwoJsau8tzF78XkYrgBlkKJorxzvlQJousWBH6yzTyNTfgLiA+hutIhsGC3s2vxA8BwrpizrKTK3ytDKGXVn8iUuqYXogrwrpznRg5XO76nbGI+gIK645imxrXZpi+LaRm0zJWIU2UkCAU5MuPz/OlnfFUVg53iPwNuFBJdwC8hAOLe+t6RaiWQCDgtadLbJdA77X+IhPpZU3Gvz+BX2LRWQJRXCAfHZI7TIDZdbFXRvdWa4f7ddEYlpPCmkEoPoQgnTPYRiZYwu6x2mT3836jNN1ru9I57hzjsn+4+0gZlgGhi9XypOaS4v1Ly2TuowxfCjC3MH9xGwkApTIMoc3M6OQdmW2JD7dbeTpBOVRKmzKqiCpmycp9J8y1NKmuSGDfqCYpLkY2fJg9zUBzVfaAINfG/Di2xc5XNalm7/H6dD0ou96XF5lbkdctEx/BwOn2kfT4sZYNM9bsde/34BBPeebO81nxc4XNhArYVkkE+qxc7EL75oqn5HMef7GkRYJBAI4caRObFF4f0erwAWfvo0HS2CJmqJ/O+3T7F4LGUAe9847YuEcXg1BcH3HOGrnGHfp/3A3NJzVaJmTsjdW+KFc9FgGvmG31lSD3G+io4BeY32UZ+JBFuI7QmNQZQ8fax9Soo1h2nq2B4dzyMpOHNSNyW0T0dQ5MBjD+Htp6djBjp72uDpronYutzwo0cE+hl45bktceexd9eOp1JBbQDDVTSci/YjalMcqnQ9M4gpEVmvRdfx8v9erz7MKXTFNXthKaQRV/jWe/i0957DLKJrxY7e2DpLIlEVcOld2HDyurDbGi363skPiZKteLj7rJcZWpvO12AcG84IsLyCgFgeh0/1aHiM4OSvltyH2U8ATU6z313WI0KCe2abXn+ZdXbO2tM1g1AkJ6LAnDYaPClVOaabxDe5jZFm7YXgULB6FxfoC7GQcBa/DMPOnMMJxY/Ku+kt7gjH32T9zjIYkm5M5FoG2yFw2Qre21juDz8106bK3VWhAQVMoxB1xAWxZXWiJgSvYwKtDwllJSStaS+TgJVSp2IV9mixd47yJRucSBRbSxrUrMkv0xzpbReSpOvky/t2OAo1EWKA/M99dblLJHN769eeuzXKiMYaEBAObgdzp89daBe4hmv1dnv62jXPAkKQs5ZMWTeWqYZIkMA9ICytmvbnAi+umzSqEvaaras+xcoH7YCFqntRRYQdkvs1m26mBFaJoziZbK2/SB4itvH1SXTYI8AuG9xiiSanPG4F/k1ph01bW+gJxEtJ62nyVfqge92w93suCGupxAHiis5M7W+MfvzmWyDY5Edsy77WGsVqhmfVrxJiEJh4nP2iyCAQSfRKAow4DM/4ZiDlSJJxioRj9+s7sT8pX5BVcJZDS+Nitx9k+NCIvoQAuXV3SU9f80TmKX/z6vuYdzw+m6AD7Ml8wkT5NZ2hsbbKlFLcOiQSMY6x8+FJ1qzdPt0P1RlnwM2nIohSJEw88JOyeDabxLDAc4FRd1hLcJRFeIbo1/9vRlZeLRNBCU2mkp3/c6Mc7zvPIeG04V0vXZwFCbYnX5w7lZdo7sD4RkjgTxk9hZ3CHXlBrMI//W6u7C0XMXajlHMmG3jSzT26aQ85UUPwnvUnneYJ6pjQCzd9vZ6KGyO6Up4D8P/2n2DZenMedUpRJZShcHWzZg8/v7jWWbpRpZIc7wcESOgETq22T8MZ1yopCc03DDP3T+kDjH8hzOqkdMKnpzq1nsJXtORTpvKNz1ngJ59O9ubiKGnrDh6bTB/UkIYeF12st8BKXM4Nb5qIRe3j49qG2JCIpIY3+b6Mgv57WcceFwgELhMyjSc3BClA4xqqZZ2iazUJRiA34VCvoa7DXyWa4Lf0WWiWdWtO9tDKXu64bD72rmLXNAUr78IMZvJVD/BvdXVPKEKCKbMN30ePj4kL6TU73Q39ynvebXa+06vAoRjiOSdIsw7NTL1l4pVGef3Tb6M7yr3rC6mJtIVtghXK8F3NPJ51YU3FdpBVNoqCsn4VqPMvckUxGdHhwKJB+bhjsR9PoC7YHMTqv7pEy0HtuPLMvv3PD7I1vRWlxZgZaFztMZvS5ve0QAYrqgG8asrNu3Lkf6l/buYrpPtylEOtE+GUezMKrbnk/r/GodOqFyUxrfpCRZ1JXq74b9RIVFDgA9g1wu1wLsGvKVpxB+FDKl3zgrSh3YX40sz8Sl4fsXrBC+F4uLozk6yKile8eK+i4570X8wyvHXOvUGElfZq/9MMGjqZDAUFoN1AIAcMMWLHhzcLnfVvSuzt86qh/9exlM2k4CYYgnM/y4FHT0HLbXzq6NDBLSqau3IdgMNzBLpBBH3lIPhLzVcyFaDGnzzQw+OnVPQ+Bb7AYg11QibDUlsfpQi932GR2o5wtUmWjJheKquP1qKTJHc3BuaOTnxXUwQYe8zOI92WTVf4xwPLMh0SdC8O1CpTMmzaVqGxyDPzrc3H9xHaZ9co3lZJM3gKvQ2/u+hD7+EJzGNTJmuY73g+BVCu04IgAdUZOw9djPkhrKXTpmIv2kPN6Amb0Q1peGPcXtob1DuJrWUNSZySJ+8es8uFCUN6xkVFJZJbtQjEMoag0P4o3bqCgWYP3CZWqAeaYmwwZR+CePHvFseYm75+iC8UJUOUJ4RO9Ay9zWg13l4srKACqRpoUI+T8QOiy5TEJgsfozjYbTXkD3EHFzHHDqIxNclchD96yfDvpjuN68ubbrOxCBzcG2gAbiNIUHOnSQHOi/IsGP3MNLqNFKahQHwSAcfgFN3wrSTQGkkLrMP9p60kIYHiRrKdi37hNwkdgWLVLtYvyRdy27BepIFo240U65qLzzGe00NhNRSAIM09tNFnnTKPC1sCWpHsJYW0IGTC4E5kwonyybCwV5CEHDZMMZN/v5sXPnHki0CcvikSLyFrQB/nqQRsLC/ROcafAP1NvHIcs7MrDZ0Vim459rrt9cuzP6ohFOArlHEc2fnbfBTxUNS+aqiY/FkptaMJjpALT4RNgeO6CWSEnycAKXGBAUcTYBmdYu/dO7xMBNiKvUH7d46lnpulD7pXNMB2WEW9PwIKfjPnEvJE+KK8cE1jRZTDKQ5z7zi7BqSMX3S78N5fKyOcR3+BKcnVjaeDPCBy7XWIZa6cMlbeJsD6aAD8oy/2GHe+KwqoKa2adTIYJH4+rzS+K+xDpDBjbRw8ooDmaiMyw1LtGxM2IaT5KUElyot19G99SjjzzwSR2Bty1T9VqLljInouaz5j0DrLE2HY7znFree8AVGaHkyUM7xuCf5VPmcnfCkTagHm42wMUQ29mq8NOpH+UUe88xhha52cXIFYxprN02N1w5Gfd2O4BYVpyyr1HVdfMPOU7uCl1rbf8nbNTTu6uLlkXxweuozNJ/XxG202/g/beXlsu69oBaS5woLJRS5/Pnstnx2h48ErvxKJTJRNx7Se9/Avz1kUpVEZa7TR8FY71WpSgIrgNtECktli5jpooHKbOiy0uSqNdai1AvMYeGHRGCdXPlNxIG5SwdGmc2Wv2ZYTe4xT3wvkcyz9N28hbA9uHPy/KSext/PzNnSlN5go26UcyzptFRRecX0vX2rCC1EDGCLZNI66BSVJLb+O4Lia5N8XXayoe3z+4WHU74gsiiJBGcjJ8NH3ownPKoXX5SyDa+5cIEc+PWp/rGOi2nxvZig6yzUtIVj+wfFsdBHH4de25u8afF7ikZY328JPXv8Dj07nNufEIXpHp9ixOeiCMYrPmiZXl/EEUf+EUtaYX8IE2Fh7BO1tV4iCIMXhMypSI/TB6NjQ4nj+YBE29MwSBC+VUNGKzRJjNRqhQsnjJu44dKAYN5BGCxgCNFDNrNLk//k3M1pY3EWXyfiwxa68/9Q1iXvBbJo9/28mgx4IZPKd7DfY7qxY4Z8w+HlsCengTWiWlrMgNtz2aooO0Lh5VB4C3bVs/IWn34/HhcXiMkZunoN4VzhQIc+q4mTl/MAPnuGJSh40AwJwkNTSNkMNWB7a6CQAl7YidBhFMFqXomddRB/kGHBm+BYMgyGNcPZC6GguavtzizDjp4o+4+nsUzIEb/mXWXfOrYYNKVlzBFG60RBlvWa9r4RZAr/kYNNdQAigzvZEENxDgkRioFxq8g1C1ddnBTpZSbfWR3/CPAqv5lPwFLN9e91fhkMY23KahiqIly2j5uEy6jnVe1VVnLqLtq/TghwEuQDDtBy8VkkSyMLvacrN6wPg+gIjtlM8qdFCnndBO3b9mZAzxbyiySE4egq+nowQMG0ggsSNvVYMgfd8l1j80cMP6EthKSA5poggIWiFN4hG7+Fi4qKtZwAzeBfCfyaq9C1+NA4IcZcSKBZr19YKW+aeiwxaHYxZaCt1hjZUfGxloSGlB8SE8dy5CGoQW3GDCmymiVNHFEhiTekuBWrTYs9bTANe+LwSw8xx07JgAGIU94tbqoYVljDgn2Q7Eiw3mT70SxofKX80AOUpebFELbudKRvH0a5rl/fAwiwb+fHbRI4x6V2OoMPdBiY0XlQfeqoeqF2mO5iSe1c7d8JKB+/CZqyOyf+0x9bNCz6qvGx5TeaudPynD6YriX3ovPO13VxS3uTanhX/x3bWkcEJ1M+kwZp3WXSpWHcg0ULP0fsfhWs0q+Fqr/v/Y10fF9nHfzFVoxPtXTg2rSjmOZ9446lKaFCF3sxcwwQnBS9T/GfXyHnb/SVqzTwCpVOUpKql5UY75RKAoCFr3+rMWS2FxEXOrWTh32NCVwraKbdZKImxhieYnDx7m1kWHCshLgnuFYHz4rbU3Io594wOR+UWOWn/SnyVHPfKyNjgsgGIPECEv7CcUUc+eBsZj9W5kLvUTfY6Nd5BwKLYPW72GZS9MVSKqXWHxFwnxD0aKHXRbIjJUpcmVRMmHgtQgfJLDz9KYhRu7wtGxwxGu6UlL40tc3nZpYmecIoXRmyGfop+6fRXNhAbM0lDq7thzQiil7+BT5a8jiLo7f2kUym+HUbStNHVWRpJO4HEkAqStJSV+Bd0gDljBd17G+deNZgYrDkv2oNp+2Tp37wefYgfdQD8Lb90t3LVUXNyzF8YozCYMl8IMfR/BxNSledLri3KNb11iPByxn249tPJWCx5nRkf+deaiT3EmsOhfv5tr0Pr6Eth88ECOp4RMSN2gPv4XZyBeMX+XvfJNhySzyTA9TBciiuZX9QTpx+zunWMFIPyNzDAfzQTu4qmiC+hShPzlC9hTLtdffRgaDGfLpWulcam8Ti/I8QYU4NQNrnIYCGchwwJJwtBNVX+sZwovS3mgMaG0aQAHUfeE++Efh+69zjTCJDmNbBvPrzwz1lnKqDCIhFRJHLAFAjyBLIur80or5wfxWdHAgRAeGSYHTlGVIUvbDASGyX8qO3b/XCU96oIP8nbdzpOcrHJpQGzOwkFPtMkhDgnaiIJFQYzFde8GROkEjSn3SCBOk931dXfansNSuMcnuW1x+gtS7On7fM2nuY1JN1gpb8o5tIhkO+eyB3Hv6JDCw2h3dIN/SGZmgCSIULDj2989UWhpEkTWttTCGV60xja0CpJDI8+vjuAbhMcA+yZiUNS8T84O01IPgYJW+p/j5lRAWbqvxw+riKawqtPOfkVHyCvKQZK5nvRoO4AWUugSY/yM6G/GpT4J369b8n7b2kPYvuTmRq7rUKs2Zn35qNDbVj7XoXMxmQI9QkkT6f0K3Hji5dtCwn9KWgamJyUXU9QuDC+fEN33oGkIKtmKT6AYZI+n0r+1qvLe1+GhE53aWc1t/T2Lo5DtP128H7LGgTVDjGVIvCOJl2ig0wmKHMAE6KTvSTRvhSbTEy8XZgvlBkGVLqpfDx+IFr+l8oh1HsOlU1wGHTB2/rcpE9kqqu8BfUKRYH1zT1YA1OcV7iN4fELfihU7Y3TDZMPskHD91wyHbZfcvG7flVeFsZrfqT4Xv6Ok1HZiFZ1EMqcyl7W+bc2pYCZJd9vxwpdtOx9BjRjRRZemXYoK5HAIAYAolkBOYmwMVwvt0rZXRf5iw9qgm9Xbci3772uJ3qm8xDyxUnak8oMcBsyfcLu/3pV71SgswxkjGe4LQgFA7IO8Ss506OhTaD+tJc1qevRzVLVNe0g7m/V/WmjQQHi7VuNil1aU0tCrmyPj87jWC/cCjosDzAgIoJpB3hCepnyGuAt7vEUWG5SXh+Q0QzLuJRzagsNbai3L4PLmJvzqhIHHEMb2j7wXO3y0F3uBlcDxZbSvcz8uXP/uK+Ad3Bll2wHolzy6ZVdRkCUukMaL+K0e91XcBEZ7fhra96c7SxXrb0WDAObxj7l75r4giupIzumWiTha52Ei3HUHPxQgN3y+L+b26RKKSz4AUlUK+eVeYmkX19H5kCgukyaflIpHGPLuZvWWjyt8UKhDdELYBm56cU/zuVdV7sH5mA1wGZnopm2VOk6YSCJA9tA2150XDeYDeNclVfyoVZ4DUO41TUiF2Leh2apAJc9zmo/ynDXck/O28HmjH1XB5ZonuV/y4QXXL5eqdnKqBm494quk7PY9T75+jBmgd5TnadPODRKoRJbHzmQ/8R4P3j98Hp943Kzm0QKDjUeM9q3OjzJgt+CheqtvIOop5p528CMughy8fcu52kotUMdzuC6EWa0Bm30piWekEFmLimsSq/7gFiyUOSCA8Q9PV+HosWeYTjcx660zHEUbu22KSql4WYlOKXrkPKySvEx0v8nOAlJY5L5jvkY8igUoihpIcTATdsCuVcm1WAn2OuimTk5pCTxIjzhHrlYC4+Pb0Nm+hntnS5ZNY/n4kt/l7/z7aB5wz6vry0/sMDxqNajxG4Ux+rNwbCBUOQd2lUJMcHGkuPfwNYzNXYYIqJwR32LBSZ4NUX3RhctcryzB/2uUOXec9sm3DMaaVd5H2rGXWfiZAh9hCxdsI23pMJyLklulUy9TgbKBsOFnPxSExYvqv/TfbELaiY8on8qtvHC5lmUit1MnUXKlIG5bI0oeoBm4sqzUdAoNW8MfKR+px4lfzsPgXbO9DzbUTneBvz53n+qvagOKvQkhHbuGpnhR67CeMi263RMX6cR2UaOxXzpc46MqQMsyMb+M8ug223cCwr6tIa0O+ks+9xT2o4MzlcLwTLrwmCyHYOm0wmAc/qioZr2Yr8Eyr1WfJsXWjR7VyN55vEhJyhcnmyKHPSdgirhzm4X1aWkYITvtEFvelwd/UEOVR1SIJCd0GNLEf8dPyGqMoklkJ4yIw1MvH8eEOV04t7x5Zq6jHy/HLl8QsmU1tCaK01WDHXg/eycvBuL/ToQwNzhMnhLumqLpeTOhVlCpFk3RSwmsxQiu3Z98Vt5fP/BaOI6ousUxlqGHIsM7TaAoHIlKSOQH6Lx962D4sd0UAFj39PKLjYhId2WI2LAUJexw28uV5L+SKoN+DsFXJzEX+sx1epZ6qx++1yorI6ayS0OcOCYAMzQ9zJCRbn/Ai7F7fmO42Hd8YUPYg5tvpANhJumGVvLLmwxn3jKk7+8+KgckeSChzplWLE8H0V266nZD//fWoYlOhvJajSnXwYVrTkW8ycrRpn8Ifa3PychsMItBYIUJFg1dpLshZyULQnZjl6UO5qZn9YJjVyViXqj3aiUmTXddlxyyB/DmHs5YWYJSd1AXsZVLNOlH9NDGwKiBHvlpkLcLPiUWIj3Rqmfx0Bb9Wz3hYhiRby7vvC+Pmx8xHoFI1Z46dpIlkfeMHFYBMfOcebTdVDyC9W4NGjAz7mxwoLOPLoS2fb9Lrx5GhUSjaC4X9rgdGJeRhGr7Vz7ePQp4Z1IPbRNSPRylAHrCt7xWLbq8BSi94588IGPKjqnhsz3JQdy31aJSVlhNcAdoiRAygZ47rouEhDpHCFwlOHxpWrJclFIpmoDyDeuoOYpSom0dqxJhcFJi/udohOvdpNJiEj+Zs54c3y7sn6BO/5b/CRuPSDWz9VbR//sZy2z6XuZCMHCMd0GbimBwEjLSG3qAPFi+u5tKoqLT5wd8M0AGox2q6Z0MKP+Co5Am8Nc7ylKAuBkmDS2R/Utq6aRSbLjHI+Psbz2AVvhp9ZjNqM63Umzfm/RHIf+ZzquXNCnRhkWy7XHR8hy3I51EIP9NGj/pzr2nrz1tVMeNBOaED2cabiH0Q2yV+D/3mM0Igjc4R21XRA+Nqu1FzB0A59TlN7TJM7akl6F1xVRG9gMv0JhDO7QkFJn3oDEaMbMaiPvmTHchA7dT6fpxCHHg3++ZcgBc+S6JRO+CdYGpj+6QDTi+h3vhPjYAZEqxNOfBEQUJjG69Z+2/+IXO3wzvjFA5bmn/53sa7PukK+KLubG82XflC7I0hYaCbFOCDXPC0WPxflXP238Ej5JYYV35MenejbaKRdeDz47PKPmU9NuTITVxGgS8FATI9hLXLBPtjBiF4FLVGfWMmNU3MJxF2dEDt9wLUO5JJIsf6/9DmeU80wYk4PD078yexYnVMOi6Ig5oEZ2bUkPYg+X6bUDzj5TVZEMHKpz8c3mCTYxgnwrUxSv9Xz4QncpOdvwYm2k3rNhKQPCmv/CjLG1LPhwY4yABFRHOPYcanQr2Jv7pphU50S3ew7tWfDXKI0ZAe59eEW+FsT+pcnLDpNGqG4A48askhPa40ZcQzrar95IfHwCym8eMRWW+FWbMy2Inu60Kfl4Kr2twq7N7vZYBqItIFwfAzhz+T2HKqCTAH+QBi6kwnUgiU1jkp+nMfg0KvDh099ug7MFjxVf53cuFokhgCNl7HkJbJKCMt3a4iiHftcdxQYqXqUNbSo2PRG5iuRskWka4pc+UG5sIWOrCVBXDXy0BT5FvmB66JRKnGp8ixdQHS94RVADQL372NsBWT4vv2O/BowpzQcgu208QykGFR5zI4NWvmKvrW6jvj5cWLoTMHuRiaPJsiVOTFV/OV0ZXo7SqOtG36xmMAW1oP9JJhVmxlGJI8hDnMKwqZmifGfChNovGkSyUmxbBa1BqudSnWjNes3tz4ktFjXI7LL6v5c/FpHIhnTNy19Gb4wkTw4/bZPjmHvir4T3xrcqn18P/qzQyEeVQePhpaUPxdx3ZLmhko7nuq5/fGYY96//oq4ictZjMGNmm8Bo83kUxxoc0QKS8Vpg0zeKhvArDI5JNo7/1sLd5NeZW8wDA7KTmsA1UaiEHsGFXl4M6unB+r0e1FHszLr7UFTH4tgFlOCxw57o64ftG5cySzckHqKNHVoHNQPm3/1/n23+sXqUo0FyJVRbMCwD+ak0YNdI7t3Jibcf60B9KVMAIL94Qqi7K7zn4Qy4gr6QHEKPddmM1dnRKIKpcGUKD4BGKcvQmL4AdEEuOXkbw1uTKUkGuWzbPWCNJje3GcGqYauvjgQh06OYPM5KMrZtLTvX9AdxQeddS/1L+KqDsyeHT0aGXzqqufyqNoptSmHVvw268l1UxUaQd5ujYz3FRRcqUvCs2a/aLkprLGiDnyU15mVNMv82q2VEtsZIby5BLhry9Rx2gf/CR6ma9hFdbpVKMHHwNeWNoL1Os/lRMcSV1NlmdJlNUvoe7XDs8+2hFu2/bOghBG+Y6x0hBwqMazKT82ij22diS0e8IObTKVxemk1YxlV4lTjgMx9uHlqQhIH0DBBx0ljCi6bJnNbXvyvMUCwhK9geNFybsw8o2hwum3vBRrdKgWO9Lb6/9RXC5o/jy1E22uPP4JbEnv9BwLt4pnRqUlOosKZo5oLOfU3am1/9R5CcEnn8w4HO67qtMuuGAc4ZY/NY+6z2ghpBFNAXpj3RjgUCinFhTvqb3wl6Qa2db0GFomGfTYbz9+OOWvfLX4ClBaJ07PrtpkGeEIhfSmRbmmo1XXLoedw4/16oVbB7k2w2vv+f2LQXpyjB9NIy3t3FAlbkSBYY6+lJl1H1RucepZygDfXeypsgti43t1jB302LdExc00VvXVw6Pdrndwc4k2FovtxyD195DhOHHtfegMDc78kfEai/I4shk0z1MevqR+AhSwcbDtQv5DDYjQJa876dd0NW/iRo1pe7HbIL4V10Bt76RLYfm7XOXvJFDe4tMOGeO6ANh4DjlRdpu4d5oKXgsWduvWoEI7FG7yHnuh5AYmeshuJTEM9ihLeKNMvVLJV4Ic7rBbaI6epiddy9Q4doyBslSRcqbwGR7FIkEJAnDSXxphQ5AJNLvBJb4R6Gu4qV6bUWo0j4PoqbJc9thlj9Pi5pxZQoAcREg0UJ9PfH+QbR1pCKrLmj8eF5Q6t8nNuwgCiYp0p1yY6pFXyvKc4vG+PvZ6SE7aHwNtCG/gsSHlmzuQk2E4/0/20IeJNsgOlGnb4LAZiVuFQkbVwZrrHAGSsZGwPiveYuUgPtxHajtFSKbQhSC+4eT84mOAjf5S5XdRJzMbslIQcV+O+8SU65AVpdqhGQUW0kDT9/XGGPdqSn8acT7e28+n+vpdr7ihdjYT9nY6HjnKcb7OdMhQ9B1s5Ke5L2/H9qP8twMkYbDGlpF1UlAmekQchUyukpxoqBs8HaAwz5PWDAiLYR1nA+UxaQ/EAPxK+KvFU3y2tQn2ml43f5sAoOgeuS3KqSJvruOoJnUGaVMbmM9q6EI4984FGp0hFcAQsodhnPLwa2uWl+H0RLE/zKMgSMoAetLsH+mbiPXeuiThDstwcnCFvjnc5YxirZswuzqptDbvJCUR32ykRgm4IsCplQqCZpAJm3wMcc9S8XMsQgGNRbKWMHMAZjqk3twBiKvivUGStILGcOKm+2T8egs4M2lfMXtzy1/jNJDYnWr2PmFRUqLwKc6ba1cB/UkysLRhCHbF/ptMVQ1G5zgBHmCU/d8tDNazhHQCBVGzS8yGUB2SwJb4JJ2OZ40R6uTg3/lMmqItpfusNTF/Q1S4bygt2xKmKWwt6NrrpDYoUZaIRsC7kEkI0ccNojV9yPFuh/a97adG13SRkw2w0KTgy16FYhM/M9E5gjxXaiLEg+oSrv1Y5ftbh/mGS4hEY5ceyLEUFflp70X3OYBghxwH4+8wFS25z/5rFKTb0s80SjPeENYj8Duy+WNy7OTENssPSwzGLOf5kaR/S25Ri/WYiwO4Zh8RkAv2qFV6/TxXcgKTOKmZX+k7owilFbDNj8o9NJGKgwF3xFIsRGSehzaEiPdPoJ6hK92oBDbFHEBGYjqJxInPJjM4Kb0AKwtMddDPKnJm5CRF4yTXjPTJu28wY3E9wO8FdNpqYV/Yzj4UREr73dnKGA6V0BBymjdLB7kYiwJwPv/Q8Fyff2Arlbnlvp1ohrGwYv/6CPyOqzzakf8UxLFDHgdSOntYBBVEHbkjE+oHomczqug8VrTE7+8nYW0zICAXLpwJQg0pXjZQ2WLSNIzK001am4gjXY7QHERJ1mcaRfR5uEc/L13pa2Z1z3Dq8jL9m7uDlQztyw8wWboLu7qlJQH0hitCqG4S/DcmJxblQPDQHFX7XY4sV1kv7nsuJhkSBeqdFo6XQ2EtjgG0YviW1BCn0TzGBbCpvVz7oAWpWiselTKoK6cbSaJr73n7nP79ehrxSVcEyjnBDThfaSB1W82f/B1EjOR772HX7BabDBVWyOgdjPbSlARPzkPd+uTbdWaG9GmAxZrXivaBpK0kRPfly6CKMt3/fw4ckTCg9WxQaoOABfMVk1TB1V3b7p4FTi8mcGuCKVStBAsERY4mp8PU/EZ04p/jhs4poBFJ4eZXu9919VB4XzH/znNFJwwiQMDD1u+HtnD61aCAoiz0Kaiejo7b/yy/AGdOvf5V333xo4h/Q0nHm4Xyz8w0YlXnImBpHvHwKF6tMNqBEsvpUgjRZYp/5f/+RzDuKgCWYMaKykff1nrWI/UQnq8wzrFVWp5OTxHJcRKJPNIPwa7deeGqtrin+T7BkZGit2bIAT1UqyHDNu5LTJXmL211NQkmVXsLMCdQ3YpeSnmSWmXdvjROBZjITJJyGbwnaln93I6TUMUcMvWeqZ7qtgJ5s+5dyADjMuWXOwOsUJ3vEkC9RZ5mAFL4reumovNsAjBCY4QfJtCMSjw20TWsNeBHKyGSZDMnIjhhNKkS/W4ERA5Lt1iRjNL54YdU0crEgbjChxvvsh0C6DN+bYRaedba0DSA46ZiZD9/DJjsqSoA4tYeWAaVCc9ExQC+2WFUFJBM7AZAUpYqcT5vuUv1oRDaSZ4rR7JXvC3pbyutPrY1uA3QTBMseC8l9UQYfQV/t5whvuS1uXVP2PWR2NJOon+vnL9kUk/Iy9L951W/YhPiORHbODUXVEC3Hf+q3TW7Vly1KyU32tD+eDlvBoMR9R8vwPI/QpVHV9V9bLGbFWPUglnQZMAaL46l0eh9FZCjTlGigReOi9LnnvKw5xPjXSk6z2BC8DW4OugrsvbHEU2eQLW+kDNynKL0GZReX7E0UTpCYWzau+dZgHQZ1QhoOACoqDAv3RoAgqCaOZ8XLvxSkHwdX0OLHdSmGsr0bnUlRPCp9bIqSDERbsJNUJglEoih+sfXcE1431HpkTSYrlggH2MqNdx/d3tF9vUSqxEJ87Hf7w0QCvSzE04PyLIYGQZ925x8ziwgM0BDX4vcn5BYfrG6fADQTAueHRSJqwWMiHzzpjQGByR3dmw6kVYShx82eidsCJ7N5BoQrYSawiNfU7KUINdN+o8L8i0EqSdW9WOi5fexVgqWTBYFwjidJ5HoZEs97G9ua4mVRzFOhTAsEu5o0KqHms0VwUSkyQiuMgmGJkA7qeLUHcj0F2w3TsD9mKTexHGyRsZHdPcZDCJ0q6GXHC09mPS0SvXHa8I1hRYR5JodtDQKJ/ACbg62eP2FKx+4m6U/QZni5UWTqQNV5BzoL+A7SDTnnp8vxO+qZH5sJDl7GX2NP15l9jkX1E8zPdygKlmCPyMCFRJKxC1ubeSU1CuqKqMz8dfh6+lXk3dq2bkj9vn1QxEhDuMkyTsBQcLhOjVre+mTqjjRMSPKhQAg37P03AjpKLnDyd7uHf3WOwGe3hTyD2tPmXOULXyDEIEnaGL26wveOApH0ddwB92cXsUOzGu2Qfjk1jfh6lZ1Iwham+ICjlLvVnrp95nS+nhtHQHIfnxvH1EYd7Ka3IJS226FlDF/Wy2yN7UGKdG+iWmiRO2m1awrIFpPCi/0jZzQUTBvOZcBgXJEKOA57x0awn8CIn6hIGCr5lY0YCaUF4JENw6FONr0Zyzi1gAQSJpooB83xC9f8nXn8S9eOUBihTey3DFWebFxPuG0rWeaMkXk9FZBmZM4lZY1KnV7BSqzaaXEOic9jKw90zqZVPKttrzvMUr8nNUqhDhpUTVM14azFJZJAbhsb3oSEm4t6tS0N6iFKhckUx7n5J9YXEebfCyoDZOcWslXpiwmnPEVRyaiTlcfycZKOIFkakxj+1sUtII8WG2mCINJLaCI3mL7aEW3IEJpSbT4dFRQV/SMOKONPuj5fncSsF+3O2Q2rdAGZIaajHoC6ZzHtho51Yge0wl16HuI7XxwkhEw72KjAaoCrHn4CrLJ0oR2zQYbIuV9UeNQ78bot2ZCmFJKGa5LlIm1woeRzNsqAo9H+H6+4TNHUO037ubYb8srgTY1jyvXwQmv3+S0k5cEef00IMit0IA2GFxkoXm17tAmkulpK464dw/DQZ+Wl58HKCNZedmEg+9oa7C8iLvdltmfPWvyIhq54UuCZ+Q4Lab0kfJWmv9tIgg4HTK0cdqbqqTY/DbExjQ/vclJIqAqP4rDOKg3nwHJpDFPtRJfQJN015ymz8EG6Y/I/pNFJtHjd71A6fBp0UH1VPmtGIsk6jKMe64VjUQcAz6OH8Unv5czhSKRvl/LCAbpdjtQ1n+1B5n6siUTyYV3+oexDyNckI9tP0PZDWlxkQ2COpGaPSTC3UUebMmy89t0rS3pREU9CkiXzdTtW2iqVUdnaVzUbbLw1KsJs8yCrSBmae3JeFa5J4lJBC+XmGDGw5I3UqnrOY87m9ksx5N30W3MICMkkK548SU/niARTTd1tVl4bJbx0us1F4Y9sjiLCLDY6f0siYX8jacok/JWtMw0kdgSGw7tBN3fMpeIpGfAU4VEgYpLnRiM+R6I6I1+8S3ciObmqIXYGEtAvOEXAvGzZ/qknP1W/Xtbzyz7RMR/WlBkQodsqU9/GfHogfHRS1NEYos4nx3vB8vLRxulwmEmLGoDrb0qytBXuvOVT9Gep6ICO28nF+NQ1iR4PI53rG2HRRRUwYmCPymm996iPfR3PR4sfFLO6HETsdC9rFfsChg3+l2vDp9sMU4jsx9tvtpz5t0NsHLV7qXSTyjZ0zl+tE/iHG0JtyXJBGDZMCeYEgVJV/kBPKOR2cjGluizgYJPuPcOgJzzN+FYh2Hcq+lFAJgM4XBclVCbUZEXh+9YGR//MWLSQCdikMzZZchWo/u6xFIeJMMaJLveGgfD1MunjaCfbTty0SmLdcIG3SGAToIGyo5ExiNgQ9W/1JmbfX7z7X5jyPxYaYyYtbbpLA75UZo9iVYRgezeWGnBmhBX5gAzblyIjNUFEB3TVr3gr5CqCIVfWJITQ3Mwt69zwaKOxY3xhaPgnVh7XlZd3mXRcJHNO8YuSWFKM/eYxBtJqwKszq4S1LlYGS8m61rEfcA0YarLWYU7LloDICV7Lv6nGDFxihpxSPQfeDNvUMRbfA9B3tai4fUF1yNeoQ0nY/FCrkM3yBufCAVCxFtH9tU3uxA6XmQUODpjA1HXdQ2CLxxXBIACl332XsVNl9FkRa1HapPngbfXFGqgUWEnnp0PJY1GJKgwqIPLCknZYJQS2kPaW5SSBL/AAUWnPaVQIVDw3XjNhN4wXCNT5YdkrBto4m3GNZkE3ZeENFoOT7v04dHmOj+jP2OClnjZTugMyayVAknTUXjyxle68SwoYqyANyIRrGEF0T0hPI/zTxQLv/OVD3lI7zHyiAJYeDC6dTXeAfmu56Nt03UH8sEBnFmODSZxO2m5bx5Gi4vgRD4cylor00mPydIzNFTadlRiV9AGqMhOihSZJIJoL2rou7etpBz3hse9mbpHTbBLnvNBx6Aa4z1kTmZ1xx4+qY5fUM/GsVfdQjuKvV7yfRaSIdpKPh/XS5nJB0vxjOQAiGHXgrV5fgF/pfcyCRTtqD3nAD6NTgyeMAqEsM53fbvOckLXK7kJc9+MHwHCL9GXJMCmfNRWRvC6kq1EqwoGsT6azxkcowstGeuEM/lK2LmAHvrjO6fBhq2In0KP+Y1UAPdQ8FtZuSAmOoIaz0CYzi26mZQEW7sc5mIbvdeXYUT5jbDmVg/7/0muNqmKg6S6qCePRfh50LbvszAnkqF72kdK7sh/4dpgSt6V4FSXhIjIhGWovSBJfjs1cKbhlgTc0wtv8h2/Y//JZEe7sLrtL4J1TpUhEor5BKfAyEYctV1HdJWrEhoXoGdaklwTtRcyFSewJvZtlXuGDydiyfWWsvhjXPQWy/b0DO4Nm0/LHsukJS+R9c0nhLHwpW4d1BEvQBSoAq6Cr04o1lqaq3vQULkEHaQToRa+eX2QV1fHBFhJhPbSiGJjaLBs7zzC4Dy/0BAnM70sg7msJdnnotE08aCEZgaMvN4Q7NFfD4XEAH1RNscjPV5ijs6W7TxtsvLNaXsI1Bslu+trAV/K0NFl9U5B/uzCqyGoAURBHmGZ4D2NnWDbezcWlpMQHYM2AEG1FxcWtEsNWm+c6j9ljdptXal00Bw2eJfT865B2oSm67RoYHAwy7qAtEhkmCZ/bHZ6yCY7oRpr30emq5zZvd3skiYwjpfcQC3Ut3kidYJl6Eg303kqjrV8LK/fzyoo/EZzpzF+jD3yY1FocTKgoFipb6Gx0pBbmxuwxVFul+VSRuKOt7XRuv4VASFgr+lMUptb0wzdZE+YNZ2T6/nVVe6i8+/EI7OlpwUEmubFvpJMONmzv1LgT5qlwOHKvkiOIgVPnBMf1iKQvXjMIxsbs49tufRmixkzqmPZQd4AFARmtRX2Itc1cd37XG5XtV0UEGHqw1ZTFW/w6YLRL4bholMqbd/pgOqujuJo4tU4V90BeeH5TCxb8YO8uaQRwUReXYtZNiEEXuJJ3pP1gLb7DkeJI28FZVwv3oaT7zNM4uD9kD9gJWlQPHKk3DEywqfAQnjHnbZmQCJRXJJbrCpoDvd5FUdc6n484A1N8+2vUQkFjvRM/+2jsnZ1FtpUOame/Y9oSDYOAjgzrxaoZLNavbRIhPif16dPUpLGzuWbiGFOrYXxE2yWHTGzBEb4PoRnEATKlwFTneLGivwjo+9LAauGHi7G64QkMYyws7YwZS9yW3oB2LgFen4/EvdbXFsDViZziwXD7Yw/UMhyvMIBICIzx1PiVHkvavwOkxo5acbm/z1QrJCd/Kzoq6L4epoFzBfcmgMN16PlxUyZQy1dpEtZcdQd/qAw3GqTuvcmiigqaPSQ5IKenEuhHgcJRi9qavJHMdlM0Zs0fW4zA2+YPei9089g2/Mdf6j1O99dhOCm6i2gIiSjzNpT/HEKWMzjZqFTH6VFyBpQ/mH1EmSI69xBs/drMs1fGtKyO1xI9vTh04sRd5M12xxeU++a8iUQaA8x44oKVrpBvB8DoBr56JZPQBbwhS47aHuWz/388Mp95aAV+5CYzoCHZ7fcf9DMP7IHGdTPrfVYWMkGDyNsmsFr/7vNQQVeNrNEt2JrwUNYCwWo/XfVh3myKfj3G1iTdaw8b/9CQJ/2vvH/EVTG6W6omvMNQzq69PPNfC+Cdf+Wg2FKHWBd8qBI1tEqs9Kp2EnhxfkZK+GgxpMf3Dcwjj3NQ86rXqela7bfwgdrMxW2ncw/oqVx0rzBXyazcyCou3vpnc8oZDi9x1hL9EYeRilnxsNI0I0ikXqv/XU7gQmGc1flwog+aK/oTkb2sosyf6/mPqzQ7zzq/Fhdt0VlVp719BMMV1JdXFE+qz5Rl6br9b5EEU78DyBBSo948HL1QVyvfO5A+Y6KlINSXFNiGcVFj8WaAF6l61gc3CTtsuQULu5KQEcXU4jeR0Gg9Lljz9Wq+PCGvjR9BZ3RTzRKoYjfs+5F1l+Rk7B32X23ZxZJdAFf+wpapAgEM2uUDCNHFmDyN5hFQwjIgThCFhmH8y43lJFuMV+p4hfGTyX4E5O4DS625Rl4SxowgjFvzKCT4dC/6Vp5cIW0dY6OVXZzgETrj1j+TL6IKl/Ctip295oilmeJlNXpZ6js0X2ZnnfTAMM56L0fCLtsPPXgSSAx82YQrdDpShjY7CxdPd3pYYbNIpAgotxEVB2icwDMRUwVsuBYEB9OprDIeFNt6VeZcIGJe9tJkQK+QInBSakYNFyHMKJNWHuBEmeSTo8leV09BuF0AMWNqZbKe+viO4I5a5U4uxPPjklmnIlwoTl2GjVR+QNUxI1ZqPqBs0OWhPCRpvi+VshkKxBpfescjxBimms1fG1rXw8a6sZmw2h/WNVlpdTK2HX1UI8eB3Qq3Bb5xF+2Gt3TjwjmY0akezTqseSYYK6S2ieOq+C9qEn0f+Jj2kM3BwIm0MgQeC+UW3vBJgLo6fqmgSkJiw8voU+zvqT6v/7PfsdqT0Yn5PBWZYo/gMrECK4SKm3FuCu1XminVXFcR+733KLwLmoY06CaaDx53Bnt0ssAp4ERbzUb3cXXm8as3MkLD3kLdoC/NAycc2xzaZuob4lFlvYO2K8YOExVX50ZkzO3kiFjHQstyuFyeD6qU9IWuBI1V7ET9X/mZ6RCEO/Zm/++cH8wXMWBRAp8T8E5p+yi2EipfA7LUfEPMc7eNlqvf/NW5cCOXuvP6NRGmmgtsPpOpULqF+If1we73zsmIAhwbzIrypr/YhwNU1F8EOIA1WBwA3/Fqm6EDeKOlRkt36RnHInZslTB3UvKT9fvmCT+Bfi7p47x3jjapd8EFcKBiUEKqMFFfYtVgpkDjcDBWfwCwB5fVmDb+7HUY1484VdJ0GMPpZTLvqFUdzmLHIcoOukXbW6HhWprP5YeTKALzCphJpQq0YiVpHSfS6Nik7eh+YXh4S0KSDBBXuoNR0zfa1Y/kpp44IiAFzqXmBSxvBik5HcU+X9+ow10lPOet2LknL+woBDoLyk6f/kMHMLTM6ZWxCXiDQGKXgw/cAmR22sAGZ2qXbaphB2Cp9ckCeVeJPxF5dS6OUcs9dz9EU/kkEdpUVYoUIlQuLiRmLb9aLT2RjsSzqYADqqH6Sb0F4kISO9qXfYKaxRBWPPcoMFMuvmNdk6++StNkB5PTWs+ERpHulCRKf+U0=',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page
            }

            url = 'http://ncztb.nc.gov.cn/nczbw/jyxx/{}/MoreInfo.aspx'.format(categoryId)
            response = requests.post(url=url, headers=self.headers, params=params, data=data, cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ncztb.nc.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
                {'categoryId':'002001/002001002', 'types':'002001002','all_page': flag},
                {'categoryId':'002001/002001004', 'types':'002001004','all_page': flag},
                {'categoryId':'002001/002001005', 'types':'002001005','all_page': flag},
                {'categoryId':'002002/002002002', 'types':'002002002','all_page': flag},
                {'categoryId':'002002/002002005', 'types':'002002005','all_page': flag},
                {'categoryId':'002003/002003001', 'types':'002003001','all_page': flag},
                {'categoryId':'002003/002003004', 'types':'002003004','all_page': flag},
                {'categoryId':'002009/002009001', 'types':'002009001','all_page': flag},
                {'categoryId':'002009/002009004', 'types':'002009004','all_page': flag},
                {'categoryId':'002004/002004001', 'types':'002004001','all_page': flag},
                {'categoryId':'002004/002004002', 'types':'002004002','all_page': flag},
                {'categoryId':'002004/002004003', 'types':'002004003','all_page': flag},
                {'categoryId':'002004/002004004', 'types':'002004004','all_page': flag},
                {'categoryId':'002004/002004005', 'types':'002004005','all_page': flag},
                {'categoryId':'002005/002005002', 'types':'002005002','all_page': flag},
                {'categoryId':'002010/002010001', 'types':'002010001','all_page': flag},
                {'categoryId':'002010/002010002', 'types':'002010002','all_page': flag},
                {'categoryId':'002010/002010004', 'types':'002010004','all_page': flag},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
Example #27
0
class GovBuy(object):
    '''杭州政府采购网'''
    def __init__(self):
        name = 'hangzhou_cg_hzft_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Cache-Control': 'max-age=0',
            'Origin': 'http://cg.hzft.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://cg.hzft.gov.cn/www/noticelist.do',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hangzhou_list1',
                             dbset='hangzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        try:
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="detail_con"]/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="content_about"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)

            area_name = '杭州'

            source = 'http://cg.hzft.gov.cn/'

            soup = BeautifulSoup(response)
            content_html = soup.find(class_='detail_con')
            # print(content_html)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '杭州市政府采购网'
            retult_dict['en_name'] = 'Hangzhou Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, data):
        try:
            url = 'http://cg.hzft.gov.cn/www/noticelist.do'
            # proxies = self.proxy_queue.get()
            response = requests.post(url=url, headers=self.headers,
                                     data=data).text
            selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(data)
        else:
            # print('第{}页'.format(page))
            url_li = selector.xpath('//ul[@class="c_list_item"]/li/a/@href')
            print(url_li)
            for url in url_li:
                urls = 'http://cg.hzft.gov.cn/' + url

                # self.load_get_html(urls)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'regionguid': '330100',
                'noticetype': '3,3001,3002,3008,3009,3011,3014,4001,4002',
                'all_page': 1
            },
            {
                'regionguid': '3',
                'noticetype': '5,6',
                'all_page': 1
            },
            {
                'regionguid': '',
                'noticetype': '1,3012',
                'all_page': 1
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                data = {
                    'page.pageNum': page,
                    "parameters['regionguid']": task['regionguid'],
                    "parameters['noticetype']": task['noticetype'],
                    "parameters['title']": ''
                }
                try:
                    self.load_get(data)
                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''苏州政府采购网'''
    def __init__(self):
        name = 'suzhou_zfcg_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.zfcg.suzhou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.zfcg.suzhou.gov.cn/html/search.shtml?title=&choose=&projectType=0&zbCode=&appcode=',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_list1', dbset='suzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,pid):
        if pid == None:
            return
        try:
            url = 'http://www.zfcg.suzhou.gov.cn/html/project/'+ pid +'.shtml'
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="M_title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="date"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '江苏-苏州'
            # print(area_name)

            source = 'http://www.zfcg.suzhou.gov.cn/'

            table_ele  = selector.xpath('//div[@id="tab1"]')[0]

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市政府采购网'
            retult_dict['en_name'] = 'Suzhou City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, types, page):
        try:
            data = [
                ('title', ''),
                ('choose', ''),
                ('type', types),
                ('zbCode', ''),
                ('appcode', ''),
                ('page', page),
                ('rows', '30'),
            ]
            url = 'http://www.zfcg.suzhou.gov.cn/content/searchContents.action'
            response = requests.post(url=url, headers=self.headers, data=data).json()
            # selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(types, page)
        else:
            print('第{}页'.format(page))
            # print(response)
            response_li = response['rows']
            if response_li == []:
                return

            for project_id in response_li:
                pid = project_id['PROJECTID']

                # self.load_get_html(pid)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
                {'type':'0', 'all_page': 2},
                {'type':'1', 'all_page': 2},
                {'type':'2', 'all_page': 2},

            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    types = task['type']

                    # self.load_get(base_url, page)
                    spawns = [gevent.spawn(self.load_get,types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
class GovBuy(object):
    '''呼和浩特政府采购网'''
    def __init__(self):
        name = 'huheaote_hhgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.hhgp.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*',
            'Referer': 'http://www.hhgp.gov.cn/huShi_web_login',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
            'Content-Length': '0',
        }


        self.rq = Rdis_Queue(host='localhost', dblist='huhehaote_list1', dbset='huhehaote_set1')


    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self,params):
        try:
            url = 'http://www.hhgp.gov.cn/huShi_web_login/showAllMessage'
            response = requests.post(url=url, headers=self.headers,params=params).json()
            response_str = response['0']
            selector = etree.HTML(response_str)
        except:
            print('load_post error')
        else:
            url_li = selector.xpath('//li/span[1]/a/@href')
            for url in url_li:
                url = 'http://www.hhgp.gov.cn'+ url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def load_get_html(self,url):
        try:
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # print(response)
            _id = self.hash_to_md5(url)
            title = selector.xpath('//*[@id="content"]/div/div[2]/div/div/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            publish_date = selector.xpath('//*[@id="content"]/div/div[2]/div/div/i/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',publish_date[0]).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='content')
            # print(content_html)
            source = 'http://www.hhgp.gov.cn/'
            area_name = self.get_area('呼和浩特',title)

            # print(content)
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '呼和浩特市政府采购网 '
            retult_dict['en_name'] = 'Huhhot City Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
                # {'code':'265.266.304', 'all_page': 29},
                # {'code':'265.266.269', 'all_page': 70},
                # {'code':'265.266.270', 'all_page': 67},
                # {'code':'265.266.271', 'all_page': 217},
                # {'code':'265.266.303', 'all_page': 58},
                # {'code':'265.266.404', 'all_page': 1},
                # {'code':'265.266.403', 'all_page': 14},
                # {'code':'265.266.343', 'all_page': 21},
                {'code':'265.266.304', 'all_page': 1},
                {'code':'265.266.269', 'all_page': 1},
                {'code':'265.266.270', 'all_page': 1},
                {'code':'265.266.271', 'all_page': 1},
                {'code':'265.266.303', 'all_page': 1},
                {'code':'265.266.404', 'all_page': 1},
                {'code':'265.266.403', 'all_page': 1},
                {'code':'265.266.343', 'all_page': 1},
            ]
        for task in task_li:
            for page in range(1,task['all_page'] + 1):
                params = (
                    ('code', task['code']),
                    ('pageNo', str(page)),
                    ('check', '1'),
                )
                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()