Example #1
0
class GovBuy(object):
    '''陕西公共资源交易信息网'''
    def __init__(self):
        name = 'shaanxi_sxggzyjy_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxggzyjy.cn/jydt/001001/subPage_jyxx.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shaanxip_list1',
                             dbset='shaanxip_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h3[@class="article-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="info-source"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('陕西', title)
            # area_name = '四川-成都'
            # print(area_name)

            source = 'http://www.sxggzyjy.cn/'

            table_ele = selector.xpath('//div[@class="ewb-main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '陕西省公共资源交易中心'
            retult_dict['en_name'] = 'Shaanxi Province Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://www.sxggzyjy.cn/jydt/001001/{}.html'.format(page)

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath('//ul[@class="ewb-list"]/li/a/@href')

            for url in url_li:
                urls = 'http://www.sxggzyjy.cn' + url
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            # {'categoryId':'', 'types':'','all_page': 1845},
            {
                'categoryId': '',
                'types': '',
                'all_page': 2
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''内蒙政府采购网'''
    def __init__(self):
        name = 'neimeng_nmgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.nmgp.gov.cn/wp-content/themes/caigou_pcweb/skin/css/css.css?ver=2.0',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }


        self.rq = Rdis_Queue(host='localhost', dblist='neimeng_list1', dbset='neimeng_set1')



    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self,params):
        try:
            url = 'http://www.nmgp.gov.cn/zfcgwslave/web/index.php'
            response = requests.get(url=url, headers=self.headers,params=params).json()
        except:
            print('load_post error')
        else:
            if len(response) >= 1:
                response_li = response[0]
            else:
                return
            for ret_dict in response_li:
                if not self.rq.in_rset(ret_dict):
                    self.rq.add_to_rset(ret_dict)
                    self.rq.pull_to_rlist(ret_dict)

    def load_get_html(self,ret_dict):
        # print(ret_dict)
        if ret_dict == None:
            return
        try:
            ret = eval(ret_dict)
            url = 'http://www.nmgp.gov.cn/ay_post/post.php?tb_id=' + ret['ay_table_tag'] + '&p_id=' + ret['wp_mark_id']

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # print(ret)
            _id = self.hash_to_md5(url)
            title = ret['TITLE_ALL']
            try:
                status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
            except:
                status = '公告'

            # print(title)
            publish_date = selector.xpath('//*[@id="info-box"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'\d+年\d+月\d+日',publish_date[0]).group()
            else:
                publish_date = None
            # print(publish_date)
            # return
            end_date = ret['ENDDATE']
            soup = BeautifulSoup(response)
            content_html = soup.find(id='s-main-2').div.div
            # print(content_html)
            # print(content)
            source = 'http://www.nmgp.gov.cn/'
            area_name = self.get_area('内蒙古', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '内蒙古自治区政府采购网 '
            retult_dict['en_name'] = 'NeiMengGu District Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'type_name':1, 'all_page': 5268},
            # {'type_name':2, 'all_page': 735},
            # {'type_name':3, 'all_page': 4482},
            # {'type_name':4, 'all_page': 101},
            # {'type_name':5, 'all_page': 925},
            # {'type_name':6, 'all_page': 2386},
            # {'type_name':7, 'all_page': 101},
            # {'type_name':8, 'all_page': 25},
            {'type_name':1, 'all_page': 2},
            {'type_name':2, 'all_page': 2},
            {'type_name':3, 'all_page': 2},
            {'type_name':4, 'all_page': 2},
            {'type_name':5, 'all_page': 2},
            {'type_name':6, 'all_page': 2},
            {'type_name':7, 'all_page': 2},
            {'type_name':8, 'all_page': 1},
                   ]
        for task in task_li:
            for page in range(1,task['all_page'] + 1):
                params = {
                    'r': 'zfcgw/anndata',
                    'type_name': task['type_name'],
                    'byf_page': str(page),
                    'fun': 'cggg',
                }
                if self.rq.r_len() > 8000:
                    time.sleep(3)
                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
Example #3
0
class GovBuy(object):
    '''山西采购电子商城'''
    def __init__(self):
        name = 'shanxi_sxzfcg_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxzfcg.cn/view.php?nav=61',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanxi_sxzfcg_cn_list1',
                             dbset='shanxi_sxzfcg_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@valign="middle"]/h2/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//td[@bgcolor="#E6E6E6"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'年|月', '-',
                    re.search(r'(\d{8}|\d{4}年\d+月\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '山西'
            # print(area_name)

            source = 'http://www.sxzfcg.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//td[@class="c_pt"]/table/tr[2]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '山西省省级政府采购中心'
            retult_dict['en_name'] = 'Shanxi Government Procurement Center'

            print(publish_date)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (
                ('nav', types),
                ('page', page),
            )
            url = 'http://www.sxzfcg.cn/view.php'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = selector.xpath('//tr[@class="odd"]/td/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                urls = 'http://www.sxzfcg.cn/{}'.format(url)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '',
                'types': '61',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '62',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '63',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '64',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '65',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '66',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '67',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '68',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '69',
                'all_page': flag
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']
                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''深圳政府采购网'''
    def __init__(self):
        name = 'shenzhen_zfcg_sz_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            'Referer':
            'http://61.144.227.212/was5/web/search?page=4096^&channelid=261279^&orderby=-DOCRELTIME^&perpage=10^&outlinepage=5^&searchscope=^&timescope=^&timescopecolumn=^&orderby=-DOCRELTIME^&chnlid=^&andsen=^&total=^&orsen=^&exclude=',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'Origin': 'http://61.144.227.212',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shenzhen_list1',
                             dbset='shenzhen_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb2312')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h4/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h6/label//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='main')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = 'http://www.zfcg.sz.gov.cn/'
            retult_dict['area_name'] = '深圳'

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '深圳市政府采购监管网 '
            retult_dict['en_name'] = 'Shenzhen Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('page', str(page)),
                ('channelid', '261279'),
                ('orderby', ['-DOCRELTIME', '-DOCRELTIME']),
                ('perpage', '10'),
                ('outlinepage', '5'),
                ('searchscope', ''),
                ('timescope', ''),
                ('timescopecolumn', ''),
                ('chnlid', ''),
                ('andsen', ''),
                ('total', ''),
                ('orsen', ''),
                ('exclude', ''),
            )
            data = [
                ('showother', 'false'),
                ('showtype', 'txt'),
                ('classnum', '20'),
                ('classcol', 'CTYPE'),
                ('channelid', '261279'),
                ('orderby', '-DOCRELTIME'),
            ]
            url = 'http://61.144.227.212/was5/web/search'
            response = self.session.post(url=url,
                                         headers=self.headers,
                                         params=params,
                                         data=data).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath('//div[@class="r_list"]/dl/dd/a/@href')
            print('第{}页'.format(page))
        except:
            print('load_post error')
        else:

            for url in url_li:
                # print(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 43879},
            {
                'all_page': 5
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                except:
                    pass

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #5
0
class GovBuy(object):
    '''重庆政府采购网'''
    def __init__(self):
        name = 'chongqing_cqgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Referer':
            'https://www.cqgp.gov.cn/notices/list?source=41,42^&area=^%^E9^%^87^%^8D^%^E5^%^BA^%^86^%^E5^%^B8^%^82^&purches=^%^E9^%^87^%^87^%^E8^%^B4^%^AD^%^E5^%^85^%^AC^%^E5^%^91^%^8A',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='chongqing_list1',
                             dbset='chongqing_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format(
                pid)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(pid)
        else:
            title = response['notice']['title']
            try:
                status = response['notice']['projectPurchaseWayName']
            except:
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            publish_date = response['notice']['issueTime']
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         publish_date).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '重庆'

            # print(area_name)

            source = 'https://www.cqgp.gov.cn/'

            content_html = response['notice']['html']

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '重庆市政府采购网'
            retult_dict['en_name'] = 'Chongqing City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('pi', page),
                ('ps', '20'),
                ('timestamp', str(int(time.time() * 1000))),
            )
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies,
                                    timeout=5).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            response_li = response['notices']
            for data_dict in response_li:
                pid = data_dict['id']
                # print(pid)
                # self.load_get_html(pid)
                # time.sleep(2)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [

            # {'all_page': 18647},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(types, page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #6
0
class GovBuy(object):
    '''苏州公共资源交易信息网'''
    def __init__(self):
        name = 'suzhou_szzyjy_fwzx_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'X-DevTools-Emulate-Network-Conditions-Client-Id': '06AB3D9C05E9FDAB1EDDAD36BA60296F',
            'Referer': 'http://ggzy.hefei.gov.cn/jyxx/002001/002001001/3.html',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_szzyjy_fwzx_suzhou_gov_cn_list1', dbset='suzhou_szzyjy_fwzx_suzhou_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h2[@class="word-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//h4[@class="word-info"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area()
            area_name = '江苏-苏州'

            source = 'http://szzyjy.fwzx.suzhou.gov.cn'

            table_ele  = selector.xpath('//div[@class="border"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市公共资源交易中心'
            retult_dict['en_name'] = 'Suzhou City Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            params = (
                ('paging', page),
            )
            url = 'http://szzyjy.fwzx.suzhou.gov.cn/Front/jyzx/{}/'.format(types)
            response = requests.get(url=url, headers=self.headers, params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            try:
                url_li = selector.xpath('//*[@class="mr-content"]/div[1]/table/tr/td[1]/a/@href')
            except:
                time.sleep(3)
                self.load_get(categoryId, types, page)


            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://szzyjy.fwzx.suzhou.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'002004/002004001','all_page': 2},
                {'categoryId':'', 'types':'002004/002004002','all_page': 2},
                {'categoryId':'', 'types':'002004/002004003','all_page': 2},
                {'categoryId':'', 'types':'002004/002004004','all_page': 1},
                {'categoryId':'', 'types':'002004/002004005','all_page': 2},
                {'categoryId':'', 'types':'002004/002004006','all_page': 1},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #7
0
class GovBuy(object):
    '''上海公共资源交易信息网'''
    def __init__(self):
        name = 'shanghai_ztb_shmh_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'X-DevTools-Emulate-Network-Conditions-Client-Id':
            'C30FE2988AF840A005E144C01A1874D4',
            'Referer':
            'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/shmhztb_subject_zfcg_cggg/List/list_350.htm',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanghai_ztb_shmh_gov_cn_list1',
                             dbset='shanghai_ztb_shmh_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="title"]/h2/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="title"]/h3//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '上海'
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://ztb.shmh.gov.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//div[@class="list_right"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '上海市闵行区公共资源交易网'
            retult_dict['en_name'] = 'Minhang District Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/{}/List/list_{}.htm'.format(
                types, page)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//ul[@id="list_ul"]/li/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                # response_li = response['result']['records']
                # for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ztb.shmh.gov.cn/mhztb_site/html/shmhztb_subject/' + re.sub(
                    '\.\.\/\.\.\/', '', url)

                # print(data_dic)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_cggg',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_jggg',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_zfcg_dyly',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_jsgc_zbxx',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_jsgc_zgbxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_jyxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_cjxx',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'shmhztb_subject_ggzy_cjxx',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''江苏政府采购网'''
    def __init__(self):
        name = 'jiangsu_ccgp-jiangsu_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/index_1.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jiangsu_list1',
                             dbset='jiangsu_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="dtit"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="detail_bz"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('江苏', title)
            # print(area_name)

            source = 'http://www.ccgp-jiangsu.gov.cn/'

            table = selector.xpath('//div[@class="detail"]')
            if table != []:
                table = table[0]
            else:
                return
            content_html = etree.tostring(table,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            # print(content_html)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江苏政府采购网'
            retult_dict['en_name'] = 'Jiangsu Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, base_url, page):
        try:
            if page == 0:
                url = base_url
            else:
                url = base_url + 'index_' + str(page) + '.html'
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('load_post error')
            # self.load_get(url)
        else:
            # print('第{}页'.format(page))
            url_li = selector.xpath('//div[@class="list_list"]/ul/li/a/@href')
            if url_li == []:
                url_li = selector.xpath(
                    '//div[@class="list_list02"]/ul/li/a/@href')

            for url in url_li:
                urls = base_url + url.replace('./', '')
                # print(urls)
                # self.load_get_html((urls))
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cgyg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/htgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/xqyj/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/ysgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/xuzhou/',
                'all_page': flag
            },
        ]
        count = 3
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    base_url = task['url']

                    # self.load_get(base_url, page)
                    spawns = [
                        gevent.spawn(self.load_get, base_url, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #9
0
class GovBuy(object):
    '''西藏政府采购网'''
    def __init__(self):
        name = 'xizang_ccgp-xizang_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.ccgp-xizang.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.ccgp-xizang.gov.cn/shopHome/morePolicyNews.action?categoryId=124',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='xizang_list1',
                             dbset='xizang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(url)
        else:
            # print(response)

            title = selector.xpath('//h2[@class="sd"]/font/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//h3[@class="wzxq"]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            print(publish_date, title)
            # print (title)
            area_name = self.get_area('西藏', title)

            # print(area_name)

            source = 'http://www.ccgp-xizang.gov.cn/'

            table_ele = selector.xpath('//div[@class="neirong"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '西藏自治区政府采购网'
            retult_dict['en_name'] = 'Xizang Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, page):
        try:
            params = {'categoryId': categoryId}

            data = {'currentPage': str(page)}
            url = 'http://www.ccgp-xizang.gov.cn/shopHome/morePolicyNews.action'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath(
                '//div[@id="news_div"]/ul/li/div[1]/a/@href')
            # print(url_li)
            # return
            for url in url_li:
                urls = 'http://www.ccgp-xizang.gov.cn' + url
                # print(urls)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '124',
                'all_page': 2
            },
            {
                'categoryId': '125',
                'all_page': 2
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #10
0
class GovBuy(object):
    '''乌鲁木齐公共资源交易信息网'''
    def __init__(self):
        name = 'wulumuqi_ggzy_wlmq_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://ggzy.wlmq.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'text/plain',
            'Accept': '*/*',
            'Referer': 'http://ggzy.wlmq.gov.cn/generalpage.do?method=showList&fileType=201605-048&faname=201605-046',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='wulumuqi_list1', dbset='wulumuqi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, info_id):
        if info_id == None:
            return
        try:
            url = 'http://ggzy.wlmq.gov.cn/infopublish.do?method=infoPublishView&infoid=' + info_id
            response = requests.get(url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="title"]/text()')
            if title != '':
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//td[@class="td_name"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.search(r'(\d{8}|\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            # area_name = self.get_area('云南',title)
            area_name = '新疆-乌鲁木齐'
            # print(area_name)

            source = 'http://ggzy.wlmq.gov.cn/'

            table_ele  = selector.xpath('//div[@class="w_content_main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '乌鲁木齐市公共资源交易网'
            retult_dict['en_name'] = 'Urumqi City Public resource'
            
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            data = 'callCount=1\n\npage=/generalpage.do?method=showList&fileType='+categoryId+'&faname=201605-046\n\nhttpSessionId=\n\nscriptSessionId=A0890501B5665F11F1222EBC440FC5FC644\n\nc0-scriptName=projectDWR\n\nc0-methodName=queryItemInfoByIndustryType2\n\nc0-id=0\n\nc0-e1=string:packTable\n\nc0-e2=string:'+categoryId+'\n\nc0-e3=number:'+str(page)+'\n\nc0-e4=string:15\n\nc0-e5=string:true\n\nc0-e6=string:packTable\n\nc0-e7=string:982\n\nc0-param0=Object_Object:{flag:reference:c0-e1, name:reference:c0-e2, currentPage:reference:c0-e3, pageSize:reference:c0-e4, isPage:reference:c0-e5, tabId:reference:c0-e6, totalRows:reference:c0-e7}\n\nbatchId=3\n\n'
            url = 'http://ggzy.wlmq.gov.cn/dwr/call/plaincall/projectDWR.queryItemInfoByIndustryType2.dwr'
            response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            info_id_il = re.findall(r"""\[\'FILE_ID\'\]\=\"(.*?)\"\;""", response)
            print(info_id_il)
            for pid in info_id_il:
                # print(info_id)
                # self.load_get_html(pid)

                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)
    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'201605-048', 'types':'','all_page': 2},
                {'categoryId':'201605-049', 'types':'','all_page': 1},
                {'categoryId':'201605-050', 'types':'','all_page': 2},
                {'categoryId':'201605-051', 'types':'','all_page': 1},
                {'categoryId':'201605-052', 'types':'','all_page': 1},
                {'categoryId':'201605-053', 'types':'','all_page': 1},
                {'categoryId':'201605-039', 'types':'','all_page': 2},
                {'categoryId':'201605-041', 'types':'','all_page': 1},
                {'categoryId':'201605-042', 'types':'','all_page': 1},
                {'categoryId':'201605-043', 'types':'','all_page': 2},
                {'categoryId':'201605-044', 'types':'','all_page': 2},
                {'categoryId':'201605-045', 'types':'','all_page': 2},
            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Example #11
0
class GovBuy(object):
    '''河北-政府采购网'''
    def __init__(self):
        name = 'hebei_ccgp-hebei_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'If-None-Match': '594gpnM6qpxwGpEvFYoNJpzY8YE=',
            'If-Modified-Since': 'Mon, 23 Jul 2018 02:32:18 GMT',
            'Referer': 'http://www.ccgp-hebei.gov.cn/province/cggg/zhbgg/index_3.html',
            'X-DevTools-Emulate-Network-Conditions-Client-Id': 'F24524FAD50B25DB7D7D89DBCEA53767',
            'Intervention': '<https://www.chromestatus.com/feature/5718547946799104>; level=warning',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='hebei_list1', dbset='hebei_set1')



    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,url):
        if url == None:
            return
        try:
            response = self.session.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:

            title = selector.xpath('//span[@class="txt2"]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'


            publish_date = selector.xpath('//body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/text()')
            # print(publish_date)
            if publish_date !=[]:
                publish_date = re.sub(r'\r|\n|\s|发布时间:','',publish_date[0])
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find('body').table.tr.td.table
            # print(content_html)
            area_name = self.get_area('河北',title)
            source = 'http://www.ccgp-hebei.gov.cn/province/'


            _id = self.hash_to_md5(url)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source
            retult_dict['publish_date'] = publish_date
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '河北省政府采购网'
            retult_dict['en_name'] = 'Hebei Province Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, params):
        try:
            url = 'http://search.hebcz.gov.cn:8080/was5/web/search'
            response = self.session.get(url=url, headers=self.headers, params=params).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
        else:
            url_li = selector.xpath('//tr[@id="biaoti"]/td[2]/a/@href')
            for url in url_li:
                # self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)


    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 10
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        #     city_id_li = [
        #         '130100000','130181000','130200000','130300000','130400000','130500000',
        #         '130600000','130682000','130700000','130800000','130900000','131000000','131100000',
        #         '139900000']
        count = 2
        task_li = [
                {'lanmu':'zhbgg','code':130000000,'all_page': count},
                {'lanmu':'zbgg','code':130000000,'all_page': count},
                {'lanmu':'zhbgg','code':130181000,'all_page': count},
                {'lanmu':'zbgg','code':130181000,'all_page': count},
                {'lanmu':'zhbgg','code':130200000,'all_page': count},
                {'lanmu':'zbgg','code':130200000,'all_page': count},
                {'lanmu':'zhbgg','code':130300000,'all_page': count},
                {'lanmu':'zbgg','code':130300000,'all_page': count},
                {'lanmu':'zhbgg','code':130400000,'all_page': count},
                {'lanmu':'zbgg','code':130400000,'all_page': count},
                {'lanmu':'zhbgg','code':130500000,'all_page': count},
                {'lanmu':'zbgg','code':130500000,'all_page': count},
                {'lanmu':'zhbgg','code':130600000,'all_page': count},
                {'lanmu':'zbgg','code':130600000,'all_page': count},
                {'lanmu':'zhbgg','code':130682000,'all_page': count},
                {'lanmu':'zbgg','code':130682000,'all_page': count},
                {'lanmu':'zhbgg','code':130700000,'all_page': count},
                {'lanmu':'zbgg','code':130700000,'all_page': count},
                {'lanmu':'zhbgg','code':130800000,'all_page': count},
                {'lanmu':'zbgg','code':130800000,'all_page': count},
                {'lanmu':'zhbgg','code':130900000,'all_page': count},
                {'lanmu':'zbgg','code':130900000,'all_page': count},
                {'lanmu':'zhbgg','code':131000000,'all_page': count},
                {'lanmu':'zbgg','code':131000000,'all_page': count},
                {'lanmu':'zhbgg','code':131100000,'all_page': count},
                {'lanmu':'zbgg','code':131100000,'all_page': count},
                {'lanmu':'zhbgg','code':139900000,'all_page': count},
                {'lanmu':'zbgg','code':139900000,'all_page': count},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                params = {
                    'page': str(page),
                    'channelid':'228483',
                    'perpage':'50',
                    'outlinepage':'10',
                    'lanmu': task['lanmu'],
                    'admindivcode': task['code'],
                    }

                try:
                    self.load_get(params)

                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)
                print('第{}页'.format(page))
        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #12
0
class GovBuy(object):
    '''山西政府采购网'''
    def __init__(self):
        name = 'shanxi_ccgp-shanxi_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Referer': 'http://www.ccgp-shanxi.gov.cn/view.php?nav=104',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanxi_list1',
                             dbset='shanxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, params):
        try:
            url = 'http://www.ccgp-shanxi.gov.cn/view.php'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params).content.decode('utf-8')

            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//*[@id="node_list"]/tbody/tr/td[1]/a/@href')
        except:
            print('load_post error')
        else:
            # print(url_li)
            if url_li != []:
                for url in url_li:
                    url = 'http://www.ccgp-shanxi.gov.cn/' + url
                    if not self.rq.in_rset(url):
                        #     pass
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('laod_get_html error')
        else:
            # print(response)
            _id = self.hash_to_md5(url)
            # # print(_id)
            title = selector.xpath(
                '//tr[@class="bk5"]/td/table/tr/td/table/tr/td/div/h2/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            publish_date = selector.xpath(
                '//tr[@class="bk5"]/td/table/tr[2]/td//text()')
            # print(publish_date)
            if publish_date != []:
                publish_date = re.search(r'(\d+年\d+月\d+日)', publish_date[2])
                if publish_date != []:
                    publish_date = publish_date[0]
                else:
                    publish_date = None
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='bk5')
            # print(content_html)

            source = 'http://www.ccgp-shanxi.gov.cn/'
            area_name = self.get_area('山西', title)
            #
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中国山西政府采购'
            retult_dict['en_name'] = 'Shanxi Government Procurement'
            #
            # print(retult_dict)
            #
            print('列表长度为={}'.format(self.rq.r_len()))
            #
            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            spawns = [
                gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                for i in range(count)
            ]
            gevent.joinall(spawns)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            #{'nav':100, 'end_page':14705,'status':'招标公告'},
            #{'nav':104, 'end_page':13667,'status':'结果公告'},
            #{'nav':105, 'end_page':2291,'status':'变更公告'},
            #{'nav':116, 'end_page':747,'status':'单一来源公告'},
            #{'nav':131, 'end_page':249,'status':'招标预公告'},
            #{'nav':132, 'end_page':1,'status':'邀请公告'},
            #{'nav':153, 'end_page':7279,'status':'合同公告'},
            {
                'nav': 100,
                'end_page': 4,
                'status': '招标公告'
            },
            {
                'nav': 104,
                'end_page': 3,
                'status': '结果公告'
            },
            {
                'nav': 105,
                'end_page': 2,
                'status': '变更公告'
            },
            {
                'nav': 116,
                'end_page': 2,
                'status': '单一来源公告'
            },
            {
                'nav': 131,
                'end_page': 1,
                'status': '招标预公告'
            },
            {
                'nav': 132,
                'end_page': 1,
                'status': '邀请公告'
            },
            {
                'nav': 153,
                'end_page': 1,
                'status': '合同公告'
            },
        ]
        for task in task_li:
            for page in range(1, task['end_page'] + 1):
                params = {
                    'app': '',
                    'type': '',
                    'nav': task['nav'],
                    'page': str(page)
                }
                self.load_get(params)
                print('第{}页'.format(page))

    def main(self):
        self.run()
Example #13
0
class GovBuy(object):
    def __init__(self):
        name = 'tianjin_city_gov_buy'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.tjgp.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer':
            'http://www.tjgp.gov.cn/portal/topicView.do?method=view^&view=Infor^&id=1665^&ver=2^&st=1^&stmp=1532324224291',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='tianjin_list1',
                             dbset='tianjin_set1')

    def is_running(self):
        is_runing = True
        # if self._post_ret_url_queue.empty() and len (self._post_ret_url_set) > 0:
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_post(self, data):
        try:
            response = requests.post(
                'http://www.tjgp.gov.cn/portal/topicView.do',
                headers=self.headers,
                data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('load_post error')
        else:
            url_li = selector.xpath('//*[@id="reflshPage"]/ul/li/a/@href')
            if url_li != []:
                for url in url_li:
                    url = 'http://www.tjgp.gov.cn' + url
                    if not self.rq.in_rset(url):
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('laod_get_html error')
        else:

            _id = self.hash_to_md5(url)
            # print(_id)
            title = selector.xpath(
                '//body/table/tbody/tr/td/div/p[1]/font/b/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            publish_date = selector.xpath(
                '//body/table/tbody/tr/td/div/p[3]/text()')
            if publish_date != []:
                publish_date = publish_date[0]
            else:
                publish_date = None
            # print(publish_date)
            source = 'http://www.tjgp.gov.cn/'
            area_name = self.get_area('', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name
            # #
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(response)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '天津市政府采购网'
            retult_dict['en_name'] = 'Tianjin government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            spawns = [
                gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                for i in range(count)
            ]
            gevent.joinall(spawns)

    def run(self):
        threading.Thread(target=self.init).start()
        count = 5
        task_li = [
            {
                'id': '1665',
                'end_page': count
            },
            {
                'id': '1664',
                'end_page': count
            },
            {
                'id': '1664',
                'end_page': count
            },
            {
                'id': '1666',
                'end_page': count
            },
            {
                'id': '2013',
                'end_page': count
            },
            {
                'id': '2014',
                'end_page': count
            },
            {
                'id': '2015',
                'end_page': count
            },
            {
                'id': '2016',
                'end_page': count
            },
        ]
        for task in task_li:
            for page in range(1, task['end_page'] + 1):
                data = [
                    ('method', 'view'),
                    ('page', str(page)),
                    ('id', task['id']),
                    ('step', '1'),
                    ('view', 'Infor'),
                    ('st', '1'),
                    ('ldateQGE', ''),
                    ('ldateQLE', ''),
                ]
                self.load_post(data)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''广州政府采购网'''
    def __init__(self):
        name = 'guangzhou_gzg2b_gzfinance_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://gzg2b.gzfinance.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer':
            'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalindex.do?method=goInfogsgg^&linkId=gsgg',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangzhou_list1',
                             dbset='guangzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, data):
        try:
            url = 'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalsys/portal.do'
            params = (
                ('method', 'queryHomepageList'),
                ('t_k', 'null'),
            )
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).json()
        except:
            print('load_post error')
        else:
            response_li = response['rows']
            for ret_dict in response_li:
                if not self.rq.in_rset(ret_dict):
                    self.rq.add_to_rset(ret_dict)
                    self.rq.pull_to_rlist(ret_dict)

    def load_get_html(self, ret_dict):
        if ret_dict == None:
            return
        try:
            ret = eval(ret_dict)
            url = 'http://gzg2b.gzfinance.gov.cn/gzgpimp/portalsys/portal.do?method=pubinfoView&&info_id=' + ret[
                'info_id'] + '&t_k=null'
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            _id = self.hash_to_md5(url)
            title = ret['title']
            status = ret['info_key']
            publish_date = ret['finish_day']
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='row').div
            # print(content_html)

            source = 'http://gzg2b.gzfinance.gov.cn/'
            area_name = self.get_area('广州', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广州市政府采购平台 '
            retult_dict[
                'en_name'] = 'Guangzhou Government Procurement Platform'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 329},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                data = [
                    ('current', str(page)),
                    ('rowCount', '10'),
                    ('searchPhrase', ''),
                    ('title_name', ''),
                    ('porid', 'zbcggg'),
                    ('kwd', ''),
                ]

                self.load_get(data)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #15
0
class GovBuy(object):
    '''广西政府采购网'''
    def __init__(self):
        name = 'guangxi_gxzfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gxzfcg.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://www.gxzfcg.gov.cn/CmsNewsController/search/chnlCodes-/distin-/beginDate-0/endDate-0/p-20/c-3/0-0.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangxi_list1',
                             dbset='guangxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self, url):
        try:
            data = [
                ('searchKey', ''),
                ('title', ''),
                ('str1', 'undefined'),
                ('str2', 'undefined'),
                ('cmsNews.title', ''),
                ('cmsNews.buyerName', ''),
                ('cmsNews.str2', ''),
                ('cmsNews.str3', ''),
                ('cmsNews.str1', ''),
                ('cmsNews.str5', ''),
                ('cmsNews.str6', ''),
                ('cmsNews.str8', ''),
                ('cmsNews.agentName', ''),
                ('cmsNews.startPubdate', ''),
                ('cmsNews.endPubdate', ''),
            ]
            response = self.session.post(url=url,
                                         headers=self.headers,
                                         data=data).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//*[@id="channelBody"]/div[2]/ul/li/a/@href')
        except:
            print('load_post error')
        else:
            for url in url_li:
                url = 'http://www.gxzfcg.gov.cn' + url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def load_get_html(self, url):
        try:
            response = self.session.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # try:
            title = selector.xpath(
                '//*[@id="bodyMain"]/div/div/div[2]/div[2]/div[1]/h1/text()')
            # print(title)
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="bodyMain"]/div/div/div[2]/div[2]/div[1]/span//text()'
            )
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='frameReport')
            if content_html == None:
                raise EOFError

            source = 'http://www.gxzfcg.gov.cn/'
            area_name = self.get_area('广西', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name
            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广西壮族自治区政府采购网 '
            retult_dict[
                'en_name'] = 'Guangxi Zhuang National Government Procurement'

            # print(retult_dict)
            #
            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 49876},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                url = 'http://www.gxzfcg.gov.cn/CmsNewsController/search/chnlCodes-/distin-/beginDate-0/endDate-0/p-20/c-' + str(
                    page) + '/0-0.html'
                self.load_get(url)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #16
0
class GovBuy(object):
    '''广西公共资源交易信息网'''
    def __init__(self):
        name = 'guangxi_gxzbtb_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'trbofu0uet0aywbdhr35s0x4',
            '__CSRFCOOKIE': '6f7e275f-5762-4569-8ea2-ae98d3b0379d',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gxzbtb.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.gxzbtb.cn/gxzbw/jyxx/001010/001010001/MoreInfo.aspx?CategoryNum=001010001',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangxi_gxzbtb_cn_list1',
                             dbset='guangxi_gxzbtb_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'\/', '-',
                    re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('', title)
            area_name = '广西'

            # print(area_name)

            source = 'http://www.gxzbtb.cn/'

            table_ele = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广西壮族自治区公共资源交易中心'
            retult_dict['en_name'] = 'Guangxi Zhuang National Public Resources'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:

            params = (('CategoryNum', types), )

            data = {
                '__CSRFTOKEN':
                '/ wEFJDZmN2UyNzVmLTU3NjItNDU2OS04ZWEyLWFlOThkM2IwMzc5ZA ==',
                '__VIEWSTATE':
                '',
                '__VIEWSTATEGENERATOR': '16D6DBB1',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page,
                '__VIEWSTATEENCRYPTED': '',
            }
            url = 'http://www.gxzbtb.cn/gxzbw/jyxx/{}/MoreInfo.aspx'.format(
                categoryId)
            response = requests.post(
                url=url,
                headers=self.headers,
                params=params,
                data=data,
                cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath(
                '//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')

            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://www.gxzbtb.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '001010/001010001',
                'types': '001010001',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010002',
                'types': '001010002',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010004',
                'types': '001010004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001001',
                'types': '001001001',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001002',
                'types': '001001002',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001004',
                'types': '001001004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001005',
                'types': '001001005',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004001',
                'types': '001004001',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004002',
                'types': '001004002',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004004',
                'types': '001004004',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004005',
                'types': '001004005',
                'all_page': flag
            },
            {
                'categoryId': '001007/001007001',
                'types': '001007001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011001',
                'types': '001011001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011002',
                'types': '001011002',
                'all_page': flag
            },
            {
                'categoryId': '001012/001012001',
                'types': '001012001',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #17
0
class GovBuy(object):
    '''陝西政府采购网'''
    def __init__(self):
        name = 'shaanxi_ccgp-shaanxi_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.ccgp-shaanxi.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Proxy-Authorization': 'Basic MTYzOTY2MzE2ODphamxhNTJ0bQ==',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'text/html, */*; q=0.01',
            'Referer': 'http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&index=3&province=province',
            'X-Requested-With': 'XMLHttpRequest',
            'Proxy-Connection': 'keep-alive',
        }

        self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='shaanxi_list1', dbset='shaanxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):

        if url == None:
            return
        try:
            response = requests.get(url=url, headers=self.headers, verify=False).content.decode("utf-8")
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # self.load_get_html(li)
        else:
            title = selector.xpath('//h1[@class="content-tit"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('陝西',title)
            # print(area_name)

            source = 'http://www.ccgp-shaanxi.gov.cn/'

            table_ele  = selector.xpath('//div[@class="contain detail-con"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '陕西省政府采购网'
            retult_dict['en_name'] = 'Shaanxi Province Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def load_get(self,noticetype, page):
        try:
            params = (
                ('noticetype', noticetype),
            )
            data = {

                "parameters['purcatalogguid']": "",
                "page.pageNum": page,
                "parameters['title']": "",
                "parameters['startdate']": "",
                "parameters['enddate']": "",
                "parameters['regionguid']": 610001,
                "parameters['projectcode']": "",
                "province": "",
                "parameters['purmethod']": "",

            }
            url = 'http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do'
            response = requests.post(url=url, headers=self.headers, params=params, data=data, verify=False).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # self.load_get(types,page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath('//div[@class="list-box"]/table/tbody/tr/td[3]/a/@href')
            for url in url_li:
                # self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'noticetype':'3', 'all_page': 2},
                {'noticetype':'5', 'all_page': 2},
                {'noticetype':'4', 'all_page': 2},
                {'noticetype':'6', 'all_page': 2},
                {'noticetype':'99', 'all_page': 1},
                {'noticetype':'1', 'all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    noticetype = task['noticetype']
                    # self.load_get(types, page)
                    spawns = [gevent.spawn(self.load_get,noticetype, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''广东采购电子商城'''
    def __init__(self):
        name = 'guangdong_gpcgd_com'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gpcgd.com',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gpcgd.com/gpcgd/portal/portal-news^!list',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }


        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='guangdong_gpcgd_com_list1', dbset='guangdong_gpcgd_com_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!detailNews?portalNews.id={}'.format(pid)
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="pub_title"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//div[@class="pub_note"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            area_name = '广东'
            # print(area_name)

            source = 'http://www.gpcgd.com/'
            # print(url)
            # print(response)

            table_ele  = selector.xpath('//div[@class="pub_cont_details"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广东省政府采购中心'
            retult_dict['en_name'] = 'Guangdong Government Procurement Center'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            data = [
                ('portalNews.typeId', types),
                ('pageNum', page),
            ]
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!list'
            response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = re.findall(r'onclick\=\"detailNews\(\'(.*?)\'\)\"',response)

            # for div_ele in div_ele_li:
            for pid in url_li:

            # for data_dic in response_li:
            #     div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                # urls = 'http://www.jngp.gov.cn{}'.format(url)
                # print(data_dic)
                # self.load_get_html(pid)

                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'90011','all_page': 1},
                {'categoryId':'', 'types':'90013','all_page': 1},
                {'categoryId':'', 'types':'40011','all_page': 2},
                {'categoryId':'', 'types':'40012','all_page': 2},
                {'categoryId':'', 'types':'40013','all_page': 1},
                {'categoryId':'', 'types':'40014','all_page': 1},
                {'categoryId':'', 'types':'40015','all_page': 1},
                {'categoryId':'', 'types':'40016','all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #19
0
class GovBuy(object):
    '''南宁公共资源交易信息网'''
    def __init__(self):
        name = 'nanning_nnggzy_net'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'yunsuo_session_verify': '2c0b046605eb7acf81b64a462d5a88e3',
            'ASP.NET_SessionId': 'k2oz1d45keci5055fe5br43f',
            '_gscu_1349052524': '33974463sf7nus87',
            '_gscbrs_1349052524': '1',
            '_gscs_1349052524': '3397446376zl7787^|pv:1',
            '__CSRFCOOKIE': 'e0612cbd-55e6-4892-9a1a-bad08d9eafed',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.nnggzy.net',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.nnggzy.net/nnzbwmanger/ShowInfo/more.aspx?categoryNum=001001001',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='nanning_nnggzy_net_list1', dbset='nanning_nnggzy_net_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//span[@id="lblTitle"]//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '广西-南宁'
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://www.nnggzy.net/'
            # print(url)
            # print(response)

            table_ele  = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南宁公共资源交易中心'
            retult_dict['en_name'] = 'Nanning Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            params = (
                ('categoryNum', types),
            )
            data = {
                '__CSRFTOKEN': '/wEFJGUwNjEyY2JkLTU1ZTYtNDg5Mi05YTFhLWJhZDA4ZDllYWZlZA==',
                '__VIEWSTATE': '',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': str(page),
                '__VIEWSTATEENCRYPTED': '',
            }
            url = 'http://www.nnggzy.net/nnzbwmanger/ShowInfo/more.aspx'
            response = requests.post(url=url, headers=self.headers, data=data, params=params, cookies=self.cookies).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))

            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//td[@id="MoreInfoList1_tdcontent"]//a/@href')

            # for div_ele in div_ele_li:

            for url in url_li:
                urls = 'http://www.nnggzy.net' + url
                # print(urls)

            # for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                # print(data_dic)
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'001001001','all_page': 1},
                {'categoryId':'', 'types':'001001002','all_page': 2},
                {'categoryId':'', 'types':'001001004','all_page': 1},
                {'categoryId':'', 'types':'001001005','all_page': 2},
                {'categoryId':'', 'types':'001001006','all_page': 1},
                {'categoryId':'', 'types':'001004001','all_page': 2},
                {'categoryId':'', 'types':'001004002','all_page': 1},
                {'categoryId':'', 'types':'001004004','all_page': 2},
                {'categoryId':'', 'types':'001010001','all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #20
0
class GovBuy(object):
    '''海南政府采购网'''
    def __init__(self):
        name = 'hainan_ccgp-hainan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.ccgp-hainan.gov.cn/thirdparty/My97DatePicker/My97DatePicker.html',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'X-DevTools-Emulate-Network-Conditions-Client-Id':
            'EAC4BA3425D26FC6B117994EFF4DEC28',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hainan_list1',
                             dbset='hainan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath('//div[@class="nei03_02"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="nei03_02"]/div[2]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='nei03_02')

            source = 'http://www.ccgp-hainan.gov.cn/'
            area_name = self.get_area('海南', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中国海南政府采购网 '
            retult_dict['en_name'] = 'Hainan Province Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, params):
        try:
            url = 'http://www.ccgp-hainan.gov.cn/cgw/cgw_list.jsp'
            response = self.session.get(url=url,
                                        headers=self.headers,
                                        params=params).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//div[@class="nei02_04_01"]/ul/li/em/a/@href')
        except:
            print('load_post error')
        else:
            for url in url_li:
                url = 'http://www.ccgp-hainan.gov.cn' + url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 2521},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                params = (
                    ('currentPage', str(page)),
                    ('begindate', ''),
                    ('enddate', ''),
                    ('title', ''),
                    ('bid_type', ''),
                    ('proj_number', ''),
                    ('zone', ''),
                )

                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #21
0
class GovBuy(object):
    '''云南公共资源交易信息网'''
    def __init__(self):
        name = 'yunnan_ynggzyxx_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'https://www.ynggzyxx.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://www.ynggzyxx.gov.cn/res/css/basic.css',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='yunnan_list1',
                             dbset='yunnan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h3[@class="detail_t"]/text()')
            if title != '':
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","预","采购","更正","结果","补充"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//p[@class="kdg"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.search(r'(\d{8}|\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                if '-' not in publish_date:
                    publish_date = '{}-{}-{}'.format(publish_date[0:4],
                                                     publish_date[4:6],
                                                     publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('云南', title)
            # print(area_name)

            source = 'https://www.ynggzyxx.gov.cn/'

            table_ele = selector.xpath('//div[@class="page_contect bai_bg"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '云南省公共资源交易网'
            retult_dict['en_name'] = 'Yunnan Province Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            data = [
                ('currentPage', str(page)),
                ('area', '000'),
                ('industriesTypeCode', ''),
                ('scrollValue', categoryId),
                ('purchaseProjectCode', ''),
                ('bulletinTitle', ''),
                ('secondArea', ''),
            ]
            url = 'https://www.ynggzyxx.gov.cn/jyxx/{}'.format(types)
            response = requests.post(url=url, headers=self.headers,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # url_li =selector.xpath('//*[@id="data_tab"]/tbody/tr/td[3]/a/@href')
            url_li = selector.xpath('//*[@id="data_tab"]/tbody/tr/td/a/@href')
            # print(url_li)
            for url in url_li:
                urls = 'https://www.ynggzyxx.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            # {'categoryId':'1002', 'types':'jsgcZbgg','all_page': 2538},
            # {'categoryId':'1000', 'types':'jsgcBgtz','all_page': 940},
            # {'categoryId':'842', 'types':'jsgcZbjggs','all_page': 3417},
            # {'categoryId':'942', 'types':'jsgcpbjggs','all_page': 917},
            # {'categoryId':'825', 'types':'zfcg/cggg','all_page': 2522},
            # {'categoryId':'626', 'types':'zfcg/gzsx','all_page': 646},
            # {'categoryId':'843', 'types':'zfcg/zbjggs','all_page': 2033},
            # {'categoryId':'963', 'types':'zfcg/zfcgYcgg','all_page': 227},
            {
                'categoryId': '1002',
                'types': 'jsgcZbgg',
                'all_page': 2
            },
            {
                'categoryId': '1000',
                'types': 'jsgcBgtz',
                'all_page': 2
            },
            {
                'categoryId': '842',
                'types': 'jsgcZbjggs',
                'all_page': 2
            },
            {
                'categoryId': '942',
                'types': 'jsgcpbjggs',
                'all_page': 2
            },
            {
                'categoryId': '825',
                'types': 'zfcg/cggg',
                'all_page': 1
            },
            {
                'categoryId': '626',
                'types': 'zfcg/gzsx',
                'all_page': 1
            },
            {
                'categoryId': '843',
                'types': 'zfcg/zbjggs',
                'all_page': 2
            },
            {
                'categoryId': '963',
                'types': 'zfcg/zfcgYcgg',
                'all_page': 1
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:

                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #22
0
class GovBuy(object):
    '''海口政府采购网'''
    def __init__(self):
        name = 'haikou_ggzy_haikou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://ggzy.haikou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://ggzy.haikou.gov.cn/login.do?method=newsecond^&param=431241696e6465783d3326747970653d5a435f4a59',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='haikou_list1',
                             dbset='haikou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        try:
            if url == None:
                return
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="part_1"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="part_1"]/div[2]//text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='content_wrap')

            area_name = self.get_area('海口', title)

            source = 'http://ggzy.haikou.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '海口公共资源交易网'
            retult_dict['en_name'] = 'Hiakou Public resource'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, data):
        try:
            params = (('method', 'getSecondTableInfo'), )
            url = 'http://ggzy.haikou.gov.cn/login.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).json()
        except:
            print('load_post error')
        else:
            response_li = response['result']
            for dic in response_li:
                key_str = 'flag=3&name=' + dic['FLAG'] + '&key=' + dic['KEYID']
                es = EncodeStr(key_str)
                encodestr = es.encodes()
                urls = 'http://ggzy.haikou.gov.cn/login.do?method=newDetail&param=' + encodestr
                # print(urls)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 3
        task_li = [
            {
                'type': 'GC_JY',
                'all_page': flag
            },
            {
                'type': 'GC_GS',
                'all_page': flag
            },
            {
                'type': 'GC_JG',
                'all_page': flag
            },
            {
                'type': 'ZC_JY',
                'all_page': flag
            },
            {
                'type': 'ZC_JG',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                data = [
                    ('currentPage', str(page)),
                    ('pageSize', '20'),
                    ('flag', '3'),
                    ('type', task['type']),
                    ('notice_title', ''),
                ]
                try:
                    self.load_get(data)
                    print('第{}页'.format(page))
                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #23
0
class GovBuy(object):
    '''南京公共资源交易信息网'''
    def __init__(self):
        name = 'nanjing_ggzy_njzwfw_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html, */*; q=0.01',
            'Referer':
            'http://ggzy.njzwfw.gov.cn/njweb/gycq/stateProperty.html',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='nanjing_ggzy_njzwfw_gov_cn_list1',
                             dbset='nanjing_ggzy_njzwfw_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="article-info"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//p[@class="info-sources"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area()
            area_name = '江苏-南京'

            source = 'http://ggzy.njzwfw.gov.cn/'

            table_ele = selector.xpath('//div[@class="ewb-main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南京市公共资源交易平台'
            retult_dict['en_name'] = 'Nanjing City Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (('_', categoryId), )
            url = 'http://ggzy.njzwfw.gov.cn/njweb/{}/{}.html'.format(
                types, page)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = re.findall(r"window.open\(\'(.*?)\'\)", response)

            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ggzy.njzwfw.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '69171',
                'types': 'zfcg/067001/067001001',
                'all_page': 2
            },
            {
                'categoryId': '27720',
                'types': 'zfcg/067002/067002001',
                'all_page': 2
            },
            {
                'categoryId': '344',
                'types': 'fjsz/068001/068001001',
                'all_page': 2
            },
            {
                'categoryId': '21869',
                'types': 'fjsz/068002/068002001',
                'all_page': 3
            },
            {
                'categoryId': '48706',
                'types': 'fjsz/068003/068003001',
                'all_page': 2
            },
            {
                'categoryId': '95248',
                'types': 'fjsz/068005/068005002',
                'all_page': 3
            },
            {
                'categoryId': '74362',
                'types': 'gchw/070001',
                'all_page': 1
            },
            {
                'categoryId': '83799',
                'types': 'gchw/070003',
                'all_page': 1
            },
            {
                'categoryId': '81835',
                'types': 'gchw/070004',
                'all_page': 1
            },
            {
                'categoryId': '4620',
                'types': 'jtsw/069001/069001001',
                'all_page': 1
            },
            {
                'categoryId': '11321',
                'types': 'jtsw/069003',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #24
0
class GovBuy(object):
    '''济南公共资源交易信息网'''
    def __init__(self):
        name = 'jinan_jngp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'Connection':
            'keep-alive',
            'Host':
            'jnggzy.jinan.gov.cn',
            'Origin':
            'http: // jnggzy.jinan.gov.cn',
            'Referer':
            'http: // jnggzy.jinan.gov.cn / jnggzyztb / front / noticelist.do?type = 1 & xuanxiang = 1 & area =',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jinan_jngp_gov_cn_list1',
                             dbset='jinan_jngp_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="list"]/h1//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="list"]/div/span//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            area_name = '山东-济南'
            # print(area_name)
            source = 'http://jnggzy.jinan.gov.cn/'

            table_ele = selector.xpath('//div/div[@class="list"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '济南公共资源交易中心'
            retult_dict['en_name'] = 'Jinan Public resource'

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = {
                'area': '',
                'type': types,
                'xuanxiang': categoryId,
                'subheading': '',
                'pagenum': page,
            }

            url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/search.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     data=params).json()
            response_str = response['params']['str']
            selector = etree.HTML(response_str)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print(response)
            print('第{}页'.format(page))
            id_li = selector.xpath('//ul/li/a/@onclick')
            if len(id_li) > 0:
                iid_li = [re.sub(r'.*?\(|\).*', '', i) for i in id_li]
                for iid in iid_li:
                    url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/showNotice.do?iid={}&xuanxiang={}'.format(
                        iid, categoryId)
                    # self.load_get_html(url)
                    if not self.rq.in_rset(url):
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)
            else:
                url_li = selector.xpath('//ul/li/a/@href')
                for url in url_li:
                    urls = 'http://jnggzy.jinan.gov.cn' + url
                    # self.load_get_html(urls)
                    if not self.rq.in_rset(urls):
                        self.rq.add_to_rset(urls)
                        self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '招标公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '中标公示',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '变更公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '废标公告',
                'types': '1',
                'all_page': 4
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #25
0
class GovBuy(object):
    '''河南政府采购网'''
    def __init__(self):
        name = 'henan_hngp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'sId': '7c61a3bff6dc4969a336157b5f3dfb1d',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.hngp.gov.cn/henan/search?appCode=H60&pageSize=16&keyword=&dljg=&cgr=&year=2015&pageNo=15',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        self.session = requests.session()
        self.session.headers.update(self.headers)
        self.session.cookies.update(self.cookies)

        self.rq = Rdis_Queue(host='localhost',
                             dblist='henan_list1',
                             dbset='henan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            urls_li = re.findall(r'get\(\"(.*?\.htm)\"', response)
            if len(urls_li) < 1:
                return
            urls = 'http://www.hngp.gov.cn' + urls_li[0]
            # print(url)
            response1 = requests.get(
                url=urls, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath(
                '//*[@id="ng-app"]/body/div[3]/div[1]/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="ng-app"]/body/div[3]/div[1]/div[1]/span//text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            content_html = response1.lower()
            if len(content_html) < 100:
                return
            area_name = self.get_area('河南', title)

            source = 'http://www.hngp.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '河南省政府采购网 '
            retult_dict['en_name'] = 'Henan Province Government Procurement'

            print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = {
                'appCode': 'H60',
                'pageSize': 10,
                'keyword': '',
                'dljg': '',
                'cgr': '',
                'year': '2019',
                'pageNo': page,
            }

            url = 'http://www.hngp.gov.cn/henan/search'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    cookies=self.cookies)
            selector = etree.HTML(response.content.decode('utf-8'))
            url_li = selector.xpath('//div[@class="List2"]/ul/li/a/@href')
            # print(response.url)
            self.headers['Referer'] = response.url
        except:
            print('load_post error')
        else:
            print('第{}页'.format(page))
            # print(url_li)
            # return
            for url in url_li:
                url = 'http://www.hngp.gov.cn' + url
                # print(url)
                self.load_get_html(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 5
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            #{'all_page': 500},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Example #26
0
class GovBuy(object):
    '''南昌公共资源交易信息网'''
    def __init__(self):
        name = 'nanchang_ncztb_nc_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'kxgkxo45v04bzs55ie3tib55',
            '__CSRFCOOKIE': 'ad60f543-41c8-481d-b0cf-accadc73c516',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://ncztb.nc.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://ncztb.nc.gov.cn/nczbw/jyxx/002001/002001002/MoreInfo.aspx?CategoryNum=002001002',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='nanchang_ncztb_nc_gov_cn_list1', dbset='nanchang_ncztb_nc_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('福建', title)
            area_name = '江西-南昌'

            # print(area_name)

            source = 'http://ncztb.nc.gov.cn'

            table_ele  = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江西省南昌公共资源交易网'
            retult_dict['en_name'] = 'Nanchang Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:

            params = (
                ('CategoryNum', types),
            )

            data = {
                '__CSRFTOKEN': '/wEFJGFkNjBmNTQzLTQxYzgtNDgxZC1iMGNmLWFjY2FkYzczYzUxNg==',
                '__VIEWSTATE': '',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page
            }

            url = 'http://ncztb.nc.gov.cn/nczbw/jyxx/{}/MoreInfo.aspx'.format(categoryId)
            response = requests.post(url=url, headers=self.headers, params=params, data=data, cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ncztb.nc.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
                {'categoryId':'002001/002001002', 'types':'002001002','all_page': flag},
                {'categoryId':'002001/002001004', 'types':'002001004','all_page': flag},
                {'categoryId':'002001/002001005', 'types':'002001005','all_page': flag},
                {'categoryId':'002002/002002002', 'types':'002002002','all_page': flag},
                {'categoryId':'002002/002002005', 'types':'002002005','all_page': flag},
                {'categoryId':'002003/002003001', 'types':'002003001','all_page': flag},
                {'categoryId':'002003/002003004', 'types':'002003004','all_page': flag},
                {'categoryId':'002009/002009001', 'types':'002009001','all_page': flag},
                {'categoryId':'002009/002009004', 'types':'002009004','all_page': flag},
                {'categoryId':'002004/002004001', 'types':'002004001','all_page': flag},
                {'categoryId':'002004/002004002', 'types':'002004002','all_page': flag},
                {'categoryId':'002004/002004003', 'types':'002004003','all_page': flag},
                {'categoryId':'002004/002004004', 'types':'002004004','all_page': flag},
                {'categoryId':'002004/002004005', 'types':'002004005','all_page': flag},
                {'categoryId':'002005/002005002', 'types':'002005002','all_page': flag},
                {'categoryId':'002010/002010001', 'types':'002010001','all_page': flag},
                {'categoryId':'002010/002010002', 'types':'002010002','all_page': flag},
                {'categoryId':'002010/002010004', 'types':'002010004','all_page': flag},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
Example #27
0
class GovBuy(object):
    '''杭州政府采购网'''
    def __init__(self):
        name = 'hangzhou_cg_hzft_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Cache-Control': 'max-age=0',
            'Origin': 'http://cg.hzft.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://cg.hzft.gov.cn/www/noticelist.do',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hangzhou_list1',
                             dbset='hangzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        try:
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="detail_con"]/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="content_about"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)

            area_name = '杭州'

            source = 'http://cg.hzft.gov.cn/'

            soup = BeautifulSoup(response)
            content_html = soup.find(class_='detail_con')
            # print(content_html)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '杭州市政府采购网'
            retult_dict['en_name'] = 'Hangzhou Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, data):
        try:
            url = 'http://cg.hzft.gov.cn/www/noticelist.do'
            # proxies = self.proxy_queue.get()
            response = requests.post(url=url, headers=self.headers,
                                     data=data).text
            selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(data)
        else:
            # print('第{}页'.format(page))
            url_li = selector.xpath('//ul[@class="c_list_item"]/li/a/@href')
            print(url_li)
            for url in url_li:
                urls = 'http://cg.hzft.gov.cn/' + url

                # self.load_get_html(urls)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'regionguid': '330100',
                'noticetype': '3,3001,3002,3008,3009,3011,3014,4001,4002',
                'all_page': 1
            },
            {
                'regionguid': '3',
                'noticetype': '5,6',
                'all_page': 1
            },
            {
                'regionguid': '',
                'noticetype': '1,3012',
                'all_page': 1
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                data = {
                    'page.pageNum': page,
                    "parameters['regionguid']": task['regionguid'],
                    "parameters['noticetype']": task['noticetype'],
                    "parameters['title']": ''
                }
                try:
                    self.load_get(data)
                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''苏州政府采购网'''
    def __init__(self):
        name = 'suzhou_zfcg_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.zfcg.suzhou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.zfcg.suzhou.gov.cn/html/search.shtml?title=&choose=&projectType=0&zbCode=&appcode=',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_list1', dbset='suzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,pid):
        if pid == None:
            return
        try:
            url = 'http://www.zfcg.suzhou.gov.cn/html/project/'+ pid +'.shtml'
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="M_title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="date"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '江苏-苏州'
            # print(area_name)

            source = 'http://www.zfcg.suzhou.gov.cn/'

            table_ele  = selector.xpath('//div[@id="tab1"]')[0]

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市政府采购网'
            retult_dict['en_name'] = 'Suzhou City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, types, page):
        try:
            data = [
                ('title', ''),
                ('choose', ''),
                ('type', types),
                ('zbCode', ''),
                ('appcode', ''),
                ('page', page),
                ('rows', '30'),
            ]
            url = 'http://www.zfcg.suzhou.gov.cn/content/searchContents.action'
            response = requests.post(url=url, headers=self.headers, data=data).json()
            # selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(types, page)
        else:
            print('第{}页'.format(page))
            # print(response)
            response_li = response['rows']
            if response_li == []:
                return

            for project_id in response_li:
                pid = project_id['PROJECTID']

                # self.load_get_html(pid)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
                {'type':'0', 'all_page': 2},
                {'type':'1', 'all_page': 2},
                {'type':'2', 'all_page': 2},

            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    types = task['type']

                    # self.load_get(base_url, page)
                    spawns = [gevent.spawn(self.load_get,types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
class GovBuy(object):
    '''呼和浩特政府采购网'''
    def __init__(self):
        name = 'huheaote_hhgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.hhgp.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*',
            'Referer': 'http://www.hhgp.gov.cn/huShi_web_login',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
            'Content-Length': '0',
        }


        self.rq = Rdis_Queue(host='localhost', dblist='huhehaote_list1', dbset='huhehaote_set1')


    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get(self,params):
        try:
            url = 'http://www.hhgp.gov.cn/huShi_web_login/showAllMessage'
            response = requests.post(url=url, headers=self.headers,params=params).json()
            response_str = response['0']
            selector = etree.HTML(response_str)
        except:
            print('load_post error')
        else:
            url_li = selector.xpath('//li/span[1]/a/@href')
            for url in url_li:
                url = 'http://www.hhgp.gov.cn'+ url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def load_get_html(self,url):
        try:
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            # print(response)
            _id = self.hash_to_md5(url)
            title = selector.xpath('//*[@id="content"]/div/div[2]/div/div/h1/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            publish_date = selector.xpath('//*[@id="content"]/div/div[2]/div/div/i/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',publish_date[0]).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='content')
            # print(content_html)
            source = 'http://www.hhgp.gov.cn/'
            area_name = self.get_area('呼和浩特',title)

            # print(content)
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '呼和浩特市政府采购网 '
            retult_dict['en_name'] = 'Huhhot City Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)


    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
                # {'code':'265.266.304', 'all_page': 29},
                # {'code':'265.266.269', 'all_page': 70},
                # {'code':'265.266.270', 'all_page': 67},
                # {'code':'265.266.271', 'all_page': 217},
                # {'code':'265.266.303', 'all_page': 58},
                # {'code':'265.266.404', 'all_page': 1},
                # {'code':'265.266.403', 'all_page': 14},
                # {'code':'265.266.343', 'all_page': 21},
                {'code':'265.266.304', 'all_page': 1},
                {'code':'265.266.269', 'all_page': 1},
                {'code':'265.266.270', 'all_page': 1},
                {'code':'265.266.271', 'all_page': 1},
                {'code':'265.266.303', 'all_page': 1},
                {'code':'265.266.404', 'all_page': 1},
                {'code':'265.266.403', 'all_page': 1},
                {'code':'265.266.343', 'all_page': 1},
            ]
        for task in task_li:
            for page in range(1,task['all_page'] + 1):
                params = (
                    ('code', task['code']),
                    ('pageNo', str(page)),
                    ('check', '1'),
                )
                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()