class GovBuy(object):
    '''广西公共资源交易信息网'''
    def __init__(self):
        name = 'guangxi_gxzbtb_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'trbofu0uet0aywbdhr35s0x4',
            '__CSRFCOOKIE': '6f7e275f-5762-4569-8ea2-ae98d3b0379d',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gxzbtb.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.gxzbtb.cn/gxzbw/jyxx/001010/001010001/MoreInfo.aspx?CategoryNum=001010001',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guangxi_gxzbtb_cn_list1',
                             dbset='guangxi_gxzbtb_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'\/', '-',
                    re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('', title)
            area_name = '广西'

            # print(area_name)

            source = 'http://www.gxzbtb.cn/'

            table_ele = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广西壮族自治区公共资源交易中心'
            retult_dict['en_name'] = 'Guangxi Zhuang National Public Resources'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:

            params = (('CategoryNum', types), )

            data = {
                '__CSRFTOKEN':
                '/ wEFJDZmN2UyNzVmLTU3NjItNDU2OS04ZWEyLWFlOThkM2IwMzc5ZA ==',
                '__VIEWSTATE':
                '',
                '__VIEWSTATEGENERATOR': '16D6DBB1',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page,
                '__VIEWSTATEENCRYPTED': '',
            }
            url = 'http://www.gxzbtb.cn/gxzbw/jyxx/{}/MoreInfo.aspx'.format(
                categoryId)
            response = requests.post(
                url=url,
                headers=self.headers,
                params=params,
                data=data,
                cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath(
                '//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')

            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://www.gxzbtb.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '001010/001010001',
                'types': '001010001',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010002',
                'types': '001010002',
                'all_page': flag
            },
            {
                'categoryId': '001010/001010004',
                'types': '001010004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001001',
                'types': '001001001',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001002',
                'types': '001001002',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001004',
                'types': '001001004',
                'all_page': flag
            },
            {
                'categoryId': '001001/001001005',
                'types': '001001005',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004001',
                'types': '001004001',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004002',
                'types': '001004002',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004004',
                'types': '001004004',
                'all_page': flag
            },
            {
                'categoryId': '001004/001004005',
                'types': '001004005',
                'all_page': flag
            },
            {
                'categoryId': '001007/001007001',
                'types': '001007001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011001',
                'types': '001011001',
                'all_page': flag
            },
            {
                'categoryId': '001011/001011002',
                'types': '001011002',
                'all_page': flag
            },
            {
                'categoryId': '001012/001012001',
                'types': '001012001',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''深圳政府采购网'''
    def __init__(self):
        name = 'shenzhen_zfcg_sz_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            'Referer':
            'http://61.144.227.212/was5/web/search?page=4096^&channelid=261279^&orderby=-DOCRELTIME^&perpage=10^&outlinepage=5^&searchscope=^&timescope=^&timescopecolumn=^&orderby=-DOCRELTIME^&chnlid=^&andsen=^&total=^&orsen=^&exclude=',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'Origin': 'http://61.144.227.212',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shenzhen_list1',
                             dbset='shenzhen_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb2312')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h4/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//*[@id="content"]/div/div[2]/div/h6/label//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='main')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = 'http://www.zfcg.sz.gov.cn/'
            retult_dict['area_name'] = '深圳'

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '深圳市政府采购监管网 '
            retult_dict['en_name'] = 'Shenzhen Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('page', str(page)),
                ('channelid', '261279'),
                ('orderby', ['-DOCRELTIME', '-DOCRELTIME']),
                ('perpage', '10'),
                ('outlinepage', '5'),
                ('searchscope', ''),
                ('timescope', ''),
                ('timescopecolumn', ''),
                ('chnlid', ''),
                ('andsen', ''),
                ('total', ''),
                ('orsen', ''),
                ('exclude', ''),
            )
            data = [
                ('showother', 'false'),
                ('showtype', 'txt'),
                ('classnum', '20'),
                ('classcol', 'CTYPE'),
                ('channelid', '261279'),
                ('orderby', '-DOCRELTIME'),
            ]
            url = 'http://61.144.227.212/was5/web/search'
            response = self.session.post(url=url,
                                         headers=self.headers,
                                         params=params,
                                         data=data).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath('//div[@class="r_list"]/dl/dd/a/@href')
            print('第{}页'.format(page))
        except:
            print('load_post error')
        else:

            for url in url_li:
                # print(url)
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 43879},
            {
                'all_page': 5
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                except:
                    pass

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #3
0
class GovBuy(object):
    '''陕西公共资源交易信息网'''
    def __init__(self):
        name = 'shaanxi_sxggzyjy_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxggzyjy.cn/jydt/001001/subPage_jyxx.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shaanxip_list1',
                             dbset='shaanxip_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h3[@class="article-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="info-source"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('陕西', title)
            # area_name = '四川-成都'
            # print(area_name)

            source = 'http://www.sxggzyjy.cn/'

            table_ele = selector.xpath('//div[@class="ewb-main"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '陕西省公共资源交易中心'
            retult_dict['en_name'] = 'Shaanxi Province Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://www.sxggzyjy.cn/jydt/001001/{}.html'.format(page)

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath('//ul[@class="ewb-list"]/li/a/@href')

            for url in url_li:
                urls = 'http://www.sxggzyjy.cn' + url
                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            # {'categoryId':'', 'types':'','all_page': 1845},
            {
                'categoryId': '',
                'types': '',
                'all_page': 2
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #4
0
class GovBuy(object):
    '''湖南公共资源交易信息网'''
    def __init__(self):
        name = 'hunan_hncg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hunan_hncg_gov_cn_list1',
                             dbset='hunan_hncg_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, data_dic):
        if data_dic == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            url = 'http://www.hncg.gov.cn/portal/protalAction!viewNoticeContent.action?noticeId={}'.format(
                data_dic['NOTICE_ID'])

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            # title = selector.xpath('//div[@class="title"]/h2/text()')
            # if title != []:
            #     title = re.sub(r'\r|\n|\s','',''.join(title))
            #     try:
            #         status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
            #     except:
            #         status = '公告'
            # else:
            #     title = None
            #     status = '公告'
            title = data_dic['NOTICE_TITLE']
            status = data_dic['PRCM_MODE_NAME']
            _id = self.hash_to_md5(url)
            publish_date = data_dic['NEWWORK_DATE']
            # publish_date = selector.xpath('//div[@class="title"]/h3//text()')
            # if publish_date != []:
            #     publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            #     # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
            #     # if '-' not in publish_date:
            #     #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            # else:
            #     publish_date = None
            print(publish_date, title)
            area_name = '湖南'
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://www.hncg.gov.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//html')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '湖南省公共资源交易中心'
            retult_dict['en_name'] = 'Hunan Province Public resource'
            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = {
                'title': '',
                'origin': '',
                'inDates': '1',
                'channelId': '845',
                'ext': '',
                'beginTime': '',
                'endTime': '',
            }
            url = 'http://www.hnsggzy.com/queryContent_{}-jygk.jspx'.format(
                page)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).text
            print(response)
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            # url_li = selector.xpath('//ul[@id="list_ul"]/li/a/@href')
            # for div_ele in div_ele_li:
            # for url in url_li:
            response_li = response
            for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                print(data_dic)
                # self.load_get_html(data_dic)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '',
                'types': '',
                'all_page': 2
            },
            # {'categoryId':'', 'types':'','all_page': 1000},
        ]
        count = 2
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #5
0
class GovBuy(object):
    '''山西采购电子商城'''
    def __init__(self):
        name = 'shanxi_sxzfcg_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.sxzfcg.cn/view.php?nav=61',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='shanxi_sxzfcg_cn_list1',
                             dbset='shanxi_sxzfcg_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@valign="middle"]/h2/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//td[@bgcolor="#E6E6E6"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'年|月', '-',
                    re.search(r'(\d{8}|\d{4}年\d+月\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            area_name = '山西'
            # print(area_name)

            source = 'http://www.sxzfcg.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//td[@class="c_pt"]/table/tr[2]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '山西省省级政府采购中心'
            retult_dict['en_name'] = 'Shanxi Government Procurement Center'

            print(publish_date)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (
                ('nav', types),
                ('page', page),
            )
            url = 'http://www.sxzfcg.cn/view.php'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = selector.xpath('//tr[@class="odd"]/td/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                urls = 'http://www.sxzfcg.cn/{}'.format(url)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 1
        task_li = [
            {
                'categoryId': '',
                'types': '61',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '62',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '63',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '64',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '65',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '66',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '67',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '68',
                'all_page': flag
            },
            {
                'categoryId': '',
                'types': '69',
                'all_page': flag
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']
                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #6
0
class GovBuy(object):
    '''银川公共资源交易信息网'''
    def __init__(self):
        name = 'yinchuan_ycsggzy_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.ycsggzy.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': '*/*',
            'Referer': 'http://www.ycsggzy.cn/morelink.html?type=12^&index=0',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='yinchuan_ycsggzy_cn_list1',
                             dbset='yinchuan_ycsggzy_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, key):
        if key == None:
            return
        try:
            url = 'http://www.ycsggzy.cn/Ajax/article.ashx'
            data = {
                'czlx': 'article',
                'cxcs': '12|0|{}'.format(key),
            }
            response = requests.post(url=url, headers=self.headers,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//p[@class="a_title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url + key)

            publish_date = selector.xpath('//p[@class="box_p"]//text()')
            # print(publish_date)
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'\/', '-',
                    re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            # area_name = self.get_area('山东',title)
            area_name = '宁夏-银川'
            # print(area_name)

            source = 'http://www.ycsggzy.cn/'

            table_ele = selector.xpath('//ul')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '银川公共资源交易中心'
            retult_dict['en_name'] = 'Yinchuan City Public resource'
            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            data = [
                ('czlx', 'linetxt'),
                ('cxcs', '{}|{}|{}|20'.format(categoryId, types, page)),
            ]
            url = 'http://www.ycsggzy.cn/Ajax/morelink.ashx'
            response = requests.post(url=url, headers=self.headers,
                                     data=data).content.decode('utf-8')
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            url_li = re.findall(r'key=(.*?)\"\>', response)

            for key in url_li:
                # li = etree.tostring(li_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                # urls = 'http://www.ycsggzy.cn/' + url
                self.load_get_html(key)
                #
                # if not self.rq.in_rset(key):
                #     self.rq.add_to_rset(key)
                #     self.rq.pull_to_rlist(key)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '12',
                'types': '0',
                'all_page': 2
            },
            {
                'categoryId': '12',
                'types': '2',
                'all_page': 1
            },
            {
                'categoryId': '17',
                'types': '0',
                'all_page': 2
            },
            {
                'categoryId': '17',
                'types': '1',
                'all_page': 2
            },
            {
                'categoryId': '17',
                'types': '2',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #7
0
class GovBuy(object):
    '''苏州公共资源交易信息网'''
    def __init__(self):
        name = 'suzhou_szzyjy_fwzx_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'X-DevTools-Emulate-Network-Conditions-Client-Id': '06AB3D9C05E9FDAB1EDDAD36BA60296F',
            'Referer': 'http://ggzy.hefei.gov.cn/jyxx/002001/002001001/3.html',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_szzyjy_fwzx_suzhou_gov_cn_list1', dbset='suzhou_szzyjy_fwzx_suzhou_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//h2[@class="word-title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//h4[@class="word-info"]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area()
            area_name = '江苏-苏州'

            source = 'http://szzyjy.fwzx.suzhou.gov.cn'

            table_ele  = selector.xpath('//div[@class="border"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市公共资源交易中心'
            retult_dict['en_name'] = 'Suzhou City Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            params = (
                ('paging', page),
            )
            url = 'http://szzyjy.fwzx.suzhou.gov.cn/Front/jyzx/{}/'.format(types)
            response = requests.get(url=url, headers=self.headers, params=params).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            try:
                url_li = selector.xpath('//*[@class="mr-content"]/div[1]/table/tr/td[1]/a/@href')
            except:
                time.sleep(3)
                self.load_get(categoryId, types, page)


            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://szzyjy.fwzx.suzhou.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'002004/002004001','all_page': 2},
                {'categoryId':'', 'types':'002004/002004002','all_page': 2},
                {'categoryId':'', 'types':'002004/002004003','all_page': 2},
                {'categoryId':'', 'types':'002004/002004004','all_page': 1},
                {'categoryId':'', 'types':'002004/002004005','all_page': 2},
                {'categoryId':'', 'types':'002004/002004006','all_page': 1},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''贵阳政府采购网'''
    def __init__(self):
        name = 'guiyang_gygp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gygp.gov.cn/list-37-1.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guiyang_list1',
                             dbset='guiyang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        # print(url)
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="biaoti"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="fbsj"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '贵州-贵阳'
            # print(area_name)

            source = 'http://www.gygp.gov.cn/'

            table_ele = selector.xpath('//div[@class="content_box"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '贵阳市政府采购网'
            retult_dict['en_name'] = 'Guiyang City Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, types, page):
        try:
            url = 'http://www.gygp.gov.cn/' + types + str(page) + '.html'
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(types, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath(
                '//div[@class="right_top_content"]/ul/li/span/a/@href')
            for url in url_li:
                urls = 'http://gyzfcg.gyggzy.cn' + url

                self.load_get_html(urls)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'type': 'list-12-',
                'all_page': 3
            },
            {
                'type': 'list-13-',
                'all_page': 3
            },
            {
                'type': 'list-27-',
                'all_page': 2
            },
            {
                'type': 'list-36-',
                'all_page': 1
            },
            {
                'type': 'list-28-',
                'all_page': 1
            },
            {
                'type': 'list-37-',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    types = task['type']
                    # self.load_get(base_url, page)
                    spawns = [
                        gevent.spawn(self.load_get, types, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #9
0
class GovBuy(object):
    '''黑龙江政府采购网'''
    def __init__(self):
        name = 'heilongjiang_hljcg_gov_cn'
        self.coll = StorageSetting(name)

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.hljcg.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.hljcg.gov.cn/xwzs^!queryXwxxqx.action',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='heilongjiang_list1',
                             dbset='heilongjiang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers,
                allow_redirects=False).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(url)
        else:
            # print(response)

            title = selector.xpath('//div[@class="mtt"]/p[1]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="mtt"]/p[2]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            area_name = self.get_area('黑龙江', title)
            source = 'http://www.hljcg.gov.cn/'
            table_ele = selector.xpath('//div[@class="xxej"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '黑龙江省政府采购网'
            retult_dict['en_name'] = 'Heilongjiang Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, lbbh, page):
        try:
            data = [
                ('xwzsPage.pageNo', page),
                ('xwzsPage.pageSize', '20'),
                ('xwzsPage.pageCount', '1293'),
                ('lbbh', lbbh),
                ('xwzsPage.LBBH', lbbh),
                ('xwzsPage.zlbh', ''),
                ('xwzsPage.GJZ', ''),
            ]
            url = 'http://www.hljcg.gov.cn/xwzs!queryXwxxqx.action'
            response = requests.post(url=url, headers=self.headers,
                                     data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(lbbh, page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath(
                '//div[@class="yahoo"]/div/span[1]/a/@onclick')

            for url in url_li:
                urls = re.findall(r"href='(.*?)'", url)[0]
                urls = 'http://www.hljcg.gov.cn' + urls
                self.load_get_html(urls)
                # if not self.rq.in_rset(pid):
                #     self.rq.add_to_rset(pid)
                #     self.rq.pull_to_rlist(pid)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            #{'lbbh':'4','all_page': 1293},
            #{'lbbh':'30','all_page': 26},
            #{'lbbh':'99','all_page': 112},
            #{'lbbh':'98','all_page': 18},
            #{'lbbh':'5','all_page': 668},
            {
                'lbbh': '4',
                'all_page': 2
            },
            {
                'lbbh': '30',
                'all_page': 1
            },
            {
                'lbbh': '99',
                'all_page': 1
            },
            {
                'lbbh': '98',
                'all_page': 1
            },
            {
                'lbbh': '5',
                'all_page': 2
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    lbbh = task['lbbh']
                    spawns = [
                        gevent.spawn(self.load_get, lbbh, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''广东采购电子商城'''
    def __init__(self):
        name = 'guangdong_gpcgd_com'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://www.gpcgd.com',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gpcgd.com/gpcgd/portal/portal-news^!list',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }


        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='guangdong_gpcgd_com_list1', dbset='guangdong_gpcgd_com_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!detailNews?portalNews.id={}'.format(pid)
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            print(url)
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="pub_title"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath('//div[@class="pub_note"]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            area_name = '广东'
            # print(area_name)

            source = 'http://www.gpcgd.com/'
            # print(url)
            # print(response)

            table_ele  = selector.xpath('//div[@class="pub_cont_details"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '广东省政府采购中心'
            retult_dict['en_name'] = 'Guangdong Government Procurement Center'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:
            data = [
                ('portalNews.typeId', types),
                ('pageNum', page),
            ]
            url = 'http://www.gpcgd.com/gpcgd/portal/portal-news!list'
            response = requests.post(url=url, headers=self.headers, data=data).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//tbody[@id="bulletininfotable_table_body"]/tr')
            url_li = re.findall(r'onclick\=\"detailNews\(\'(.*?)\'\)\"',response)

            # for div_ele in div_ele_li:
            for pid in url_li:

            # for data_dic in response_li:
            #     div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

                # urls = 'http://www.jngp.gov.cn{}'.format(url)
                # print(data_dic)
                # self.load_get_html(pid)

                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [
                {'categoryId':'', 'types':'90011','all_page': 1},
                {'categoryId':'', 'types':'90013','all_page': 1},
                {'categoryId':'', 'types':'40011','all_page': 2},
                {'categoryId':'', 'types':'40012','all_page': 2},
                {'categoryId':'', 'types':'40013','all_page': 1},
                {'categoryId':'', 'types':'40014','all_page': 1},
                {'categoryId':'', 'types':'40015','all_page': 1},
                {'categoryId':'', 'types':'40016','all_page': 1},
            ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #11
0
class GovBuy(object):
    '''南昌政府采购网'''
    def __init__(self):
        name = 'nanchang_ncszfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.ncszfcg.gov.cn/index2018.cfm',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='nanchang_list1', dbset='nanchang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,li):
        if li == None:
            return
        try:
            selector_li = etree.HTML(str(li))
            url = 'http://www.ncszfcg.gov.cn/'+ selector_li.xpath('//li/a/@href')[0]
            print(url)
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector_li.xpath('//li/a/@title')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector_li.xpath('//li/div/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group()
            else:
                publish_date = None
            print(publish_date,title)
            area_name = '江西-南昌'
            source = 'http://www.ncszfcg.gov.cn/'

            table = selector.xpath('//div[@class="ewb-detail-box"]')[0]

            content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南昌市政府采购网'
            retult_dict['en_name'] = 'Nanchang Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, page):
        try:
            params = {
                'sid': '100002',
                'Page': page,
            }
            url = 'http://www.ncszfcg.gov.cn/more2018.cfm'

            response = requests.get(url=url, headers=self.headers,params=params).text

            selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            ul_li_ele = selector.xpath('//ul[@class="listbox"]/li')

            for ul_li in ul_li_ele:
                li = etree.tostring(ul_li, pretty_print=True,encoding='utf-8',method='html').decode('utf-8')
                self.load_get_html(li)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
                # {'all_page': 909},
                {'all_page': 3},
            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(page)
                    spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''江苏政府采购网'''
    def __init__(self):
        name = 'jiangsu_ccgp-jiangsu_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/index_1.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jiangsu_list1',
                             dbset='jiangsu_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="dtit"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="detail_bz"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('江苏', title)
            # print(area_name)

            source = 'http://www.ccgp-jiangsu.gov.cn/'

            table = selector.xpath('//div[@class="detail"]')
            if table != []:
                table = table[0]
            else:
                return
            content_html = etree.tostring(table,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            # print(content_html)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江苏政府采购网'
            retult_dict['en_name'] = 'Jiangsu Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, base_url, page):
        try:
            if page == 0:
                url = base_url
            else:
                url = base_url + 'index_' + str(page) + '.html'
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except:
            print('load_post error')
            # self.load_get(url)
        else:
            # print('第{}页'.format(page))
            url_li = selector.xpath('//div[@class="list_list"]/ul/li/a/@href')
            if url_li == []:
                url_li = selector.xpath(
                    '//div[@class="list_list02"]/ul/li/a/@href')

            for url in url_li:
                urls = base_url + url.replace('./', '')
                # print(urls)
                # self.load_get_html((urls))
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cgyg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/htgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/xqyj/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/ysgg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cggg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/gzgg/xuzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/shengji/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nanjing/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/wuxi/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/changzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/zhenjiang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/nantong/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/taizhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yangzhou/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/yancheng/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/huaian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/suqian/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/lianyungang/',
                'all_page': flag
            },
            {
                'url': 'http://www.ccgp-jiangsu.gov.cn/cgxx/cjgg/xuzhou/',
                'all_page': flag
            },
        ]
        count = 3
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    base_url = task['url']

                    # self.load_get(base_url, page)
                    spawns = [
                        gevent.spawn(self.load_get, base_url, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''长春-政府采购网'''
    def __init__(self):
        name = 'changchun_cczfcg_gov_cn'
        self.coll = StorageSetting(name)

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            # 'Referer': 'http://www.cczfcg.gov.cn/article/bid_list.action?__fp=vKUU60vQmvMBON82huO8GA^%^3D^%^3D^&field=2^&title=^&d-16544-p=3^&getList=^&getList=^%^E6^%^90^%^9C^%^E7^%^B4^%^A2^&_sourcePage=QGlMpvcgcewgbrz1QGkYn6WfINWh0k0sL4lzLkek3lM^%^3D^&type=2',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'X-Requested-With': 'ShockwaveFlash/30.0.0.134',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='changchun_list1', dbset='changchun_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,url):
        if url == None:
            return
        try:
            response = self.session.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            # print(response)
            title = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/center/span/text()')
            if title == []:
                title = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/table[1]/caption/text()')
                if title != []:
                    title = title[0]
                    try:
                        status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                    except:
                        status = '公告'
                else:
                    title = None
                    status = '公告'
            else:
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
                title = title[0]
            # print(title)
            # print(url)

            _id = self.hash_to_md5(title)

            publish_date = selector.xpath('//*[@id="wrap"]/div[1]/div[2]/div/div[2]/p[2]/text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='details')

            area_name = self.get_area('长春', title)

            source = 'http://www.cczfcg.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '长春市政府采购网'
            retult_dict['en_name'] = 'Changchun City Government Procurement'

            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, params):
        try:
            url = 'http://www.cczfcg.gov.cn/article/bid_list.action'
            response = self.session.get(url=url, headers=self.headers, params=params).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
        else:
            url_li = selector.xpath('//*[@id="row"]/tbody/tr/td/a/@href')
            for url in url_li:
                urls = 'http://www.cczfcg.gov.cn' + url
                # print(urls)
                # print(urls)
                self.load_get_html(urls)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)


    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
                #{'type':1,'field':1,'all_page': 35},
                #{'type':1,'field':2,'all_page': 129},
                #{'type':2,'field':1,'all_page': 32},
                #{'type':2,'field':2,'all_page': 130},
                 {'type':1,'field':1,'all_page': 1},
                 {'type':1,'field':2,'all_page': 1},
                 {'type':2,'field':1,'all_page': 1},
                 {'type':2,'field':2,'all_page': 1},

            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                params = {
                    'field': task['field'],
                    'title':'',
                    'd-16544-p': str(page),
                    'getList': '搜索',
                    'type': task['type'],
                    '__fp': 'V7VgOK3HYWUBON82huO8GA ==',
                    '_sourcePage': '1dxhayx - Cv4gbrz1QGkYn6WfINWh0k0sL4lzLkek3lM =',
                    }

                try:
                    self.load_get(params)

                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''杭州公共资源交易信息网'''
    def __init__(self):
        name = 'hangzhou_hzctc_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.hzctc.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://www.hzctc.cn/SecondPage/ProjectAfficheList?area=^&afficheType=22^&proID=^&title=',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hangzhou_hzctc_cn_list1',
                             dbset='hangzhou_hzctc_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, data_dic):
        if data_dic == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            try:
                url = 'http://www.hzctc.cn/AfficheShow/Home?AfficheID={}&IsInner={}&ModuleID=22'.format(
                    data_dic['ID'], data_dic['IsInner'])
            except:
                url = 'http://www.hzctc.cn/OpenBidRecord/Index?id={}&tenderID={}&ModuleID=22'.format(
                    data_dic['ID'], data_dic['TenderID'])
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            # title = selector.xpath('//div[@class="Content-Main FloatL"]/span/text()')
            title = [data_dic['TenderName']]
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="Content-Main FloatL"]/em//text()')
            publish_date = [data_dic['PublishStartTime']]
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            # area_name = self.get_area('江西', title)
            area_name = '浙江-杭州'

            # print(area_name)

            source = 'http://www.hzctc.cn'

            table_ele = selector.xpath('//div[@class="MainList"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '杭州市公共资源交易网'
            retult_dict['en_name'] = 'Hangzhou Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            data = [
                ('area', ''),
                ('afficheType', categoryId),
                ('IsToday', ''),
                ('title', ''),
                ('proID', ''),
                ('number', ''),
                ('_search', 'false'),
                ('nd', str(int(time.time() * 1000))),
                ('rows', '10'),
                ('page', page),
                ('sidx', 'PublishStartTime'),
                ('sord', 'desc'),
            ]
            url = 'http://www.hzctc.cn/SecondPage/GetNotice'
            response = requests.post(url=url, headers=self.headers,
                                     data=data).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            # url_li = selector.xpath('//div[@class="List-Li FloatL"]/ul/li/a/@href')
            # for div_ele in div_ele_li:
            # for url in url_li:
            response_li = response['rows']
            for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                # urls = 'http://ggzy.wzzbtb.com:6081' + url
                # print(data_dic)
                self.load_get_html(data_dic)

                # if not self.rq.in_rset(pid):
                #     self.rq.add_to_rset(pid)
                #     self.rq.pull_to_rlist(pid)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '22',
                'types': 'jyxx',
                'all_page': 2
            },
            {
                'categoryId': '27',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '23',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '465',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '24',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '486',
                'types': 'jyxx',
                'all_page': 2
            },
            {
                'categoryId': '25',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '28',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '26',
                'types': 'jyxx',
                'all_page': 1
            },
            {
                'categoryId': '32',
                'types': 'jyxx',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #15
0
class GovBuy(object):
    '''海南政府采购网'''
    def __init__(self):
        name = 'hainan_ccgp-hainan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection':
            'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer':
            'http://www.ccgp-hainan.gov.cn/thirdparty/My97DatePicker/My97DatePicker.html',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'X-DevTools-Emulate-Network-Conditions-Client-Id':
            'EAC4BA3425D26FC6B117994EFF4DEC28',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hainan_list1',
                             dbset='hainan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def load_get_html(self, url):
        try:
            # print(url)
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:'.format(e))
        else:
            title = selector.xpath('//div[@class="nei03_02"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="nei03_02"]/div[2]//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d+\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='nei03_02')

            source = 'http://www.ccgp-hainan.gov.cn/'
            area_name = self.get_area('海南', title)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['publish_date'] = publish_date
            retult_dict['source'] = source
            retult_dict['area_name'] = area_name

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中国海南政府采购网 '
            retult_dict['en_name'] = 'Hainan Province Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, params):
        try:
            url = 'http://www.ccgp-hainan.gov.cn/cgw/cgw_list.jsp'
            response = self.session.get(url=url,
                                        headers=self.headers,
                                        params=params).content.decode('utf-8')
            selector = etree.HTML(response)
            url_li = selector.xpath(
                '//div[@class="nei02_04_01"]/ul/li/em/a/@href')
        except:
            print('load_post error')
        else:
            for url in url_li:
                url = 'http://www.ccgp-hainan.gov.cn' + url
                if not self.rq.in_rset(url):
                    self.rq.add_to_rset(url)
                    self.rq.pull_to_rlist(url)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                # self.load_get_html(self.rq.get_to_rlist())
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 2521},
            {
                'all_page': 5
            },
        ]
        for task in task_li:
            for page in range(1, task['all_page'] + 1):
                params = (
                    ('currentPage', str(page)),
                    ('begindate', ''),
                    ('enddate', ''),
                    ('title', ''),
                    ('bid_type', ''),
                    ('proj_number', ''),
                    ('zone', ''),
                )

                self.load_get(params)
                print('第{}页'.format(page))

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #16
0
class GovBuy(object):
    '''福建政府采购网'''
    def __init__(self,source,base_url, all_page):
        name = 'fujian_cz_fjzfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://cz.fjzfcg.gov.cn/3500/noticelist/d03180adb4de41acbb063875889f9af1/?page=1',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        self.session = requests.session()
        self.source = source
        self.base_url = base_url
        self._all_page = all_page


        self.rq = Rdis_Queue(host='localhost', dblist='fujian_list1', dbset='fujian_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,tr):
        if tr == None:
            return
        try:
            selector_tr = etree.HTML(str(tr))
            url = self.source +  selector_tr.xpath('//tr/td[4]/a/@href')[0]
            # print(url)
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector_tr.xpath('//tr/td[4]/a/text()')
            if title != []:
                title = title[0]
                # try:
                #     status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                # except:
                #     status = '公告'
            else:
                title = None
                # status = '公告'
            status = selector_tr.xpath('//tr/td[2]/text()')
            if status != []:
                status = status[0]
            else:
                status =None
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector_tr.xpath('//tr/td[5]/text()')
            if publish_date != []:
                publish_date = publish_date[0]
                # publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            aaa = selector_tr.xpath('//tr/td[1]/text()')
            if aaa != []:
                aaa = aaa[0]
            else:
                aaa = '福建'
            area_name = self.get_area('福建',aaa )
            print(area_name)

            source = self.source

            table = selector.xpath('//*[@id="print-content"]')[0]
            content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = 'http://117.27.88.250:9306/'

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '福建省政府采购网'
            retult_dict['en_name'] = 'Fujian Province Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, page):
        try:
            params = {
                'page':str(page),
            }
            url = self.base_url + 'noticelist/d03180adb4de41acbb063875889f9af1/'
            print(url)

            response = requests.get(url=url, headers=self.headers,params=params).text
            selector = etree.HTML(response)
        except:
            print('load_post error')
            # self.load_get(page)
        else:
            print('第{}页'.format(page))
            tr_ele_li = selector.xpath('//div[@class="wrapTable"]/table/tbody/tr')

            for tr_ele in tr_ele_li:
                tr = etree.tostring(tr_ele, pretty_print=True,encoding='utf-8',method='html').decode('utf-8')
                self.load_get_html(tr)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
                # {'all_page': 9111},
                {'all_page': self._all_page},
            ]
        count = 4
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(page)
                    spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #17
0
class GovBuy(object):
    '''青島政府采购网'''
    def __init__(self):
        name = 'qingdao_ccgp-qingdao_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'https://www.ccgp-qingdao.gov.cn',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'text/plain',
            'Accept': '*/*',
            'Referer':
            'https://www.ccgp-qingdao.gov.cn/sdgp2014/site/channelall370200.jsp?colcode=0401^&flag=0401',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='qingdao_list1',
                             dbset='qingdao_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, ids):

        if ids == None:
            return
        try:
            url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/site/read370200.jsp?id=' + str(
                ids)
            # print(url)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    verify=False).content.decode("gb18030")
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # self.load_get_html(li)
        else:
            title = selector.xpath('//div[@class="biaot"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="biaotq"]/text()')
            if publish_date != []:
                publish_date = re.sub(
                    r'年|月', '-',
                    re.search(r'(\d{4}年\d+月\d{1,2})',
                              ''.join(publish_date)).group())
            else:
                publish_date = None
            # print(publish_date)
            area_name = '山东-青島'

            source = 'https://www.ccgp-qingdao.gov.cn/'

            table_ele = selector.xpath('//div[@class="cont"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '青岛市政府采购网'
            retult_dict['en_name'] = 'Qingdao City Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, types, page):
        try:
            # url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/dwr/call/plaincall/dwrmng.queryWithoutUi.dwr'
            url = 'http://www.ccgp-qingdao.gov.cn/sdgp2014/dwr/call/plaincall/dwrmng.queryWithoutUi.dwr'

            data = {
                'callCount': '1',
                'windowName': '',
                'c0-scriptName': 'dwrmng',
                'c0-methodName': 'queryWithoutUi',
                'c0-id': '0',
                'c0-param0': 'number:7',
                'c0-e1': 'string:' + types,
                'c0-e2': 'string:' + str(page),
                'c0-e3': 'number:10',
                'c0-e4': 'string:',
                'c0-e5': 'null:null',
                'c0-param1':
                'Object_Object:{_COLCODE:reference:c0-e1, _INDEX:reference:c0-e2, _PAGESIZE:reference:c0-e3, _REGION:reference:c0-e4, _KEYWORD:reference:c0-e5}',
                'batchId': '8',
                'page':
                '%2Fsdgp2014%2Fsite%2Fchannelall370200.jsp%3Fcolcode%3D0401%26flag%3D0401',
                'httpSessionId': '',
                'scriptSessionId': '9BCA99F81A827529F202FF26A81421A0',
            }
            response = requests.post(url=url,
                                     headers=self.headers,
                                     data=data,
                                     verify=False).text

            a = re.findall(r'rsltStringValue:"(.*?)"', response)[0]
        except Exception as e:
            print('load_get error:{}'.format(e))
            # self.load_get(types,page)
        else:
            print('第{}页'.format(page))
            b = a.split('?')
            for i in b:
                ids = i.split(',')[0]
                self.load_get_html(ids)

    def init(self):
        count = 1
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'types': '0401',
                'all_page': 3
            },
            {
                'types': '0402',
                'all_page': 3
            },
            {
                'types': '0403',
                'all_page': 2
            },
            {
                'types': '0404',
                'all_page': 2
            },
            {
                'types': '0405',
                'all_page': 2
            },
            {
                'types': '0406',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    types = task['types']
                    self.load_get(types, page)
                    spawns = [
                        gevent.spawn(self.load_get, types, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #18
0
class GovBuy(object):
    '''重庆政府采购网'''
    def __init__(self):
        name = 'chongqing_cqgp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Referer':
            'https://www.cqgp.gov.cn/notices/list?source=41,42^&area=^%^E9^%^87^%^8D^%^E5^%^BA^%^86^%^E5^%^B8^%^82^&purches=^%^E9^%^87^%^87^%^E8^%^B4^%^AD^%^E5^%^85^%^AC^%^E5^%^91^%^8A',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='chongqing_list1',
                             dbset='chongqing_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format(
                pid)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(pid)
        else:
            title = response['notice']['title']
            try:
                status = response['notice']['projectPurchaseWayName']
            except:
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            publish_date = response['notice']['issueTime']
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         publish_date).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '重庆'

            # print(area_name)

            source = 'https://www.cqgp.gov.cn/'

            content_html = response['notice']['html']

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '重庆市政府采购网'
            retult_dict['en_name'] = 'Chongqing City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('pi', page),
                ('ps', '20'),
                ('timestamp', str(int(time.time() * 1000))),
            )
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies,
                                    timeout=5).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            response_li = response['notices']
            for data_dict in response_li:
                pid = data_dict['id']
                # print(pid)
                # self.load_get_html(pid)
                # time.sleep(2)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        task_li = [

            # {'all_page': 18647},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(types, page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''浙江公共资源交易信息网'''
    def __init__(self):
        name = 'zhejiang_zjpubservice_zjzwfw_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.zjpubservice.com/002/infogov.html',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(
            host='localhost',
            dblist='zhejiang_zjpubservice_zjzwfw_gov_cn_list1',
            dbset='zhejiang_zjpubservice_zjzwfw_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, data_dic):
        if data_dic == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            urls = data_dic['link']
            url_li = re.findall(
                r'infoid\=(.*?)\&categorynum\=(.*?)\&infodate\=(.*)', urls)[0]
            url = 'http://zjpubservice.zjzwfw.gov.cn/{}/{}/{}/{}/{}.html'.format(
                url_li[1][:3], url_li[1][:6], url_li[1], url_li[2], url_li[0])

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            # title = selector.xpath('//div[@class="Content-Main FloatL"]/span/text()')
            title = [data_dic['title']]
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="Content-Main FloatL"]/em//text()')
            publish_date = [data_dic['date']]
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            print(publish_date, title)
            area_name = self.get_area('浙江', data_dic['remark5'])
            # area_name = '浙江-杭州'
            # print(area_name)

            source = 'http://zjpubservice.zjzwfw.gov.cn/'
            # print(url)
            # print(response)

            table_ele = selector.xpath('//div[@class="article_bd"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '浙江省公共资源交易服务平台'
            retult_dict['en_name'] = 'Zhejiang Public resource'
            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = (
                ('format', 'json'),
                ('sort', '0'),
                ('rmk1', types),
                ('pn', page),
                ('rn', '20'),
                ('idx_cgy', 'web'),
            )
            url = 'http://www.zjpubservice.com/fulltextsearch/rest/getfulltextdata'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params).json()
            print(response)
            return
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            # url_li = selector.xpath('//div[@class="List-Li FloatL"]/ul/li/a/@href')
            # for div_ele in div_ele_li:
            # for url in url_li:
            response_li = response['result']['records']
            for data_dic in response_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                # urls = 'http://ggzy.wzzbtb.com:6081' + url
                # print(data_dic)
                self.load_get_html(data_dic)

                # if not self.rq.in_rset(pid):
                #     self.rq.add_to_rset(pid)
                #     self.rq.pull_to_rlist(pid)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '',
                'types': '002001001',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002001002',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': '002001003',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002001004',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002001005',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002002001',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002002002',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002003001',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': '002003002',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': '002004001',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': '002004002',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': '002005001',
                'all_page': 3
            },
            {
                'categoryId': '',
                'types': '002005002',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(0, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #20
0
class GovBuy(object):
    '''湖南政府采购网'''
    def __init__(self):
        name = 'hunan_ccgp-hunan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.ccgp-hunan.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://www.ccgp-hunan.gov.cn/page/notice/more.jsp?noticeTypeID=prcmNotices',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='hunan_list1',
                             dbset='hunan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, data_dict):
        try:
            url = 'http://www.ccgp-hunan.gov.cn/mvc/viewNoticeContent.do?noticeId=' + str(
                data_dict['NOTICE_ID'])
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = data_dict['NOTICE_TITLE']
            # print(title)
            status = data_dict['NOTICE_NAME']

            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = data_dict['NEWWORK_DATE']

            # print(publish_date)
            # area_name = self.get_area('武汉', ''.join(publish_date_li))
            area_name = '湖南'
            source = 'http://www.ccgp-hunan.gov.cn/'

            soup = BeautifulSoup(response)
            content_html = soup.find('table')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '湖南政府采购网'
            retult_dict['en_name'] = 'Hunan Government Procurement'

            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            data = [
                ('pType', ''),
                ('prcmPrjName', ''),
                ('prcmItemCode', ''),
                ('prcmOrgName', ''),
                ('startDate', '2019-01-17'),
                ('endDate', '2019-12-31'),
                ('prcmPlanNo', ''),
                ('page', page),
                ('pageSize', '18'),
            ]
            url = 'http://www.ccgp-hunan.gov.cn/mvc/getNoticeList4Web.do'
            response = requests.post(url=url, headers=self.headers,
                                     data=data).json()
        except:
            print('load_post error')
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            response_li = response['rows']
            # print(response_li)
            for data_dict in response_li:
                print(data_dict)
                self.load_get_html(data_dict)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 1000},
            {
                'all_page': 3
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # url =task['url']+str(page)+'.html'
                    # self.load_get(page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''浙江政府采购网'''
    def __init__(self):
        name = 'zhejiang_manager_zjzfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.zjzfcg.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://www.zjzfcg.gov.cn/purchaseNotice/index.html?categoryId=10',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='zhejinag_list1',
                             dbset='zhejiang_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, data_dict):
        try:
            proxies = proxy_pool.proxies()
            params = {
                'noticeId': data_dict['id'],
                'url': 'http://notice.zcy.gov.cn/new/noticeDetail',
            }
            url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = data_dict['title']
            # print(title)
            status = data_dict['typeName']
            # print(status)

            _id = self.hash_to_md5(response.url)

            publish_date = time.strftime(
                "%Y-%m-%d", time.localtime(int(data_dict['pubDate']) / 1000))
            # print(publish_date)

            area_name = data_dict['districtName']
            # print(area_name)

            source = 'http://www.zjzfcg.gov.cn/'
            try:
                content_html = response.json()['noticeContent']
            except:
                return

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '浙江政府采购网'
            retult_dict['en_name'] = 'Zhejiang government Procurement'

            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = {
                'pageSize': '15',
                'pageNo': page,
                'url': 'http://notice.zcy.gov.cn/new/noticeSearch',
                'noticeType': '0',
            }
            url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults'
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies,
                                    timeout=5).json()
        except Exception as e:
            print('load_post error{}'.format(e))
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            response_li = response['articles']
            # print(response_li)
            for data_dict in response_li:
                self.load_get_html(data_dict)

                # self.load_get_html(data_dict)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'all_page': 3
            },
            # {'all_page': 2000},
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # url =task['url']+str(page)+'.html'
                    # self.load_get(page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''济南政府采购网'''
    def __init__(self):
        name = 'jinan_jncz_jinan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Proxy-Connection': 'keep-alive',
            'Proxy-Authorization': 'Basic MTYzOTY2MzE2ODphamxhNTJ0bQ==',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Referer':
            'http://119.164.253.173:8080/jngp2016/site/list.jsp?curpage=3&colid=121',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jinan_list1',
                             dbset='jinan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, tr):

        if tr == None:
            return
        try:
            selector_li = etree.HTML(str(tr))
            tr_li = selector_li.xpath('//tr/td[2]/a/@href')[0]
            url = 'http://119.164.253.173:8080' + tr_li
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # self.load_get_html(li)
        else:
            title = selector_li.xpath('//tr/td[2]/a/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector_li.xpath('//tr/td/text()')
            if publish_date != []:
                publish_date = re.sub(
                    r'\[|\]', '-',
                    re.search(r'(\d{4}\-\d+\-\d{1,2})',
                              ''.join(publish_date)).group())
            else:
                publish_date = None
            # print(publish_date)
            area_name = '山东-济南'

            source = 'http://jncz.jinan.gov.cn/'

            try:
                table_ele = selector.xpath('//body/table')
            except:
                return
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '济南市财政局'
            retult_dict['en_name'] = 'Jinan Finance Bureau'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, colid, page):
        try:
            params = (
                ('curpage', page),
                ('colid', colid),
            )
            url = 'http://119.164.253.173:8080/jngp2016/site/list.jsp'
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies,
                                    timeout=10).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(colid, page)
        else:
            print('第{}页'.format(page))
            try:
                li_ele_li = selector.xpath('//table[@class="list"]/tr')
            except:
                return
            for li_ele in li_ele_li:
                tr = etree.tostring(li_ele,
                                    pretty_print=True,
                                    encoding='utf-8',
                                    method='html').decode('utf-8')
                # print(li)
                self.load_get_html(tr)

    def run(self):
        task_li = [
            {
                'colid': '37',
                'all_page': 3
            },
            {
                'colid': '38',
                'all_page': 3
            },
            {
                'colid': '81',
                'all_page': 3
            },
            {
                'colid': '29',
                'all_page': 3
            },
            {
                'colid': '101',
                'all_page': 3
            },
            {
                'colid': '122',
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    colid = task['colid']
                    self.load_get(colid, page)
                    # spawns = [gevent.spawn(self.load_get,colid, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #23
0
class GovBuy(object):
    '''海口政府采购网'''
    def __init__(self):
        name = 'haikou_ggzy_haikou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://ggzy.haikou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer':
            'http://ggzy.haikou.gov.cn/login.do?method=newsecond^&param=431241696e6465783d3326747970653d5a435f4a59',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='haikou_list1',
                             dbset='haikou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        try:
            if url == None:
                return
            response = requests.get(url=url, headers=self.headers).text
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="part_1"]/div[1]/text()')
            if title != []:
                title = title[0]
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="part_1"]/div[2]//text()')

            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            soup = BeautifulSoup(response)
            content_html = soup.find(class_='content_wrap')

            area_name = self.get_area('海口', title)

            source = 'http://ggzy.haikou.gov.cn'

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '海口公共资源交易网'
            retult_dict['en_name'] = 'Hiakou Public resource'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, data):
        try:
            params = (('method', 'getSecondTableInfo'), )
            url = 'http://ggzy.haikou.gov.cn/login.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     data=data).json()
        except:
            print('load_post error')
        else:
            response_li = response['result']
            for dic in response_li:
                key_str = 'flag=3&name=' + dic['FLAG'] + '&key=' + dic['KEYID']
                es = EncodeStr(key_str)
                encodestr = es.encodes()
                urls = 'http://ggzy.haikou.gov.cn/login.do?method=newDetail&param=' + encodestr
                # print(urls)
                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        flag = 3
        task_li = [
            {
                'type': 'GC_JY',
                'all_page': flag
            },
            {
                'type': 'GC_GS',
                'all_page': flag
            },
            {
                'type': 'GC_JG',
                'all_page': flag
            },
            {
                'type': 'ZC_JY',
                'all_page': flag
            },
            {
                'type': 'ZC_JG',
                'all_page': flag
            },
        ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                data = [
                    ('currentPage', str(page)),
                    ('pageSize', '20'),
                    ('flag', '3'),
                    ('type', task['type']),
                    ('notice_title', ''),
                ]
                try:
                    self.load_get(data)
                    print('第{}页'.format(page))
                    # spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    # gevent.joinall(spawns)
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
Exemple #24
0
class GovBuy(object):
    '''贵阳公共资源交易信息网'''
    def __init__(self):
        name = 'guiyang_gcjs_gyggzy_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://www.gcjs.gyggzy.cn/noticeconstruct/index.htm',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='guiyang_gcjs_gyggzy_cn_list1',
                             dbset='guiyang_gcjs_gyggzy_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, div):
        if div == None:
            return
        try:
            selector_div = etree.HTML(str(div))
            url = selector_div.xpath('//div/div/a/@href')[0]
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            title = selector_div.xpath('//div/div/a/@title')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector_div.xpath('//div/div[2]/text()')
            # print(publish_date)
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
                # publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('山东',title)
            area_name = '贵州-贵阳'
            # print(area_name)

            source = 'http://www.gcjs.gyggzy.cn/'

            table_ele = selector.xpath('//div[@class="text_c"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '贵阳市公共资源交易监管网'
            retult_dict['en_name'] = 'Guiyang City Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            url = 'http://www.gcjs.gyggzy.cn/{}/index_{}.htm'.format(
                types, page)
            response = requests.get(
                url=url, headers=self.headers).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            div_ele_li = selector.xpath('//div[@class="c1-bline"]')

            for div_ele in div_ele_li:
                div = etree.tostring(div_ele,
                                     encoding="utf-8",
                                     pretty_print=True,
                                     method="html").decode('utf-8')
                # urls = 'http://www.nxzfcg.gov.cn' + url

                self.load_get_html(div)

                # if not self.rq.in_rset(pid):
                #     self.rq.add_to_rset(pid)
                #     self.rq.pull_to_rlist(pid)

    def init(self):
        count = 6
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '',
                'types': 'noticeconstruct',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'noticeservice',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'noticedesign',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'noticereconnaissance',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'noticequipment',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'noticeContracting',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'succonstruct',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'succservice',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'succdesign',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'succreconnaissance',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'succequipment',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'contracting',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'buildNewsConstruts',
                'all_page': 2
            },
            {
                'categoryId': '',
                'types': 'buindNewsService',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'buildNewsDesigner',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'buildNewsrecon',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'buildNewsEuqip',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'buildContracting',
                'all_page': 1
            },
            {
                'categoryId': '',
                'types': 'directPub',
                'all_page': 1
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
Exemple #25
0
class GovBuy(object):
    '''济南公共资源交易信息网'''
    def __init__(self):
        name = 'jinan_jngp_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh,zh-CN;q=0.9',
            'Connection':
            'keep-alive',
            'Host':
            'jnggzy.jinan.gov.cn',
            'Origin':
            'http: // jnggzy.jinan.gov.cn',
            'Referer':
            'http: // jnggzy.jinan.gov.cn / jnggzyztb / front / noticelist.do?type = 1 & xuanxiang = 1 & area =',
        }

        self.rq = Rdis_Queue(host='localhost',
                             dblist='jinan_jngp_gov_cn_list1',
                             dbset='jinan_jngp_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//div[@class="list"]/h1//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)
            publish_date = selector.xpath(
                '//div[@class="list"]/div/span//text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            area_name = '山东-济南'
            # print(area_name)
            source = 'http://jnggzy.jinan.gov.cn/'

            table_ele = selector.xpath('//div/div[@class="list"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return
            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '济南公共资源交易中心'
            retult_dict['en_name'] = 'Jinan Public resource'

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self, categoryId, types, page):
        try:
            params = {
                'area': '',
                'type': types,
                'xuanxiang': categoryId,
                'subheading': '',
                'pagenum': page,
            }

            url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/search.do'
            response = requests.post(url=url,
                                     headers=self.headers,
                                     data=params).json()
            response_str = response['params']['str']
            selector = etree.HTML(response_str)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print(response)
            print('第{}页'.format(page))
            id_li = selector.xpath('//ul/li/a/@onclick')
            if len(id_li) > 0:
                iid_li = [re.sub(r'.*?\(|\).*', '', i) for i in id_li]
                for iid in iid_li:
                    url = 'http://jnggzy.jinan.gov.cn/jnggzyztb/front/showNotice.do?iid={}&xuanxiang={}'.format(
                        iid, categoryId)
                    # self.load_get_html(url)
                    if not self.rq.in_rset(url):
                        self.rq.add_to_rset(url)
                        self.rq.pull_to_rlist(url)
            else:
                url_li = selector.xpath('//ul/li/a/@href')
                for url in url_li:
                    urls = 'http://jnggzy.jinan.gov.cn' + url
                    # self.load_get_html(urls)
                    if not self.rq.in_rset(urls):
                        self.rq.add_to_rset(urls)
                        self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
            {
                'categoryId': '招标公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '中标公示',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '变更公告',
                'types': '1',
                'all_page': 4
            },
            {
                'categoryId': '废标公告',
                'types': '1',
                'all_page': 4
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    spawns = [
                        gevent.spawn(self.load_get, categoryId, types,
                                     page + i) for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()

    def main(self):
        self.run()
class GovBuy(object):
    '''安徽政府采购网'''
    def __init__(self):
        name = 'anhui_ahzfcg_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html, */*; q=0.01',
            'Referer': 'http://www.ahzfcg.gov.cn/',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost',
                             dblist='anhui_list1',
                             dbset='anhui_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # proxies = self.proxy_queue.get()
            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(url)
        else:
            title = selector.xpath('//div[@class="frameNews"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//div[@class="source"]/span[1]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         ''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = self.get_area('安徽', title)
            # print(area_name)

            source = 'http://www.ahzfcg.gov.cn/'

            table_ele = selector.xpath('//div[@class="frameNews"]')[0]

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '安徽省政府采购网'
            retult_dict['en_name'] = 'Anhui Province Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, page):
        try:
            params = (
                ('pageNum', page),
                ('numPerPage', '20'),
                ('title', ''),
                ('buyer_name', ''),
                ('agent_name', ''),
                ('proj_code', ''),
                ('bid_type', ''),
                ('type', ''),
                ('dist_code', '340000'),
                ('pubDateStart', ''),
                ('pubDateEnd', ''),
                ('pProviceCode', '340000'),
                ('areacode_city', ''),
                ('areacode_dist', ''),
                ('channelCode', 'sjcg_cggg'),
            )
            url = 'http://www.ahzfcg.gov.cn/cmsNewsController/getCgggNewsList.do'
            # proxies = self.proxy_queue.get()
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            url_li = selector.xpath(
                '//div[@class="zc_content1"]/div[3]/table/tr/td[1]/a/@href')
            # print(url_li)
            for url in url_li:
                urls = 'http://www.ahzfcg.gov.cn/' + url

                self.load_get_html(urls)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
            # {'all_page': 21580},
            {
                'all_page': 3
            },
        ]
        count = 2
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(base_url, page)
                    spawns = [
                        gevent.spawn(self.load_get, page + i)
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''南昌公共资源交易信息网'''
    def __init__(self):
        name = 'nanchang_ncztb_nc_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.cookies = {
            'ASP.NET_SessionId': 'kxgkxo45v04bzs55ie3tib55',
            '__CSRFCOOKIE': 'ad60f543-41c8-481d-b0cf-accadc73c516',
        }

        self.headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Origin': 'http://ncztb.nc.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://ncztb.nc.gov.cn/nczbw/jyxx/002001/002001002/MoreInfo.aspx?CategoryNum=002001002',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }

        # self.session = requests.session()
        # pq = ProxyQueue()
        # self.pq_run = pq.run()
        # self.proxy_queue = pq.proxy_queue

        self.rq = Rdis_Queue(host='localhost', dblist='nanchang_ncztb_nc_gov_cn_list1', dbset='nanchang_ncztb_nc_gov_cn_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, url):
        if url == None:
            return
        try:
            # selector_div = etree.HTML(str(div))

            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(url)
            title = selector.xpath('//td[@id="tdTitle"]/font//text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',''.join(title))
                try:
                    status = re.search(r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(r'\/','-',re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date, title)
            # area_name = self.get_area('福建', title)
            area_name = '江西-南昌'

            # print(area_name)

            source = 'http://ncztb.nc.gov.cn'

            table_ele  = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江西省南昌公共资源交易网'
            retult_dict['en_name'] = 'Nanchang Public resource'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)

    def load_get(self,categoryId, types, page):
        try:

            params = (
                ('CategoryNum', types),
            )

            data = {
                '__CSRFTOKEN': '/wEFJGFkNjBmNTQzLTQxYzgtNDgxZC1iMGNmLWFjY2FkYzczYzUxNg==',
                '__VIEWSTATE': '',
                '__EVENTTARGET': 'MoreInfoList1$Pager',
                '__EVENTARGUMENT': page
            }

            url = 'http://ncztb.nc.gov.cn/nczbw/jyxx/{}/MoreInfo.aspx'.format(categoryId)
            response = requests.post(url=url, headers=self.headers, params=params, data=data, cookies=self.cookies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            # time.sleep(3)
            # self.load_get(categoryId, types, page)
        else:
            print('第{}页'.format(page))
            # div_ele_li = selector.xpath('//ul[@class="ewb-right-item"]/li')
            url_li = selector.xpath('//table[@id="MoreInfoList1_DataGrid1"]/tr/td[2]/a/@href')
            # for div_ele in div_ele_li:
            for url in url_li:
                # div = etree.tostring(div_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
                urls = 'http://ncztb.nc.gov.cn' + url

                # self.load_get_html(urls)

                if not self.rq.in_rset(urls):
                    self.rq.add_to_rset(urls)
                    self.rq.pull_to_rlist(urls)

    def init(self):
        count = 2
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # print(os.getppid())
        threading.Thread(target=self.init).start()
        flag = 2
        task_li = [
                {'categoryId':'002001/002001002', 'types':'002001002','all_page': flag},
                {'categoryId':'002001/002001004', 'types':'002001004','all_page': flag},
                {'categoryId':'002001/002001005', 'types':'002001005','all_page': flag},
                {'categoryId':'002002/002002002', 'types':'002002002','all_page': flag},
                {'categoryId':'002002/002002005', 'types':'002002005','all_page': flag},
                {'categoryId':'002003/002003001', 'types':'002003001','all_page': flag},
                {'categoryId':'002003/002003004', 'types':'002003004','all_page': flag},
                {'categoryId':'002009/002009001', 'types':'002009001','all_page': flag},
                {'categoryId':'002009/002009004', 'types':'002009004','all_page': flag},
                {'categoryId':'002004/002004001', 'types':'002004001','all_page': flag},
                {'categoryId':'002004/002004002', 'types':'002004002','all_page': flag},
                {'categoryId':'002004/002004003', 'types':'002004003','all_page': flag},
                {'categoryId':'002004/002004004', 'types':'002004004','all_page': flag},
                {'categoryId':'002004/002004005', 'types':'002004005','all_page': flag},
                {'categoryId':'002005/002005002', 'types':'002005002','all_page': flag},
                {'categoryId':'002010/002010001', 'types':'002010001','all_page': flag},
                {'categoryId':'002010/002010002', 'types':'002010002','all_page': flag},
                {'categoryId':'002010/002010004', 'types':'002010004','all_page': flag},
            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    categoryId = task['categoryId']
                    types = task['types']

                    # self.load_get(categoryId, page)

                    spawns = [gevent.spawn(self.load_get, categoryId, types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)
        if self.rq.r_len() > 10:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
class GovBuy(object):
    '''江西政府采购网'''
    def __init__(self):
        name = 'jiangxi_ccgp-jiangxi_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://ccgp-jiangxi.gov.cn/web/jyxx/002006/jyxx.html',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='jiangxi_list1', dbset='jiangxi_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,data_dict):
        # {'categorynum': '002006005',
        # 'infoid': '99c03675-a099-412e-b97b-7d45ee9c3872',
        # 'postdate': '2018-06-08',
        # 'title': '[省本级]江西科技师范大学工程造价软件升级更新项目单一来源采购征求意见公示'}
        try:
            publish_date = data_dict['postdate']
            url = 'http://ccgp-jiangxi.gov.cn/web/jyxx/002006/'+ data_dict['categorynum']+'/'+ ''.join(publish_date.split('-'))+'/'+ data_dict['infoid'] + '.html'
            # print(url)
            response = requests.get(url=url, headers=self.headers)
            if response.status_code ==404:
                return
            selector = etree.HTML(response.text)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = data_dict['title']
            try:
                status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
            except:
                status = '公告'

            # print(title)
            # print(status)
            _id = self.hash_to_md5(url)
            # print(publish_date)
            area_name = '江西'

            source = 'http://ccgp-jiangxi.gov.cn/'
            table = selector.xpath('//div[@class="ewb-detail-box"]')[0]
            content_html = etree.tostring(table, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '江西省政府采购网'
            retult_dict['en_name'] = 'Jiangxi Province Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, page):
        try:
            params = (
                ('response', 'application/json'),
                ('pageIndex', page),
                ('pageSize', '22'),
                ('area', ''),
                ('prepostDate', ''),
                ('nxtpostDate', ''),
                ('xxTitle', ''),
                ('categorynum', '002006'),
            )
            url = 'http://ccgp-jiangxi.gov.cn/jxzfcg/services/JyxxWebservice/getList'
            response = requests.get(url=url, headers=self.headers,params=params).json()
        except:
            print('load_post error')
            self.load_get(page)
        else:
            print('第{}页'.format(page))
            # print(response)
            response_li = eval(response['return'])['Table']

            for data_dict in response_li:
                self.load_get_html(data_dict)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
                # {'all_page': 3156},
                {'all_page': 3},

            ]
        count = 1
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # self.load_get(page)
                    spawns = [gevent.spawn(self.load_get, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()
class GovBuy(object):
    '''苏州政府采购网'''
    def __init__(self):
        name = 'suzhou_zfcg_suzhou_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Origin': 'http://www.zfcg.suzhou.gov.cn',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'http://www.zfcg.suzhou.gov.cn/html/search.shtml?title=&choose=&projectType=0&zbCode=&appcode=',
            'X-Requested-With': 'XMLHttpRequest',
            'Connection': 'keep-alive',
        }

        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost', dblist='suzhou_list1', dbset='suzhou_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len (self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self,result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self,pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(r'省|市', '-', re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >=2 and area_li[1] !='':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self,pid):
        if pid == None:
            return
        try:
            url = 'http://www.zfcg.suzhou.gov.cn/html/project/'+ pid +'.shtml'
            response = requests.get(url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="M_title"]/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="date"]/span/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '江苏-苏州'
            # print(area_name)

            source = 'http://www.zfcg.suzhou.gov.cn/'

            table_ele  = selector.xpath('//div[@id="tab1"]')[0]

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '苏州市政府采购网'
            retult_dict['en_name'] = 'Suzhou City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)


    def load_get(self, types, page):
        try:
            data = [
                ('title', ''),
                ('choose', ''),
                ('type', types),
                ('zbCode', ''),
                ('appcode', ''),
                ('page', page),
                ('rows', '30'),
            ]
            url = 'http://www.zfcg.suzhou.gov.cn/content/searchContents.action'
            response = requests.post(url=url, headers=self.headers, data=data).json()
            # selector = etree.HTML(response)
        except:
            print('load_post error')
            self.load_get(types, page)
        else:
            print('第{}页'.format(page))
            # print(response)
            response_li = response['rows']
            if response_li == []:
                return

            for project_id in response_li:
                pid = project_id['PROJECTID']

                # self.load_get_html(pid)
                if not self.rq.in_rset(pid):
                    self.rq.add_to_rset(pid)
                    self.rq.pull_to_rlist(pid)

    def init(self):
        count = 3
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [gevent.spawn(self.load_get_html, self.rq.get_to_rlist()) for i in range(count)]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        threading.Thread(target=self.init).start()
        task_li = [
                {'type':'0', 'all_page': 2},
                {'type':'1', 'all_page': 2},
                {'type':'2', 'all_page': 2},

            ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    types = task['type']

                    # self.load_get(base_url, page)
                    spawns = [gevent.spawn(self.load_get,types, page + i) for i in range(count)]
                    gevent.joinall(spawns)
                    # print('第{}页'.format(page))
                except Exception as e:
                    print(e)

        if self.rq.r_len() > 0:
            threading.Thread(target=self.init).start()


    def main(self):
        self.run()
class GovBuy(object):
    '''武汉政府采购网'''
    def __init__(self):
        name = 'wuhan_cgb_wuhan_gov_cn'
        self.coll = StorageSetting(name)
        self.collection = self.coll.find_collection

        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'http://cgb.wuhan.gov.cn/notice/zbgg//index_2.html',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh,zh-CN;q=0.9',
        }
        self.session = requests.session()

        self.rq = Rdis_Queue(host='localhost',
                             dblist='wuhan_list1',
                             dbset='wuhan_set1')

    def is_running(self):
        is_runing = True
        if self.rq.r_len() == 0 and len(self.rq.rset_info()) > 0:
            return False
        else:
            return is_runing

    def hash_to_md5(self, sign_str):
        m = hashlib.md5()
        sign_str = sign_str.encode('utf-8')
        m.update(sign_str)
        sign = m.hexdigest()
        return sign

    def now_time(self):
        time_stamp = datetime.datetime.now()
        return time_stamp.strftime('%Y-%m-%d %H:%M:%S')

    def save_to_mongo(self, result_dic):
        self.coll.saves(result_dic)
        self.is_running()

    def get_area(self, pro, strs):
        location_str = [strs]
        try:
            df = transform(location_str, umap={})
            area_str = re.sub(
                r'省|市', '-',
                re.sub(r'省市区0', '', re.sub(r'/r|/n|\s', '', str(df))))
        except:
            pass
        else:
            if area_str == '':
                area_li = [pro]
            else:
                area_li = (area_str.split('-'))
            if len(area_li) >= 2 and area_li[1] != '':
                return '-'.join(area_li[:2])
            else:
                return area_li[0]

    def load_get_html(self, li):
        try:
            selector_li = etree.HTML(str(li))
            url = 'http://cgb.wuhan.gov.cn' + selector_li.xpath(
                '//li/a/@href')[0]

            response = requests.get(
                url=url, headers=self.headers).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector_li.xpath('//li/a/text()')
            if title != []:
                title = title[0]
            else:
                title = None
            # print(title)
            status = selector_li.xpath('//li/div/span[3]/font/text()')
            if status != []:
                status = str(status[0])
            else:
                status = None

            _id = self.hash_to_md5(url)

            publish_date_li = selector_li.xpath('//li/span/text()')

            if publish_date_li != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',
                                         ''.join(publish_date_li)).group()
            else:
                publish_date = None
            # print(publish_date)
            # area_name = self.get_area('武汉', ''.join(publish_date_li))
            area_name = '武汉'
            source = 'http://cgb.wuhan.gov.cn/'

            soup = BeautifulSoup(response)
            content_html = soup.find(class_='art_con')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '武汉政府采购网'
            retult_dict['en_name'] = 'Wuhan Government Procurement'

            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)

    def load_get(self, url):
        try:
            response = requests.post(
                url=url, headers=self.headers).content.decode('utf-8')
            print(response)
            soup = BeautifulSoup(response)
        except:
            print('load_post error')
            self.load_get(url)
        else:
            ul = soup.find(class_="news-list-content list-unstyled")
            ul_li = ul.find_all('li')
            for li in ul_li:
                self.load_get_html(li)
                # if not self.rq.in_rset(urls):
                #     self.rq.add_to_rset(urls)
                #     self.rq.pull_to_rlist(urls)

    def init(self):
        count = 8
        while self.is_running():
            if self.rq.r_len() <= count:
                count = 1
            try:
                spawns = [
                    gevent.spawn(self.load_get_html, self.rq.get_to_rlist())
                    for i in range(count)
                ]
                gevent.joinall(spawns)
            except Exception as e:
                print(e)

    def run(self):
        # threading.Thread(target=self.init).start()
        task_li = [
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/zbgg//index_',
                'all_page': 3
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/cggg/index_',
                'all_page': 3
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/gzgg/index_',
                'all_page': 3
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/fbgg/index_',
                'all_page': 3
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/dylygg/index_',
                'all_page': 2
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/qtgg/index_',
                'all_page': 2
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/jkcpgg/index_',
                'all_page': 1
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/notice/dzscgg/index_',
                'all_page': 2
            },
            {
                'url': 'http://cgb.wuhan.gov.cn/contract/index_',
                'all_page': 2
            },
        ]
        count = 3
        for task in task_li:
            for page in range(1, task['all_page'] + 1, count):
                try:
                    # url =task['url']+str(page)+'.html'
                    # self.load_get(url)
                    spawns = [
                        gevent.spawn(self.load_get,
                                     task['url'] + str(page + i) + '.html')
                        for i in range(count)
                    ]
                    gevent.joinall(spawns)
                    print('第{}页'.format(page))
                except Exception as e:
                    print(e)

    def main(self):
        self.run()