Ejemplo n.º 1
0
    def load_get(self, page):
        try:
            params = {
                'category_id': '',
                'keyword': '',
                'page': str(page),
                'px': '2'
            }

            url = 'http://www.zycg.gov.cn/article/article_search'
            proxies = proxy_pool.proxies()
            response = requests.post(url=url,
                                     headers=self.headers,
                                     params=params,
                                     proxies=proxies,
                                     timeout=10).text
            selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(page)
        else:
            li_ele_li = selector.xpath('//ul[@class="lby-list"]/li')
            print('第{}页'.format(page))
            for li_ele in li_ele_li:
                li = etree.tostring(li_ele,
                                    encoding="utf-8",
                                    pretty_print=True,
                                    method="html").decode('utf-8')
                self.load_get_html(li)
Ejemplo n.º 2
0
 def load_get(self, colid, page):
     try:
         params = (
             ('curpage', page),
             ('colid', colid),
         )
         url = 'http://119.164.253.173:8080/jngp2016/site/list.jsp'
         proxies = proxy_pool.proxies()
         response = requests.get(url=url,
                                 headers=self.headers,
                                 params=params,
                                 proxies=proxies,
                                 timeout=10).content.decode('gb18030')
         selector = etree.HTML(response)
     except Exception as e:
         print('load_get error:{}'.format(e))
         self.load_get(colid, page)
     else:
         print('第{}页'.format(page))
         try:
             li_ele_li = selector.xpath('//table[@class="list"]/tr')
         except:
             return
         for li_ele in li_ele_li:
             tr = etree.tostring(li_ele,
                                 pretty_print=True,
                                 encoding='utf-8',
                                 method='html').decode('utf-8')
             # print(li)
             self.load_get_html(tr)
Ejemplo n.º 3
0
 def load_get(self, page):
     try:
         params = (
             ('pi', page),
             ('ps', '20'),
             ('timestamp', str(int(time.time() * 1000))),
         )
         proxies = proxy_pool.proxies()
         url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable'
         response = requests.get(url=url,
                                 headers=self.headers,
                                 params=params,
                                 proxies=proxies,
                                 timeout=5).json()
         # selector = etree.HTML(response)
     except Exception as e:
         print('load_get error:{}'.format(e))
         self.load_get(page)
     else:
         print('第{}页'.format(page))
         response_li = response['notices']
         for data_dict in response_li:
             pid = data_dict['id']
             # print(pid)
             # self.load_get_html(pid)
             # time.sleep(2)
             if not self.rq.in_rset(pid):
                 self.rq.add_to_rset(pid)
                 self.rq.pull_to_rlist(pid)
Ejemplo n.º 4
0
    def load_get(self,categoryId, page):
        try:
            params = (
                ('keywords', ''),
                ('times', '4'),
                ('timesStart', ''),
                ('timesEnd', ''),
                ('area', ''),
                ('businessType', ''),
                ('informationType', ''),
                ('industryType', ''),
                ('page', page),
                ('parm', str(int(time.time() * 1000))),
            )
            url = 'http://www.scggzy.gov.cn/Info/GetInfoListNew'
            proxies = proxy_pool.proxies ()
            response = requests.get(url=url, headers=self.headers,params=params,proxies=proxies).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('load_get error:{}'.format(e))
            self.load_get(categoryId, page)
        else:
            print('第{}页'.format(page))
            response_li = json.loads(response['data'])

            for data_dic in response_li:

                self.load_get_html(data_dic)
Ejemplo n.º 5
0
    def load_get_html(self, data_dict):
        try:
            proxies = proxy_pool.proxies()
            params = {
                'noticeId': data_dict['id'],
                'url': 'http://notice.zcy.gov.cn/new/noticeDetail',
            }
            url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults'
            response = requests.get(url=url,
                                    headers=self.headers,
                                    params=params,
                                    proxies=proxies)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = data_dict['title']
            # print(title)
            status = data_dict['typeName']
            # print(status)

            _id = self.hash_to_md5(response.url)

            publish_date = time.strftime(
                "%Y-%m-%d", time.localtime(int(data_dict['pubDate']) / 1000))
            # print(publish_date)

            area_name = data_dict['districtName']
            # print(area_name)

            source = 'http://www.zjzfcg.gov.cn/'
            try:
                content_html = response.json()['noticeContent']
            except:
                return

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '浙江政府采购网'
            retult_dict['en_name'] = 'Zhejiang government Procurement'

            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)
Ejemplo n.º 6
0
 def load_get(self, base_url, page):
     try:
         if page == 0:
             url = base_url
         else:
             url = base_url + 'index_' + str(page) + '.html'
         proxies = proxy_pool.proxies ()
         response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8')
         selector = etree.HTML(response)
     except:
         print('load_post error')
     else:
         print('第{}页'.format(page))
         url_li = selector.xpath('//div[@class="R_cont_detail"]/ul/li/a/@href')
         for url in url_li:
             urls = base_url + url.replace('./','')
             print(urls)
             self.load_get_html((urls))
Ejemplo n.º 7
0
 def load_get(self, categoryId, types, page):
     try:
         params = (('Paging', page), )
         proxies = proxy_pool.proxies()
         url = 'http://ggzy.xjbt.gov.cn/TPFront/jyxx/{}/'.format(categoryId)
         response = requests.get(url=url,
                                 headers=self.headers,
                                 params=params,
                                 proxies=proxies).content.decode('utf-8')
         selector = etree.HTML(response)
     except Exception as e:
         print('load_get error:{}'.format(e))
         self.load_get(categoryId, types, page)
     else:
         print('第{}页'.format(page))
         url_li = selector.xpath(
             '//td[@class="border"]/div/table/tr/td/a/@href')
         for url in url_li:
             urls = 'http://ggzy.xjbt.gov.cn' + url
             # print(urls)
             self.load_get_html(urls)
Ejemplo n.º 8
0
 def load_get(self, page):
     try:
         params = {
             'pageSize': '15',
             'pageNo': page,
             'url': 'http://notice.zcy.gov.cn/new/noticeSearch',
             'noticeType': '0',
         }
         url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults'
         proxies = proxy_pool.proxies()
         response = requests.get(url=url,
                                 headers=self.headers,
                                 params=params,
                                 proxies=proxies,
                                 timeout=5).json()
     except Exception as e:
         print('load_post error{}'.format(e))
         self.load_get(page)
     else:
         print('第{}页'.format(page))
         response_li = response['articles']
         # print(response_li)
         for data_dict in response_li:
             self.load_get_html(data_dict)
Ejemplo n.º 9
0
    def load_get_html(self, url):
        print(url)
        if url == None:
            return
        try:
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            print(url)
            # self.load_get_html(url)
        else:
            title = selector.xpath('//td[@id="tdTitle"]/font[1]/b/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(
                        r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$',
                        title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath(
                '//td[@id="tdTitle"]/font[2]//text()')
            if publish_date != []:
                # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
                publish_date = re.sub(
                    r'\/', '-',
                    re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})',
                              ''.join(publish_date)).group())
                # if '-' not in publish_date:
                #     publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8])
            else:
                publish_date = None
            # print(publish_date)
            # area_name = self.get_area('云南',title)
            area_name = '新疆-建设兵团'
            # print(area_name)

            source = 'http://ggzy.xjbt.gov.cn/'

            table_ele = selector.xpath('//table[@id="tblInfo"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '新疆生产建设兵团公共资源交易信息网'
            retult_dict[
                'en_name'] = 'Xinjiang Construction Corps Public resource'
            print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)
Ejemplo n.º 10
0
    def load_get_html(self, tr):

        if tr == None:
            return
        try:
            selector_li = etree.HTML(str(tr))
            tr_li = selector_li.xpath('//tr/td[2]/a/@href')[0]
            url = 'http://119.164.253.173:8080' + tr_li
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).content.decode('gb18030')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # self.load_get_html(li)
        else:
            title = selector_li.xpath('//tr/td[2]/a/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector_li.xpath('//tr/td/text()')
            if publish_date != []:
                publish_date = re.sub(
                    r'\[|\]', '-',
                    re.search(r'(\d{4}\-\d+\-\d{1,2})',
                              ''.join(publish_date)).group())
            else:
                publish_date = None
            # print(publish_date)
            area_name = '山东-济南'

            source = 'http://jncz.jinan.gov.cn/'

            try:
                table_ele = selector.xpath('//body/table')
            except:
                return
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele,
                                          encoding="utf-8",
                                          pretty_print=True,
                                          method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '济南市财政局'
            retult_dict['en_name'] = 'Jinan Finance Bureau'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)
Ejemplo n.º 11
0
    def load_get_html(self, pid):
        if pid == None:
            return
        try:
            proxies = proxy_pool.proxies()
            url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format(
                pid)
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).json()
            # selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.load_get_html(pid)
        else:
            title = response['notice']['title']
            try:
                status = response['notice']['projectPurchaseWayName']
            except:
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()')
            publish_date = response['notice']['issueTime']
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',
                                         publish_date).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '重庆'

            # print(area_name)

            source = 'https://www.cqgp.gov.cn/'

            content_html = response['notice']['html']

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '重庆市政府采购网'
            retult_dict['en_name'] = 'Chongqing City Government Procurement'
            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)
Ejemplo n.º 12
0
    def load_get_html(self,url):
        if url == None:
            return
        # print(url)
        try:
            proxies = proxy_pool.proxies ()
            response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="title"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="extra"]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '江苏-南京'
            # print(area_name)

            source = 'http://www.njgp.gov.cn/'

            table_ele_li = selector.xpath('//div[@class="cont"]/div')
            content_html = ''
            for table_ele in table_ele_li[1:4]:

                content_html += etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南京市政府采购网'
            retult_dict['en_name'] = 'Nanjing City Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)
Ejemplo n.º 13
0
    def load_get_html(self, li):
        sele_li = etree.HTML(li)
        time.sleep(0.5)
        # url = 'http://www.zycg.gov.cn/article/show/311865'
        if li == None:
            return
        try:
            url_li = sele_li.xpath('//li/a/@href')
            url = 'http://www.zycg.gov.cn' + url_li[0]
            # url = 'http://www.zycg.gov.cn/article/show/527813'
            proxies = proxy_pool.proxies()
            response = requests.get(url=url,
                                    headers=self.headers,
                                    proxies=proxies,
                                    timeout=10).text
            selector = etree.HTML(response)
            if '打印预览' in response:
                url_li = selector.xpath('//span[@id="btnPrint"]/a/@href')
                url = 'http://www.zycg.gov.cn' + url_li[0]
                response = requests.get(url=url,
                                        headers=self.headers,
                                        proxies=proxies,
                                        timeout=10).text
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            self.rq.pull_to_rlist(li)
        else:
            print(url)
            title = sele_li.xpath('//li/a/@title')
            if title != []:
                title = re.sub(r'\r|\n|\s', '', ''.join(title))
            else:
                title = ''
            try:
                status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
            except:
                status = '公告'

            _id = self.hash_to_md5(url)

            publish_date = sele_li.xpath('//li/span/text()')
            if publish_date != []:
                publish_date = re.sub(r'\r|\n|\s|\[|\]', '',
                                      ''.join(publish_date))
            else:
                publish_date = ''

            print(publish_date, title)

            # print(response)

            soup = BeautifulSoup(response)

            content_html = soup.find(class_="detail_gg")
            if content_html == None:
                content_html = soup.find(class_='frame-pane')
                if content_html == None:
                    content_html = soup.find(name='Frm_Order')
                    if content_html == None:
                        print(content_html)
                        return

            # print('content_html',content_html)
            # print(response)

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = '中央'
            retult_dict['source'] = 'http://www.zycg.gov.cn/'

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '中央政府采购网'
            retult_dict['en_name'] = 'Central Government Procurement'

            # print(retult_dict)

            print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)
Ejemplo n.º 14
0
    def load_get_html(self, data_dic):
        if data_dic == None:
            return
        try:
            url = 'http://www.scggzy.gov.cn'+data_dic['Link']
            proxies = proxy_pool.proxies ()
            response = requests.get(url=url, headers=self.headers,proxies=proxies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
            # print(url)
            # self.load_get_html(url)
        else:
            # print(response)
            title = data_dic['Title']
            # title = selector.xpath('//div[@class="div-title"]/text()')
            if title != '':
                title = re.sub(r'\r|\n|\s','',title)
                try:
                    status = re.search(r'["招标","预","采购","更正","结果","补充"]{1,2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'

            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            # publish_date = selector.xpath('//div[@class="div-title2"]//text()')
            publish_date = data_dic['CreateDateStr']
            # if publish_date != []:
            #     publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group()
            # else:
            #     publish_date = None
            # print(publish_date)
            area_name = self.get_area('四川', title)

            source = 'http://www.scggzy.gov.cn'

            table_ele  = selector.xpath('//div[@class="ChangeMidle"]')
            if table_ele != []:
                table_ele = table_ele[0]
            else:
                return

            content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')
            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '四川省公共资源交易平台'
            retult_dict['en_name'] = 'Sichuan Public resource'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))

            self.save_to_mongo(retult_dict)