def get_detail_info(url, headers, proxies, ipdata, msg):
            childresponse = get_response_get(url, headers, proxies, ipdata, msg)
            if childresponse:
                print('详情页:', url)
                selector = etree.HTML(childresponse.text, etree.HTMLParser())
                supplier_info = {}
                supplier_info.update({'source_url': url})
                try:
                    brief_info = selector.xpath(
                        '//div[@class="dljgContainer"]/div[@class="dljg_infor"]/div/ul/li/p/span')

                    for i in range(0, len(brief_info), 2):  # 获取了主要信息
                        title = brief_info[i].xpath('text()')[0].strip()
                        value = brief_info[i + 1].xpath('text()')[0].strip()
                        supplier_info.update({title: value})
                except Exception as e:
                    fail_time = datetime.datetime.now()
                    fail_time = trans_date_str(fail_time)
                    with open('failed_url.txt', 'a', encoding='utf-8')as f:
                        f.write(f'{fail_time}:<{childresponse.request.url}>:该网址解析xpath失败:<{e}>')
                # 获取基本信息  3个表...
                try:
                    table_info = selector.xpath('//div[@class="byf_table_jibenxinxi"]/table[1]')
                    for table_ele in table_info:
                        tr_ele = table_ele.xpath('tr')
                        if len(tr_ele) > 1:
                            for i in range(0, len(tr_ele)):
                                detail_info = tr_ele[i].xpath('td/text()')
                                if len(detail_info) > 1:
                                    for i in range(0, len(detail_info), 2):
                                        title = detail_info[i]
                                        value = detail_info[i + 1]
                                        supplier_info.update({title: value})
                    table_info_type2 = selector.xpath(
                        '//div[@class="byf_table_jibenxinxi"]/table[position()>1]')
                    for table_info in table_info_type2:
                        tr_ele = table_info.xpath('tr')
                        if len(tr_ele) > 1:
                            title = tr_ele[0]
                            titles = title.xpath('td')
                            titles = [i.xpath('text()')[0] for i in titles]
                            for value_ele in tr_ele[1:]:
                                values = value_ele.xpath('td')
                                values = [i.xpath('text()')[0] if len(i.xpath('text()')) > 0 else '' for i
                                          in values]
                                supplier_info.update(dict(zip(titles, values)))
                except Exception as e:
                    fail_time = datetime.datetime.now()
                    fail_time = trans_date_str(fail_time)
                    with open('failed_url.txt', 'a', encoding='utf-8')as f:
                        f.write(f'{fail_time}:<{childresponse.request.url}>:该网址解析xpath失败:<{e}>')
                with open('government_procurement/neimenggu/supplier.txt', 'a', encoding='utf-8')as f:
                    f.write(json.dumps(supplier_info, ensure_ascii=False))
                    f.write(',')
Ejemplo n.º 2
0
def get_response(url, proxies, ipdata, msg, trynum=0):
    if trynum == 10:  # 连续更换10次ip均出错
        return None
    try:
        response = requests.get(url=url,
                                headers=headers,
                                proxies=proxies,
                                timeout=20,
                                allow_redirects=False)
        if response.status_code == 200:
            return response
        elif response.status_code == 404:
            ipdata.update_ipdata(msg)  # 删除当前ip
            msg = ipdata.get_ipdata()
            proxies = ipdata.get_proxy(msg)
            trynum += 1
            return get_response(url, proxies, ipdata, msg, trynum=trynum)
        else:
            fail_time = datetime.datetime.now()
            fail_time = trans_date_str(fail_time)
            with open('failed_url.txt', 'a', encoding='utf-8') as f:
                f.write(
                    f'{fail_time}:<{url}>: 该网站获取失败: <{response.status_code}>\n'
                )
            return None
    except requests.exceptions.ProxyError:  # 代理ip被拒绝访问
        ipdata.update_ipdata(msg)  # 删除当前ip
        msg = ipdata.get_ipdata()
        proxies = ipdata.get_proxy(msg)
        trynum += 1
        return get_response(url, proxies, ipdata, msg, trynum=trynum)
    except requests.exceptions.Timeout:
        ipdata.update_ipdata(msg)  # 删除当前ip
        msg = ipdata.get_ipdata()
        proxies = ipdata.get_proxy(msg)
        trynum += 1
        return get_response(url, proxies, ipdata, msg, trynum=trynum)
    except Exception as e:
        print(f'爬取页面错误 <{url}>: <{e}>')
        ipdata.update_ipdata(msg)  # 删除当前ip
        msg = ipdata.get_ipdata()
        proxies = ipdata.get_proxy(msg)
        trynum += 1
        return get_response(url, proxies, ipdata, msg, trynum=trynum)
        def get_supplier_info(childresponse):  # 供应商详情页获取信息
            childselector = etree.HTML(childresponse.text, etree.HTMLParser())
            try:
                tr_eleselector = childselector.xpath('/html/body/div[2]/table/tbody/tr')
                def get_dict(tr):  # 针对详细页面的一行,获取相应信息
                    name = tr.xpath('th')
                    if name != []:
                        for i in range(len(name)):
                            name = name[i].xpath('text()')[0].strip()
                            value = tr.xpath('td/text()')[i].strip()
                            return {name: value}
                    else:
                        return None

                supplier_info = list(map(get_dict, tr_eleselector))
                return supplier_info
            except Exception as e:
                fail_time = datetime.datetime.now()
                fail_time = trans_date_str(fail_time)
                with open('failed_url.txt', 'a', encoding='utf-8')as f:
                    f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n')
                return None
Ejemplo n.º 4
0
def get_content(url, bid_id, proj_id):  # 根据url获取正文
    ipdata = IpData()
    msg = ipdata.get_ipdata()
    proxies = ipdata.get_proxy(msg)
    create_time = datetime.datetime.now()
    create_time = trans_date_str(create_time)
    pattern = re.compile('(style.*/style)', re.I)

    def deal_response_content(response):
        def deal_table(selector):  # 获取概要
            info_addr = "ABSTRACT"
            table_list = selector.xpath('//div[@class="table"]/table/tr/td')
            if len(table_list) > 0:  # 获取到了指定元素
                title_list = selector.xpath(
                    '//div[@class="table"]/table/tr/td[contains(@class, "title")]'
                )
                titles = []
                values = []
                for ele in table_list:
                    if ele in title_list:
                        titles.append(ele.xpath('text()')[0])
                    else:
                        if len(titles) > len(values):
                            value = ele.xpath('text()')
                            if value == []:
                                values.append('')
                            else:
                                values.append(value[0])
                        else:
                            pass
                href_list = selector.xpath(
                    '//div[@class="table"]/table/tr/td/a[contains(@title, "点击下载")]'
                )
                file_url_list = []
                if len(href_list) > 0:
                    for href_ele in href_list:
                        url_suffix = href_ele.xpath('@id')[0]
                        file_url = "/oss/download?uuid={}".format(
                            url_suffix)  # 目前前缀应该为 http://www.ccgp.gov.cn
                        file_url_list.append(file_url)
                    for i in range(1, len(file_url_list) + 1):
                        values[-i] = file_url_list[-i]
                table = dict(zip(titles, values))
                biuld_bid_info_data = _format_bid_info(table, bid_id,
                                                       info_addr, create_time)
                file_attach_ele = selector.xpath(
                    '//a[@class="bizDownload"]')  # 概要中的附件
                files = []
                if len(file_attach_ele) > 0:
                    for file in file_attach_ele:
                        file_name = file.xpath('text()')[0].strip()
                        file_url = file.xpath('@id')[0]
                        file_id = get_uuid()
                        files.append(
                            (file_name, file_url, file_id, bid_id, proj_id))
                return biuld_bid_info_data, files

        def deal_content(selector):  # 获取正文
            content_list = selector.xpath(
                '//div[@class="vF_detail_content"]//text()')
            content_value = ''
            if len(content_list) > 0:  # 获取到了指定元素
                for content in content_list:
                    content = content.replace('***',
                                              '\r\n').replace('&nbsp;', ' ')
                    if content.strip().startswith('<'):
                        pass
                    else:
                        content_value += content
            file_attach_ele = selector.xpath(
                "//a[contains(@ignore, '1')]")  # 正文中的附件
            files = []
            if len(file_attach_ele) > 0:
                for file in file_attach_ele:
                    file_name = file.xpath('text()')
                    if len(file_name) > 0:
                        file_name = file_name[0]
                        file_url = file.xpath('@href')[0]
                        if file_url == 'javascript:;':
                            continue
                        if file_url == '':
                            file_url = file.xpath('@id')[0]
                        file_id = get_uuid()
                        files.append(
                            (file_name, file_url, file_id, bid_id, proj_id))

            origin_bid_text_data = (0, bid_id, content_value, 1, 0,
                                    create_time, operator)
            return origin_bid_text_data, files

        response_text = response.text.encode(web_encoding).decode("utf-8")
        response_text = response_text.replace('</p>', '***</p>').replace(
            '</h>', '***</h>').replace('<br>', '***<br>')
        style_value = pattern.findall(response_text)
        if len(style_value) > 0:
            for style in style_value:
                response_text = response_text.replace(style, '')
        baseSelector = etree.HTML(response_text, etree.HTMLParser())
        bid_info_data_table, file_table = deal_table(baseSelector)  # 获取概要
        bid_text_data, file_text = deal_content(baseSelector)  # 获取正文
        file_table.extend(file_text)
        return bid_info_data_table, bid_text_data, file_table

    # print('当前url为:', url)
    response = get_response(url, proxies, ipdata, msg)
    if response:
        return deal_response_content(response)
Ejemplo n.º 5
0
    def deal_list_page(response):
        create_time = datetime.datetime.now()
        create_time = trans_date_str(create_time)
        info_addr = 'LIST'
        selector = etree.HTML(response.text, etree.HTMLParser())
        url_list_href = selector.xpath(
            '//ul[@class="vT-srch-result-list-bid"]/li/a')
        ori_list_href = selector.xpath(
            '//ul[@class="vT-srch-result-list-bid"]/li/span')
        proj_bid_list = []
        origin_bid_data = []  # 一页一页地存储
        origin_bid_info_data = []
        origin_text_data_list = []
        file_attach_list = []
        for i in range(len(url_list_href)):
            url_href = url_list_href[i]
            ori_href = ori_list_href[i]
            url = url_href.xpath("@href")[0]  # 公告链接
            bid_title = url_href.xpath("text()")[0].strip()  # 公告名称
            proj_id = get_uuid()  # 获取项目id.  #  此处还要存一个表。项目id,项目标题
            bid_id = get_uuid()
            proj_bid_list.append((proj_id, bid_id, create_time, operator))
            ori_info = ori_href.xpath('text()')
            release_time = ori_info[0].strip().split('|')[0].strip()  # 发布时间
            purchasing_agent = ori_info[0].strip().split('|')[1].strip().split(
                ':')[1]  # 采购人
            agency = ori_info[0].strip().split('|')[2].strip().split(':')[
                1]  # 代理机构
            strong_info = ori_href.xpath('strong//text()')
            bid_type = strong_info[0].strip().split('|')[0]  # 标书类型
            project_type = strong_info[1].strip().split('|')[0]  # 项目类型
            region = ori_href.xpath('a//text()')  # 地区
            if len(region) != 0:
                region = region[0]
            origin_bid_data.append((0, bid_id, bid_title, bid_type, source,
                                    url, create_time, operator))
            bid_info_dict = {
                'release_time': release_time,
                'purchasing_agent': purchasing_agent,
                'agency': agency,
                'bid_type': bid_type,
                'project_type': project_type,
                'region': region
            }

            origin_bid_info_data_list = _format_bid_info(
                bid_info_dict, bid_id, info_addr, create_time)
            origin_bid_info_data_table, origin_bid_text_data, files_attach = get_content(
                url, bid_id, proj_id)
            origin_bid_info_data.extend(origin_bid_info_data_list)
            origin_bid_info_data.extend(origin_bid_info_data_table)
            origin_text_data_list.append(origin_bid_text_data)
            file_attach_list.extend(files_attach)
        proj_bid_data = _format_r_projbid(proj_bid_list)
        rprojbid = RProjBid()  # r_proj_bid
        rprojbid.insertmany(proj_bid_data)
        origin_bid_info = OriginBidInfo()  # t_origin_bid_info
        origin_bid_info.insertmany(tuple(origin_bid_info_data))
        origin_bid_text = OriginBidText()  # t_origin_bid_text
        origin_bid_text.insertmany(tuple(origin_text_data_list))
        origin_bid = OriginBid()  # t_origin_bid
        print(len(origin_bid_data), '当前页面获取到的文章数')
        origin_bid.insertmany(tuple(origin_bid_data))  # 向 t_origin_bid添加数据
        file_attach_data = _format_file_attach(file_attach_list, create_time,
                                               operator)
        if len(file_attach_data) > 0:
            tfileattach = TFileAttach()  # t_file_attach
            tfileattach.insertmany(file_attach_data)
    def get_agency(self):

        def get_agency_info(childresponse):  # 供应商详情页获取信息
            childselector = etree.HTML(childresponse.text, etree.HTMLParser())
            tr_eleselector = childselector.xpath('/html/body/div[2]/table[1]/tbody/tr')
            def get_dict(tr):  # 针对详细页面的一行,获取相应信息
                name = tr.xpath('th')
                if name != []:
                    for i in range(len(name)):
                        name = name[i].xpath('text()')[0].strip()
                        value = tr.xpath('td/text()')[i].strip()
                        return {name: value}
                else:
                    return None
            if tr_eleselector:
                supplier_info = list(map(get_dict, tr_eleselector))
                return supplier_info
            else:
                print(childresponse.request.url, '该代理页面数据提取失败')

        ipdata = IpData()
        msg = ipdata.get_ipdata()
        proxies = ipdata.get_proxy(msg)
        form_data = {
            'pointPageIndexId': '1',
            'pageIndex': '1',  # 从首页获取最大页数
            'pageSize': '10',
        }
        # max_page = self.get_max_page(self.agency_url, proxies, form_data, ipdata, msg)
        headers['referer'] = 'http://www.ccgp-guangdong.gov.cn/organization/queryPerformOrgList.do'
        # headers['cookie'] = 'Ks8ae9gdPofpF0yrRJi1UrDsaM-hm8uARsgRyaj46O9l8dsmqJyJ!-1509577578'
        headers['cookie'] = 'hlce-4C6jnLFpch9x2dUya_0eBJR--2owaXh62fo9E2FQRFQfWrf!-1509577578'
        # for page in range(19, max_page+1):
        for page in range(28, 114):
            print('代理第{}页'.format(page))
            form_data['pageIndex'] = page
            # headers['Content-Length'] = str(len(form_data))
            response = get_response_post(self.agency_url, headers, proxies, form_data, ipdata, msg, trynum=0)
            print('列表页:', form_data)
            if response:  # 解析代理商详情地址
                selector = etree.HTML(response.text, etree.HTMLParser())
                childurls = selector.xpath('//td[@align="center"]/a')
                unduplicate_childurls = []
                for childurl in childurls:
                    childurl = self.base_url + childurl.xpath('@href')[0]
                    if childurl not in unduplicate_childurls:
                        unduplicate_childurls.append(childurl)
                        childresponse = get_response_get(childurl, headers, proxies, ipdata, msg)
                        print('详情页:', childurl)
                        if childresponse:
                            try:
                                supplier_info = get_agency_info(childresponse)
                                supplier_info.append({'url_source': childurl})
                                with open('government_procurement/guangdong/agency.txt', 'a', encoding='utf-8')as f:
                                    f.write(json.dumps(supplier_info, ensure_ascii=False))
                                    f.write(',')
                            except Exception as e:
                                fail_time = datetime.datetime.now()
                                fail_time = trans_date_str(fail_time)
                                with open('failed_url.txt', 'a', encoding='utf-8')as f:
                                    f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站获取失败: <{response.status_code}>\n')
                            finally:
                                time.sleep(3)
                        else:
                            print('详情页信息获取失败')
                            time.sleep(3)
                    else:
                        continue

            else:
                print('代理商地址解析失败')
                continue
    def get_suppliers(self):

        def get_supplier_info(childresponse):  # 供应商详情页获取信息
            childselector = etree.HTML(childresponse.text, etree.HTMLParser())
            try:
                tr_eleselector = childselector.xpath('/html/body/div[2]/table/tbody/tr')
                def get_dict(tr):  # 针对详细页面的一行,获取相应信息
                    name = tr.xpath('th')
                    if name != []:
                        for i in range(len(name)):
                            name = name[i].xpath('text()')[0].strip()
                            value = tr.xpath('td/text()')[i].strip()
                            return {name: value}
                    else:
                        return None

                supplier_info = list(map(get_dict, tr_eleselector))
                return supplier_info
            except Exception as e:
                fail_time = datetime.datetime.now()
                fail_time = trans_date_str(fail_time)
                with open('failed_url.txt', 'a', encoding='utf-8')as f:
                    f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n')
                return None

        ipdata = IpData()
        msg = ipdata.get_ipdata()
        proxies = ipdata.get_proxy(msg)
        form_data = {
            'pointPageIndexId': '1',
            'pageIndex': '1',  # 从首页获取最大页数
            'pageSize': '10',
        }
        max_page = self.get_max_page(self.supplier_url, proxies, form_data, ipdata, msg)
        headers['referer'] = 'http://www.ccgp-guangdong.gov.cn/organization/querySellerOrgList.do'
        headers['cookie'] = 'Ks8ae9gdPofpF0yrRJi1UrDsaM-hm8uARsgRyaj46O9l8dsmqJyJ!-1509577578'
        for page in range(178, max_page+1):  # 从1循环到最大页数
            print('供应商第{}页'.format(page))
            form_data['pageIndex'] = page
            # headers['Content-Length'] = str(len(form_data))
            response = get_response_post(self.supplier_url, headers, proxies, form_data, ipdata, msg, trynum=0)
            if response:  # 解析供应商详情地址
                selector = etree.HTML(response.text, etree.HTMLParser())
                childurls = selector.xpath('//div[@class="m_m_cont"]//tr/td[3]/a')
                real_childurls = []
                for childurl in childurls:
                    childurl = self.base_url + childurl.xpath('@href')[0]
                    if childurl not in real_childurls:
                        real_childurls.append(childurl)
                        childresponse = get_response_get(childurl, headers, proxies, ipdata, msg)
                        if childresponse:
                            print('详情页:', childurl)
                            try:
                                supplier_info = get_supplier_info(childresponse)
                                supplier_info.append({'url_source': childurl})
                                with open('government_procurement/guangdong/supplier.txt', 'a', encoding='utf-8')as f:
                                    f.write(json.dumps(supplier_info, ensure_ascii=False))
                                    f.write(',')
                            except Exception as e:
                                fail_time = datetime.datetime.now()
                                fail_time = trans_date_str(fail_time)
                                with open('failed_url.txt', 'a', encoding='utf-8')as f:
                                    f.write(f'{fail_time}:<{childresponse.request.url}>: 该网站解析xpath: <{e}>\n')

                        else:
                            print('详情页信息获取失败')
                        time.sleep(3)
                    else:
                        continue