Ejemplo n.º 1
0
class CorpNamePipeline(object):
    redis_tools = RedisTools()

    def __init__(self):
        self.cnt = 1

    def process_item(self, item_contains, spider):
        for item in item_contains['item_contains']:
            spider_name = spider.name

            compass_name = item['compass_name']
            detail_link = item['detail_link']
            out_province = item['out_province']  # 'None'为本省, 否则保存外省名

            if item['detail_link'] is None or item['detail_link'].upper() in (
                    'NONE', ''):
                finger = compass_name
            else:
                finger = item['detail_link']  # 公司名作为指纹
            self.redis_tools.store_finger(ALL_FINGER_CONTAINS, finger)
            common_info = '##'.join([compass_name, detail_link, out_province])
            self.redis_tools.store_finger(spider_name, common_info)
            print('---------%d个信息item被保存' % self.cnt)
            self.cnt += 1
        return item_contains

    def close_spider(self, spider):
        """爬虫结束后执行(一次)"""
        self.db = None
        self.conn = None
        print('关闭爬虫,本次一次存了 %d个信息item' % self.cnt)
        pass
Ejemplo n.º 2
0
class YunNanSpider(scrapy.Spider):
    name = 'yun_nan_spider'
    allowed_domains = ['220.163.15.148']
    start_urls = ['http://220.163.15.148/InfoQuery/EnterpriseList?page=1']
    redis_tools = RedisTools()
    LOG_FILE = 'logs/{}_{}_{}_{}.log'.format(name, now_date_time.year,
                                             now_date_time.month,
                                             now_date_time.day)
    log_path = os.path.join(os.path.abspath('..'), LOG_FILE)
    log_dir = os.path.dirname(log_path)
    if not os.path.exists(log_dir):
        os.makedirs(os.path.dirname(log_path))

    def parse(self, response):
        url = response.url
        line_links = response.xpath(
            '//tbody/tr/td[@class="left"]/a/@href').extract()
        line_links = ['http://220.163.15.148' + link for link in line_links]
        for link in line_links:
            is_crawled = self.redis_tools.check_finger(
                link, name=ALL_FINGER_CONTAINS)
            if is_crawled:
                print('%s已经抓取过!' % link)
                continue
            if is_crawled:
                logging.info('%s has already crawled!', link)
                continue
            print 'parse detail page info.....'
            headers = {
                'Referer':
                url,
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            }
            yield scrapy.Request(url=link,
                                 callback=self.parse_detail,
                                 headers=headers)
        # 翻页
        ss = response.xpath(
            '//div[@class="jump fl"]/span[1]/text()').extract_first()
        [total_line, per_page] = re.findall('\d+', ss)  # 总记录数, 每页显示多少行
        total_page = (int(total_line) / int(per_page) + 1) if int(
            total_line) % int(per_page) else int(total_line) / int(per_page)
        next_page_num = int(response.meta.get('cur_page_num', '1')) + 1
        if next_page_num > total_page:
            logging.info('不能继续翻页啦, 当前是第{}页,已经是最后一页啦'.format(next_page_num))
            return
        link = 'http://220.163.15.148/InfoQuery/EnterpriseList?page={}'
        next_link = link.format(next_page_num)
        # print '下一页...', next_link
        yield scrapy.Request(next_link,
                             callback=self.parse,
                             meta={'cur_page_num': next_page_num})

    def parse_detail(self, response):
        url = response.url
        compass_items = self.get_company_info(response)
        quality_items = self.get_qualification_info(response)
        # quality_items = self.get_project_info(response)
        yield JianzhuprojectItem({
            'compass_items':
            compass_items,
            'qualification_items':
            None,
            'project_items':
            None,
            'staff_items':
            None,
            'change_items':
            None,
            'behavior_items':
            None,
            'crawl_time':
            self.fmt_time(),
            'source_link':
            url,
            'compass_name':
            compass_items[0]['compass_name'],
            'honor_code':
            compass_items[0]['honor_code'],
            # 'quality_link': url,
            # 'project_link': url,
            # 'staff_link': url,
            'other':
            None,
        })

    def get_company_info(self, response):
        compass_name = ''.join(
            response.xpath(
                '//div[@class="tLayer-1"]/h3/text()').extract()).strip()
        honor_code, register_capital = response.xpath(
            '//div[@class="tLayer-1"]/table/tr[1]/td[not(@class)]/text()'
        ).extract()
        honor_code = 'None' if len(honor_code) < 7 else honor_code
        representive = ''.join(
            response.xpath(
                '//div[@class="tLayer-1"]/table/tr[2]/td[not(@class)][1]/text()'
            ).extract())
        compass_type = response.xpath(
            '//div[@class="tLayer-1"]/table/tr[3]/td[not(@class)]/text()'
        ).extract()[0]
        establish_time = ''.join(
            response.xpath(
                '//div[@class="tLayer-1"]/table/tr[4]/td[not(@class)][2]/text()'
            ).extract()).strip()
        provice = ''.join(
            response.xpath(
                '//div[@class="tLayer-1"]/table/tr[5]/td[not(@class)][2]/text()'
            ).extract())
        operating_addr = ''.join(
            response.xpath(
                '//div[@class="tLayer-1"]/table/tr[6]/td[not(@class)][1]/text()'
            ).extract())
        company_item = CompassItem({  # 自动检查key是否合法
            'compass_name': compass_name,
            'compass_link': response.url,
            'honor_code': honor_code,  # 信用代码
            'representative': representive,  # 法人
            'compass_type': compass_type,  # 公司类型
            'provice': provice,
            'operating_addr': operating_addr,  # 运营地址
            'establish_time': establish_time,
            'register_capital': register_capital,
            'net_asset': None,
        })
        # print company_item
        return [company_item]

    def get_qualification_info(self, response):
        pass

    def get_project_info(self, response):
        pass

    def fmt_time(self):
        return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def get_headers(self):
        headers = {
            'Referer':
            'http://220.163.15.148/InfoQuery/EnterpriseList?page=769',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        return headers
Ejemplo n.º 3
0
class ShangHaiCompass(BaseCompass):
    name = 'shanghai_compass'
    allow_domain = ['']
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [(
        'http://www.ciac.sh.cn/SHCreditInfoInterWeb/CreditBookAnounce/GetQyCreditReportAll?page=-1',
        sit_list[0])]

    extract_dict = {
        'inner': {
            'nodes': '//table[contains(@class, "tablelist")]/tbody/tr',
            'cname': './td[2]/text()',
            'detail_link': 'None',
        },
    }

    redis_tools = RedisTools()

    def start_requests(self):
        for link, _ in self.start_urls:
            yield scrapy.Request(link,
                                 callback=self.parse_list,
                                 meta={'cur_page': '1'},
                                 dont_filter=True)

    def parse_list(self, response):

        data = json.loads(response.text)['resultdata']
        html = etree.HTML(data)
        ext_rules = self.extract_dict['inner']
        nodes = html.xpath(ext_rules['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_rules['cname'])[0])
            item['detail_link'] = 'None'
            item['out_province'] = 'waisheng'
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}

        total_page_num = html.xpath('//label[@id="zongyeshu"]/text()')[0]
        meta = response.meta
        if int(total_page_num) > int(meta['cur_page']):
            print(u'当前页码:{}'.format(meta['cur_page']))
            yield self.turn_page(response)
        else:
            print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page']))
            return

    def turn_page(self, response):
        meta = response.meta
        headers = self.get_header(response.url, flag='2')
        formdata = self.get_form_data(response)
        meta['cur_page'] = str(int(meta['cur_page']) + 1)

        return scrapy.FormRequest(response.url,
                                  formdata=formdata,
                                  callback=self.parse_list,
                                  headers=headers,
                                  meta=meta)

    def handle_cname(self, cname, flag='inner'):
        return cname.replace('企业基本信息', '').strip('\n\t\r ')

    def handle_cdetail_link(self, link, flag='inner', url=''):
        if 'javascript:window' in link:
            import re
            pp = re.compile(r"\('(.*?)'\)")
            return 'http://218.14.207.72:8082/PublicPage/' + re.search(
                pp, link).group(1)
        if link.startswith('.'):
            return link.replace('.',
                                'http://zjj.jiangmen.gov.cn/public/licensing')
        else:
            return 'http://www.stjs.org.cn/xxgk/' + link

    def get_form_data(self, resp):
        meta = resp.meta
        formdata = {
            'mainZZ': '0',
            'aptText': '',
            'areaCode': '0',
            'entName': '',
            'pageSize': '10',
            'pageIndex': str(meta['cur_page']),
        }
        return formdata
Ejemplo n.º 4
0
class SiChuanCompass(BaseCompass):
    name = 'sichuan_compass'
    allow_domain = ['xmgk.scjst.gov.cn']
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        ('http://xmgk.scjst.gov.cn/QueryInfo/Ente/EnteList.aspx', sit_list[0])
    ]

    extract_dict = {
        'inner': {
            'nodes': '//table[contains(@class, "list")]//tr[position()>1]',
            'cname': './/a[contains(@href, "EnteZsxx") and @title]/@title',
            'detail_link': './/a[contains(@href, "EnteZsxx") and @title]/@href',
        # 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + xxx
            'next_page_flag': u'//a[@disabled="disabled" and contains(text(), "下页")]/text()',
        },
        '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value',
        '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value',

    }

    redis_tools = RedisTools()

    def start_requests(self):
        for link, _ in self.start_urls:
            yield scrapy.Request(link, callback=self.parse_list, meta={'cur_page': '1'},
                                 dont_filter=True)

    def parse_list(self, response):

        ext_rules = self.extract_dict['inner']
        nodes = response.xpath(ext_rules['nodes'])
        item_contains = []

        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(node.xpath(ext_rules['cname']).extract_first())
            item['detail_link'] = self.handle_cdetail_link(node.xpath(ext_rules['detail_link']).extract_first())
            item['out_province'] = 'waisheng'
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}

        next_page_flag = response.xpath(ext_rules['next_page_flag'])
        meta = response.meta
        if not next_page_flag:
            print(u'当前页码:{}'.format(meta['cur_page']))
            yield self.turn_page(response)
        else:
            print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page']))
            return

    def turn_page(self, response):
        meta = response.meta
        headers = self.get_header(response.url, flag='2')
        if int(meta['cur_page']) % 10:
            time.sleep(random.random() * 4)
        meta['cur_page'] = str(int(meta['cur_page']) + 1)
        formdata = self.get_form_data(response)
        return scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, headers=headers, meta=meta)

    def handle_cdetail_link(self, link, flag='inner', url=''):
        if link.startswith('.'):
            return link.replace('.', 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/')
        else:
            return 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + link

    def get_form_data(self, resp):
        meta = resp.meta
        formdata = {
            '__VIEWSTATE': resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(),
            '__EVENTVALIDATION': resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(),
            '__EVENTARGUMENT': meta['cur_page'],  # 实际是下一页的页码
            '__VIEWSTATEGENERATOR': 'E1A883C9',
            '__EVENTTARGET': 'ctl00$mainContent$gvPager',
            'ctl00$mainContent$txt_entname': '',
            'ctl00$mainContent$lx114': '',
            'ctl00$mainContent$cxtj': '',
            'UBottom1:dg1': '',
            'UBottom1:dg2': '',
            'UBottom1:dg3': '',
            'UBottom1:dg4': '',
            'UBottom1:dg5': '',
            'UBottom1:dg6': '',
        }
        return formdata
Ejemplo n.º 5
0
class LiaoLinCompass(BaseCompass):
    name = 'liaolin_compass'
    allow_domain = ['218.60.144.163']
    custom_settings = {
        'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, }
    }
    start_urls = [
        'http://218.60.144.163/LNJGPublisher/corpinfo/CorpInfo.aspx',
    ]
    log_file = '../logs/{}_log.log'.format(name)
    redis_tools = RedisTools()

    inner_extract_dict = {
        'nodes': '//div[@id="div_Province"]//tr[@class="odd" or @class="even"]',
        'cname': './td[contains(@class, "company_name")]/@title',
        'detail_link': './td[contains(@class, "company_name")]/a[contains(@onclick, "OpenCorpDetail")]/@onclick',
        'out_province': 'None',

        '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value',
        '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value',
    }

    outer_extract_dict = {
        'nodes': '//div[@id="div_outCast"]//tr[@class="odd" or @class="even"]',
        'detail_link': './td[last()]/a[contains(@onclick, "onshow")]/@onclick',
        # onshow('30a48514-d54e-4a38-bafd-0d05296f1a01')
        'cname': './td[2]/text()',
        'out_province': './td[4]/text()'
    }

    def start_requests(self):
        headers = {
            'Accept':'application/json, text/javascript, */*; q=0.01',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'Referer': 'http://cx.jljsw.gov.cn/corpinfo/CorpInfo.aspx',
            'Host': 'cx.jljsw.gov.cn',
        }
        for link in self.start_urls:
            yield scrapy.Request(link, headers=headers, callback=self.parse_list, meta={'cur_page_num': 1})

    def parse_list(self, response):
        item_contains = []

        node1 = response.xpath(self.inner_extract_dict['nodes'])
        node2 = response.xpath(self.outer_extract_dict['nodes'])
        try:
            for node in node1:
                inner_item = NameItem()
                inner_item['compass_name'] = self.handle_cname(
                    node.xpath(self.inner_extract_dict['cname']).extract_first())
                inner_item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(self.inner_extract_dict['detail_link']).extract_first())
                inner_item['out_province'] = 'liaolin'
                if not self.redis_tools.check_finger(inner_item['detail_link']):
                    item_contains.append(inner_item)
                else:
                    print('{}已经爬取过'.format(inner_item['detail_link']))

            for node in node2:
                outer_item = NameItem()
                outer_item['compass_name'] = self.handle_cname(
                    node.xpath(self.outer_extract_dict['cname']).extract_first())
                outer_item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(self.outer_extract_dict['detail_link']).extract_first())
                outer_item['out_province'] = self.handle_out_province(
                    node.xpath(self.outer_extract_dict['out_province']).extract_first())

                if not self.redis_tools.check_finger(outer_item['detail_link']):
                    item_contains.append(outer_item)
                else:
                    print(u'{}已经爬取过'.format(outer_item['detail_link']))
        except Exception as e:
            with open(self.log_file, 'wa') as fp:
                fp.write(str(e))
        yield {'item_contains': item_contains}

        # 翻页
        meta = response.meta
        cur_page_num = meta['cur_page_num']
        next_page_flag = response.xpath('//a[@id="Linkbutton3" and contains(@class, "aspNetDisabled")]').extract()
        if next_page_flag:
            print(u'不能继续翻页了,当前最大页码:')
            return
        print(u'翻页....')
        next_page = int(cur_page_num) + 1
        meta['cur_page_num'] = str(next_page)
        headers = self.get_header(response.url, flag='2')
        formdata = self.get_form_data(response)
        yield scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, meta=meta, headers=headers)


    def handle_cdetail_link(self, clink):
        """
        处理进入公司详细页的链接
        :param clink: 字符串链接, 最原始
        :return: 直接能够使用的链接,(无论是post还是get)
        """

        if 'OpenCorpDetail' in clink:
            pp = re.compile(ur"OpenCorpDetail\('(.*?)','(.*?)','(.*)'\)")
            [rowGuid, CorpCode, CorpName] = re.search(pp, clink).groups()
            good_link = 'http://218.60.144.163/LNJGPublisher/corpinfo/CorpDetailInfo.aspx?rowGuid={}&CorpCode={}&CorpName={}&VType=1'.format(
                rowGuid, CorpCode, CorpName)
        else:
            pp = re.compile(ur"onshow\('(.*?)'")
            fid = re.search(pp, clink).group(1)
            good_link = 'http://218.60.144.163/LNJGPublisher/corpinfo/outCaseCorpDetailInfo.aspx?Fid=' + fid
        return good_link

    def get_form_data(self, resp):
        formdata = {
            '__VIEWSTATE': resp.xpath(self.inner_extract_dict['__VIEWSTATE']).extract_first(),
            '__EVENTVALIDATION': resp.xpath(self.inner_extract_dict['__EVENTVALIDATION']).extract_first(),
            'hidd_type': '1',
            'txtCorpName': '',
            'ddlZzlx': '',
            'txtFOrgCode': '',
            'txtCertNum': '',
            'newpage': resp.meta['cur_page_num'],
            'newpage1': '',
            '__EVENTTARGET': 'Linkbutton3',
            '__EVENTARGUMENT': '',
        }
        return formdata

    def handle_out_province(self, s):
        return s.strip('\r\n\t ')
Ejemplo n.º 6
0
class ShanDongSpider(scrapy.Spider):
    name = 'shan_dong_spider'
    allowed_domains = ['www.sdjs.gov.cn', '221.214.94.41']
    start_urls = ['http://221.214.94.41:81/InformationReleasing/Ashx/InformationReleasing.ashx']

    redis_tools = RedisTools()

    pp = re.compile(r'\((.*)\)', re.S)

    def start_requests(self):
        url = self.start_urls[0] + '/' + self.get_query_string(1)
        yield scrapy.Request(url, callback=self.parse, headers=self.get_headers())

    def parse(self, response):
        url = response.url
        txt_str = response.text
        print url
        data = eval(re.search(self.pp, txt_str).group(1))
        detail_link = 'http://221.214.94.41:81/InformationReleasing/Ashx/InformationReleasing.ashx?callback=jQuery17108474795947085398&methodname=GetCorpQualificationCertInfo&CorpCode={}&CurrPageIndex=1&PageSize=5'
        for unit in data['data']['CorpInfoList']:
            url1 = 'http://www.sdjs.gov.cn/xyzj/DTFront/ZongHeSearch/Detail_Company.aspx?CorpCode={}&searchType=0'.format(unit['LegalMan'])
            headers = {
                'Referer': url1,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            }
            detail_url = detail_link.format(unit['LegalMan'])

            compass_items = self.parse_compass_info(unit, url1)
            quality_items = self.get_qualification_info(unit, detail_url, headers)
            print JianzhuprojectItem({
                'compass_items': compass_items,
                'qualification_items': quality_items,
                'project_items': None,
                'staff_items': None,
                'change_items': None,
                'behavior_items': None,
                'crawl_time': self.fmt_time(),
                'source_link': url,
                'compass_name': compass_items[0]['compass_name'],
                'honor_code': compass_items[0]['honor_code'],
                'other': None,
            })
            break

    def parse_compass_info(self, unit, url):
        company_item = CompassItem({  # 自动检查key是否合法
            'compass_name': unit['CorpName'],
            'compass_link': url,
            'honor_code': unit['CorpCode'],  # 信用代码
            'representative': unit['LegalMan'],  # 法人
            'compass_type': unit['EconomicNum'],  # 公司类型
            'provice': ''.join(unit['AreaName'].split('·')[:1]),
            'operating_addr': unit['Address'],  # 运营地址
            'establish_time': 'None',
            'register_capital': unit['RegPrin'],
            'net_asset': None,
        })
        return [company_item]

    def get_qualification_info(self, unit, url, headers):
        response = requests.get(url, headers=headers)
        txt_str = response.content
        qua_data = eval(re.search(self.pp, txt_str).group(1))
        _, __, ___ = unit['QualificationScope'], unit['CertCode'],unit['DanWeiType']
        quality_name_list, quality_code_list, quality_type_list = _.split(';'), __.split(';'), ___.split(';')
        item_list = [QualityItem({'quality_type': qtype,
                                'quality_code': qcode,
                                'quality_name': qname.replace('(新)', '').replace('(新)', ''),
                                'quality_start_date': 'None',
                                'quality_end_date': 'None',
                                'quality_detail_link': None,
                                'authority': 'None',
                                }) for (qname, qcode, qtype) in zip(quality_name_list, quality_code_list, quality_type_list) if qcode.upper()[0] in ['A', 'B', 'C']]
        return item_list

    def fmt_time(self):
        return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def get_headers(self):
        headers = {
            'Referer': 'http://www.sdjs.gov.cn/xyzj/DTFront/ZongHeSearch/Detail_Company.aspx?CorpCode=913716261671905552&searchType=0',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        return headers

    def get_query_string(self, cur_page):
        query_dict = {
            "callback": "jQuery17106983271474465658",
            "methodname": "GetCorpInfo",
            "CurrPageIndex": str(cur_page),
            "PageSize": "12",
        }
        return urllib.urlencode(query_dict)
class GuangDongPart03Compass(BaseCompass):
    name = 'guangdong03_compass'
    allow_domain = ['218.13.12.85']
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        ('http://218.13.12.85/cxpt/web/enterprise/getEnterpriseList.do',
         sit_list[0])
    ]
    refers = ['http://218.13.12.85/cxpt/website/enterpriseList.jsp']

    now_time = datetime.datetime.now().strftime('%Y-%m-%d')

    redis_tools = RedisTools()

    def start_requests(self):
        for link, sit in self.start_urls:
            headers = self.get_header(self.refers[0], flag='2')
            formdata = self.get_form_data(0)
            yield scrapy.FormRequest(link,
                                     headers=headers,
                                     formdata=formdata,
                                     callback=self.parse_list,
                                     meta={
                                         'pageIndex': '0',
                                         'sit': sit
                                     })

    def parse_list(self, response):
        json_resp = json.loads(response.text)
        item_contains = []
        for unit in json_resp['data']:
            cname, cid, _id, bid, province = unit['corpName'], unit[
                'corpCode'], unit['id'], unit['bid'], unit['areacode']
            detail_link = 'http://218.13.12.85/cxpt/website/enterpriseInfo.jsp?entID={}&eid={}&bid={}'.format(
                cid, _id, bid)
            out_province = self.handle_out_province(province)

            if self.redis_tools.check_finger(cname):
                print(u'{}已经爬取过'.format(cname))
                continue
            item = NameItem({
                'compass_name': cname,
                'detail_link': detail_link,
                'out_province': out_province
            })
            item_contains.append(item)
        yield {'item_contains': item_contains}
        if 'total' not in response.meta:
            response.meta['total_page_num'] = (int(json_resp['total']) +
                                               9) / 10
        if int(response.meta['pageIndex']) < int(
                response.meta['total_page_num']):
            yield self.turn_page(response)
        else:
            print('不能继续翻页了, 当前最大页码:{}'.format(response.meta['pageIndex']))
            return

    def turn_page(self, response):
        meta = response.meta

        link = response.url
        meta['pageIndex'] = str(int(meta['pageIndex']) + 1)
        formdata = self.get_form_data(meta['pageIndex'])
        headers = self.get_header(self.refers[0], flag='2')
        return scrapy.FormRequest(link,
                                  headers=headers,
                                  formdata=formdata,
                                  meta=meta,
                                  callback=self.parse_list)

    def get_form_data(self, next_page_num):
        formdata = {
            'mainZZ': '0',
            'aptText': '',
            'areaCode': '0',
            'entName': '',
            'pageSize': '10',
            'pageIndex': str(next_page_num),
        }
        return formdata

    def handle_out_province(self, s):
        if s is '':
            return 'waisheng'
        return s.split('-')[0]
Ejemplo n.º 8
0
class BaseCompass(scrapy.Spider):
    name = ''
    allow_domain = ['']
    start_urls = ['']
    extract_dict = None
    redis_tools = RedisTools()

    def start_requests(self):
        print('start_requests.....')
        for url, sit in self.start_urls:
            headers = self.get_header(url, flag='1')
            yield scrapy.Request(url=url,
                                 callback=self.parse_list,
                                 headers=headers,
                                 meta={
                                     'sit': sit,
                                     'pre_page_num': '0'
                                 })

    def parse_list(self, response):
        # print('parse_list....', response.text)
        item_contains = []
        url = response.url
        sit = response.meta['sit']
        try:
            if sit == sit_list[0]:
                inner_nodes = response.xpath(
                    self.extract_dict['inner']['nodes'])
                inner = self.extract_dict['inner']
                print("inner_nodes:", len(inner_nodes))
                for node in inner_nodes:
                    item = NameItem()
                    item['compass_name'] = self.handle_cname(
                        node.xpath(inner['cname']).extract_first(), 'inner')
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(inner['detail_link']).extract_first(),
                        'inner', url)
                    if self.redis_tools.check_finger(item['detail_link']):
                        print('{}已经爬取过'.format(item['detail_link']))
                        continue
                    item['out_province'] = inner[
                        'out_province'][1] if isinstance(
                            inner['out_province'], list) else 'None'
                    item_contains.append(item)

            if sit == sit_list[1]:
                print(u'解析外省....')
                outer_nodes = response.xpath(
                    self.extract_dict['outer']['nodes'])
                outer = self.extract_dict['outer']
                print("outer_nodes:", len(outer_nodes))
                for node in outer_nodes:
                    item = NameItem()
                    print(node.xpath(outer['cname']).extract_first())
                    item['compass_name'] = self.handle_cname(
                        node.xpath(outer['cname']).extract_first(), 'outer')
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(outer['detail_link']).extract_first(),
                        'outer', url)
                    if self.redis_tools.check_finger(item['detail_link']):
                        print('{}已经爬取过'.format(item['detail_link']))
                        continue
                    if isinstance(outer['out_province'],
                                  list) and len(outer['out_province']) > 1:
                        item['out_province'] = outer['out_province'][1]
                    else:
                        item['out_province'] = self.handle_out_province(
                            node.xpath(outer['out_province']).extract_first())
                    item_contains.append(item)
        except Exception as e:
            print(response.text)
            with open(self.log_file, 'wa') as fp:
                fp.write(str(e))
            exit(0)
        yield {'item_contains': item_contains}

        yield self.turn_page(response)

    def turn_page(self, response):
        print('必须重写turn_page方法')
        pass

    def handle_out_province(self, s):
        return s.strip('\r\n\t ')

    def handle_cname(self, cname, flag='inner'):
        """
        处理公司名称
        :param cname: 字符串公司名
        :return: 干净的名字
        """
        return cname.strip('\r\n\t ')

    def handle_cdetail_link(self, clink, flag='inner', url=''):
        """
        处理进入公司详细页的链接
        :param clink: 字符串链接, 最原始
        :return: 直接能够使用的链接,(无论是post还是get)
        """
        if clink.startswith('http'):
            good_link = clink
        else:
            domain_str = self.get_domain_info(url)  # 待重写,domain_str可变, 结尾一定没有/
            if clink.startswith('..'):
                good_link = clink.replace('..', domain_str, 1)
            elif clink.startswith('.'):
                good_link = clink.replace('.', domain_str, 1)
            elif clink.startswith('/'):
                good_link = domain_str + clink
            else:
                print('请重写该方法handle_cdetail_link')
                good_link = ''
        return good_link

    def handles_province(self, cprovice):
        """
        处理省份信息
        :param cprovice:
        :return: 只有省信息
        """
        return cprovice.strip('\r\n\t ')

    def get_domain_info(self, link):
        # 根据link的开头特点需要进行重写
        # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
        # !/usr/bin/python
        import platform
        v = platform.python_version()
        if v.startswith('2'):
            import urlparse
            res = urlparse.urlparse(link)
        else:
            from urllib import parse
            res = parse.urlparse(link)
        return res.scheme + '://' + res.netloc
        # return 'jzjg.gzjs.gov.cn:8088'

    def get_header(self, url, flag='1'):
        domain_str = self.get_domain_info(url)
        header = {
            'Host':
            domain_str.split('//')[-1],
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        if flag not in (1, '1'):
            header['Origin'], header['Referer'] = domain_str, url
        return header

    def run(self):
        cmdline.execute(['scrapy', 'crawl', self.name])
Ejemplo n.º 9
0
class BeiJingCompass(BaseCompass):
    name = 'beijing_compass'
    allow_domain = ['xpt.bcactc.com']
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        ("http://xpt.bcactc.com/G2/basic/gfm/info!performancePublicList.do?data&filter_params_=enterpriseName",
         sit_list[1]),
        ("http://xpt.bcactc.com/G2/basic/gfm/info!entOrganizationList.do?data&filter_params_=enterpriseName",
         sit_list[1]),
        ("http://xpt.bcactc.com/G2/basic/gfm/info!entPersonInfoList.do?data&filter_params_=enterpriseName",
         sit_list[0]),
        ("http://xpt.bcactc.com/G2/basic/gfm/info!entPerformanceList.do?data&filter_params_=enterpriseName",
         sit_list[1]),
    ]

    redis_tools = RedisTools()

    def start_requests(self):
        for url, sit in self.start_urls:
            headers = self.get_header(url, flag='1')
            yield scrapy.Request(url=url,
                                 callback=self.parse_list,
                                 headers=headers,
                                 meta={
                                     'sit': sit,
                                     'cur_page_num': '1'
                                 })

    def parse_list(self, response):

        meta = response.meta
        sit = meta['sit']
        out_province = 'beijing' if sit_list[0] == sit else 'waisheng'

        json_data = json.loads(response.body_as_unicode())['data']
        item_contains = []
        for unit in json_data:
            item = NameItem({
                'compass_name': unit['enterpriseName'],
                'detail_link': 'None',
                'out_province': out_province
            })
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)

    def turn_page(self, response):
        meta = response.meta
        if 'total_page' not in meta:
            _ = json.loads(response.body_as_unicode())
            meta['total_page'], meta['cur_page_num'] = _['total'], _['page']

        print('当前页:{}, 总页码:{}'.format(meta['cur_page_num'],
                                      meta['total_page']))
        if int(meta['cur_page_num']) >= int(meta['total_page']):
            print('不能翻页了,当前最大页码:{}'.format(meta['cur_page_num']))
            return
        headers = self.get_header(response.url, flag='2')
        formdata = self.get_form_data(response)
        meta['cur_page_num'] = str(int(meta['cur_page_num']) + 1)
        return scrapy.FormRequest(response.url,
                                  headers=headers,
                                  formdata=formdata,
                                  callback=self.parse_list,
                                  meta=meta)

    def get_form_data(self, response):
        form_data = {
            'gridSearch': 'false',
            'nd': str(int(time.time() * 1000)),
            'PAGESIZE': '15',
            'PAGE': str(response.meta['cur_page_num']),
            'sortField': '',
            'sortDirection': 'asc',
        }
        return form_data

    def get_header(self, url, flag='1'):
        headers = {
            "Host":
            "xpt.bcactc.com",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        }
        if flag not in (1, '1'):
            headers["Referer"], headers["Origin"] = url, self.get_domain_info(
                url)  # 二次进入才有
        return headers

    def handle_cdetail_link(self, clink, flag='inner', url=''):
        if clink.startswith('http'):
            good_link = clink
        else:
            good_link = "" + clink
        return good_link
Ejemplo n.º 10
0
class GuangDongPart01Compass(BaseCompass):
    name = 'guangdong01_compass'
    allow_domain = [
        '219.129.189.10:8080', 'www.jyjzcx.com', 'www.zsjs.gov.cn',
        'mmzjcx.maoming.gov.cn'
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        # ("http://219.129.189.10:8080/yjcxk/web-nav/enterprises?pageNumber=1", sit_list[0]),
        # ("http://219.129.189.10:8080/yjcxk/web-nav/enterprises?pageNumber=0", sit_list[1]),
        # ("http://219.129.189.10:8080/yjcxk/web-nav/persons?pageNumber=1&pageSize=17550", sit_list[0])
        # ('http://www.jyjzcx.com/web/companylist.action?pageNum=1&pageSize=15', sit_list[0])
        # ('http://www.zsjs.gov.cn/web/enterprise/findEnterprises?page=1&start=45', sit_list[0]),
        # ('https://gcjs.sg.gov.cn/website/buildproject/buildProjectSjAction!proMainList.action?pager.offset=20',
        #  sit_list[0]),
        ('http://mmzjcx.maoming.gov.cn/PublicPage/CorpMoreList.aspx?clearPaging=true&strNav=4',
         sit_list[0])
    ]
    ctypes = [3, 2, 1, 4, 6, 5, 7, 8, 9, 10, 11, 12]
    tot = [2, 3, 2, 10, 3, 0, 2, 5, 1, 1, 5, 2]

    extract_dict = {
        'inner': {
            'nodes': '//table[contains(@id, "GridView1")]//tr[position()>1]',
            'cname': './td/a/text()',
            'detail_link': './td/a/@onclick',  #
            'next_page': '//input[contains(@id, "btnNext") and @disabled]'  #
        },
        '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value',
        '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value',
        '__VIEWSTATEENCRYPTED': '//input[@id="__VIEWSTATEENCRYPTED"]/@value',
    }

    redis_tools = RedisTools()

    def start_requests(self):
        link = self.start_urls[0][0]
        for ctype in self.ctypes[:1]:
            yield scrapy.Request(link,
                                 callback=self.parse_list1,
                                 meta={
                                     'cur_page': '1',
                                     'ctype': ctype
                                 },
                                 dont_filter=True)

    def parse_list1(self, response):
        ext_rules = self.extract_dict['inner']
        nodes = response.xpath(ext_rules['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_rules['cname']).extract_first())
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(ext_rules['detail_link']).extract_first())
            item['out_province'] = 'waisheng'
            if self.redis_tools.check_finger(item['detail_link']):
                print(u'{}已经爬取郭'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)

    def turn_page(self, response):
        meta = response.meta
        if int(meta['cur_page']) >= int(meta['total_page']):
            print(u'不能在翻页了')
            return
        headers = self.get_header(response.url, flag='2')
        form_data = self.get_form_data(response)
        meta['cur_page'] = str(int(meta['cur_page']) + 1)
        print(u'下一页:', meta['cur_page'])
        return scrapy.FormRequest(response.url,
                                  formdata=form_data,
                                  callback=self.parse_list1,
                                  headers=headers,
                                  meta=meta)

    def parse_list2(self, response):
        json_data = json.loads(response.body_as_unicode())

        item_contains = []
        for row in json_data['rows']:
            item = NameItem()
            item['compass_name'] = row['cxaa05']
            item['detail_link'] = row['link']
            item['out_province'] = 'waisheng'
            item_contains.append(item)
        yield {'item_contains': item_contains}
        meta = response.meta
        total_page = (json_data['total'] + 14) / 15
        cur_page = meta['cur_page']
        if int(cur_page) >= int(total_page):
            print(u'不能继续翻页了,当前最大页码为:', cur_page)
            return
        yield self.turn_page1(response)

    def turn_page1(self, resp):
        meta = resp.meta
        meta['cur_page'], start_row = int(
            meta['cur_page']) + 1, int(meta['cur_page']) * 15
        link = 'http://www.zsjs.gov.cn/web/enterprise/findEnterprises?page={}&start={}'.format(
            meta['cur_page'], start_row)
        headers = self.get_header(resp.url, flag='2')
        return scrapy.Request(link,
                              callback=self.parse_list2,
                              meta=meta,
                              headers=headers)

        # def parse_list(self, response):
        #     data = json.loads(response.body_as_unicode())['data']['rows']
        #     item_contains = []
        #     for unit in data:
        #         if 'persons' in response.url:
        #             compass_name = unit['entName']
        #             detail_link = 'None'
        #             out_province = 'waisheng'
        #         else:
        #             compass_name = unit['companyName']
        #             detail_link = 'http://219.129.189.10:8080/yjcxk/vueStatic/html/companyDetail.jsp?id=' + unit['id']
        #             out_province = 'guangdong'
        #         if detail_link in ('', 'None'):
        #             if self.redis_tools.check_finger(compass_name):
        #                 continue
        #         else:
        #             if self.redis_tools.check_finger(detail_link):
        #                 continue
        #         item = NameItem({
        #             'compass_name': compass_name,
        #             'detail_link': detail_link,
        #             'out_province': out_province
        #         })
        #         if '测试企业' in item['compass_name']:
        #             continue
        #         item_contains.append(item)
        #     yield {'item_contains': item_contains}
        #
        #     # def turn_page(self, response):
        #     #     next_page_num = response['']
        #     #     "http://219.129.189.10:8080/yjcxk/web-nav/persons?pageNumber={}&pageSize=5000".format(next_page_num)
        #     #     return

    def handle_cname(self, cname):
        return cname.replace('企业基本信息', '').strip('\n\t\r ')

    def handle_cdetail_link(self, link):
        if 'javascript:window' in link:
            import re
            pp = re.compile(r"\('(.*?)'\)")
            return 'http://mmzjcx.maoming.gov.cn/PublicPage/' + re.search(
                pp, link).group(1)

    def get_form_data(self, resp):
        meta = resp.meta
        formdata = {
            'ctl00$cph_context$ScriptManager1':
            'ctl00$cph_context$UpdatePanel1|ctl00$cph_context$GridViewPaging1$btnNext',
            '__EVENTTARGET':
            '',
            '__EVENTARGUMENT':
            '',
            '__LASTFOCUS':
            '',
            '__VIEWSTATE':
            resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(),
            '__EVENTVALIDATION':
            resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(),
            '__VIEWSTATEENCRYPTED':
            resp.xpath(
                self.extract_dict['__VIEWSTATEENCRYPTED']).extract_first(),
            'ctl00$cph_context$ddlCorpType':
            str(meta['ctype']),
            'ctl00$cph_context$ddlCorpSincerityGrade':
            '',
            'ctl00$cph_context$txtCorpName':
            u'请输入相关的企业名称',
            'ctl00$cph_context$GridViewPaging1$txtGridViewPagingForwardTo':
            str(meta['cur_page']),
            'ctl00$cph_context$GridViewPaging1$btnNext.x':
            '12',
            'ctl00$cph_context$GridViewPaging1$btnNext.y':
            '5',
        }

        return formdata
Ejemplo n.º 11
0
class JinLinCompass(BaseCompass):
    name = 'jilin_compass'
    allow_domain = ['cx.jljsw.gov.cn']
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    start_urls = [
        ('http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method=SnCorpData&nPageIndex=1&nPageSize=20',
         sit_list[0]),
        # ('http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method=SwCorpData&nPageIndex=1&nPageSize=20', sit_list[1])
    ]
    redis_tools = RedisTools()
    extract_dict = {
        'nodes':
        '//tr',
        'cname':
        './td[@title and contains(@class, "company_name")]/@title',
        'detail_link':
        './td[@title and contains(@class, "company_name")]/a/@href',
        'out_province':
        u'./td[@title and contains(@class, "company_name")]/following-sibling::td[1]/text()'
    }

    def start_requests(self):
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'Referer': 'http://cx.jljsw.gov.cn/corpinfo/CorpInfo.aspx',
            'Host': 'cx.jljsw.gov.cn',
        }
        for link, sit in self.start_urls:
            yield scrapy.Request(link,
                                 headers=headers,
                                 callback=self.parse_list,
                                 meta={
                                     'sit': sit,
                                     'base_link': ''
                                 })

    def parse_list(self, response):
        sit = response.meta['sit']
        json_data = json.loads(response.text)
        html = etree.HTML(json_data['tb'])

        nodes = html.xpath(self.extract_dict['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(self.extract_dict['cname'])[0])
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(self.extract_dict['detail_link'])[0])
            item['out_province'] = 'jilin' if sit == sit_list[
                0] else node.xpath(self.extract_dict['out_province'])[0]
            if not self.redis_tools.check_finger(item['detail_link']):
                item_contains.append(item)
            else:
                print('{}已经爬取过'.format(item['detail_link']))

        yield {'item_contains': item_contains}

        # 翻页
        total_page = int(json_data['nPageCount'])
        cur_page = int(json_data['nPageIndex'])

        if int(total_page) > int(cur_page):
            print('翻页....')
            next_page = cur_page + 1
            mpara = 'SnCorpData' if sit == sit_list[0] else 'SwCorpData'
            next_link = 'http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method={}&nPageIndex={}&nPageSize=20'.format(
                mpara, next_page)
            response.meta['cur_page'] = next_page
            yield scrapy.Request(next_link,
                                 callback=self.parse_list,
                                 meta=response.meta)
        else:
            print('不能继续翻页了,当前页码:', cur_page)

    def handle_cname(self, cname):
        """
        处理公司名称
        :param cname: 字符串公司名
        :return: 干净的名字
        """
        return cname

    def handle_cdetail_link(self, clink):
        """
        处理进入公司详细页的链接
        :param clink: 字符串链接, 最原始
        :return: 直接能够使用的链接,(无论是post还是get)
        """
        if clink.startswith('http'):
            good_link = clink
        else:
            domain_str = 'http://cx.jljsw.gov.cn'  # 待重写,domain_str可变, 结尾一定没有/
            if clink.startswith('..'):
                good_link = clink.replace('..', domain_str, 1)
            elif clink.startswith('.'):
                good_link = clink.replace('.', domain_str, 1)
            elif clink.startswith('/'):
                good_link = domain_str + clink
            else:
                print('请重写该方法')
                good_link = ''
        return good_link

    def handles_province(self, cprovice):
        """
        处理省份信息
        :param cprovice:
        :return: 只有省信息
        """
        pass
Ejemplo n.º 12
0
class JzscQualitySpider(scrapy.Spider):
    name = 'jzsc_quality'
    allowed_domains = ['jzsc.mohurd.gov.cn']
    start_urls = ['http://jzsc.mohurd.gov.cn/dataservice/query/comp/list']
    default_headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
    }
    page_num = 1
    mongo_tools = MongoTools()
    redis_tools = RedisTools()

    def start_requests(self):
        skip = 0
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
            "Host": "jzsc.mohurd.gov.cn",
        }

        result = self.mongo_tools.get_documents(batch_size=50, skip_num=skip)
        redis_collections = ['compass', 'quality', 'staff', 'project']
        for i, data in enumerate(result):
            quality_link, staff_link, project_link, compass_link = data[
                'quality_link'], data['staff_link'], data[
                    'project_link'], data['entry_link']
            for url in [quality_link]:
                if self.redis_tools.check_finger(finger=url,
                                                 name=redis_collections[i]):
                    yield scrapy.FormRequest(url,
                                             callback=self.parse,
                                             headers=headers,
                                             meta={
                                                 'compass_link': compass_link,
                                                 'quality_info_list': []
                                             })
                else:
                    print url, '已经抓取过了'

    def parse(self, response):
        """
        实质是翻页控制
        :param response: scrapy 响应对象
        :return: 该公司所有的资质信息 [{}, {}, {}.....]
        """
        meta, url = response.meta, response.url
        print url
        headers = {
            'Referer':
            url,
            'Origin':
            "http://jzsc.mohurd.gov.cn",
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }
        if '__pgfm' in response.text:
            page_pattern = re.compile(r'__pgfm\(.*?({.*?})\)')
            res = re.search(page_pattern, response.text).groups(1)[0]
            json_page_data = eval(res)
            total, cur_page, page_size = json_page_data[
                '$total'], json_page_data['$pg'], json_page_data['$pgsz']
        else:
            total = 1
        compass_link = meta['compass_link']
        items_list = []
        for pnum in range(total):  # 总页数
            form_data = {
                "$total": total,
                "$reload": "0",
                "$pg": str(int(pnum) + 1),
                "$pgsz": "25",
            }
            response = requests.post(url=url, headers=headers, data=form_data)
            quality_info_list = self.parse_page(response, compass_link)
            items_list.extend(quality_info_list)
        # print '打印len(items_list)', len(items_list)
        return {'items_list': items_list}

    def parse_page(self, response, compass_link):
        """
        解析资质信息,以页面为单位
        :param response:
        :param compass_link: 公司连接
        :return: [item1, item2]
        """
        if 'caDetailList' in response.url:
            return self.parse_quality(response, compass_link)
        elif 'regStaffList' in response.url:
            pass
            # return self.parse_staff(response, compass_link)
        else:
            # 'compPerformanceListSys'  工程项目信息
            # return self.parse_project(response, compass_link)
            pass

    def parse_quality(self, response, compass_link):
        print '解析资质资格信息.....'
        html = etree.HTML(response.text)
        line_nodes = html.xpath('//tbody/tr')
        quality_info_list = []
        print len(line_nodes)
        for i, node in enumerate(line_nodes):
            quality = QualityItem()
            quality['quality_type'] = ''.join(
                node.xpath(u'./td[@data-header="资质类别"]/text()'))
            quality['quality_code'] = ''.join(
                node.xpath(u'./td[@data-header="资质证书号"]/text()'))
            quality['quality_name'] = ''.join(
                node.xpath(u'./td[@data-header="资质名称"]/text()')).strip()
            quality['quality_date'] = ''.join(
                node.xpath(u'./td[@data-header="发证日期"]/text()'))
            quality['validity_date'] = ''.join(
                node.xpath(u'./td[@data-header="证书有效期"]/text()'))
            quality['authority'] = ''.join(
                node.xpath(u'./td[@data-header="发证机关"]/text()'))
            quality['compass_link'] = compass_link
            quality['quality_link'] = response.url
            quality['crawl_time'] = self.fmt_time()
            quality_info_list.append(quality)
        return quality_info_list

    def parse_staff(self, response, compass_link):
        print '解析注冊員工信息....'
        html = etree.HTML(response.content)
        staff_nodes = html.xpath('//tbody/tr')
        staff_info_list = []
        for i, node in enumerate(staff_nodes[:-1]):
            staff = StaffItem()
            staff['name'] = node.xpath(u'.//a[@onclick]/text()')[0]
            staff['id_card'] = node.xpath(
                u'./td[@data-header="身份证号"]/text()')[0]
            staff['title'] = node.xpath(u'./td[@data-header="注册类别"]/text()')[0]
            staff['title_code'] = node.xpath(
                u'./td[contains(@data-header, "注册号")]/text()')[0]
            staff['profession'] = ''.join(
                node.xpath(
                    u'./td[contains(@data-header, "注册专业")]/text()')) or 'None'
            staff['html_link'] = response.url
            staff['person_link'] = node.xpath('.//a[@onclick]/@onclick')[0]
            staff['compass_link'] = compass_link
            staff['crawl_time'] = self.fmt_time()
            staff_info_list.append(staff)
        print '打印员工信息..', staff_info_list
        return staff_info_list

    def parse_project(self, response, compass_link):
        print '解析工程項目信息.....'
        html = etree.HTML(response.text)
        project_info_list = []
        line_nodes = html.xpath('//tbody/tr[position()<26]')
        for i, node in enumerate(line_nodes):
            project = ProjectItem()
            project['proj_code'] = node.xpath(
                u'./td[@data-header="项目编码"]/text()')[0]
            project['proj_name'] = node.xpath(
                u'./td[@data-header="项目名称"]//text()')[0]
            project['proj_site'] = ''.join(
                node.xpath(
                    u'./td[@data-header="项目属地"]/text()')).strip() or 'None'
            project['proj_type'] = ''.join(
                node.xpath(u'./td[@data-header="项目类别"]/text()')) or 'None'
            project['employer'] = ''.join(
                node.xpath(u'./td[@data-header="建设单位"]/text()')) or 'None'
            project['proj_link'] = node.xpath('.//a[@onclick]/@onclick')[0]
            project['compass_link'] = compass_link
            project['crawl_time'] = self.fmt_time()
            project_info_list.append(project)
        print '打印prj:', project_info_list
        return project_info_list

    def fmt_time(self):
        return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
Ejemplo n.º 13
0
class GuangDongPart02Compass(BaseCompass):
    name = 'guangdong02_compass'
    allow_domain = ['www.stjs.org.cn', 'zjj.jiangmen.gov.cn']
    custom_settings = {
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        # ('http://zjj.jiangmen.gov.cn/public/licensing/index_1.html', sit_list[0]),
        ('http://www.stjs.org.cn/xxgk/xxgk_cxgs.aspx?page=1', sit_list[0])
    ]

    extract_dict = {
        'inner': {
            'nodes': '//div[@class="a_table"]//table//tr[position()>1]',
            'cname': './td/a/text()',
            'detail_link':
            './td/a/@href',  # 'http://www.stjs.org.cn/xxgk/' + link
            'next_page':
            '//a[contains(text(), "Next") and not(@disabled)]/@href'  # xxgk_cxgs.aspx?page=4
        },
    }

    redis_tools = RedisTools()

    def start_requests(self):
        link = self.start_urls[0][0]
        for ctype, _ in self.start_urls:
            yield scrapy.Request(link,
                                 callback=self.parse_list,
                                 meta={'cur_page': '1'},
                                 dont_filter=True)

    def parse_list(self, response):
        ext_rules = self.extract_dict['inner']
        nodes = response.xpath(ext_rules['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_rules['cname']).extract_first())
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(ext_rules['detail_link']).extract_first())
            item['out_province'] = 'guangdong'
            if self.redis_tools.check_finger(item['detail_link']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)

    def turn_page(self, response):
        meta = response.meta
        next_page_link = response.xpath(
            self.extract_dict['inner']['next_page']).extract_first()
        if next_page_link is None:
            print(u'不能在翻页了')
            return
        headers = self.get_header(response.url, flag='2')
        meta['cur_page'] = str(int(meta['cur_page']) + 1)
        link = 'http://www.stjs.org.cn/xxgk/{}'.format(next_page_link)
        return scrapy.Request(link,
                              callback=self.parse_list,
                              headers=headers,
                              meta=meta)

    def handle_cname(self, cname, flag='inner'):
        return cname.replace('企业基本信息', '').strip('\n\t\r ')

    def handle_cdetail_link(self, link, flag='inner', url=''):
        if 'javascript:window' in link:
            import re
            pp = re.compile(r"\('(.*?)'\)")
            return 'http://218.14.207.72:8082/PublicPage/' + re.search(
                pp, link).group(1)
        if link.startswith('.'):
            return link.replace('.',
                                'http://zjj.jiangmen.gov.cn/public/licensing')
        else:
            return 'http://www.stjs.org.cn/xxgk/' + link

    def get_form_data(self, resp):
        meta = resp.meta
        formdata = {
            'ctl00$cph_context$ScriptManager1':
            'ctl00$cph_context$UpdatePanel1|ctl00$cph_context$GridViewPaging1$btnNext',
            '__EVENTTARGET':
            '',
            '__EVENTARGUMENT':
            '',
            '__LASTFOCUS':
            '',
            '__VIEWSTATE':
            resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(),
            '__VIEWSTATEGENERATOR':
            '8D94C66F',
            '__VIEWSTATEENCRYPTED':
            '',
            '__EVENTVALIDATION':
            resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(),
            'ctl00$cph_context$corType':
            str(meta['ctype']),
            'ctl00$cph_context$corGrade':
            '全部',
            'ctl00$cph_context$corName':
            u'请输入相关的企业名称',
            'ctl00$cph_context$GridViewPaging1$txtGridViewPagingForwardTo':
            str(meta['cur_page']),
            'ctl00$cph_context$GridViewPaging1$btnNext.x':
            '12',
            'ctl00$cph_context$GridViewPaging1$btnNext.y':
            '5',
        }

        return formdata
Ejemplo n.º 14
0
class ChongQingCompass(BaseCompass):
    name = 'chongqing_compass'
    allow_domain = ['jzzb.cqjsxx.com']
    custom_settings = {
        'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300}
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/YhzSgqy/YhzSgqy_List.aspx', sit_list[0], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Sgqy/Sgqy_List.aspx', sit_list[0], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zljcjg/Zljcjg_List.aspx', sit_list[0], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zjzxjg/Zjzxjg_List.aspx', sit_list[0], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Hntqy/Hntqy_List.aspx', sit_list[0], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Ryxxbs/Rybabs_List.aspx', sit_list[1], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zbdljg/Zbdljg_List.aspx', sit_list[1], 'rule1'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zjzxjg/Wd_Zjzxjg_List.aspx', sit_list[1], 'rule1'),

        # == == == == == == == == == rule2
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Jlqy/Jlqy_List.aspx', sit_list[0], 'rule2'),
        ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Jlqy/WdJlqy_List.aspx', sit_list[1], 'rule2'),
    ]

    redis_tools = RedisTools()

    extract_dict = {
        'rule1': {  # acsOutNetQueryPageList  qualificationCertificateListForPublic
            'nodes': '//table[@id="DataGrid1" or @rules="all"]//tr[position()>1]',
            'cname': u'.//a[contains(@href, "doPostBack") and not(contains(string(), "查看"))]//text()',
            'detail_link': '',  # # 赋值空
            # 'out_province': ['chongqing', 'waidi'],
        },
        'rule2': {
            'nodes': '//table[@id="DataGrid1"]/tbody/tr[position()>1]',
            'cname': './td[2]//text()',
            'detail_link': '',  # 赋值空
            # 'out_province': ['chongqing', 'waidi']
        },
        'total_page': '//span[@id="TurnPage1_pagecount" or @id="Pager1_Pages"]//text()',
        '__VIEWSTATE': '//input[@name="__VIEWSTATE"]/@value',
        '__VIEWSTATEGENERATOR': '//input[@name="__VIEWSTATEGENERATOR"]/@value',
        '__EVENTTARGET': '//input[@name="__EVENTTARGET"]/@value',

    }

    def start_requests(self):
        for url, sit, rule in self.start_urls:
            headers = self.get_header(url, flag='1')
            yield scrapy.Request(url=url, callback=self.parse_list, headers=headers,
                                 meta={'sit': sit, 'cur_page_num': '1', 'rule': rule})

    def parse_list(self, response):

        meta = response.meta
        rule, sit = meta['rule'], meta['sit']
        out_province = 'chongqing' if sit_list[0] == sit else 'waisheng'
        ext_rule = self.extract_dict[rule]
        nodes = response.xpath(ext_rule['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(node.xpath(ext_rule['cname']).extract_first())
            item['detail_link'] = 'None'
            item['out_province'] = out_province
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经抓取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)

    def turn_page(self, response):
        meta = response.meta
        if 'total_page' not in meta:
            meta['total_page'] = meta.get('total_page', response.xpath(self.extract_dict['total_page']).extract_first())

        cur_page_num = meta['cur_page_num']
        if int(cur_page_num) >= int(meta['total_page']):
            print('不能翻页了,当前最大页码:{}'.format(cur_page_num))
            return
        print('当前页:{}, 总页码:{}'.format(cur_page_num, meta['total_page']))
        headers = self.get_header(response.url, flag='2')
        formdata = self.get_form_data(response)
        meta['cur_page_num'] = int(meta['cur_page_num']) + 1
        return scrapy.FormRequest(response.url, headers=headers, formdata=formdata, callback=self.parse_list, meta=meta)

    def get_form_data(self, response):
        form_data = {
            'TurnPage1:PageNum': '',
            'FName': '',
            '__EVENTARGUMENT': '',
            '__EVENTTARGET': 'TurnPage1:LB_Next',
            '__VIEWSTATE': ''.join(response.xpath(self.extract_dict['__VIEWSTATE']).extract()),
            '__VIEWSTATEGENERATOR': ''.join(response.xpath(self.extract_dict['__VIEWSTATEGENERATOR']).extract()),
        }
        return form_data

    def get_header(self, url, flag='1'):
        headers = {
            "Host": "jzzb.cqjsxx.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",

        }
        if flag not in (1, '1'):
            headers["Referer"], headers["Origin"] = url, self.get_domain_info(url)  # 二次进入才有
        return headers
Ejemplo n.º 15
0
class ParentSpider(scrapy.Spider):
    name = ''

    allowed_domains = []
    start_urls = []

    extract_dict = None  # 字段提取大字典
    cnt = 1
    redis_tools = RedisTools()

    def __init__(self):
        super(ParentSpider, self).__init__()
        assert all([
            self.name, self.allowed_domains, self.start_urls, self.extract_dict
        ]), '在关键的4个大变量中有未自实现的值, 请检查子类'

    def start_requests(self):
        print('start_requests.....')
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse_list)

    def parse_list(self, resp_list_page):
        print('parse_list......')
        url = resp_list_page.url
        meta = resp_list_page.meta
        line_nodes = resp_list_page.xpath(
            self.extract_dict['list_page']['lines_rule'])
        print('\t\tline_nodes:', len(line_nodes))
        for node in line_nodes:
            position = Postion[1]  # DetailPage
            link = node.xpath(self.extract_dict['list_page']
                              ['detail_link_rule']).extract_first()
            good_link = self.handle_detail_link(url, link)
            is_crawled = self.redis_tools.check_finger(good_link)
            if is_crawled:
                print('%s已经抓取过' % good_link)
                continue
            detail_headers = self.get_headers(url, good_link, position)
            listpage_data = self.get_info_listPage(resp_list_page)
            meta['listpage_data'] = listpage_data
            if self.extract_dict['detail_page']['method'].upper() == 'GET':
                print('\t\t发详细页请求:get')
                yield scrapy.Request(url=good_link,
                                     callback=self.parse_detail,
                                     headers=detail_headers,
                                     meta=meta)
            else:
                # 注意:如果post方式、url查询参数动态变化,根据情况重写handle_detail_link()
                formdata = self.get_form_data(resp_list_page, position)
                yield scrapy.FormRequest(url=good_link,
                                         callback=self.parse_detail,
                                         headers=detail_headers,
                                         formdata=formdata,
                                         meta=meta)
            print('正在抓取第%d个公司相关信息' % self.cnt)
            self.cnt += 1
            # break

        if self.judge_next_page(resp_list_page):
            print('翻页。。。。')
            yield self.parse_turn_page(
                resp_list_page,
                method=self.extract_dict['list_page']['method'])
        else:
            print('翻页结束,当前是第{}页'.format(1))

    def parse_detail(self, resp_detail):
        print('parse_detail......')
        url = resp_detail.url
        compass_items = self.extract_compass_info(
            resp_detail, self.extract_dict['detail_page']['compass'])  # [item]
        print('len(compass_items): ', len(compass_items))

        quality_items = self.extract_qualification_info(
            resp_detail, self.extract_dict['detail_page']
            ['qualification'])  # [item, item...]
        print('len(quality_items): ', len(quality_items))

        project_link = self.get_project_link(resp_detail, compass_items)  #
        print('len(project_link): ', len([project_link]))
        staff_link = self.get_staff_link(resp_detail, compass_items)
        print('len(staff_link): ', len([staff_link]))
        behavior_link = self.get_behavior_link(resp_detail,
                                               compass_items)  # 良好、不良
        print('len(behavior_link): ', len([behavior_link]))
        change_link = self.get_change_link(resp_detail, compass_items)
        print('len(change_link): ', len([change_link]))

        same_seq = self.get_same_seq(
            [project_link, staff_link, behavior_link, change_link], url)
        yield JianzhuprojectItem({
            'compass_items':
            compass_items,
            'qualification_items':
            quality_items,
            'project_items':
            None,
            'staff_items':
            None,
            'change_items':
            None,
            'behavior_items':
            None,
            'crawl_time':
            self.fmt_time(),
            'compass_name':
            compass_items[0]['compass_name'],
            'honor_code':
            compass_items[0]['honor_code'],
            'source_link':
            url,
            'project_link':
            project_link,
            'staff_link':
            staff_link,
            'behavior_link':
            behavior_link,
            'change_link':
            change_link,
            'same_seq':
            same_seq,
        })

    def judge_next_page(self, resp):
        cur_page_num = resp.meta.get('cur_page_num', '1')
        is_have = resp.xpath(
            self.extract_dict['list_page']['have_next_page_rule'])
        total_page_num = resp.xpath(
            self.extract_dict['list_page']['total_page_num_rule'])
        return is_have and int(cur_page_num) < int(total_page_num)

    def parse_turn_page(self, resp_list, method):
        # 下一页参数
        print('parse_turn_page')
        response = resp_list
        url = response.url
        cur_page = int(response.meta.get('cur_page', 1))
        query_data = self.get_query_data(response, cur_page)  # query_string 字典
        form_data = self.get_form_data(response, Postion[0])  # form_data字典
        headers = self.get_headers(url, url, position=Postion[0])
        print('即将范第{}页.......'.format(cur_page))
        url = self.handle_url(url, query_data)
        response.meta['cur_page'] = str(cur_page + 1)
        if method.upper() == 'POST':
            return scrapy.FormRequest(url,
                                      callback=self.parse_list,
                                      formdata=form_data,
                                      headers=headers,
                                      meta=response.meta,
                                      dont_filter=True)
        else:
            # 注意:如果post方式,url查询参数动态变化,根据情况重写handle_url()
            return scrapy.Request(url,
                                  callback=self.parse_list,
                                  headers=headers,
                                  meta=response.meta)

    def get_same_seq(self, link_list, source_url):
        seq = ''
        for link in link_list:
            if link == source_url:
                seq += '1'
            else:
                seq += '0'
        assert len(seq) == len(link_list), 'seq长度错误, 请检查get_same_seq方法'
        return seq

    def handle_detail_link(self, url, link):
        if link.startswith('http'):
            good_link = link
        else:
            domain_str = self.get_domain_info(url)  # 待重写,domain_str可变, 结尾一定没有/
            if link.startswith('..'):
                good_link = link.replace('..', domain_str, count=1)
            elif link.startswith('.'):
                good_link = link.replace('.', domain_str, count=1)
            elif link.startswith('/'):
                good_link = domain_str + link
            else:
                print('请重写该方法')
                good_link = ''
        return good_link

    def handle_url(self, url, query_data):
        # 如果是请求ur的参数部分动态变化,则重写该方法
        return url

    def fmt_time(self):
        return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def get_form_data(self, response, postion):
        cur_page = response.meta.get('cur_page', 1)
        __VIEWSTATE = response.xpath(
            '//input[@id="__VIEWSTATE"]/@value').extract_first()
        __EVENTVALIDATION = response.xpath(
            '//input[@id="__EVENTVALIDATION"]/@value').extract_first()
        hidd_type = "1"
        newpage = str(int(cur_page) + 1)
        __EVENTTARGET = "Linkbutton3"
        return {
            "__VIEWSTATE": __EVENTVALIDATION,
            "__EVENTVALIDATION": __EVENTVALIDATION,
            "hidd_type": hidd_type,
            "newpage": newpage,
            "__EVENTTARGET": __EVENTTARGET
        }

    def get_query_data(self, response, cur_page):
        return {}

    def get_headers(self, url, link, position):
        # position: 只能取值-列表页ListPage、详细页DetailPage、资质页QualPage、工程页ProjPage、人员页StaffPage
        return {}

    def get_domain_info(self, link):
        # 根据link的开头特点需要进行重写
        # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
        import urlparse
        res = urlparse.urlparse(link)
        return res.scheme + '://' + res.netloc

    def get_info_listPage(self, resp_list_page):
        """
        备用, 部分数据须从列表页才能获取完整数据, 自实现(一般不建议)
        :param resp_list_page: 列表页的response
        :return: 字典数据
        """
        return {}

    def extract_qualification_info(self, resp_detail, qual_rules):
        # 再一次发送请求
        return {"name": "需要重写"}

    def extract_compass_info(self, resp_detail, com_rules):
        return {}

    def get_staff_link(self, resp_detail, compass_items):
        return ''

    def get_behavior_link(self, resp_detail, compass_items):
        return ''

    def get_change_link(self, resp_detail, compass_items):
        return ''

    def get_project_link(self, resp_detail, compass_items):
        return ''

    def run(self):
        cmdline.execute(['scrapy ', 'crawl', self.name])
Ejemplo n.º 16
0
class JiangSuCompass(BaseCompass):
    name = 'jiangsu_compass'
    allow_domain = ['58.213.147.230:7001']
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'ITEM_PIPELINES': {
            'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300,
        }
    }
    log_file = '../logs/{}_log.log'.format(name)
    cnt = 1
    start_urls = [(
        'http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/companyies.jsp?qylx=jlqy',
        sit_list[0])]

    extract_dict = {
        'inner': {
            'nodes':
            '//table[@mainbody]//tr[@onclick]',
            'cname':
            './td[2]/div[@title]/nobr//text()',
            'detail_link':
            './td[2]/div[@title]//a[contains(@href, "corp")]/@href',
            # 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + xxx
            'next_page_flag':
            u'//a[@disabled="disabled" and contains(text(), "下页")]/text()',
        },
        'view': '//input[@name="com.sun.faces.VIEW"]/@value',
    }

    redis_tools = RedisTools()

    def start_requests(self):
        for link, _ in self.start_urls:
            yield scrapy.Request(link,
                                 callback=self.parse_list,
                                 meta={
                                     'cur_page': '1',
                                     'total_page_num': 35
                                 },
                                 dont_filter=True)

    def parse_list(self, response):

        ext_rules = self.extract_dict['inner']
        nodes = response.xpath(ext_rules['nodes'])
        item_contains = []

        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_rules['cname']).extract_first())
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(ext_rules['detail_link']).extract_first())
            item['out_province'] = 'jiangsu'
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}

        meta = response.meta
        if int(meta['total_page_num']) > int(meta['cur_page']):
            print(u'当前页码:{}'.format(meta['cur_page']))
            yield self.turn_page(response)
        else:
            print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page']))
            return

    def turn_page(self, response):
        meta = response.meta
        headers = self.get_header(response.url, flag='2')
        if int(meta['cur_page']) % 10:
            time.sleep(random.random() * 4)
        meta['cur_page'] = str(int(meta['cur_page']) + 1)
        formdata = self.get_form_data(response)
        return scrapy.FormRequest(response.url,
                                  formdata=formdata,
                                  callback=self.parse_list,
                                  headers=headers,
                                  meta=meta)

    def handle_cdetail_link(self, link, flag='inner', url=''):
        # javascript:newWindow('jlqy/basicInfoView.jsp?action=viewJlqyJbxx&corpCode=71629845-5',1024,0,'jlqyView');
        # http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/jlqy/basicInfoView.jsp?action=viewJlqyJbxx&corpCode=71628806-2
        import re
        pp = re.compile(r"\('(.*?)'\);")
        _ = re.search(pp, link).group(1)
        return 'http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/' + _

    def get_form_data(self, resp):
        meta = resp.meta
        formdata = {
            'projectWinSelectedTabPageIndex':
            '1',
            'basicWinSelectedTabPageIndex':
            '1',
            'peopleWinSelectedTabPageIndex':
            '1',
            'form:refreshAct':
            '',
            'form:page':
            meta['cur_page'],
            'form:_id0':
            'jlqy',
            'form:_id2':
            '',
            'form:_id3':
            '',
            'form:_id4':
            '',
            'form:checkCode':
            '',
            'pageSize':
            '30',
            'com.sun.faces.VIEW':
            resp.xpath(self.extract_dict['view']).extract_first(),
            'form':
            'form',
        }
        return formdata