Beispiel #1
0
    def parse_detail(self, response):
        print 'parse_detail now'
        p = PyQuery(response.body)

        content = PyQuery(p('div.details'))
        s = ''
        for a in content('dl').items():
            b = PyQuery(a)
            s = s + b('dt').text() + b('dd').text() + '\n'
        item = GsxtItem()
        item['zhizhao'] = s
        item['cname'] = response.meta['company_name']
        yield item
        pass
Beispiel #2
0
    def parse_getcontent(self, response):
        # 解析搜索界面,得到详情页地址,跳转
        print 'getcontent now'
        try:
            yield self.return_begin(response).next()
            self.countsucc = self.countsucc + 1

            # print response.body
            p = PyQuery(response.body)
            meta = deepcopy(response.meta)
            if p('div.main-layout a.search_list_item'):
                '该搜索名能够得到公司信息'
                print '成功搜索到公司'
                # 跳转到该公司对应的页面获取详情
                # for i in p('div.main-layout a.search_list_item').items():
                #     href = self.detail_url + i.attr('href')
                #     a = PyQuery(i)
                #     meta.update({'company_name': ''.join(a('h1').text().split())})
                #     yield Request(
                #         method='GET',
                #         meta=meta,
                #         url=href,
                #         callback=self.parse_detail,
                #         dont_filter=True,
                #     )

                # 保存公司简略信息
                for content in p('div.main-layout a.search_list_item').items():
                    item = GsxtItem()
                    a = PyQuery(content)
                    self.logger.info(
                        str(self.countsucc) + ', ' +
                        ''.join(a('h1').text().split()))
                    # item['cname'] = ''.join(a('h1').text().split())
                    # item['status'] = a('div.wrap-corpStatus span').text()
                    # item['ccode'] = a('div.div-map2 span').text()
                    # item['lawuser'] = a('div.div-user2 span').text()
                    # item['etime'] = a('div.div-info-circle2 span').text()
                    # yield item
            else:
                print '没有符合的公司'

        except Exception as e:
            self.logger.exception(e)
            print 'error parse_getcontent'
Beispiel #3
0
    def save_content(self, response):
        meta = response.meta

        if meta['info'] == 'basic':
            if 'company_detail_basic.html' in response.body:
                self.logger.info(
                    'get content basic, name:{} list_name:{}'.format(
                        meta['name'], meta['list_name']))
            else:
                self.logger.info('not get basic, name:{} list_name:{}'.format(
                    meta['name'], meta['list_name']))
                print response.body
                yield self.man_retry(response)
                return

        if meta['info'] == 'JCXX':
            if '营业执照信息' in response.body:
                self.logger.info(
                    'get content JCXX, name:{} list_name:{}'.format(
                        meta['name'], meta['list_name']))
            else:
                self.logger.info('not get JCXX, name:{} list_name:{}'.format(
                    meta['name'], meta['list_name']))
                print response.body
                yield self.man_retry(response)
                return

        if meta['info'] == 'XZXK':
            if '行政许可信息' in response.body:
                self.logger.info(
                    'get content XZXK, name:{} list_name:{}'.format(
                        meta['name'], meta['list_name']))
            else:
                self.logger.info('not get XZXK, name:{} list_name:{}'.format(
                    meta['name'], meta['list_name']))
                print response.body
                yield self.man_retry(response)
                return

        result = GsxtItem.get_result_from_response(response)
        result['content'] = '%s %s\n%s' % (meta['name'], meta['list_name'],
                                           response.body)
        yield result
Beispiel #4
0
    def parse_getcontent(self, response):
        #解析搜索界面,得到详情页地址,跳转
        print 'getcontent now'
        try:
            p = PyQuery(response.body)
            meta = deepcopy(response.meta)
            if p('div.main-layout a.search_list_item'):
                '该搜索名能够得到公司信息'
                print '成功搜索到公司'
                #跳转到该公司对应的页面获取详情
                for i in p('div.main-layout a.search_list_item').items():
                    href = self.detail_url + i.attr('href')
                    a = PyQuery(i)
                    meta.update(
                        {'company_name': ''.join(a('h1').text().split())})
                    yield Request(
                        method='GET',
                        meta=meta,
                        url=href,
                        callback=self.parse_detail,
                        dont_filter=True,
                    )

                #保存公司简略信息
                for content in p('div.main-layout a.search_list_item').items():
                    item = GsxtItem()
                    a = PyQuery(content)
                    item['cname'] = ''.join(a('h1').text().split())
                    item['status'] = a('div.wrap-corpStatus span').text()
                    item['ccode'] = a('div.div-map2 span').text()
                    item['lawuser'] = a('div.div-user2 span').text()
                    item['etime'] = a('div.div-info-circle2 span').text()
                    yield item
            else:
                print '没有符合的公司'

        except Exception as e:
            print 'error parse_getcontent'
Beispiel #5
0
 def parse(self, response):
     item = GsxtItem()
     item['text'] = response.xpath("//text()").extract_first()
     yield item