Python get_trunk Examples, utils.webpage.get_trunk Python Examples

Example #1

0

Show file

File: xinwen.py Project: michael1011101/blotus

    def parse(self, response):
        tc = self.get_thread_category_from_url(response.url)
        if not tc[0] or not tc[1]:
            self.logger.warning('Invalid Wangjia News Item From <%s>.' % response.url)
            return None

        symbol = (tc[0], self.tab[tc[1]], response.url)
        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning('No.%s Wangjia News %s Item From <%s> Maybe Limited.' % symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia News %s Item From <%s>.' % symbol)

        item = XinwenItem()
        item['thread'] = int(symbol[0])
        item['category_id'] = tc[1]
        item['source'] = symbol[2]

        article = response.xpath('//div[@class="show-box"]')
        item['title'] = get_content(article.xpath('h1/text()').extract())

        subtitle = article.xpath('div[@class="s-bq"]/span')
        item['created'] = subtitle[0].xpath('text()').extract()[0]
        if len(subtitle) >= 3:
            item['author'] = get_content(subtitle[2].xpath('text()').extract()).split(u'：')[1]
        item['summary'] = get_content(article.xpath('div[@class="s-zy"]/span/text()').extract())

        body = article.xpath('div[@class="c-cen"]')
        item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//img/@src').extract()]) or None

        self.logger.info(item)
        return None

Example #2

0

Show file

    def parse_news_detail(self, response):

        news = NewsItem()
        news['thread'] = self.get_thread_from_url(response.url)
        news['source'] = response.url
        news['title'] = get_content(response.xpath('//title/text()').extract())

        news['created'] = get_content(
            response.xpath('//small/span[last()]/text()').extract())
        news['author'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()

        news['summary'] = response.xpath(
            '//meta[@name="description"]/@content').extract_first()

        news['keywords'] = response.xpath(
            '//meta[@name="keywords"]/@content').extract_first()

        news['category'] = get_content(
            response.xpath('//small/span[1]/a/text()').extract())

        article = response.xpath('//div[@class="article-txt"]')
        news['raw_content'] = article.extract_first()
        news['content'] = ''.join(
            [get_trunk(c) for c in article.xpath('.//text()').extract()])

        news['image_url'] = '#'.join(
            [get_trunk(c)
             for c in article.xpath('.//img/@src').extract()]) or None

        # print(news)
        yield news

Example #3

0

Show file

    def parse_news_detail(self, response):

        news = NewsItem()
        news['thread'] = self.get_thread_from_url(response.url)
        news['source'] = response.url
        news['title'] = get_content(response.xpath('//h1/text()').extract())

        news['created'] = response.meta['created']
        news['author'] = response.meta['author']

        news['category'] = response.meta['category']

        news['summary'] = response.meta['summary']

        article = response.xpath('//td[@id="article_content"]')
        news['raw_content'] = article.extract_first()
        news['content'] = ''.join([
            get_trunk(c) for c in article.xpath(
                './/p[contains(@class, "ke-editor-inner-p")]/text()').extract(
                )
        ])

        news['image_url'] = '#'.join([
            self.modify_image_url(get_trunk(c))
            for c in article.xpath('.//img/@src').extract()
        ]) or None

        yield news

Example #4

0

Show file

    def parse_exposure_detail(self, response):

        exposure = ExposureItem()
        exposure['thread'] = self.get_thread_from_url(response.url)
        exposure['source'] = response.url
        exposure['title'] = get_content(response.xpath('//span[@id="thread_subject"]/text()').extract())

        poston = response.xpath('(//div[@class="authi"])[2]/em/text()').extract_first()
        exposure['created'] = poston[poston.index(' ') + 1:]

        exposure['name'] = get_content(response.xpath('//div[@class="typeoption"]//tr[1]/td/text()').extract())
        exposure['link'] = get_content(response.xpath('//div[@class="typeoption"]//tr[2]/td/a/text()').extract())
        exposure['reason'] = get_content(response.xpath('//div[@class="typeoption"]//tr[3]/td/text()').extract())

        body = response.xpath('//td[contains(@id, "postmessage")]')

        exposure['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        exposure['raw_content'] = body.extract_first()
        exposure['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in response.xpath(
            '//ignore_js_op//img[re:test(@zoomfile, "^data")]/@zoomfile').extract()]) or None

        # exposure['image_url'] = response.xpath('//ignore_js_op//img[re:test(@src, "^data")]/@src').extract()

        # print(exposure)
        yield exposure

Example #5

0

Show file

File: xinwen.py Project: lwh1992/blotus

    def parse(self, response):
        tc = self.get_thread_category_from_url(response.url)
        if not tc[0] or not tc[1]:
            self.logger.warning("Invalid Wangjia News Item From <%s>." % response.url)
            return None

        symbol = (tc[0], self.tab[tc[1]], response.url)
        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning("No.%s Wangjia News %s Item From <%s> Maybe Limited." % symbol)
            return None

        self.logger.info("Parsing No.%s Wangjia News %s Item From <%s>." % symbol)

        item = XinwenItem()
        item["thread"] = int(symbol[0])
        item["category_id"] = tc[1]
        item["source"] = symbol[2]

        article = response.xpath('//div[@class="con_news"]')
        item["title"] = get_content(article.xpath("h1/text()").extract())

        subtitle = article.xpath('ul/li[@class="n_time"]/text()').extract()[0].encode("utf8").split("：")
        item["created"] = get_content(subtitle[1].split())
        item["author"] = get_content(subtitle[-1].split())
        item["summary"] = get_content(article.xpath('ul/li[@class="a_abstract"]/span/text()').extract())

        body = article.xpath('ul/li[@class="news_con_p"]')
        item["content"] = "".join([get_trunk(c) for c in body.xpath(".//text()").extract()])
        item["raw_content"] = get_content(body.extract())
        item["image_url"] = (
            "#".join([self.modify_image_url(get_trunk(c)) for c in body.xpath(".//img/@src").extract()]) or None
        )

        return item

Example #6

0

Show file

File: company.py Project: wagaman/dollop

    def parse_govern_info(self, response):
        name = get_content(
            response.xpath('//div[@class="comp-intro"]').xpath(
                './/div[@class="intro-txt"]/span')[0].xpath(
                    'string(.)').extract())

        company = GovernInfoItem()
        company['link'] = response.url
        company['name'] = name
        company['code'] = response.meta['code']

        govern_info = response.xpath('//div[@id="govern-info"]')

        company['structure'] = get_content(
            response.xpath(
                '//div[@class="mask"]/img[@class="mask-img"]/@src').extract())

        relation = dict()
        for tr in govern_info.xpath('table[2]/tbody/tr'):
            key = get_content(tr.xpath('string(td[1])').extract())
            value = get_content(tr.xpath('string(td[2])').extract())
            relation[key] = value
        company['relation'] = json.dumps(relation,
                                         encoding="UTF-8",
                                         ensure_ascii=False)

        controller = govern_info.xpath(
            'table[3]/tbody/tr[1]/td/text()').extract()
        company['controller'] = json.dumps(controller,
                                           encoding="UTF-8",
                                           ensure_ascii=False)

        shareholder_list = list()
        for tr in govern_info.xpath('table[4]/tbody/tr'):
            shareholder_list.append([
                get_trunk(item) for item in tr.xpath('td//text()').extract()
                if get_trunk(item) != ''
            ])

        company['shareholder_list'] = json.dumps(shareholder_list,
                                                 encoding="UTF-8",
                                                 ensure_ascii=False)

        manager_list = list()
        for tr in govern_info.xpath('table[5]/tbody/tr'):
            manager_list.append([
                get_trunk(item) for item in tr.xpath('td//text()').extract()
                if get_trunk(item) != ''
            ])

        company['manager_list'] = json.dumps(manager_list,
                                             encoding="UTF-8",
                                             ensure_ascii=False)

        return company

Example #7

0

Show file

    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(
                rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(
                rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([
                get_trunk(info) for info in tag_info.xpath('text()').extract()
            ])

        comment_info = response.xpath(
            '//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(
                commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(
                score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(
                score[2].xpath('text()').extract())
            item['experience_num'] = get_content(
                score[3].xpath('text()').extract())

            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(
                scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(
                scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(
                scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(
                scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath(
                './dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([
                get_trunk(impress)
                for impress in impress_info.xpath('text()').extract()
            ])

        return item

Example #8

0

Show file

File: baoguang.py Project: wagaman/dollop

    def parse(self, response):
        symbol = (self.get_thread_from_url(response.url), response.url)
        if not symbol[0]:
            self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' %
                                symbol[1])
            return None

        if response.xpath('//div[@class="wrap"]'):
            self.logger.warning('May Redirect To Warning Page Of Wangjia.')
            return None

        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning(
                'No.%s Wangjia Exposure Item From <%s> Maybe Limited.' %
                symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' %
                         symbol)

        item = BaoguangItem()
        item['thread'] = int(symbol[0])
        item['source'] = symbol[1]

        item['title'] = get_content(
            response.xpath('//h1[@class="context-title"]/text()').extract())

        subtitle = response.xpath('//div[@class="post-time"]/span')
        poston = get_content(subtitle.xpath('text()').extract(),
                             skipBlank=False)
        item['created'] = poston[poston.index(' ') + 1:]

        header = response.xpath(
            '//div[@class="post-pub-txt mb12"]/table/tbody/tr')
        if header:
            item['name'] = get_content(
                header[1].xpath('./td[2]/text()').extract())
            item['link'] = get_content(
                header[2].xpath('./td[2]/text()').extract())
            item['reason'] = get_content(
                header[3].xpath('./td[2]/text()').extract())

        body = response.xpath('//div[@class="news_con_p"]')
        #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()])
        item['content'] = ''.join(
            [get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([
            self.modify_image_url(get_trunk(c))
            for c in body.xpath('.//img/@src').extract()
        ]) or None

        return item

Example #9

0

Show file

File: yaoping.py Project: michael1011101/medical

    def parse(self, response):
        # self.object = self.mapping.get(response.url)
        # symbol = (self.object.manual_id, response.url)
        # self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

        symbol = (self.mapping.get(response.url), response.url)
        self.logger.info(
            "Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

        item = YaopingItem()
        item['manual_id'] = symbol[0]

        sub = response.xpath('//div[@class="subs"]//a')
        item['category_list'] = '>>'.join(
            [get_trunk(s) for s in sub.xpath('text()').extract()])

        category_list = item['category_list'].split(">>")
        if len(category_list) == 3:
            item['category_first'] = category_list[1]
        elif len(category_list) == 4:
            item['category_first'] = category_list[1]
            item['category_second'] = category_list[2]

        item['name'] = get_content(
            response.xpath('//div[@class="t1"]/h1/a/text()').extract())
        cites = response.xpath('//div[@class="t1"]//cite')
        item['cites'] = '&&'.join(
            [get_trunk(cite) for cite in cites.xpath('span/text()').extract()])

        item['english_name'] = get_content(
            response.xpath('//cite[@class="t2"]/text()').extract(),
            skipBlank=False)

        item['company'] = get_content(
            response.xpath('//li[@class="company"]/text()').extract())
        item['address'] = get_content(
            response.xpath('//li[@class="address"]/text()').extract())
        item['telephone'] = get_content(
            response.xpath('//li[@class="telephone"]/text()').extract(),
            skipBlank=False)

        information = response.xpath('//div[@class="tab_box"]//dl')
        for info in information:
            key = get_content(info.xpath('dt/text()').extract())
            if self.detail_map.get(key):
                attr = self.detail_map[key]
                detail = info.xpath('dd')

                #using string(.) to remove html label
                item[attr] = get_content(detail.xpath('string(.)').extract())

        return item

Example #10

0

Show file

    def parse(self, response):
        symbol = (self.get_thread_from_url(response.url), response.url)
        if not symbol[0]:
            self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' %
                                symbol[1])
            return None

        if response.xpath('//div[@class="wrap"]'):
            self.logger.warning('May Redirect To Warning Page Of Wangjia.')
            return None

        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning(
                'No.%s Wangjia Exposure Item From <%s> Maybe Limited.' %
                symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' %
                         symbol)

        item = BaoguangItem()
        item['thread'] = int(symbol[0])
        item['source'] = symbol[1]

        title = response.xpath('//span[@id="thread_subject"]')
        item['title'] = get_content(title.xpath('text()').extract())

        subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0]
        poston = get_content(subtitle.xpath('text()').extract(),
                             skipBlank=False)
        item['created'] = poston[poston.index(' ') + 1:]

        header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td')
        if header:
            item['name'] = get_content(header[0].xpath('.//text()').extract())
            item['link'] = get_content(header[1].xpath('.//text()').extract())
            item['reason'] = get_content(
                header[2].xpath('.//text()').extract())

        body = response.xpath('//td[starts-with(@id, "postmessage")]')[0]
        #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()])
        item['content'] = ''.join(
            [get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([
            self.modify_image_url(get_trunk(c))
            for c in body.xpath('.//@file').extract()
        ]) or None

        return item

Example #11

0

Show file

    def parse_detail(self, response):

        member = MemberItem()

        info = response.xpath('//div[@id="tytext"]')
        member['name'] = get_content(info.xpath('h1/text()').extract())
        member['date'] = get_content(
            info.xpath('p[@class="tytdate"]/text()').extract())
        member['link'] = response.url
        if len(info.xpath('./div/p')) > 0:
            for p in info.xpath('./div/p'):
                content = get_content(p.xpath('string(.)').extract())
                print(member['name'])
                print(content)
                print('--------1')
                if content == None: continue
                if content.find(u'网址') >= 0:
                    member['website'] = content.split(':')[-1]
                elif content.find(u'电话') >= 0:
                    member['phone'] = content.split(':')[-1]
                elif content.find(u'地址') >= 0:
                    member['address'] = content.split(':')[-1]
                elif content.find(u'邮编') >= 0:
                    member['zip'] = content.split(':')[-1]
        elif len(info.xpath('./p')) < 4:
            content = info.xpath('string(./p[2])').extract_first().split('\n')

            for s in content:
                print(s)
                print('--------2')
                value = get_trunk(s.split(u'：')[-1])
                if s.find(u'网址') >= 0:
                    member['website'] = value
                elif s.find(u'电话') >= 0:
                    member['phone'] = value
                elif s.find(u'地址') >= 0:
                    member['address'] = value
                elif s.find(u'邮编') >= 0:
                    member['zip'] = value
        else:
            for p in info.xpath('./p'):
                content = get_content(p.xpath('string(.)').extract())
                print(member['name'])
                print(content)
                print('--------3')
                if content == None: continue
                if content.find(u'网址') >= 0:
                    member['website'] = content.split(':')[-1]
                elif content.find(u'电话') >= 0:
                    member['phone'] = content.split(':')[-1]
                elif content.find(u'地址') >= 0:
                    member['address'] = content.split(':')[-1]
                elif content.find(u'邮编') >= 0:
                    member['zip'] = content.split(':')[-1]

        if member['website'][0] == '/':
            member['website'] = 'http:' + member['website']
        yield member

Example #12

0

Show file

File: yaoping.py Project: michael1011101/medical

	def parse(self, response):
		# self.object = self.mapping.get(response.url)
		# symbol = (self.object.manual_id, response.url)
		# self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

		symbol = (self.mapping.get(response.url), response.url)
		self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol)

		item = YaopingItem()
		item['manual_id'] = symbol[0]

		sub = response.xpath('//div[@class="subs"]//a')
		item['category_list'] = '>>'.join([get_trunk(s) for s in sub.xpath('text()').extract()])

		category_list = item['category_list'].split(">>")
		if len(category_list) == 3:
			item['category_first'] = category_list[1]
		elif len(category_list)  == 4:
			item['category_first'] = category_list[1]
			item['category_second'] = category_list[2]

		item['name'] = get_content(response.xpath('//div[@class="t1"]/h1/a/text()').extract())
		cites = response.xpath('//div[@class="t1"]//cite')
		item['cites'] = '&&'.join([get_trunk(cite) for cite in cites.xpath('span/text()').extract()])

		item['english_name'] = get_content(response.xpath('//cite[@class="t2"]/text()').extract(), skipBlank=False)

		item['company'] = get_content(response.xpath('//li[@class="company"]/text()').extract())
		item['address'] = get_content(response.xpath('//li[@class="address"]/text()').extract())
		item['telephone'] = get_content(response.xpath('//li[@class="telephone"]/text()').extract(), skipBlank=False)

		information = response.xpath('//div[@class="tab_box"]//dl')
		for info in information:
			key = get_content(info.xpath('dt/text()').extract())
			if self.detail_map.get(key):
				attr = self.detail_map[key]
				detail = info.xpath('dd')

				#using string(.) to remove html label
				item[attr] = get_content(detail.xpath('string(.)').extract()) 

		return item

Example #13

0

Show file

    def parse(self, response):
        symbol = (self.mapping[response.meta['cpid']], response.meta['cpid'])
        self.logger.info('Parsing ID.%d Chinawealth Area Info From Pid:%s' % symbol)

        item = ChanpinItem()
        data = json.loads(response.body)
        provinces = data.get('List', [])
        areas = '#'.join(get_trunk(each.get('cpxsqy','')) for each in provinces)
        item['pid'] = symbol[1]
        item['area'] = areas
        return item

Example #14

0

Show file

    def parse_platform_honor(self, response, thread, name, link):

        platform_honor = HonorItem()
        platform_honor['thread'] = thread
        platform_honor['name'] = name
        platform_honor['link'] = link

        platform_honor['honor_list'] = [get_trunk(honor) for honor in
                                        response.xpath('//div[contains(@class, "honor")]/ul/li/text()').extract()]

        return platform_honor

Example #15

0

Show file

File: report.py Project: wagaman/dollop

    def parse_detail(self, response):
        report = ReportItem()
        report['thread'] = self.get_id_from_url(response.url)
        report['category'] = response.meta['category']
        report['link'] = response.url
        report['title'] = get_content(
            response.xpath('//div[@class="report"]/h1/text()').extract())
        report['created'] = get_content(
            response.xpath(
                '//span[@class="inputtime"]/text()').extract())[-10:]

        article = response.xpath('//div[@class="dianping"]')
        report['raw_content'] = article.extract_first()
        report['content'] = ''.join(
            [get_trunk(c) for c in article.xpath('.//text()').extract()])

        report['image_url'] = '#'.join(
            [get_trunk(c)
             for c in article.xpath('.//img/@src').extract()]) or None

        yield report

Example #16

0

Show file

File: baoguang.py Project: lwh1992/blotus

    def parse(self, response):
        symbol = (self.get_thread_from_url(response.url), response.url)
        if not symbol[0]:
            self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1])
            return None

        if response.xpath('//div[@class="wrap"]'):
            self.logger.warning('May Redirect To Warning Page Of Wangjia.')
            return None

        if response.xpath('//div[@id="messagetext" and @class="alert_info"]'):
            self.logger.warning('No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol)
            return None

        self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol)

        item = BaoguangItem()
        item['thread'] = int(symbol[0])
        item['source'] = symbol[1]

        title = response.xpath('//span[@id="thread_subject"]')
        item['title'] = get_content(title.xpath('text()').extract())

        subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0]
        poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False)
        item['created'] = poston[poston.index(' ')+1:]

        header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td')
        if header:
            item['name'] = get_content(header[0].xpath('.//text()').extract())
            item['link'] = get_content(header[1].xpath('.//text()').extract())
            item['reason'] = get_content(header[2].xpath('.//text()').extract())

        body = response.xpath('//td[starts-with(@id, "postmessage")]')[0]
        #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()])
        item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()])
        item['raw_content'] = get_content(body.extract())
        item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//@file').extract()]) or None

        return item

Example #17

0

Show file

File: news.py Project: wagaman/dollop

    def parse_news_detail(self, response):

        news = NewsItem()

        news['thread'] = self.get_thread_from_url(response.url)
        news['source'] = response.url
        news['title'] = response.meta['title']
        # news['created'] = response.meta['created']
        news['created'] = get_content(
            response.xpath(
                '//div[@class="reInfo"]/div[1]/span[2]/text()').extract())

        news['keywords'] = get_content(
            response.xpath('//meta[@name="keywords"]/@content').extract())
        news['summary'] = get_content(
            response.xpath('//meta[@name="description"]/@content').extract())

        # if response.xpath('//div[@class="reInfo"]/div[1]/span[last()]/a/text()'):
        #     news['category'] = get_content(
        #         response.xpath('//div[@class="reInfo"]/div[1]/span[last()]/a/text()').extract())
        # else:
        #     news['category'] = get_content(
        #         response.xpath('//div[@class="reInfo"]/span[last()]/a/text()').extract())

        news['category'] = response.meta['category']

        article = response.xpath(
            '//div[@class="article-content" or @id="ctrlfscont"]'
        ) if response.xpath(
            '//div[@class="article-content" or @id="ctrlfscont"]'
        ) else response.xpath('//div[@class="Custom_UnionStyle"]')

        news['raw_content'] = article.extract_first()
        news['content'] = ''.join(
            [get_trunk(c) for c in article.xpath('.//text()').extract()])
        news['image_url'] = '#'.join(
            [get_trunk(c)
             for c in article.xpath('.//img/@src').extract()]) or None

        yield news

Example #18

0

Show file

File: bjh_region.py Project: wagaman/dollop

    def parse_detail(self, response):

        title = response.meta['title']
        year, month = self.parse_title(title)
        created = response.meta['created']
        link = response.url
        capital_structure = None
        content = ' '.join(
            [get_trunk(c) for c in response.xpath('//p//text()').extract()])

        amount = None
        caichanxian = None
        shouxian = None
        yiwaixian = None
        jiankangxian = None

        for tbody in response.xpath('//tbody'):
            if len(tbody.xpath('tr')) > 5:
                for tr in tbody.xpath('tr'):
                    # try:
                    if len(tr.xpath('td')) == 6:
                        region_name = get_content(
                            tr.xpath('string(td[1])').extract())
                        amount = get_content(
                            tr.xpath('string(td[2])').extract())
                        caichanxian = get_content(
                            tr.xpath('string(td[3])').extract())
                        shouxian = get_content(
                            tr.xpath('string(td[4])').extract())
                        yiwaixian = get_content(
                            tr.xpath('string(td[5])').extract())
                        jiankangxian = get_content(
                            tr.xpath('string(td[6])').extract())
                    else:
                        continue

                    if region_name and region_name.find(u'地区') < 0:
                        region = RegionItem()
                        region['title'] = title
                        region['year'] = year
                        region['month'] = month
                        region['link'] = link
                        region['region'] = region_name
                        region['amount'] = amount
                        region['caichanxian'] = caichanxian
                        region['shouxian'] = shouxian
                        region['yiwaixian'] = yiwaixian
                        region['jiankangxian'] = jiankangxian
                        region['content'] = content
                        region['created'] = created

                        yield region

Example #19

0

Show file

File: company.py Project: wagaman/dollop

    def parse_trade_log(self, response):

        attr_list = response.xpath(
            '//*[@id="trade-log"]/table[2]/tr[1]/td/text()').extract()
        date_list = response.xpath(
            '//*[@id="trade-log"]/table[1]/tr/td[@class="table-label"]/text()'
        ).extract()

        date_list = [
            get_trunk(date) for date in date_list
            if get_trunk(date) != '' and get_trunk(date).find(u'信息截止日期') < 0
        ]

        for i, date in enumerate(date_list):
            company = TradeLogItem()
            company['link'] = response.url
            company['code'] = response.meta['code']
            name = get_content(
                response.xpath('//div[@class="comp-intro"]').xpath(
                    './/div[@class="intro-txt"]/span')[0].xpath(
                        'string(.)').extract())
            company['name'] = name
            company['date'] = date

            log = dict()

            attr_value_list = response.xpath(
                '//*[@id="trade-log"]/table[2]/tr[{}]/td/text()'.format(
                    str(i + 2))).extract()
            attr_value_list = [get_trunk(value) for value in attr_value_list]

            for j in range(len(attr_value_list)):
                log[attr_list[j]] = attr_value_list[j]

            company['log'] = json.dumps(log,
                                        encoding="UTF-8",
                                        ensure_ascii=False)
            yield company

Example #20

0

Show file

File: tedian.py Project: lwh1992/blotus

    def parse(self, response):
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = TedianItem()
        item['name'] = self.object.name

        rtag = response.xpath('//div[@class="rTags"]')
        if rtag:
            item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract())
            item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract())

            tag_info = rtag.xpath('./span[@class = "tag"]')
            item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()])

        comment_info = response.xpath('//div[contains(@class,"box commentBox")]')
        if comment_info:
            commentScores = comment_info.xpath('./dl[@class="comment"]')
            item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract())

            score = commentScores.xpath('./dd/span[@class="num"]')
            item['withdraw_num'] = get_content(score[0].xpath('text()').extract())
            item['guard_num'] = get_content(score[1].xpath('text()').extract())
            item['service_num'] = get_content(score[2].xpath('text()').extract())
            item['experience_num'] = get_content(score[3].xpath('text()').extract())
    
            scoreInfo = commentScores.xpath('.//span[not(@class="num")]')
            item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract())
            item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract())
            item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract())
            item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract())

            impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span')
            item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()])

        return item

Example #21

0

Show file

    def parse_investor(self, response, thread, name, link):

        investor = InvestorItem()
        investor['thread'] = thread
        investor['name'] = name
        investor['link'] = link
        investor['date'] = datetime.now().strftime('%Y-%m-%d')

        investor['age_distribution'] = [propo + '%' for propo in
                                        response.xpath('//div[@id="ageList"]/dl/dd/span[1]/em/text()').extract()]

        investor['sex_distribution'] = response.xpath(
            '//div[contains(@class, "index-investors-sex")]/script/text()').re_first(
            r'= (.*);')

        investor['tag_list'] = '#'.join(
            [get_trunk(tag) for tag in response.xpath('//div[@id="index_tag"]/a/text()').extract()])
        return investor

Example #22

0

Show file

File: dangan.py Project: lwh1992/blotus

    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]')
        if info:
            item['company_name'] = get_content(info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(info[1].xpath('text()').extract())
            item['company_type'] = get_content(info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(info[5].xpath('text()').extract())
            item['registered_address'] = get_content(info[6].xpath('text()').extract())
            item['opening_date'] = get_content(info[7].xpath('text()').extract())
            item['approved_date'] = get_content(info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(info[9].xpath('text()').extract())
            item['business_licence'] = get_content(info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(info[12].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td')
        if record:
            item['domain_name'] = get_content(record[0].xpath('text()').extract())
            item['domain_date'] = get_content(record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p')
        if cost:
            item['management_fee'] = get_content(cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p')
        if contact:
            item['contact_address'] = get_content(contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item

Example #23

0

Show file

File: dangan.py Project: michael1011101/blotus

    def parse(self, response):
        #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE.
        symbol = (self.mapping.get(self.get_pin_from_url(response.url)),
                  response.url)
        self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol)
        self.object = DaohangItem.get_object_by_pk(symbol[0])

        item = DanganItem()
        item['name'] = self.object.name
        item['logo_url'] = get_content(
            response.xpath('//div[@class="rLogo"]/a/img/@src').extract())

        detail = response.xpath('//div[contains(@class, "detailBox")]/p')
        if detail:
            item['link'] = get_content(detail[1].xpath('a/@href').extract())
            item['location'] = get_content(detail[3].xpath('text()').extract())
            item['launch_time'] = get_content(
                detail[4].xpath('text()').extract())

        about = response.xpath('//div[contains(@class, "aboutBd")]/p')
        if about:
            item['introduction'] = ' '.join(
                [get_trunk(c) for c in about.xpath('.//text()').extract()])

        info = response.xpath(
            '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]'
        )
        if info:
            item['company_name'] = get_content(
                info[0].xpath('text()').extract())
            item['artificial_person'] = get_content(
                info[1].xpath('text()').extract())
            item['company_type'] = get_content(
                info[2].xpath('text()').extract())
            item['shareholder_stucture'] = get_content(
                info[3].xpath('text()').extract())
            item['registered_capital'] = get_content(
                info[4].xpath('text()').extract())
            item['contributed_capital'] = get_content(
                info[5].xpath('text()').extract())
            item['registered_address'] = get_content(
                info[6].xpath('text()').extract())
            item['opening_date'] = get_content(
                info[7].xpath('text()').extract())
            item['approved_date'] = get_content(
                info[8].xpath('text()').extract())
            item['registration_authority'] = get_content(
                info[9].xpath('text()').extract())
            item['business_licence'] = get_content(
                info[10].xpath('text()').extract())
            item['institutional_framework'] = get_content(
                info[11].xpath('text()').extract())
            item['tax_registration_num'] = get_content(
                info[12].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath(
                'td')
        if record:
            item['domain_name'] = get_content(
                record[0].xpath('text()').extract())
            item['domain_date'] = get_content(
                record[1].xpath('text()').extract())
            item['domain_company_type'] = get_content(
                record[2].xpath('text()').extract())
            item['domain_company_name'] = get_content(
                record[3].xpath('text()').extract())
            item['icp'] = get_content(record[4].xpath('text()').extract())

        people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li')
        if people:
            avatar_url = []
            content = []
            for i in xrange(len(people)):
                avatar_url.extend(
                    people[i].xpath('div[@class="avatar"]/img/@src').extract())
                content.extend([
                    get_trunk(c)
                    for c in people[i].xpath('p//text()').extract()
                ])
            item['company_person_avatar_url'] = '#'.join(avatar_url)
            item['company_person'] = ' '.join(content)

        cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath(
            'p')
        if cost:
            item['management_fee'] = get_content(
                cost[0].xpath('text()').extract())
            item['prepaid_fee'] = get_content(
                cost[1].xpath('text()').extract())
            item['cash_withdrawal_fee'] = get_content(
                cost[2].xpath('text()').extract())
            item['vip_fee'] = get_content(cost[3].xpath('text()').extract())
            item['transfer_fee'] = get_content(
                cost[4].xpath('text()').extract())
            item['mode_of_payment'] = get_content(
                cost[5].xpath('text()').extract())

        contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath(
            'p')
        if contact:
            item['contact_address'] = get_content(
                contact[0].xpath('text()').extract())
            item['phone_400'] = get_content(
                contact[1].xpath('text()').extract())
            item['phone'] = get_content(contact[2].xpath('text()').extract())
            item['fax'] = get_content(contact[3].xpath('text()').extract())
            item['email'] = get_content(contact[4].xpath('text()').extract())

        record = response.xpath(
            '//div[contains(@class, "recordListBox")]/ul/li')
        if record:
            item['is_automatic_bid'] = get_content(
                record[3].xpath('.//text()').extract(), skipFirst=True)
            item['is_equitable_assignment'] = get_content(
                record[4].xpath('.//text()').extract(), skipFirst=True)
            item['trust_fund'] = get_content(
                record[5].xpath('.//text()').extract(), skipFirst=True)
            item['tender_security'] = get_content(
                record[6].xpath('.//text()').extract(), skipFirst=True)
            item['security_mode'] = get_content(
                record[7].xpath('.//text()').extract(), skipFirst=True)
            item['guarantee_institution'] = get_content(
                record[8].xpath('.//text()').extract(), skipFirst=True)
            item['business_type'] = len(record) >= 10 and get_content(
                record[9].xpath('.//text()').extract(), skipFirst=True)

        log_empty_fields(item, self.logger)
        return item

Example #24

0

Show file

File: bjh_caichanxian.py Project: wagaman/dollop

    def parse_detail(self, response):

        title = response.meta['title']
        year, month = self.parse_title(title)
        created = response.meta['created']
        link = response.url
        capital_structure = None
        content = ' '.join(
            [get_trunk(c) for c in response.xpath('//p//text()').extract()])
        share = None
        flag = False

        for tbody in response.xpath('//tbody'):
            if len(tbody.xpath('tr')) > 10:
                for tr in tbody.xpath('tr'):
                    if len(tr.xpath('td')) == 3:
                        if flag:
                            name = get_content(
                                tr.xpath('string(td[1])').extract())
                            income = get_content(
                                tr.xpath('string(td[2])').extract())
                            share = get_content(
                                tr.xpath('string(td[3])').extract())
                        else:
                            name = get_content(
                                tr.xpath('string(td[2])').extract())
                            income = get_content(
                                tr.xpath('string(td[3])').extract())
                            try:
                                if get_content(
                                        tr.xpath('string(td[1])').extract()
                                ).find(u'资') >= 0:
                                    capital_structure = get_content(
                                        tr.xpath('string(td[1])').extract())
                            except:
                                pass
                    elif len(tr.xpath('td')) == 2:
                        name = get_content(tr.xpath('string(td[1])').extract())
                        income = get_content(
                            tr.xpath('string(td[2])').extract())
                    elif len(tr.xpath('td')) == 4:
                        try:
                            if get_content(
                                    tr.xpath('string(td[1])').extract()).find(
                                        u'资') >= 0:
                                capital_structure = get_content(
                                    tr.xpath('string(td[1])').extract())
                        except:
                            pass
                        name = get_content(tr.xpath('string(td[3])').extract())
                        income = get_content(
                            tr.xpath('string(td[4])').extract())

                    else:
                        continue

                    if income and income.find(u'份额') >= 0:
                        flag = True
                    if name and income and name.find(
                            u'公司名称') < 0 and name.find(
                                u'小计') < 0 and name.find(u'合计') < 0:
                        if income.find(u'万元') >= 0 or income.find(
                                u'保费') >= 0 or income.find(u'份额') >= 0:
                            continue
                        company = CaichanxianItem()
                        company['title'] = title
                        company['year'] = year
                        company['month'] = month
                        company['link'] = link
                        company['company_name'] = name
                        company['income'] = income
                        company['capital_structure'] = capital_structure
                        company['content'] = content
                        company['created'] = created
                        if flag:
                            company['share'] = share
                        yield company

Example #25

0

Show file

File: dangan.py Project: wagaman/dollop

    def parse(self, response):
        self.logger.info('Parsing Wangjia Archive From <%s>.' % response.url)

        item = DanganItem()

        item['pin'] = response.meta.get('pin')
        item['logo_url'] = get_content(
            response.xpath('//div[@class="pt-logo"]/img/@src').extract())

        web_url = get_content(
            response.xpath('//div[@class="on4"]/a[1]/@href').extract())
        if web_url and 'javascript' not in web_url:
            item['web_url'] = web_url
        if response.xpath('//div[@class="bq-box"]')[0].xpath('.//span'):
            tag = get_content(
                response.xpath('//div[@class="bq-box"]')[0].xpath('.//span')
                [-1].xpath('text()').extract())
            if tag in self.problem_label:
                item['product_state'] = tag

        intro = response.xpath('//div[@class="cen-zk"]')
        item['introduction'] = ''.join(
            [get_trunk(c) for c in intro.xpath('.//text()').extract()])

        title_div = response.xpath('//div[@class="title"]')

        item['launch_time'] = get_content(
            title_div.xpath('span[2]/em/text()').extract()).replace(u"上线", '')

        item['product_name'] = get_content(
            title_div.xpath('h1/text()').extract())

        location = get_content(title_div.xpath('span[1]/em/text()').extract())
        if len(location.split(u'·')) > 1:
            item['province'] = location.split(u'·')[0].strip()
            item['city'] = location.split(u'·')[1].strip()
        else:
            item['province'] = location.strip()

        business_icp = response.xpath('//div[@class="da-ggxx"]')
        if business_icp and len(business_icp) > 1:
            # 工商信息
            business_info = response.xpath('//div[@class="da-ggxx"]')[0]
            if business_info:
                part1 = business_info.xpath('table[1]//tr')
                item['company_name'] = get_content(
                    part1[0].xpath('td[2]/text()').extract())
                item['artificial_person'] = get_content(
                    part1[1].xpath('td[2]/text()').extract())
                item['company_type'] = get_content(
                    part1[2].xpath('td[2]/text()').extract())
                item['ownership_structure'] = get_content(
                    part1[3].xpath('td[2]/text()').extract()).replace(
                        "--", '')

                part2 = business_info.xpath('table[2]//tr')
                item['registered_capital'] = get_content(
                    part2[0].xpath('td[2]/text()').extract())
                item['contributed_capital'] = get_content(
                    part2[1].xpath('td[2]/text()').extract())
                item['registered_address'] = get_content(
                    part2[2].xpath('td[2]/text()').extract())

                part3 = business_info.xpath('table[3]//tr')
                item['opening_date'] = get_content(
                    part3[0].xpath('td[2]/text()').extract())
                item['approved_date'] = get_content(
                    part3[1].xpath('td[2]/text()').extract())
                item['registration_authority'] = get_content(
                    part3[2].xpath('td[2]/text()').extract())
                item['business_licence'] = get_content(
                    part3[3].xpath('td[2]/text()').extract())
                item['institutional_framework'] = get_content(
                    part3[4].xpath('td[2]/text()').extract())
                item['tax_registration_num'] = get_content(
                    part3[5].xpath('td[2]/text()').extract())

                item['business_scope'] = get_content(
                    business_info.xpath('table[4]/tr/td[2]/text()').extract())

            # 备案信息
            icp_info = response.xpath('//div[@class="da-ggxx"]')[1].xpath(
                'table//tr')
            if icp_info:
                item['domain_name'] = get_content(
                    icp_info[0].xpath('td[2]/text()').extract())
                item['domain_date'] = get_content(
                    icp_info[1].xpath('td[2]/text()').extract())
                item['domain_company_type'] = get_content(
                    icp_info[2].xpath('td[2]/text()').extract())
                item['domain_company_name'] = get_content(
                    icp_info[3].xpath('td[2]/text()').extract())
                item['ICP_number'] = get_content(
                    icp_info[4].xpath('td[2]/text()').extract())
                item['ICP_approval_number'] = get_content(
                    icp_info[5].xpath('td[2]/text()').extract())

        elif business_icp and len(business_icp) == 1:
            icp_info = response.xpath('//div[@class="da-ggxx"]')[0].xpath(
                'table//tr')
            if len(icp_info) == 6:
                # 备案信息
                item['domain_name'] = get_content(
                    icp_info[0].xpath('td[2]/text()').extract())
                item['domain_date'] = get_content(
                    icp_info[1].xpath('td[2]/text()').extract())
                item['domain_company_type'] = get_content(
                    icp_info[2].xpath('td[2]/text()').extract())
                item['domain_company_name'] = get_content(
                    icp_info[3].xpath('td[2]/text()').extract())
                item['ICP_number'] = get_content(
                    icp_info[4].xpath('td[2]/text()').extract())
                item['ICP_approval_number'] = get_content(
                    icp_info[5].xpath('td[2]/text()').extract())
            else:
                business_info = response.xpath('//div[@class="da-ggxx"]')[0]
                # 工商信息
                part1 = business_info.xpath('table[1]//tr')
                item['company_name'] = get_content(
                    part1[0].xpath('td[2]/text()').extract())
                item['artificial_person'] = get_content(
                    part1[1].xpath('td[2]/text()').extract())
                item['company_type'] = get_content(
                    part1[2].xpath('td[2]/text()').extract())
                item['ownership_structure'] = get_content(
                    part1[3].xpath('td[2]/text()').extract()).replace(
                        "--", '')

                part2 = business_info.xpath('table[2]//tr')
                item['registered_capital'] = get_content(
                    part2[0].xpath('td[2]/text()').extract())
                item['contributed_capital'] = get_content(
                    part2[1].xpath('td[2]/text()').extract())
                item['registered_address'] = get_content(
                    part2[2].xpath('td[2]/text()').extract())

                part3 = business_info.xpath('table[3]//tr')
                item['opening_date'] = get_content(
                    part3[0].xpath('td[2]/text()').extract())
                item['approved_date'] = get_content(
                    part3[1].xpath('td[2]/text()').extract())
                item['registration_authority'] = get_content(
                    part3[2].xpath('td[2]/text()').extract())
                item['business_licence'] = get_content(
                    part3[3].xpath('td[2]/text()').extract())
                item['institutional_framework'] = get_content(
                    part3[4].xpath('td[2]/text()').extract())
                item['tax_registration_num'] = get_content(
                    part3[5].xpath('td[2]/text()').extract())

                item['business_scope'] = get_content(
                    business_info.xpath('table[4]/tr/td[2]/text()').extract())

        # 平台费用
        plat_fee = response.xpath('//div[@class="da-ptfy"]//dl')
        if plat_fee:
            item['account_fee'] = get_content(
                plat_fee[0].xpath('dt/em/text()').extract())
            item['cash_fee'] = get_content(
                plat_fee[1].xpath('dt/em/text()').extract())
            item['fueling_fee'] = get_content(
                plat_fee[2].xpath('dt/em/text()').extract())
            item['transfer_fee'] = get_content(
                plat_fee[3].xpath('dt/em/text()').extract())
            item['vip_fee'] = get_content(
                plat_fee[4].xpath('dt/em/text()').extract())

        # 联系方式
        contact = response.xpath('//div[@class="da-lxfs zzfwbox"]//dd')
        for ele in contact:
            key = ele.xpath(
                ".//div[@class='l']/em/text()").extract()[0].strip()
            value = get_content(
                ele.xpath(".//div[@class='r']").xpath("string(.)").extract())
            if self.map_ch2en.has_key(key):
                item[self.map_ch2en[key]] = value

        # 实力资质 平台服务
        basic_info = response.xpath("//div[@class='bgbox-bt zzfwbox']//dd")
        for ele in basic_info:
            key = ele.xpath(
                ".//div[@class='l']/em/text()").extract()[0].strip()
            if self.map_ch2en.has_key(key):
                if key == u'担保机构':
                    value = get_content(
                        ele.xpath(".//div[@class='r dbjg']").xpath(
                            "string(.)").extract())
                else:
                    value = get_content(
                        ele.xpath(".//div[@class='r']").xpath(
                            "string(.)").extract())
                item[self.map_ch2en[key]] = value

        return item

Example #26

0

Show file

    def parse_detail(self, response):

        report = JingyingItem()
        report['title'] = response.meta['title']
        year, month = self.parse_title(report['title'])
        report['year'] = year
        report['month'] = month

        report['id'] = response.meta['id']
        report['link'] = response.url
        report['created'] = response.meta['created']

        # data = dict()
        data = list()
        content_p = list()
        content = None

        if len(
                response.xpath(
                    '//*[@id="zoom"]/div/table/tbody/tr/td/div[2]/table/tbody/tr'
                )) > 0:
            for item in response.xpath(
                    '//*[@id="zoom"]/div/table/tbody/tr/td/div[2]/table/tbody/tr'
            ):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    # data[key] = value
                    data.append((key, value))
            content = response.xpath(
                '//*[@id="zoom"]/div/table/tbody/tr/td/div[3]/table/tbody/tr/td[2]/p'
            )
        elif len(
                response.xpath(
                    '//*[@id="zoom"]/table/tbody/tr[1]/td/table/tbody/tr')
        ) > 0:
            for item in response.xpath(
                    '//*[@id="zoom"]/table/tbody/tr[1]/td/table/tbody/tr'):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    data.append((key, value))
                if key and not value:
                    content_p.append(key)
            content = response.xpath('//span[@id="zoom"]')
        elif len(response.xpath('//*[@id="zoom"]/table/tbody/tr')) > 0:
            for item in response.xpath('//*[@id="zoom"]/table/tbody/tr'):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    data.append((key, value))
                if key and not value:
                    content_p.append(key)
            content = response.xpath('//span[@id="zoom"]')
        elif len(response.xpath('//*[@id="zoom"]/table/tr')) > 0:
            for item in response.xpath('//*[@id="zoom"]/table/tr'):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    data.append((key, value))
                if key and not value:
                    content_p.append(key)
            content = response.xpath('//span[@id="zoom"]')
        elif len(response.xpath('//*[@id="zoom"]/div/table/tbody/tr')) > 0:
            for item in response.xpath('//*[@id="zoom"]/div/table/tbody/tr'):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    data.append((key, value))
                if key and not value:
                    content_p.append(key)
            content = response.xpath('//span[@id="zoom"]')
        else:
            for item in response.xpath(
                    '//*[@id="zoom"]/strong/table/tbody/tr'):
                key = get_content(item.xpath('string(td[1])').extract())
                value = get_content(item.xpath('string(td[2])').extract())
                if key and value:
                    data.append((key, value))
                if key and not value:
                    content_p.append(key)
            content = response.xpath('//span[@id="zoom"]')

        flag = False
        for key, value in data:
            if key.find(u'收入') >= 0:
                report['income'] = value
            elif key.find(u'保户投资') >= 0:
                report['baohu_xz'] = value
            elif key.find(u'独立账户') >= 0:
                report['duli_xz'] = value
            elif key.find(u'给付') >= 0 or key.find(u'赔付支出') >= 0:
                report['expense'] = value
                flag = True
            elif key.find(u'年金缴费') >= 0:
                report['yanglao_cost'] = value
            elif key.find(u'受托') >= 0:
                report['yanglao_shoutuo'] = value
            elif key.find(u'年金投资管理') >= 0:
                report['yanglao_touzi'] = value
            elif key.find(u'业务') >= 0 or key.find(u'营业') >= 0:
                report['manage_fee'] = value
            elif key.find(u'银行存款') >= 0:
                report['bank_deposits'] = value
            elif key.find(u'投资') >= 0:
                report['invest'] = value
            elif key.find(u'资产总额') >= 0:
                report['amount'] = value
            elif key.find(u'财产险') >= 0:
                if flag:
                    report['caichanxian2'] = value
                else:
                    report['caichanxian1'] = value
            elif key.find(u'人身险') >= 0:
                if flag:
                    report['renshenxian2'] = value
                else:
                    report['renshenxian1'] = value
            elif key.find(u'寿险') >= 0:
                if flag:
                    report['shouxian2'] = value
                else:
                    report['shouxian1'] = value
            elif key.find(u'健康险') >= 0:
                if flag:
                    report['jiankangxian2'] = value
                else:
                    report['jiankangxian1'] = value
            elif key.find(u'意外') >= 0:
                if flag:
                    report['yiwaixian2'] = value
                else:
                    report['yiwaixian1'] = value

        report['data'] = json.dumps(data, encoding="UTF-8", ensure_ascii=False)
        report['raw_content'] = content.extract_first()
        if len(content_p) > 1:
            report['content'] = ' '.join(content_p)
        else:
            report['content'] = ''.join([
                get_trunk(c) for c in content.xpath(
                    './/p/text() or string(span)').extract()
            ])
        report['image_url'] = '#'.join(
            [get_trunk(c)
             for c in content.xpath('.//img/@src').extract()]) or None

        yield report

Example #27

0

Show file

    def parse_detail(self, response):

        title = response.meta['title']
        year, month = self.parse_title(title)
        created = response.meta['created']
        link = response.url
        capital_structure = None
        content = ' '.join(
            [get_trunk(c) for c in response.xpath('//p//text()').extract()])

        shoutuo_jf = None
        touzi_jf = None
        weituo_jf = None
        shoutuo_zc = None
        touzi_zc = None
        weituo_zc = None

        for tbody in response.xpath('//tbody'):
            if len(tbody.xpath('tr')) > 5:
                for tr in tbody.xpath('tr'):
                    # try:
                    if len(tr.xpath('td')) == 4:
                        name = get_content(tr.xpath('string(td[1])').extract())
                        weituo_jf = get_content(
                            tr.xpath('string(td[2])').extract())
                        shoutuo_jf = get_content(
                            tr.xpath('string(td[3])').extract())
                        touzi_jf = get_content(
                            tr.xpath('string(td[4])').extract())
                    elif len(tr.xpath('td')) == 7:
                        name = get_content(tr.xpath('string(td[1])').extract())
                        shoutuo_jf = get_content(
                            tr.xpath('string(td[2])').extract())
                        touzi_jf = get_content(
                            tr.xpath('string(td[3])').extract())
                        weituo_jf = get_content(
                            tr.xpath('string(td[4])').extract())
                        shoutuo_zc = get_content(
                            tr.xpath('string(td[5])').extract())
                        touzi_zc = get_content(
                            tr.xpath('string(td[6])').extract())
                        weituo_zc = get_content(
                            tr.xpath('string(td[7])').extract())
                    else:
                        continue

                    if shoutuo_jf and shoutuo_jf.find(u'企业') >= 0:
                        continue

                    if name and name.find(u'简称') < 0:
                        company = YanglaoxianItem()
                        company['title'] = title
                        company['year'] = year
                        company['month'] = month
                        company['link'] = link
                        company['company_name'] = name
                        company['shoutuo_jf'] = shoutuo_jf
                        company['touzi_jf'] = touzi_jf
                        company['weituo_jf'] = weituo_jf
                        company['shoutuo_zc'] = shoutuo_zc
                        company['touzi_zc'] = touzi_zc
                        company['weituo_zc'] = weituo_zc
                        company['content'] = content
                        company['created'] = created
                        yield company